1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 
49 
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
51 
52 static void
53 vect_free_slp_tree (slp_tree node)
54 {
55   int i;
56   slp_tree child;
57 
58   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59     vect_free_slp_tree (child);
60 
61   gimple *stmt;
62   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63     /* After transform some stmts are removed and thus their vinfo is gone.  */
64     if (vinfo_for_stmt (stmt))
65       {
66 	gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 	STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
68       }
69 
70   SLP_TREE_CHILDREN (node).release ();
71   SLP_TREE_SCALAR_STMTS (node).release ();
72   SLP_TREE_VEC_STMTS (node).release ();
73   SLP_TREE_LOAD_PERMUTATION (node).release ();
74 
75   free (node);
76 }
77 
78 
79 /* Free the memory allocated for the SLP instance.  */
80 
81 void
82 vect_free_slp_instance (slp_instance instance)
83 {
84   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85   SLP_INSTANCE_LOADS (instance).release ();
86   free (instance);
87 }
88 
89 
90 /* Create an SLP node for SCALAR_STMTS.  */
91 
92 static slp_tree
93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
94 {
95   slp_tree node;
96   gimple *stmt = scalar_stmts[0];
97   unsigned int nops;
98 
99   if (is_gimple_call (stmt))
100     nops = gimple_call_num_args (stmt);
101   else if (is_gimple_assign (stmt))
102     {
103       nops = gimple_num_ops (stmt) - 1;
104       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 	nops++;
106     }
107   else if (gimple_code (stmt) == GIMPLE_PHI)
108     nops = 0;
109   else
110     return NULL;
111 
112   node = XNEW (struct _slp_tree);
113   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114   SLP_TREE_VEC_STMTS (node).create (0);
115   SLP_TREE_CHILDREN (node).create (nops);
116   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117   SLP_TREE_TWO_OPERATORS (node) = false;
118   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
119 
120   unsigned i;
121   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122     STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
123 
124   return node;
125 }
126 
127 
128 /* This structure is used in creation of an SLP tree.  Each instance
129    corresponds to the same operand in a group of scalar stmts in an SLP
130    node.  */
131 typedef struct _slp_oprnd_info
132 {
133   /* Def-stmts for the operands.  */
134   vec<gimple *> def_stmts;
135   /* Information about the first statement, its vector def-type, type, the
136      operand itself in case it's constant, and an indication if it's a pattern
137      stmt.  */
138   tree first_op_type;
139   enum vect_def_type first_dt;
140   bool first_pattern;
141   bool second_pattern;
142 } *slp_oprnd_info;
143 
144 
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146    operand.  */
147 static vec<slp_oprnd_info>
148 vect_create_oprnd_info (int nops, int group_size)
149 {
150   int i;
151   slp_oprnd_info oprnd_info;
152   vec<slp_oprnd_info> oprnds_info;
153 
154   oprnds_info.create (nops);
155   for (i = 0; i < nops; i++)
156     {
157       oprnd_info = XNEW (struct _slp_oprnd_info);
158       oprnd_info->def_stmts.create (group_size);
159       oprnd_info->first_dt = vect_uninitialized_def;
160       oprnd_info->first_op_type = NULL_TREE;
161       oprnd_info->first_pattern = false;
162       oprnd_info->second_pattern = false;
163       oprnds_info.quick_push (oprnd_info);
164     }
165 
166   return oprnds_info;
167 }
168 
169 
170 /* Free operands info.  */
171 
172 static void
173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
174 {
175   int i;
176   slp_oprnd_info oprnd_info;
177 
178   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
179     {
180       oprnd_info->def_stmts.release ();
181       XDELETE (oprnd_info);
182     }
183 
184   oprnds_info.release ();
185 }
186 
187 
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
190 
191 int
192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
193 {
194   gimple *next_stmt = first_stmt;
195   int result = 0;
196 
197   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198     return -1;
199 
200   do
201     {
202       if (next_stmt == stmt)
203 	return result;
204       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205       if (next_stmt)
206 	result += GROUP_GAP (vinfo_for_stmt (next_stmt));
207     }
208   while (next_stmt);
209 
210   return -1;
211 }
212 
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214    using the method implemented by duplicate_and_interleave.  Return true
215    if so, returning the number of intermediate vectors in *NVECTORS_OUT
216    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217    (if nonnull).  */
218 
219 bool
220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 				unsigned int *nvectors_out,
222 				tree *vector_type_out,
223 				tree *permutes)
224 {
225   poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226   poly_int64 nelts;
227   unsigned int nvectors = 1;
228   for (;;)
229     {
230       scalar_int_mode int_mode;
231       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232       if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 	  && int_mode_for_size (elt_bits, 0).exists (&int_mode))
234 	{
235 	  tree int_type = build_nonstandard_integer_type
236 	    (GET_MODE_BITSIZE (int_mode), 1);
237 	  tree vector_type = build_vector_type (int_type, nelts);
238 	  if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
239 	    {
240 	      vec_perm_builder sel1 (nelts, 2, 3);
241 	      vec_perm_builder sel2 (nelts, 2, 3);
242 	      poly_int64 half_nelts = exact_div (nelts, 2);
243 	      for (unsigned int i = 0; i < 3; ++i)
244 		{
245 		  sel1.quick_push (i);
246 		  sel1.quick_push (i + nelts);
247 		  sel2.quick_push (half_nelts + i);
248 		  sel2.quick_push (half_nelts + i + nelts);
249 		}
250 	      vec_perm_indices indices1 (sel1, 2, nelts);
251 	      vec_perm_indices indices2 (sel2, 2, nelts);
252 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
254 		{
255 		  if (nvectors_out)
256 		    *nvectors_out = nvectors;
257 		  if (vector_type_out)
258 		    *vector_type_out = vector_type;
259 		  if (permutes)
260 		    {
261 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 								indices1);
263 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 								indices2);
265 		    }
266 		  return true;
267 		}
268 	    }
269 	}
270       if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 	return false;
272       nvectors *= 2;
273     }
274 }
275 
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277    they are of a valid type and that they match the defs of the first stmt of
278    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
279    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero *SWAP
280    indicates swap is required for cond_expr stmts.  Specifically, *SWAP
281    is 1 if STMT is cond and operands of comparison need to be swapped;
282    *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283    If there is any operand swap in this function, *SWAP is set to non-zero
284    value.
285    If there was a fatal error return -1; if the error could be corrected by
286    swapping operands of father node of this one, return 1; if everything is
287    ok return 0.  */
288 static int
289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 			     vec<gimple *> stmts, unsigned stmt_num,
291 			     vec<slp_oprnd_info> *oprnds_info)
292 {
293   gimple *stmt = stmts[stmt_num];
294   tree oprnd;
295   unsigned int i, number_of_oprnds;
296   gimple *def_stmt;
297   enum vect_def_type dt = vect_uninitialized_def;
298   bool pattern = false;
299   slp_oprnd_info oprnd_info;
300   int first_op_idx = 1;
301   bool commutative = false;
302   bool first_op_cond = false;
303   bool first = stmt_num == 0;
304   bool second = stmt_num == 1;
305 
306   if (is_gimple_call (stmt))
307     {
308       number_of_oprnds = gimple_call_num_args (stmt);
309       first_op_idx = 3;
310     }
311   else if (is_gimple_assign (stmt))
312     {
313       enum tree_code code = gimple_assign_rhs_code (stmt);
314       number_of_oprnds = gimple_num_ops (stmt) - 1;
315       /* Swap can only be done for cond_expr if asked to, otherwise we
316 	 could result in different comparison code to the first stmt.  */
317       if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
319 	{
320 	  first_op_cond = true;
321 	  number_of_oprnds++;
322 	}
323       else
324 	commutative = commutative_tree_code (code);
325     }
326   else
327     return -1;
328 
329   bool swapped = (*swap != 0);
330   gcc_assert (!swapped || first_op_cond);
331   for (i = 0; i < number_of_oprnds; i++)
332     {
333 again:
334       if (first_op_cond)
335 	{
336 	  /* Map indicating how operands of cond_expr should be swapped.  */
337 	  int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 	  int *map = maps[*swap];
339 
340 	  if (i < 2)
341 	    oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 	  else
343 	    oprnd = gimple_op (stmt, map[i]);
344 	}
345       else
346 	oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
347 
348       oprnd_info = (*oprnds_info)[i];
349 
350       if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
351 	{
352 	  if (dump_enabled_p ())
353 	    {
354 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 			       "Build SLP failed: can't analyze def for ");
356 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 	    }
359 
360 	  return -1;
361 	}
362 
363       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364          from the pattern.  Check that all the stmts of the node are in the
365          pattern.  */
366       if (def_stmt && gimple_bb (def_stmt)
367           && vect_stmt_in_region_p (vinfo, def_stmt)
368           && vinfo_for_stmt (def_stmt)
369           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
372         {
373           pattern = true;
374           if (!first && !oprnd_info->first_pattern
375 	      /* Allow different pattern state for the defs of the
376 		 first stmt in reduction chains.  */
377 	      && (oprnd_info->first_dt != vect_reduction_def
378 		  || (!second && !oprnd_info->second_pattern)))
379 	    {
380 	      if (i == 0
381 		  && !swapped
382 		  && commutative)
383 		{
384 		  swapped = true;
385 		  goto again;
386 		}
387 
388 	      if (dump_enabled_p ())
389 		{
390 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 				   "Build SLP failed: some of the stmts"
392 				   " are in a pattern, and others are not ");
393 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 		}
396 
397 	      return 1;
398             }
399 
400           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
402 
403           if (dt == vect_unknown_def_type)
404             {
405               if (dump_enabled_p ())
406                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 				 "Unsupported pattern.\n");
408               return -1;
409             }
410 
411           switch (gimple_code (def_stmt))
412             {
413             case GIMPLE_PHI:
414             case GIMPLE_ASSIGN:
415 	      break;
416 
417 	    default:
418 	      if (dump_enabled_p ())
419 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 				 "unsupported defining stmt:\n");
421 	      return -1;
422             }
423         }
424 
425       if (second)
426 	oprnd_info->second_pattern = pattern;
427 
428       if (first)
429 	{
430 	  oprnd_info->first_dt = dt;
431 	  oprnd_info->first_pattern = pattern;
432 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
433 	}
434       else
435 	{
436 	  /* Not first stmt of the group, check that the def-stmt/s match
437 	     the def-stmt/s of the first stmt.  Allow different definition
438 	     types for reduction chains: the first stmt must be a
439 	     vect_reduction_def (a phi node), and the rest
440 	     vect_internal_def.  */
441 	  tree type = TREE_TYPE (oprnd);
442 	  if ((oprnd_info->first_dt != dt
443 	       && !(oprnd_info->first_dt == vect_reduction_def
444 		    && dt == vect_internal_def)
445 	       && !((oprnd_info->first_dt == vect_external_def
446 		     || oprnd_info->first_dt == vect_constant_def)
447 		    && (dt == vect_external_def
448 			|| dt == vect_constant_def)))
449 	      || !types_compatible_p (oprnd_info->first_op_type, type))
450 	    {
451 	      /* Try swapping operands if we got a mismatch.  */
452 	      if (i == 0
453 		  && !swapped
454 		  && commutative)
455 		{
456 		  swapped = true;
457 		  goto again;
458 		}
459 
460 	      if (dump_enabled_p ())
461 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 				 "Build SLP failed: different types\n");
463 
464 	      return 1;
465 	    }
466 	  if ((dt == vect_constant_def
467 	       || dt == vect_external_def)
468 	      && !current_vector_size.is_constant ()
469 	      && (TREE_CODE (type) == BOOLEAN_TYPE
470 		  || !can_duplicate_and_interleave_p (stmts.length (),
471 						      TYPE_MODE (type))))
472 	    {
473 	      if (dump_enabled_p ())
474 		{
475 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 				   "Build SLP failed: invalid type of def "
477 				   "for variable-length SLP ");
478 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 		  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 		}
481 	      return -1;
482 	    }
483 	}
484 
485       /* Check the types of the definitions.  */
486       switch (dt)
487 	{
488 	case vect_constant_def:
489 	case vect_external_def:
490 	  break;
491 
492 	case vect_reduction_def:
493 	case vect_induction_def:
494 	case vect_internal_def:
495 	  oprnd_info->def_stmts.quick_push (def_stmt);
496 	  break;
497 
498 	default:
499 	  /* FORNOW: Not supported.  */
500 	  if (dump_enabled_p ())
501 	    {
502 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 			       "Build SLP failed: illegal type of def ");
504 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
506 	    }
507 
508 	  return -1;
509 	}
510     }
511 
512   /* Swap operands.  */
513   if (swapped)
514     {
515       /* If there are already uses of this stmt in a SLP instance then
516          we've committed to the operand order and can't swap it.  */
517       if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
518 	{
519 	  if (dump_enabled_p ())
520 	    {
521 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 			       "Build SLP failed: cannot swap operands of "
523 			       "shared stmt ");
524 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 	    }
526 	  return -1;
527 	}
528 
529       if (first_op_cond)
530 	{
531 	  tree cond = gimple_assign_rhs1 (stmt);
532 	  enum tree_code code = TREE_CODE (cond);
533 
534 	  /* Swap.  */
535 	  if (*swap == 1)
536 	    {
537 	      swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 				 &TREE_OPERAND (cond, 1));
539 	      TREE_SET_CODE (cond, swap_tree_comparison (code));
540 	    }
541 	  /* Invert.  */
542 	  else
543 	    {
544 	      swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 				 gimple_assign_rhs3_ptr (stmt));
546 	      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 	      code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 	      gcc_assert (code != ERROR_MARK);
549 	      TREE_SET_CODE (cond, code);
550 	    }
551 	}
552       else
553 	swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 			   gimple_assign_rhs2_ptr (stmt));
555       if (dump_enabled_p ())
556 	{
557 	  dump_printf_loc (MSG_NOTE, vect_location,
558 			   "swapped operands to match def types in ");
559 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
560 	}
561     }
562 
563   *swap = swapped;
564   return 0;
565 }
566 
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568    caller's attempt to find the vector type in STMT with the narrowest
569    element type.  Return true if VECTYPE is nonnull and if it is valid
570    for VINFO.  When returning true, update MAX_NUNITS to reflect the
571    number of units in VECTYPE.  VINFO, GORUP_SIZE and MAX_NUNITS are
572    as for vect_build_slp_tree.  */
573 
574 static bool
575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 			tree vectype, poly_uint64 *max_nunits)
577 {
578   if (!vectype)
579     {
580       if (dump_enabled_p ())
581 	{
582 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 			   "Build SLP failed: unsupported data-type in ");
584 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
586 	}
587       /* Fatal mismatch.  */
588       return false;
589     }
590 
591   /* If populating the vector type requires unrolling then fail
592      before adjusting *max_nunits for basic-block vectorization.  */
593   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594   unsigned HOST_WIDE_INT const_nunits;
595   if (is_a <bb_vec_info> (vinfo)
596       && (!nunits.is_constant (&const_nunits)
597 	  || const_nunits > group_size))
598     {
599       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 		       "Build SLP failed: unrolling required "
601 		       "in basic block SLP\n");
602       /* Fatal mismatch.  */
603       return false;
604     }
605 
606   /* In case of multiple types we need to detect the smallest type.  */
607   vect_update_max_nunits (max_nunits, vectype);
608   return true;
609 }
610 
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612    permutation or are of unsupported types of operation.  Return
613    true if they are, otherwise return false and indicate in *MATCHES
614    which stmts are not isomorphic to the first one.  If MATCHES[0]
615    is false then this indicates the comparison could not be
616    carried out or the stmts will never be vectorized by SLP.
617 
618    Note COND_EXPR is possibly ismorphic to another one after swapping its
619    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620    the first stmt by swapping the two operands of comparison; set SWAP[i]
621    to 2 if stmt I is isormorphic to the first stmt by inverting the code
622    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
624 
625 static bool
626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 		       vec<gimple *> stmts, unsigned int group_size,
628 		       unsigned nops, poly_uint64 *max_nunits,
629 		       bool *matches, bool *two_operators)
630 {
631   unsigned int i;
632   gimple *first_stmt = stmts[0], *stmt = stmts[0];
633   enum tree_code first_stmt_code = ERROR_MARK;
634   enum tree_code alt_stmt_code = ERROR_MARK;
635   enum tree_code rhs_code = ERROR_MARK;
636   enum tree_code first_cond_code = ERROR_MARK;
637   tree lhs;
638   bool need_same_oprnds = false;
639   tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640   optab optab;
641   int icode;
642   machine_mode optab_op2_mode;
643   machine_mode vec_mode;
644   HOST_WIDE_INT dummy;
645   gimple *first_load = NULL, *prev_first_load = NULL;
646 
647   /* For every stmt in NODE find its def stmt/s.  */
648   FOR_EACH_VEC_ELT (stmts, i, stmt)
649     {
650       swap[i] = 0;
651       matches[i] = false;
652 
653       if (dump_enabled_p ())
654 	{
655 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
657 	}
658 
659       /* Fail to vectorize statements marked as unvectorizable.  */
660       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
661         {
662           if (dump_enabled_p ())
663             {
664               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 			       "Build SLP failed: unvectorizable statement ");
666               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
667             }
668 	  /* Fatal mismatch.  */
669 	  matches[0] = false;
670           return false;
671         }
672 
673       lhs = gimple_get_lhs (stmt);
674       if (lhs == NULL_TREE)
675 	{
676 	  if (dump_enabled_p ())
677 	    {
678 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
680 			       "GIMPLE_CALL ");
681 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 	    }
683 	  /* Fatal mismatch.  */
684 	  matches[0] = false;
685 	  return false;
686 	}
687 
688       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689       vectype = get_vectype_for_scalar_type (scalar_type);
690       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 				   max_nunits))
692 	{
693 	  /* Fatal mismatch.  */
694 	  matches[0] = false;
695           return false;
696         }
697 
698       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
699 	{
700 	  rhs_code = CALL_EXPR;
701 	  if (gimple_call_internal_p (call_stmt)
702 	      || gimple_call_tail_p (call_stmt)
703 	      || gimple_call_noreturn_p (call_stmt)
704 	      || !gimple_call_nothrow_p (call_stmt)
705 	      || gimple_call_chain (call_stmt))
706 	    {
707 	      if (dump_enabled_p ())
708 		{
709 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 				   "Build SLP failed: unsupported call type ");
711 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 				    call_stmt, 0);
713 		}
714 	      /* Fatal mismatch.  */
715 	      matches[0] = false;
716 	      return false;
717 	    }
718 	}
719       else
720 	rhs_code = gimple_assign_rhs_code (stmt);
721 
722       /* Check the operation.  */
723       if (i == 0)
724 	{
725 	  first_stmt_code = rhs_code;
726 
727 	  /* Shift arguments should be equal in all the packed stmts for a
728 	     vector shift with scalar shift operand.  */
729 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 	      || rhs_code == LROTATE_EXPR
731 	      || rhs_code == RROTATE_EXPR)
732 	    {
733 	      vec_mode = TYPE_MODE (vectype);
734 
735 	      /* First see if we have a vector/vector shift.  */
736 	      optab = optab_for_tree_code (rhs_code, vectype,
737 					   optab_vector);
738 
739 	      if (!optab
740 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
741 		{
742 		  /* No vector/vector shift, try for a vector/scalar shift.  */
743 		  optab = optab_for_tree_code (rhs_code, vectype,
744 					       optab_scalar);
745 
746 		  if (!optab)
747 		    {
748 		      if (dump_enabled_p ())
749 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 					 "Build SLP failed: no optab.\n");
751 		      /* Fatal mismatch.  */
752 		      matches[0] = false;
753 		      return false;
754 		    }
755 		  icode = (int) optab_handler (optab, vec_mode);
756 		  if (icode == CODE_FOR_nothing)
757 		    {
758 		      if (dump_enabled_p ())
759 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 					 "Build SLP failed: "
761 					 "op not supported by target.\n");
762 		      /* Fatal mismatch.  */
763 		      matches[0] = false;
764 		      return false;
765 		    }
766 		  optab_op2_mode = insn_data[icode].operand[2].mode;
767 		  if (!VECTOR_MODE_P (optab_op2_mode))
768 		    {
769 		      need_same_oprnds = true;
770 		      first_op1 = gimple_assign_rhs2 (stmt);
771 		    }
772 		}
773 	    }
774 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
775             {
776               need_same_oprnds = true;
777               first_op1 = gimple_assign_rhs2 (stmt);
778             }
779 	}
780       else
781 	{
782 	  if (first_stmt_code != rhs_code
783 	      && alt_stmt_code == ERROR_MARK)
784 	    alt_stmt_code = rhs_code;
785 	  if (first_stmt_code != rhs_code
786 	      && (first_stmt_code != IMAGPART_EXPR
787 		  || rhs_code != REALPART_EXPR)
788 	      && (first_stmt_code != REALPART_EXPR
789 		  || rhs_code != IMAGPART_EXPR)
790 	      /* Handle mismatches in plus/minus by computing both
791 		 and merging the results.  */
792 	      && !((first_stmt_code == PLUS_EXPR
793 		    || first_stmt_code == MINUS_EXPR)
794 		   && (alt_stmt_code == PLUS_EXPR
795 		       || alt_stmt_code == MINUS_EXPR)
796 		   && rhs_code == alt_stmt_code)
797               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798                    && (first_stmt_code == ARRAY_REF
799                        || first_stmt_code == BIT_FIELD_REF
800                        || first_stmt_code == INDIRECT_REF
801                        || first_stmt_code == COMPONENT_REF
802                        || first_stmt_code == MEM_REF)))
803 	    {
804 	      if (dump_enabled_p ())
805 		{
806 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 				   "Build SLP failed: different operation "
808 				   "in stmt ");
809 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 				   "original stmt ");
812 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 				    first_stmt, 0);
814 		}
815 	      /* Mismatch.  */
816 	      continue;
817 	    }
818 
819 	  if (need_same_oprnds
820 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
821 	    {
822 	      if (dump_enabled_p ())
823 		{
824 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 				   "Build SLP failed: different shift "
826 				   "arguments in ");
827 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828 		}
829 	      /* Mismatch.  */
830 	      continue;
831 	    }
832 
833 	  if (rhs_code == CALL_EXPR)
834 	    {
835 	      gimple *first_stmt = stmts[0];
836 	      if (gimple_call_num_args (stmt) != nops
837 		  || !operand_equal_p (gimple_call_fn (first_stmt),
838 				       gimple_call_fn (stmt), 0)
839 		  || gimple_call_fntype (first_stmt)
840 		     != gimple_call_fntype (stmt))
841 		{
842 		  if (dump_enabled_p ())
843 		    {
844 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 				       "Build SLP failed: different calls in ");
846 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 					stmt, 0);
848 		    }
849 		  /* Mismatch.  */
850 		  continue;
851 		}
852 	    }
853 	}
854 
855       /* Grouped store or load.  */
856       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
857 	{
858 	  if (REFERENCE_CLASS_P (lhs))
859 	    {
860 	      /* Store.  */
861 	      ;
862 	    }
863 	  else
864 	    {
865 	      /* Load.  */
866               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867               if (prev_first_load)
868                 {
869                   /* Check that there are no loads from different interleaving
870                      chains in the same node.  */
871                   if (prev_first_load != first_load)
872                     {
873                       if (dump_enabled_p ())
874                         {
875                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 					   vect_location,
877 					   "Build SLP failed: different "
878 					   "interleaving chains in one node ");
879                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 					    stmt, 0);
881                         }
882 		      /* Mismatch.  */
883 		      continue;
884                     }
885                 }
886               else
887                 prev_first_load = first_load;
888            }
889         } /* Grouped access.  */
890       else
891 	{
892 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 	    {
894 	      /* Not grouped load.  */
895 	      if (dump_enabled_p ())
896 		{
897 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 				   "Build SLP failed: not grouped load ");
899 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 		}
901 
902 	      /* FORNOW: Not grouped loads are not supported.  */
903 	      /* Fatal mismatch.  */
904 	      matches[0] = false;
905 	      return false;
906 	    }
907 
908 	  /* Not memory operation.  */
909 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 	      && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 	      && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 	      && rhs_code != CALL_EXPR)
914 	    {
915 	      if (dump_enabled_p ())
916 		{
917 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 				   "Build SLP failed: operation");
919 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
921 		}
922 	      /* Fatal mismatch.  */
923 	      matches[0] = false;
924 	      return false;
925 	    }
926 
927 	  if (rhs_code == COND_EXPR)
928 	    {
929 	      tree cond_expr = gimple_assign_rhs1 (stmt);
930 	      enum tree_code cond_code = TREE_CODE (cond_expr);
931 	      enum tree_code swap_code = ERROR_MARK;
932 	      enum tree_code invert_code = ERROR_MARK;
933 
934 	      if (i == 0)
935 		first_cond_code = TREE_CODE (cond_expr);
936 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
937 		{
938 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 		  swap_code = swap_tree_comparison (cond_code);
940 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
941 		}
942 
943 	      if (first_cond_code == cond_code)
944 		;
945 	      /* Isomorphic can be achieved by swapping.  */
946 	      else if (first_cond_code == swap_code)
947 		swap[i] = 1;
948 	      /* Isomorphic can be achieved by inverting.  */
949 	      else if (first_cond_code == invert_code)
950 		swap[i] = 2;
951 	      else
952 		{
953 		  if (dump_enabled_p ())
954 		    {
955 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 				       "Build SLP failed: different"
957 				       " operation");
958 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 					stmt, 0);
960 		    }
961 		  /* Mismatch.  */
962 		  continue;
963 		}
964 	    }
965 	}
966 
967       matches[i] = true;
968     }
969 
970   for (i = 0; i < group_size; ++i)
971     if (!matches[i])
972       return false;
973 
974   /* If we allowed a two-operation SLP node verify the target can cope
975      with the permute we are going to use.  */
976   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977   if (alt_stmt_code != ERROR_MARK
978       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
979     {
980       unsigned HOST_WIDE_INT count;
981       if (!nunits.is_constant (&count))
982 	{
983 	  if (dump_enabled_p ())
984 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 			     "Build SLP failed: different operations "
986 			     "not allowed with variable-length SLP.\n");
987 	  return false;
988 	}
989       vec_perm_builder sel (count, count, 1);
990       for (i = 0; i < count; ++i)
991 	{
992 	  unsigned int elt = i;
993 	  if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 	    elt += count;
995 	  sel.quick_push (elt);
996 	}
997       vec_perm_indices indices (sel, 2, count);
998       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
999 	{
1000 	  for (i = 0; i < group_size; ++i)
1001 	    if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1002 	      {
1003 		matches[i] = false;
1004 		if (dump_enabled_p ())
1005 		  {
1006 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 				     "Build SLP failed: different operation "
1008 				     "in stmt ");
1009 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 				      stmts[i], 0);
1011 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 				     "original stmt ");
1013 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 				      first_stmt, 0);
1015 		  }
1016 	      }
1017 	  return false;
1018 	}
1019       *two_operators = true;
1020     }
1021 
1022   return true;
1023 }
1024 
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026    Note we never remove apart from at destruction time so we do not
1027    need a special value for deleted that differs from empty.  */
1028 struct bst_traits
1029 {
1030   typedef vec <gimple *> value_type;
1031   typedef vec <gimple *> compare_type;
1032   static inline hashval_t hash (value_type);
1033   static inline bool equal (value_type existing, value_type candidate);
1034   static inline bool is_empty (value_type x) { return !x.exists (); }
1035   static inline bool is_deleted (value_type x) { return !x.exists (); }
1036   static inline void mark_empty (value_type &x) { x.release (); }
1037   static inline void mark_deleted (value_type &x) { x.release (); }
1038   static inline void remove (value_type &x) { x.release (); }
1039 };
1040 inline hashval_t
1041 bst_traits::hash (value_type x)
1042 {
1043   inchash::hash h;
1044   for (unsigned i = 0; i < x.length (); ++i)
1045     h.add_int (gimple_uid (x[i]));
1046   return h.end ();
1047 }
1048 inline bool
1049 bst_traits::equal (value_type existing, value_type candidate)
1050 {
1051   if (existing.length () != candidate.length ())
1052     return false;
1053   for (unsigned i = 0; i < existing.length (); ++i)
1054     if (existing[i] != candidate[i])
1055       return false;
1056   return true;
1057 }
1058 
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1061 
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 		       vec<gimple *> stmts, unsigned int group_size,
1065 		       poly_uint64 *max_nunits,
1066 		       vec<slp_tree> *loads,
1067 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 		       unsigned max_tree_size);
1069 
1070 static slp_tree
1071 vect_build_slp_tree (vec_info *vinfo,
1072 		     vec<gimple *> stmts, unsigned int group_size,
1073 		     poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 		     unsigned max_tree_size)
1076 {
1077   if (bst_fail->contains (stmts))
1078     return NULL;
1079   slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 					loads, matches, npermutes, tree_size,
1081 					max_tree_size);
1082   /* When SLP build fails for stmts record this, otherwise SLP build
1083      can be exponential in time when we allow to construct parts from
1084      scalars, see PR81723.  */
1085   if (! res)
1086     {
1087       vec <gimple *> x;
1088       x.create (stmts.length ());
1089       x.splice (stmts);
1090       bst_fail->add (x);
1091     }
1092   return res;
1093 }
1094 
1095 /* Recursively build an SLP tree starting from NODE.
1096    Fail (and return a value not equal to zero) if def-stmts are not
1097    isomorphic, require data permutation or are of unsupported types of
1098    operation.  Otherwise, return 0.
1099    The value returned is the depth in the SLP tree where a mismatch
1100    was found.  */
1101 
1102 static slp_tree
1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 		       vec<gimple *> stmts, unsigned int group_size,
1105 		       poly_uint64 *max_nunits,
1106 		       vec<slp_tree> *loads,
1107 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 		       unsigned max_tree_size)
1109 {
1110   unsigned nops, i, this_tree_size = 0;
1111   poly_uint64 this_max_nunits = *max_nunits;
1112   gimple *stmt;
1113   slp_tree node;
1114 
1115   matches[0] = false;
1116 
1117   stmt = stmts[0];
1118   if (is_gimple_call (stmt))
1119     nops = gimple_call_num_args (stmt);
1120   else if (is_gimple_assign (stmt))
1121     {
1122       nops = gimple_num_ops (stmt) - 1;
1123       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 	nops++;
1125     }
1126   else if (gimple_code (stmt) == GIMPLE_PHI)
1127     nops = 0;
1128   else
1129     return NULL;
1130 
1131   /* If the SLP node is a PHI (induction or reduction), terminate
1132      the recursion.  */
1133   if (gimple_code (stmt) == GIMPLE_PHI)
1134     {
1135       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136       tree vectype = get_vectype_for_scalar_type (scalar_type);
1137       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 				   max_nunits))
1139 	return NULL;
1140 
1141       vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142       /* Induction from different IVs is not supported.  */
1143       if (def_type == vect_induction_def)
1144 	{
1145 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 	    if (stmt != stmts[0])
1147 	      return NULL;
1148 	}
1149       else
1150 	{
1151 	  /* Else def types have to match.  */
1152 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1153 	    {
1154 	      /* But for reduction chains only check on the first stmt.  */
1155 	      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 		continue;
1158 	      if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 		return NULL;
1160 	    }
1161 	}
1162       node = vect_create_new_slp_node (stmts);
1163       return node;
1164     }
1165 
1166 
1167   bool two_operators = false;
1168   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169   if (!vect_build_slp_tree_1 (vinfo, swap,
1170 			      stmts, group_size, nops,
1171 			      &this_max_nunits, matches, &two_operators))
1172     return NULL;
1173 
1174   /* If the SLP node is a load, terminate the recursion.  */
1175   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1177     {
1178       *max_nunits = this_max_nunits;
1179       node = vect_create_new_slp_node (stmts);
1180       loads->safe_push (node);
1181       return node;
1182     }
1183 
1184   /* Get at the operands, verifying they are compatible.  */
1185   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186   slp_oprnd_info oprnd_info;
1187   FOR_EACH_VEC_ELT (stmts, i, stmt)
1188     {
1189       int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 					     stmts, i, &oprnds_info);
1191       if (res != 0)
1192 	matches[(res == -1) ? 0 : i] = false;
1193       if (!matches[0])
1194 	break;
1195     }
1196   for (i = 0; i < group_size; ++i)
1197     if (!matches[i])
1198       {
1199 	vect_free_oprnd_info (oprnds_info);
1200 	return NULL;
1201       }
1202 
1203   auto_vec<slp_tree, 4> children;
1204   auto_vec<slp_tree> this_loads;
1205 
1206   stmt = stmts[0];
1207 
1208   if (tree_size)
1209     max_tree_size -= *tree_size;
1210 
1211   /* Create SLP_TREE nodes for the definition node/s.  */
1212   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1213     {
1214       slp_tree child;
1215       unsigned old_nloads = this_loads.length ();
1216       unsigned old_tree_size = this_tree_size;
1217       unsigned int j;
1218 
1219       if (oprnd_info->first_dt != vect_internal_def
1220 	  && oprnd_info->first_dt != vect_reduction_def
1221 	  && oprnd_info->first_dt != vect_induction_def)
1222         continue;
1223 
1224       if (++this_tree_size > max_tree_size)
1225 	{
1226 	  FOR_EACH_VEC_ELT (children, j, child)
1227 	    vect_free_slp_tree (child);
1228 	  vect_free_oprnd_info (oprnds_info);
1229 	  return NULL;
1230 	}
1231 
1232       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 					group_size, &this_max_nunits,
1234 					&this_loads, matches, npermutes,
1235 					&this_tree_size,
1236 					max_tree_size)) != NULL)
1237 	{
1238 	  /* If we have all children of child built up from scalars then just
1239 	     throw that away and build it up this node from scalars.  */
1240 	  if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 	      /* ???  Rejecting patterns this way doesn't work.  We'd have to
1242 		 do extra work to cancel the pattern so the uses see the
1243 		 scalar version.  */
1244 	      && !is_pattern_stmt_p
1245 	            (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1246 	    {
1247 	      slp_tree grandchild;
1248 
1249 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 		if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 		  break;
1252 	      if (!grandchild)
1253 		{
1254 		  /* Roll back.  */
1255 		  this_loads.truncate (old_nloads);
1256 		  this_tree_size = old_tree_size;
1257 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 		    vect_free_slp_tree (grandchild);
1259 		  SLP_TREE_CHILDREN (child).truncate (0);
1260 
1261 		  dump_printf_loc (MSG_NOTE, vect_location,
1262 				   "Building parent vector operands from "
1263 				   "scalars instead\n");
1264 		  oprnd_info->def_stmts = vNULL;
1265 		  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 		  children.safe_push (child);
1267 		  continue;
1268 		}
1269 	    }
1270 
1271 	  oprnd_info->def_stmts = vNULL;
1272 	  children.safe_push (child);
1273 	  continue;
1274 	}
1275 
1276       /* If the SLP build failed fatally and we analyze a basic-block
1277          simply treat nodes we fail to build as externally defined
1278 	 (and thus build vectors from the scalar defs).
1279 	 The cost model will reject outright expensive cases.
1280 	 ???  This doesn't treat cases where permutation ultimatively
1281 	 fails (or we don't try permutation below).  Ideally we'd
1282 	 even compute a permutation that will end up with the maximum
1283 	 SLP tree size...  */
1284       if (is_a <bb_vec_info> (vinfo)
1285 	  && !matches[0]
1286 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
1287 	     do extra work to cancel the pattern so the uses see the
1288 	     scalar version.  */
1289 	  && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1290 	{
1291 	  dump_printf_loc (MSG_NOTE, vect_location,
1292 			   "Building vector operands from scalars\n");
1293 	  child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 	  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 	  children.safe_push (child);
1296 	  oprnd_info->def_stmts = vNULL;
1297 	  continue;
1298 	}
1299 
1300       /* If the SLP build for operand zero failed and operand zero
1301 	 and one can be commutated try that for the scalar stmts
1302 	 that failed the match.  */
1303       if (i == 0
1304 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1305 	  && matches[0]
1306 	  /* ???  For COND_EXPRs we can swap the comparison operands
1307 	     as well as the arms under some constraints.  */
1308 	  && nops == 2
1309 	  && oprnds_info[1]->first_dt == vect_internal_def
1310 	  && is_gimple_assign (stmt)
1311 	  /* Do so only if the number of not successful permutes was nor more
1312 	     than a cut-ff as re-trying the recursive match on
1313 	     possibly each level of the tree would expose exponential
1314 	     behavior.  */
1315 	  && *npermutes < 4)
1316 	{
1317 	  /* See whether we can swap the matching or the non-matching
1318 	     stmt operands.  */
1319 	  bool swap_not_matching = true;
1320 	  do
1321 	    {
1322 	      for (j = 0; j < group_size; ++j)
1323 		{
1324 		  if (matches[j] != !swap_not_matching)
1325 		    continue;
1326 		  gimple *stmt = stmts[j];
1327 		  /* Verify if we can swap operands of this stmt.  */
1328 		  if (!is_gimple_assign (stmt)
1329 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1330 		    {
1331 		      if (!swap_not_matching)
1332 			goto fail;
1333 		      swap_not_matching = false;
1334 		      break;
1335 		    }
1336 		  /* Verify if we can safely swap or if we committed to a
1337 		     specific operand order already.
1338 		     ???  Instead of modifying GIMPLE stmts here we could
1339 		     record whether we want to swap operands in the SLP
1340 		     node and temporarily do that when processing it
1341 		     (or wrap operand accessors in a helper).  */
1342 		  else if (swap[j] != 0
1343 			   || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1344 		    {
1345 		      if (!swap_not_matching)
1346 			{
1347 			  if (dump_enabled_p ())
1348 			    {
1349 			      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1350 					       vect_location,
1351 					       "Build SLP failed: cannot swap "
1352 					       "operands of shared stmt ");
1353 			      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1354 						TDF_SLIM, stmts[j], 0);
1355 			    }
1356 			  goto fail;
1357 			}
1358 		      swap_not_matching = false;
1359 		      break;
1360 		    }
1361 		}
1362 	    }
1363 	  while (j != group_size);
1364 
1365 	  /* Swap mismatched definition stmts.  */
1366 	  dump_printf_loc (MSG_NOTE, vect_location,
1367 			   "Re-trying with swapped operands of stmts ");
1368 	  for (j = 0; j < group_size; ++j)
1369 	    if (matches[j] == !swap_not_matching)
1370 	      {
1371 		std::swap (oprnds_info[0]->def_stmts[j],
1372 			   oprnds_info[1]->def_stmts[j]);
1373 		dump_printf (MSG_NOTE, "%d ", j);
1374 	      }
1375 	  dump_printf (MSG_NOTE, "\n");
1376 	  /* And try again with scratch 'matches' ... */
1377 	  bool *tem = XALLOCAVEC (bool, group_size);
1378 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1379 					    group_size, &this_max_nunits,
1380 					    &this_loads, tem, npermutes,
1381 					    &this_tree_size,
1382 					    max_tree_size)) != NULL)
1383 	    {
1384 	      /* ... so if successful we can apply the operand swapping
1385 		 to the GIMPLE IL.  This is necessary because for example
1386 		 vect_get_slp_defs uses operand indexes and thus expects
1387 		 canonical operand order.  This is also necessary even
1388 		 if we end up building the operand from scalars as
1389 		 we'll continue to process swapped operand two.  */
1390 	      for (j = 0; j < group_size; ++j)
1391 		{
1392 		  gimple *stmt = stmts[j];
1393 		  gimple_set_plf (stmt, GF_PLF_1, false);
1394 		}
1395 	      for (j = 0; j < group_size; ++j)
1396 		{
1397 		  gimple *stmt = stmts[j];
1398 		  if (matches[j] == !swap_not_matching)
1399 		    {
1400 		      /* Avoid swapping operands twice.  */
1401 		      if (gimple_plf (stmt, GF_PLF_1))
1402 			continue;
1403 		      swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1404 					 gimple_assign_rhs2_ptr (stmt));
1405 		      gimple_set_plf (stmt, GF_PLF_1, true);
1406 		    }
1407 		}
1408 	      /* Verify we swap all duplicates or none.  */
1409 	      if (flag_checking)
1410 		for (j = 0; j < group_size; ++j)
1411 		  {
1412 		    gimple *stmt = stmts[j];
1413 		    gcc_assert (gimple_plf (stmt, GF_PLF_1)
1414 				== (matches[j] == !swap_not_matching));
1415 		  }
1416 
1417 	      /* If we have all children of child built up from scalars then
1418 		 just throw that away and build it up this node from scalars.  */
1419 	      if (!SLP_TREE_CHILDREN (child).is_empty ()
1420 		  /* ???  Rejecting patterns this way doesn't work.  We'd have
1421 		     to do extra work to cancel the pattern so the uses see the
1422 		     scalar version.  */
1423 		  && !is_pattern_stmt_p
1424 			(vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1425 		{
1426 		  unsigned int j;
1427 		  slp_tree grandchild;
1428 
1429 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1430 		    if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1431 		      break;
1432 		  if (!grandchild)
1433 		    {
1434 		      /* Roll back.  */
1435 		      this_loads.truncate (old_nloads);
1436 		      this_tree_size = old_tree_size;
1437 		      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1438 			vect_free_slp_tree (grandchild);
1439 		      SLP_TREE_CHILDREN (child).truncate (0);
1440 
1441 		      dump_printf_loc (MSG_NOTE, vect_location,
1442 				       "Building parent vector operands from "
1443 				       "scalars instead\n");
1444 		      oprnd_info->def_stmts = vNULL;
1445 		      SLP_TREE_DEF_TYPE (child) = vect_external_def;
1446 		      children.safe_push (child);
1447 		      continue;
1448 		    }
1449 		}
1450 
1451 	      oprnd_info->def_stmts = vNULL;
1452 	      children.safe_push (child);
1453 	      continue;
1454 	    }
1455 
1456 	  ++*npermutes;
1457 	}
1458 
1459 fail:
1460       gcc_assert (child == NULL);
1461       FOR_EACH_VEC_ELT (children, j, child)
1462 	vect_free_slp_tree (child);
1463       vect_free_oprnd_info (oprnds_info);
1464       return NULL;
1465     }
1466 
1467   vect_free_oprnd_info (oprnds_info);
1468 
1469   if (tree_size)
1470     *tree_size += this_tree_size;
1471   *max_nunits = this_max_nunits;
1472   loads->safe_splice (this_loads);
1473 
1474   node = vect_create_new_slp_node (stmts);
1475   SLP_TREE_TWO_OPERATORS (node) = two_operators;
1476   SLP_TREE_CHILDREN (node).splice (children);
1477   return node;
1478 }
1479 
1480 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1481 
1482 static void
1483 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1484 {
1485   int i;
1486   gimple *stmt;
1487   slp_tree child;
1488 
1489   dump_printf_loc (dump_kind, loc, "node%s\n",
1490 		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
1491 		   ? " (external)" : "");
1492   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1493     {
1494       dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1495       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1496     }
1497   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1498     vect_print_slp_tree (dump_kind, loc, child);
1499 }
1500 
1501 
1502 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1503    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1504    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1505    stmts in NODE are to be marked.  */
1506 
1507 static void
1508 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1509 {
1510   int i;
1511   gimple *stmt;
1512   slp_tree child;
1513 
1514   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1515     return;
1516 
1517   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1518     if (j < 0 || i == j)
1519       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1520 
1521   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1522     vect_mark_slp_stmts (child, mark, j);
1523 }
1524 
1525 
1526 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1527 
1528 static void
1529 vect_mark_slp_stmts_relevant (slp_tree node)
1530 {
1531   int i;
1532   gimple *stmt;
1533   stmt_vec_info stmt_info;
1534   slp_tree child;
1535 
1536   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1537     return;
1538 
1539   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1540     {
1541       stmt_info = vinfo_for_stmt (stmt);
1542       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545     }
1546 
1547   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548     vect_mark_slp_stmts_relevant (child);
1549 }
1550 
1551 
1552 /* Rearrange the statements of NODE according to PERMUTATION.  */
1553 
1554 static void
1555 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1556                           vec<unsigned> permutation)
1557 {
1558   gimple *stmt;
1559   vec<gimple *> tmp_stmts;
1560   unsigned int i;
1561   slp_tree child;
1562 
1563   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1564     vect_slp_rearrange_stmts (child, group_size, permutation);
1565 
1566   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1567   tmp_stmts.create (group_size);
1568   tmp_stmts.quick_grow_cleared (group_size);
1569 
1570   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1571     tmp_stmts[permutation[i]] = stmt;
1572 
1573   SLP_TREE_SCALAR_STMTS (node).release ();
1574   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1575 }
1576 
1577 
1578 /* Attempt to reorder stmts in a reduction chain so that we don't
1579    require any load permutation.  Return true if that was possible,
1580    otherwise return false.  */
1581 
1582 static bool
1583 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1584 {
1585   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1586   unsigned int i, j;
1587   unsigned int lidx;
1588   slp_tree node, load;
1589 
1590   /* Compare all the permutation sequences to the first one.  We know
1591      that at least one load is permuted.  */
1592   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1593   if (!node->load_permutation.exists ())
1594     return false;
1595   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1596     {
1597       if (!load->load_permutation.exists ())
1598 	return false;
1599       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1600 	if (lidx != node->load_permutation[j])
1601 	  return false;
1602     }
1603 
1604   /* Check that the loads in the first sequence are different and there
1605      are no gaps between them.  */
1606   auto_sbitmap load_index (group_size);
1607   bitmap_clear (load_index);
1608   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1609     {
1610       if (lidx >= group_size)
1611 	return false;
1612       if (bitmap_bit_p (load_index, lidx))
1613 	return false;
1614 
1615       bitmap_set_bit (load_index, lidx);
1616     }
1617   for (i = 0; i < group_size; i++)
1618     if (!bitmap_bit_p (load_index, i))
1619       return false;
1620 
1621   /* This permutation is valid for reduction.  Since the order of the
1622      statements in the nodes is not important unless they are memory
1623      accesses, we can rearrange the statements in all the nodes
1624      according to the order of the loads.  */
1625   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1626 			    node->load_permutation);
1627 
1628   /* We are done, no actual permutations need to be generated.  */
1629   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1630   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1631     {
1632       gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1633       first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1634       /* But we have to keep those permutations that are required because
1635          of handling of gaps.  */
1636       if (known_eq (unrolling_factor, 1U)
1637 	  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1638 	      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1639 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1640       else
1641 	for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1642 	  SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1643     }
1644 
1645   return true;
1646 }
1647 
1648 /* Check if the required load permutations in the SLP instance
1649    SLP_INSTN are supported.  */
1650 
1651 static bool
1652 vect_supported_load_permutation_p (slp_instance slp_instn)
1653 {
1654   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1655   unsigned int i, j, k, next;
1656   slp_tree node;
1657   gimple *stmt, *load, *next_load;
1658 
1659   if (dump_enabled_p ())
1660     {
1661       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1662       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1663 	if (node->load_permutation.exists ())
1664 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1665 	    dump_printf (MSG_NOTE, "%d ", next);
1666 	else
1667 	  for (k = 0; k < group_size; ++k)
1668 	    dump_printf (MSG_NOTE, "%d ", k);
1669       dump_printf (MSG_NOTE, "\n");
1670     }
1671 
1672   /* In case of reduction every load permutation is allowed, since the order
1673      of the reduction statements is not important (as opposed to the case of
1674      grouped stores).  The only condition we need to check is that all the
1675      load nodes are of the same size and have the same permutation (and then
1676      rearrange all the nodes of the SLP instance according to this
1677      permutation).  */
1678 
1679   /* Check that all the load nodes are of the same size.  */
1680   /* ???  Can't we assert this? */
1681   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1682     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1683       return false;
1684 
1685   node = SLP_INSTANCE_TREE (slp_instn);
1686   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1687 
1688   /* Reduction (there are no data-refs in the root).
1689      In reduction chain the order of the loads is not important.  */
1690   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1691       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1692     vect_attempt_slp_rearrange_stmts (slp_instn);
1693 
1694   /* In basic block vectorization we allow any subchain of an interleaving
1695      chain.
1696      FORNOW: not supported in loop SLP because of realignment compications.  */
1697   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1698     {
1699       /* Check whether the loads in an instance form a subchain and thus
1700          no permutation is necessary.  */
1701       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1702         {
1703 	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1704 	    continue;
1705 	  bool subchain_p = true;
1706           next_load = NULL;
1707           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1708             {
1709               if (j != 0
1710 		  && (next_load != load
1711 		      || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1712 		{
1713 		  subchain_p = false;
1714 		  break;
1715 		}
1716               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1717             }
1718 	  if (subchain_p)
1719 	    SLP_TREE_LOAD_PERMUTATION (node).release ();
1720 	  else
1721 	    {
1722 	      stmt_vec_info group_info
1723 		= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1724 	      group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1725 	      unsigned HOST_WIDE_INT nunits;
1726 	      unsigned k, maxk = 0;
1727 	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1728 		if (k > maxk)
1729 		  maxk = k;
1730 	      /* In BB vectorization we may not actually use a loaded vector
1731 		 accessing elements in excess of GROUP_SIZE.  */
1732 	      tree vectype = STMT_VINFO_VECTYPE (group_info);
1733 	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1734 		  || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1735 		{
1736 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 				   "BB vectorization with gaps at the end of "
1738 				   "a load is not supported\n");
1739 		  return false;
1740 		}
1741 
1742 	      /* Verify the permutation can be generated.  */
1743 	      vec<tree> tem;
1744 	      unsigned n_perms;
1745 	      if (!vect_transform_slp_perm_load (node, tem, NULL,
1746 						 1, slp_instn, true, &n_perms))
1747 		{
1748 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1749 				   vect_location,
1750 				   "unsupported load permutation\n");
1751 		  return false;
1752 		}
1753 	    }
1754         }
1755       return true;
1756     }
1757 
1758   /* For loop vectorization verify we can generate the permutation.  Be
1759      conservative about the vectorization factor, there are permutations
1760      that will use three vector inputs only starting from a specific factor
1761      and the vectorization factor is not yet final.
1762      ???  The SLP instance unrolling factor might not be the maximum one.  */
1763   unsigned n_perms;
1764   poly_uint64 test_vf
1765     = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1766 			     LOOP_VINFO_VECT_FACTOR
1767 			     (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1768   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1769     if (node->load_permutation.exists ()
1770 	&& !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1771 					  slp_instn, true, &n_perms))
1772       return false;
1773 
1774   return true;
1775 }
1776 
1777 
1778 /* Find the last store in SLP INSTANCE.  */
1779 
1780 gimple *
1781 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1782 {
1783   gimple *last = NULL, *stmt;
1784 
1785   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1786     {
1787       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1788       if (is_pattern_stmt_p (stmt_vinfo))
1789 	last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1790       else
1791 	last = get_later_stmt (stmt, last);
1792     }
1793 
1794   return last;
1795 }
1796 
1797 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1798 
1799 static void
1800 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1801 			 stmt_vector_for_cost *prologue_cost_vec,
1802 			 stmt_vector_for_cost *body_cost_vec,
1803 			 unsigned ncopies_for_cost,
1804 			 scalar_stmts_set_t* visited)
1805 {
1806   unsigned i, j;
1807   slp_tree child;
1808   gimple *stmt;
1809   stmt_vec_info stmt_info;
1810   tree lhs;
1811 
1812   /* If we already costed the exact same set of scalar stmts we're done.
1813      We share the generated vector stmts for those.  */
1814   if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1815     return;
1816 
1817   visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1818 
1819   /* Recurse down the SLP tree.  */
1820   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1821     if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1822       vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1823 			       body_cost_vec, ncopies_for_cost, visited);
1824 
1825   /* Look at the first scalar stmt to determine the cost.  */
1826   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1827   stmt_info = vinfo_for_stmt (stmt);
1828   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829     {
1830       vect_memory_access_type memory_access_type
1831 	= (STMT_VINFO_STRIDED_P (stmt_info)
1832 	   ? VMAT_STRIDED_SLP
1833 	   : VMAT_CONTIGUOUS);
1834       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1835 	vect_model_store_cost (stmt_info, ncopies_for_cost,
1836 			       memory_access_type, VLS_STORE,
1837 			       node, prologue_cost_vec, body_cost_vec);
1838       else
1839 	{
1840 	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1841 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1842 	    {
1843 	      /* If the load is permuted then the alignment is determined by
1844 		 the first group element not by the first scalar stmt DR.  */
1845 	      stmt = GROUP_FIRST_ELEMENT (stmt_info);
1846 	      stmt_info = vinfo_for_stmt (stmt);
1847 	      /* Record the cost for the permutation.  */
1848 	      unsigned n_perms;
1849 	      vect_transform_slp_perm_load (node, vNULL, NULL,
1850 					    ncopies_for_cost, instance, true,
1851 					    &n_perms);
1852 	      record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1853 				stmt_info, 0, vect_body);
1854 	      unsigned assumed_nunits
1855 		= vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1856 	      /* And adjust the number of loads performed.  This handles
1857 	         redundancies as well as loads that are later dead.  */
1858 	      auto_sbitmap perm (GROUP_SIZE (stmt_info));
1859 	      bitmap_clear (perm);
1860 	      for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1861 		bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1862 	      ncopies_for_cost = 0;
1863 	      bool load_seen = false;
1864 	      for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1865 		{
1866 		  if (i % assumed_nunits == 0)
1867 		    {
1868 		      if (load_seen)
1869 			ncopies_for_cost++;
1870 		      load_seen = false;
1871 		    }
1872 		  if (bitmap_bit_p (perm, i))
1873 		    load_seen = true;
1874 		}
1875 	      if (load_seen)
1876 		ncopies_for_cost++;
1877 	      gcc_assert (ncopies_for_cost
1878 			  <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1879 			      + assumed_nunits - 1) / assumed_nunits);
1880 	      poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1881 	      ncopies_for_cost *= estimated_poly_value (uf);
1882 	    }
1883 	  /* Record the cost for the vector loads.  */
1884 	  vect_model_load_cost (stmt_info, ncopies_for_cost,
1885 				memory_access_type, node, prologue_cost_vec,
1886 				body_cost_vec);
1887 	  return;
1888 	}
1889     }
1890   else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1891     {
1892       /* ncopies_for_cost is the number of IVs we generate.  */
1893       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1894 			stmt_info, 0, vect_body);
1895 
1896       /* Prologue cost for the initial values and step vector.  */
1897       record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1898 			CONSTANT_CLASS_P
1899 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1900 			     (stmt_info))
1901 			? vector_load : vec_construct,
1902 			stmt_info, 0, vect_prologue);
1903       record_stmt_cost (prologue_cost_vec, 1,
1904 			CONSTANT_CLASS_P
1905 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1906 			? vector_load : vec_construct,
1907 			stmt_info, 0, vect_prologue);
1908 
1909       /* ???  No easy way to get at the actual number of vector stmts
1910          to be geneated and thus the derived IVs.  */
1911     }
1912   else
1913     {
1914       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1915 			stmt_info, 0, vect_body);
1916       if (SLP_TREE_TWO_OPERATORS (node))
1917 	{
1918 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1919 			    stmt_info, 0, vect_body);
1920 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1921 			    stmt_info, 0, vect_body);
1922 	}
1923     }
1924 
1925   /* Push SLP node def-type to stmts.  */
1926   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1927     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1928       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1929 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1930 
1931   /* Scan operands and account for prologue cost of constants/externals.
1932      ???  This over-estimates cost for multiple uses and should be
1933      re-engineered.  */
1934   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1935   lhs = gimple_get_lhs (stmt);
1936   for (i = 0; i < gimple_num_ops (stmt); ++i)
1937     {
1938       tree op = gimple_op (stmt, i);
1939       gimple *def_stmt;
1940       enum vect_def_type dt;
1941       if (!op || op == lhs)
1942 	continue;
1943       if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1944 	  && (dt == vect_constant_def || dt == vect_external_def))
1945 	{
1946 	  /* Without looking at the actual initializer a vector of
1947 	     constants can be implemented as load from the constant pool.
1948 	     When all elements are the same we can use a splat.  */
1949 	  tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1950 	  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1951 	  unsigned num_vects_to_check;
1952 	  unsigned HOST_WIDE_INT const_nunits;
1953 	  unsigned nelt_limit;
1954 	  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1955 	      && ! multiple_p (const_nunits, group_size))
1956 	    {
1957 	      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1958 	      nelt_limit = const_nunits;
1959 	    }
1960 	  else
1961 	    {
1962 	      /* If either the vector has variable length or the vectors
1963 	         are composed of repeated whole groups we only need to
1964 		 cost construction once.  All vectors will be the same.  */
1965 	      num_vects_to_check = 1;
1966 	      nelt_limit = group_size;
1967 	    }
1968 	  tree elt = NULL_TREE;
1969 	  unsigned nelt = 0;
1970 	  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1971 	    {
1972 	      unsigned si = j % group_size;
1973 	      if (nelt == 0)
1974 		elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1975 	      /* ???  We're just tracking whether all operands of a single
1976 		 vector initializer are the same, ideally we'd check if
1977 		 we emitted the same one already.  */
1978 	      else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1979 		elt = NULL_TREE;
1980 	      nelt++;
1981 	      if (nelt == nelt_limit)
1982 		{
1983 		  /* ???  We need to pass down stmt_info for a vector type
1984 		     even if it points to the wrong stmt.  */
1985 		  record_stmt_cost (prologue_cost_vec, 1,
1986 				    dt == vect_external_def
1987 				    ? (elt ? scalar_to_vec : vec_construct)
1988 				    : vector_load,
1989 				    stmt_info, 0, vect_prologue);
1990 		  nelt = 0;
1991 		}
1992 	    }
1993 	}
1994     }
1995 
1996   /* Restore stmt def-types.  */
1997   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1998     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1999       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2000 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2001 }
2002 
2003 /* Compute the cost for the SLP instance INSTANCE.  */
2004 
2005 static void
2006 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2007 {
2008   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2009   unsigned ncopies_for_cost;
2010   stmt_info_for_cost *si;
2011   unsigned i;
2012 
2013   /* Calculate the number of vector stmts to create based on the unrolling
2014      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2015      GROUP_SIZE / NUNITS otherwise.  */
2016   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2017   slp_tree node = SLP_INSTANCE_TREE (instance);
2018   stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2019   /* Get the estimated vectorization factor, which is always one for
2020      basic-block vectorization.  */
2021   unsigned int assumed_vf;
2022   if (STMT_VINFO_LOOP_VINFO (stmt_info))
2023     assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2024   else
2025     assumed_vf = 1;
2026   /* For reductions look at a reduction operand in case the reduction
2027      operation is widening like DOT_PROD or SAD.  */
2028   tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2029   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030     {
2031       gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2032       switch (gimple_assign_rhs_code (stmt))
2033 	{
2034 	case DOT_PROD_EXPR:
2035 	case SAD_EXPR:
2036 	  vectype_for_cost = get_vectype_for_scalar_type
2037 	    (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2038 	  break;
2039 	default:;
2040 	}
2041     }
2042   unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2043   ncopies_for_cost = (least_common_multiple (assumed_nunits,
2044 					     group_size * assumed_vf)
2045 		      / assumed_nunits);
2046 
2047   prologue_cost_vec.create (10);
2048   body_cost_vec.create (10);
2049   vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2050 			   &prologue_cost_vec, &body_cost_vec,
2051 			   ncopies_for_cost, visited);
2052 
2053   /* Record the prologue costs, which were delayed until we were
2054      sure that SLP was successful.  */
2055   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2056     {
2057       struct _stmt_vec_info *stmt_info
2058 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2059       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2060 			    si->misalign, vect_prologue);
2061     }
2062 
2063   /* Record the instance's instructions in the target cost model.  */
2064   FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2065     {
2066       struct _stmt_vec_info *stmt_info
2067 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2068       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2069 			    si->misalign, vect_body);
2070     }
2071 
2072   prologue_cost_vec.release ();
2073   body_cost_vec.release ();
2074 }
2075 
2076 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2077    one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2078    the first GROUP1_SIZE stmts, since stores are consecutive), the second
2079    containing the remainder.
2080    Return the first stmt in the second group.  */
2081 
2082 static gimple *
2083 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2084 {
2085   stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2086   gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2087   gcc_assert (group1_size > 0);
2088   int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2089   gcc_assert (group2_size > 0);
2090   GROUP_SIZE (first_vinfo) = group1_size;
2091 
2092   gimple *stmt = first_stmt;
2093   for (unsigned i = group1_size; i > 1; i--)
2094     {
2095       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2096       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2097     }
2098   /* STMT is now the last element of the first group.  */
2099   gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2100   GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2101 
2102   GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2103   for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2104     {
2105       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2106       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2107     }
2108 
2109   /* For the second group, the GROUP_GAP is that before the original group,
2110      plus skipping over the first vector.  */
2111   GROUP_GAP (vinfo_for_stmt (group2)) =
2112     GROUP_GAP (first_vinfo) + group1_size;
2113 
2114   /* GROUP_GAP of the first group now has to skip over the second group too.  */
2115   GROUP_GAP (first_vinfo) += group2_size;
2116 
2117   if (dump_enabled_p ())
2118     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2119 		     group1_size, group2_size);
2120 
2121   return group2;
2122 }
2123 
2124 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2125    statements and a vector of NUNITS elements.  */
2126 
2127 static poly_uint64
2128 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2129 {
2130   return exact_div (common_multiple (nunits, group_size), group_size);
2131 }
2132 
2133 /* Analyze an SLP instance starting from a group of grouped stores.  Call
2134    vect_build_slp_tree to build a tree of packed stmts if possible.
2135    Return FALSE if it's impossible to SLP any stmt in the loop.  */
2136 
2137 static bool
2138 vect_analyze_slp_instance (vec_info *vinfo,
2139 			   gimple *stmt, unsigned max_tree_size)
2140 {
2141   slp_instance new_instance;
2142   slp_tree node;
2143   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2144   tree vectype, scalar_type = NULL_TREE;
2145   gimple *next;
2146   unsigned int i;
2147   vec<slp_tree> loads;
2148   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2149   vec<gimple *> scalar_stmts;
2150 
2151   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2152     {
2153       if (dr)
2154         {
2155           scalar_type = TREE_TYPE (DR_REF (dr));
2156           vectype = get_vectype_for_scalar_type (scalar_type);
2157         }
2158       else
2159         {
2160           gcc_assert (is_a <loop_vec_info> (vinfo));
2161           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2162         }
2163 
2164       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2165     }
2166   else
2167     {
2168       gcc_assert (is_a <loop_vec_info> (vinfo));
2169       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2170       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2171     }
2172 
2173   if (!vectype)
2174     {
2175       if (dump_enabled_p ())
2176         {
2177           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 			   "Build SLP failed: unsupported data-type ");
2179           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2180           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2181         }
2182 
2183       return false;
2184     }
2185   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2186 
2187   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
2188   scalar_stmts.create (group_size);
2189   next = stmt;
2190   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2191     {
2192       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
2193       while (next)
2194         {
2195 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2196 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2197 	    scalar_stmts.safe_push (
2198 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2199 	  else
2200             scalar_stmts.safe_push (next);
2201           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2202         }
2203       /* Mark the first element of the reduction chain as reduction to properly
2204 	 transform the node.  In the reduction analysis phase only the last
2205 	 element of the chain is marked as reduction.  */
2206       if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2207 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2208     }
2209   else
2210     {
2211       /* Collect reduction statements.  */
2212       vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2213       for (i = 0; reductions.iterate (i, &next); i++)
2214 	scalar_stmts.safe_push (next);
2215     }
2216 
2217   loads.create (group_size);
2218 
2219   /* Build the tree for the SLP instance.  */
2220   bool *matches = XALLOCAVEC (bool, group_size);
2221   unsigned npermutes = 0;
2222   bst_fail = new scalar_stmts_set_t ();
2223   poly_uint64 max_nunits = nunits;
2224   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2225 			      &max_nunits, &loads, matches, &npermutes,
2226 			      NULL, max_tree_size);
2227   delete bst_fail;
2228   if (node != NULL)
2229     {
2230       /* Calculate the unrolling factor based on the smallest type.  */
2231       poly_uint64 unrolling_factor
2232 	= calculate_unrolling_factor (max_nunits, group_size);
2233 
2234       if (maybe_ne (unrolling_factor, 1U)
2235 	  && is_a <bb_vec_info> (vinfo))
2236 	{
2237 	  unsigned HOST_WIDE_INT const_max_nunits;
2238 	  if (!max_nunits.is_constant (&const_max_nunits)
2239 	      || const_max_nunits > group_size)
2240 	    {
2241 	      if (dump_enabled_p ())
2242 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 				 "Build SLP failed: store group "
2244 				 "size not a multiple of the vector size "
2245 				 "in basic block SLP\n");
2246 	      vect_free_slp_tree (node);
2247 	      loads.release ();
2248 	      return false;
2249 	    }
2250 	  /* Fatal mismatch.  */
2251 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
2252 	  vect_free_slp_tree (node);
2253 	  loads.release ();
2254 	}
2255       else
2256 	{
2257       /* Create a new SLP instance.  */
2258       new_instance = XNEW (struct _slp_instance);
2259       SLP_INSTANCE_TREE (new_instance) = node;
2260       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2261       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2262       SLP_INSTANCE_LOADS (new_instance) = loads;
2263 
2264       /* Compute the load permutation.  */
2265       slp_tree load_node;
2266       bool loads_permuted = false;
2267       FOR_EACH_VEC_ELT (loads, i, load_node)
2268 	{
2269 	  vec<unsigned> load_permutation;
2270 	  int j;
2271 	  gimple *load, *first_stmt;
2272 	  bool this_load_permuted = false;
2273 	  load_permutation.create (group_size);
2274 	  first_stmt = GROUP_FIRST_ELEMENT
2275 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2276 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2277 	    {
2278 		  int load_place = vect_get_place_in_interleaving_chain
2279 				     (load, first_stmt);
2280 	      gcc_assert (load_place != -1);
2281 	      if (load_place != j)
2282 		this_load_permuted = true;
2283 	      load_permutation.safe_push (load_place);
2284 	    }
2285 	  if (!this_load_permuted
2286 	      /* The load requires permutation when unrolling exposes
2287 	         a gap either because the group is larger than the SLP
2288 		 group-size or because there is a gap between the groups.  */
2289 	      && (known_eq (unrolling_factor, 1U)
2290 		  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2291 		      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2292 	    {
2293 	      load_permutation.release ();
2294 	      continue;
2295 	    }
2296 	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2297 	  loads_permuted = true;
2298 	}
2299 
2300       if (loads_permuted)
2301         {
2302           if (!vect_supported_load_permutation_p (new_instance))
2303             {
2304               if (dump_enabled_p ())
2305                 {
2306                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 				   "Build SLP failed: unsupported load "
2308 				   "permutation ");
2309 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2310 					TDF_SLIM, stmt, 0);
2311                 }
2312               vect_free_slp_instance (new_instance);
2313               return false;
2314             }
2315         }
2316 
2317 	  /* If the loads and stores can be handled with load/store-lan
2318 	 instructions do not generate this SLP instance.  */
2319       if (is_a <loop_vec_info> (vinfo)
2320 	  && loads_permuted
2321 	  && dr && vect_store_lanes_supported (vectype, group_size, false))
2322 	{
2323 	  slp_tree load_node;
2324 	  FOR_EACH_VEC_ELT (loads, i, load_node)
2325 	    {
2326 	      gimple *first_stmt = GROUP_FIRST_ELEMENT
2327 		  (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2328 	      stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2329 		  /* Use SLP for strided accesses (or if we
2330 		     can't load-lanes).  */
2331 	      if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2332 		  || ! vect_load_lanes_supported
2333 			(STMT_VINFO_VECTYPE (stmt_vinfo),
2334 			 GROUP_SIZE (stmt_vinfo), false))
2335 		break;
2336 	    }
2337 	  if (i == loads.length ())
2338 	    {
2339 	      if (dump_enabled_p ())
2340 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341 				 "Built SLP cancelled: can use "
2342 				 "load/store-lanes\n");
2343 	      vect_free_slp_instance (new_instance);
2344 	      return false;
2345 	    }
2346 	}
2347 
2348       vinfo->slp_instances.safe_push (new_instance);
2349 
2350       if (dump_enabled_p ())
2351 	{
2352 	  dump_printf_loc (MSG_NOTE, vect_location,
2353 			   "Final SLP tree for instance:\n");
2354 	  vect_print_slp_tree (MSG_NOTE, vect_location, node);
2355 	}
2356 
2357       return true;
2358     }
2359     }
2360   else
2361     {
2362   /* Failed to SLP.  */
2363   /* Free the allocated memory.  */
2364   scalar_stmts.release ();
2365   loads.release ();
2366     }
2367 
2368   /* For basic block SLP, try to break the group up into multiples of the
2369      vector size.  */
2370   unsigned HOST_WIDE_INT const_nunits;
2371   if (is_a <bb_vec_info> (vinfo)
2372       && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2373       && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2374       && nunits.is_constant (&const_nunits))
2375     {
2376       /* We consider breaking the group only on VF boundaries from the existing
2377 	 start.  */
2378       for (i = 0; i < group_size; i++)
2379 	if (!matches[i]) break;
2380 
2381       if (i >= const_nunits && i < group_size)
2382 	{
2383 	  /* Split into two groups at the first vector boundary before i.  */
2384 	  gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2385 	  unsigned group1_size = i & ~(const_nunits - 1);
2386 
2387 	  gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2388 	  bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2389 	  /* If the first non-match was in the middle of a vector,
2390 	     skip the rest of that vector.  */
2391 	  if (group1_size < i)
2392 	    {
2393 	      i = group1_size + const_nunits;
2394 	      if (i < group_size)
2395 		rest = vect_split_slp_store_group (rest, const_nunits);
2396 	    }
2397 	  if (i < group_size)
2398 	    res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2399 	  return res;
2400 	}
2401       /* Even though the first vector did not all match, we might be able to SLP
2402 	 (some) of the remainder.  FORNOW ignore this possibility.  */
2403     }
2404 
2405   return false;
2406 }
2407 
2408 
2409 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
2410    trees of packed scalar stmts if SLP is possible.  */
2411 
2412 bool
2413 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2414 {
2415   unsigned int i;
2416   gimple *first_element;
2417 
2418   if (dump_enabled_p ())
2419     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2420 
2421   /* Find SLP sequences starting from groups of grouped stores.  */
2422   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2423     vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2424 
2425   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2426     {
2427       if (loop_vinfo->reduction_chains.length () > 0)
2428 	{
2429 	  /* Find SLP sequences starting from reduction chains.  */
2430 	  FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2431 	    if (! vect_analyze_slp_instance (vinfo, first_element,
2432 					     max_tree_size))
2433 	      {
2434 		/* Dissolve reduction chain group.  */
2435 		gimple *next, *stmt = first_element;
2436 		while (stmt)
2437 		  {
2438 		    stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2439 		    next = GROUP_NEXT_ELEMENT (vinfo);
2440 		    GROUP_FIRST_ELEMENT (vinfo) = NULL;
2441 		    GROUP_NEXT_ELEMENT (vinfo) = NULL;
2442 		    stmt = next;
2443 		  }
2444 		STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2445 		  = vect_internal_def;
2446 	      }
2447 	}
2448 
2449       /* Find SLP sequences starting from groups of reductions.  */
2450       if (loop_vinfo->reductions.length () > 1)
2451 	vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2452 				   max_tree_size);
2453     }
2454 
2455   return true;
2456 }
2457 
2458 
2459 /* For each possible SLP instance decide whether to SLP it and calculate overall
2460    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
2461    least one instance.  */
2462 
2463 bool
2464 vect_make_slp_decision (loop_vec_info loop_vinfo)
2465 {
2466   unsigned int i;
2467   poly_uint64 unrolling_factor = 1;
2468   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2469   slp_instance instance;
2470   int decided_to_slp = 0;
2471 
2472   if (dump_enabled_p ())
2473     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2474                      "\n");
2475 
2476   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477     {
2478       /* FORNOW: SLP if you can.  */
2479       /* All unroll factors have the form current_vector_size * X for some
2480 	 rational X, so they must have a common multiple.  */
2481       unrolling_factor
2482 	= force_common_multiple (unrolling_factor,
2483 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2484 
2485       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
2486 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2487 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
2488       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2489       decided_to_slp++;
2490     }
2491 
2492   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2493 
2494   if (decided_to_slp && dump_enabled_p ())
2495     {
2496       dump_printf_loc (MSG_NOTE, vect_location,
2497 		       "Decided to SLP %d instances. Unrolling factor ",
2498 		       decided_to_slp);
2499       dump_dec (MSG_NOTE, unrolling_factor);
2500       dump_printf (MSG_NOTE, "\n");
2501     }
2502 
2503   return (decided_to_slp > 0);
2504 }
2505 
2506 
2507 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2508    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
2509 
2510 static void
2511 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2512 {
2513   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2514   imm_use_iterator imm_iter;
2515   gimple *use_stmt;
2516   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2517   slp_tree child;
2518   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2520   int j;
2521 
2522   /* Propagate hybrid down the SLP tree.  */
2523   if (stype == hybrid)
2524     ;
2525   else if (HYBRID_SLP_STMT (stmt_vinfo))
2526     stype = hybrid;
2527   else
2528     {
2529       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
2530       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2531       /* If we get a pattern stmt here we have to use the LHS of the
2532          original stmt for immediate uses.  */
2533       if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2534 	  && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2535 	stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2536       tree def;
2537       if (gimple_code (stmt) == GIMPLE_PHI)
2538 	def = gimple_phi_result (stmt);
2539       else
2540 	def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2541       if (def)
2542 	FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2543 	  {
2544 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2545 	      continue;
2546 	    use_vinfo = vinfo_for_stmt (use_stmt);
2547 	    if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2548 		&& STMT_VINFO_RELATED_STMT (use_vinfo))
2549 	      use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2550 	    if (!STMT_SLP_TYPE (use_vinfo)
2551 		&& (STMT_VINFO_RELEVANT (use_vinfo)
2552 		    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2553 		&& !(gimple_code (use_stmt) == GIMPLE_PHI
2554 		     && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2555 	      {
2556 		if (dump_enabled_p ())
2557 		  {
2558 		    dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2559 				     "def in non-SLP stmt: ");
2560 		    dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2561 		  }
2562 		stype = hybrid;
2563 	      }
2564 	  }
2565     }
2566 
2567   if (stype == hybrid
2568       && !HYBRID_SLP_STMT (stmt_vinfo))
2569     {
2570       if (dump_enabled_p ())
2571 	{
2572 	  dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2573 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2574 	}
2575       STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2576     }
2577 
2578   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2579     if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2580       vect_detect_hybrid_slp_stmts (child, i, stype);
2581 }
2582 
2583 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
2584 
2585 static tree
2586 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2587 {
2588   walk_stmt_info *wi = (walk_stmt_info *)data;
2589   struct loop *loopp = (struct loop *)wi->info;
2590 
2591   if (wi->is_lhs)
2592     return NULL_TREE;
2593 
2594   if (TREE_CODE (*tp) == SSA_NAME
2595       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2596     {
2597       gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2598       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2599 	  && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2600 	{
2601 	  if (dump_enabled_p ())
2602 	    {
2603 	      dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2604 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2605 	    }
2606 	  STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2607 	}
2608     }
2609 
2610   return NULL_TREE;
2611 }
2612 
2613 static tree
2614 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2615 			  walk_stmt_info *)
2616 {
2617   stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2618   /* If the stmt is in a SLP instance then this isn't a reason
2619      to mark use definitions in other SLP instances as hybrid.  */
2620   if (! STMT_SLP_TYPE (use_vinfo)
2621       && (STMT_VINFO_RELEVANT (use_vinfo)
2622 	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2623       && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2624 	    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2625     ;
2626   else
2627     *handled = true;
2628   return NULL_TREE;
2629 }
2630 
2631 /* Find stmts that must be both vectorized and SLPed.  */
2632 
2633 void
2634 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2635 {
2636   unsigned int i;
2637   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2638   slp_instance instance;
2639 
2640   if (dump_enabled_p ())
2641     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2642                      "\n");
2643 
2644   /* First walk all pattern stmt in the loop and mark defs of uses as
2645      hybrid because immediate uses in them are not recorded.  */
2646   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2647     {
2648       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2649       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 	   gsi_next (&gsi))
2651 	{
2652 	  gimple *stmt = gsi_stmt (gsi);
2653 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2654 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2655 	    {
2656 	      walk_stmt_info wi;
2657 	      memset (&wi, 0, sizeof (wi));
2658 	      wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2659 	      gimple_stmt_iterator gsi2
2660 		= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2661 	      walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2662 				vect_detect_hybrid_slp_1, &wi);
2663 	      walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2664 			       vect_detect_hybrid_slp_2,
2665 			       vect_detect_hybrid_slp_1, &wi);
2666 	    }
2667 	}
2668     }
2669 
2670   /* Then walk the SLP instance trees marking stmts with uses in
2671      non-SLP stmts as hybrid, also propagating hybrid down the
2672      SLP tree, collecting the above info on-the-fly.  */
2673   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2674     {
2675       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2676 	vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2677 				      i, pure_slp);
2678     }
2679 }
2680 
2681 
2682 /* Initialize a bb_vec_info struct for the statements between
2683    REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive).  */
2684 
2685 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2686 			    gimple_stmt_iterator region_end_in)
2687   : vec_info (vec_info::bb, init_cost (NULL)),
2688     bb (gsi_bb (region_begin_in)),
2689     region_begin (region_begin_in),
2690     region_end (region_end_in)
2691 {
2692   gimple_stmt_iterator gsi;
2693 
2694   for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2695        gsi_next (&gsi))
2696     {
2697       gimple *stmt = gsi_stmt (gsi);
2698       gimple_set_uid (stmt, 0);
2699       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2700     }
2701 
2702   bb->aux = this;
2703 }
2704 
2705 
2706 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2707    stmts in the basic block.  */
2708 
2709 _bb_vec_info::~_bb_vec_info ()
2710 {
2711   for (gimple_stmt_iterator si = region_begin;
2712        gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2713     {
2714       gimple *stmt = gsi_stmt (si);
2715       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716 
2717       if (stmt_info)
2718         /* Free stmt_vec_info.  */
2719         free_stmt_vec_info (stmt);
2720 
2721       /* Reset region marker.  */
2722       gimple_set_uid (stmt, -1);
2723     }
2724 
2725   bb->aux = NULL;
2726 }
2727 
2728 
2729 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2730    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2731 
2732    Return true if the operations are supported.  */
2733 
2734 static bool
2735 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2736 				  slp_instance node_instance)
2737 {
2738   bool dummy;
2739   int i, j;
2740   gimple *stmt;
2741   slp_tree child;
2742 
2743   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2744     return true;
2745 
2746   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2748       return false;
2749 
2750   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2751   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2752   gcc_assert (stmt_info);
2753   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2754 
2755   /* For BB vectorization vector types are assigned here.
2756      Memory accesses already got their vector type assigned
2757      in vect_analyze_data_refs.  */
2758   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2759   if (bb_vinfo
2760       && ! STMT_VINFO_DATA_REF (stmt_info))
2761     {
2762       gcc_assert (PURE_SLP_STMT (stmt_info));
2763 
2764       tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2765       if (dump_enabled_p ())
2766 	{
2767 	  dump_printf_loc (MSG_NOTE, vect_location,
2768 			   "get vectype for scalar type:  ");
2769 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2770 	  dump_printf (MSG_NOTE, "\n");
2771 	}
2772 
2773       tree vectype = get_vectype_for_scalar_type (scalar_type);
2774       if (!vectype)
2775 	{
2776 	  if (dump_enabled_p ())
2777 	    {
2778 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 			       "not SLPed: unsupported data-type ");
2780 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2781 				 scalar_type);
2782 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2783 	    }
2784 	  return false;
2785 	}
2786 
2787       if (dump_enabled_p ())
2788 	{
2789 	  dump_printf_loc (MSG_NOTE, vect_location, "vectype:  ");
2790 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2791 	  dump_printf (MSG_NOTE, "\n");
2792 	}
2793 
2794       gimple *sstmt;
2795       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2796 	STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2797     }
2798 
2799   /* Calculate the number of vector statements to be created for the
2800      scalar stmts in this node.  For SLP reductions it is equal to the
2801      number of vector statements in the children (which has already been
2802      calculated by the recursive call).  Otherwise it is the number of
2803      scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2804      VF divided by the number of elements in a vector.  */
2805   if (GROUP_FIRST_ELEMENT (stmt_info)
2806       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2807     SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2808       = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2809   else
2810     {
2811       poly_uint64 vf;
2812       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2813 	vf = loop_vinfo->vectorization_factor;
2814       else
2815 	vf = 1;
2816       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2817       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2818       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2819 	= vect_get_num_vectors (vf * group_size, vectype);
2820     }
2821 
2822   /* ???  We have to catch the case late where two first scalar stmts appear
2823      in multiple SLP children with different def type and fail.  Remember
2824      original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2825      match it when that is vect_internal_def.  */
2826   auto_vec<vect_def_type, 4> dt;
2827   dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2828   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2829     dt[j]
2830       = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]));
2831 
2832   /* Push SLP node def-type to stmt operands.  */
2833   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2834     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2835       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2836 	= SLP_TREE_DEF_TYPE (child);
2837 
2838   /* Check everything worked out.  */
2839   bool res = true;
2840   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2841    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2842      {
2843        if (STMT_VINFO_DEF_TYPE
2844 	     (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2845 	   != SLP_TREE_DEF_TYPE (child))
2846 	 res = false;
2847      }
2848    else if (STMT_VINFO_DEF_TYPE
2849 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) != dt[j])
2850      res = false;
2851   if (!res && dump_enabled_p ())
2852     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853 		     "not vectorized: same operand with different "
2854 		     "def type in stmt.\n");
2855   if (res)
2856     res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2857 
2858   /* Restore def-types.  */
2859   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2860     STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2861       = dt[j];
2862 
2863   return res;
2864 }
2865 
2866 
2867 /* Analyze statements in SLP instances of VINFO.  Return true if the
2868    operations are supported. */
2869 
2870 bool
2871 vect_slp_analyze_operations (vec_info *vinfo)
2872 {
2873   slp_instance instance;
2874   int i;
2875 
2876   if (dump_enabled_p ())
2877     dump_printf_loc (MSG_NOTE, vect_location,
2878 		     "=== vect_slp_analyze_operations ===\n");
2879 
2880   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2881     {
2882       if (!vect_slp_analyze_node_operations (vinfo,
2883 					     SLP_INSTANCE_TREE (instance),
2884 					     instance))
2885         {
2886 	  dump_printf_loc (MSG_NOTE, vect_location,
2887 			   "removing SLP instance operations starting from: ");
2888 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2889 			    SLP_TREE_SCALAR_STMTS
2890 			      (SLP_INSTANCE_TREE (instance))[0], 0);
2891 	  vect_free_slp_instance (instance);
2892           vinfo->slp_instances.ordered_remove (i);
2893 	}
2894       else
2895 	i++;
2896     }
2897 
2898   if (dump_enabled_p ())
2899     dump_printf_loc (MSG_NOTE, vect_location,
2900 		     "=== vect_analyze_slp_cost ===\n");
2901 
2902   /* Compute the costs of the SLP instances.  */
2903   scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2904   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2905     vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2906   delete visited;
2907 
2908   return !vinfo->slp_instances.is_empty ();
2909 }
2910 
2911 
2912 /* Compute the scalar cost of the SLP node NODE and its children
2913    and return it.  Do not account defs that are marked in LIFE and
2914    update LIFE according to uses of NODE.  */
2915 
2916 static unsigned
2917 vect_bb_slp_scalar_cost (basic_block bb,
2918 			 slp_tree node, vec<bool, va_heap> *life)
2919 {
2920   unsigned scalar_cost = 0;
2921   unsigned i;
2922   gimple *stmt;
2923   slp_tree child;
2924 
2925   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2926     {
2927       unsigned stmt_cost;
2928       ssa_op_iter op_iter;
2929       def_operand_p def_p;
2930       stmt_vec_info stmt_info;
2931 
2932       if ((*life)[i])
2933 	continue;
2934 
2935       /* If there is a non-vectorized use of the defs then the scalar
2936          stmt is kept live in which case we do not account it or any
2937 	 required defs in the SLP children in the scalar cost.  This
2938 	 way we make the vectorization more costly when compared to
2939 	 the scalar cost.  */
2940       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2941 	{
2942 	  imm_use_iterator use_iter;
2943 	  gimple *use_stmt;
2944 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2945 	    if (!is_gimple_debug (use_stmt)
2946 		&& (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2947 					     use_stmt)
2948 		    || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2949 	      {
2950 		(*life)[i] = true;
2951 		BREAK_FROM_IMM_USE_STMT (use_iter);
2952 	      }
2953 	}
2954       if ((*life)[i])
2955 	continue;
2956 
2957       /* Count scalar stmts only once.  */
2958       if (gimple_visited_p (stmt))
2959 	continue;
2960       gimple_set_visited (stmt, true);
2961 
2962       stmt_info = vinfo_for_stmt (stmt);
2963       if (STMT_VINFO_DATA_REF (stmt_info))
2964         {
2965           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2966             stmt_cost = vect_get_stmt_cost (scalar_load);
2967           else
2968             stmt_cost = vect_get_stmt_cost (scalar_store);
2969         }
2970       else
2971         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2972 
2973       scalar_cost += stmt_cost;
2974     }
2975 
2976   auto_vec<bool, 20> subtree_life;
2977   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2978     {
2979       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2980 	{
2981 	  /* Do not directly pass LIFE to the recursive call, copy it to
2982 	     confine changes in the callee to the current child/subtree.  */
2983 	  subtree_life.safe_splice (*life);
2984 	  scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2985 	  subtree_life.truncate (0);
2986 	}
2987     }
2988 
2989   return scalar_cost;
2990 }
2991 
2992 /* Check if vectorization of the basic block is profitable.  */
2993 
2994 static bool
2995 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2996 {
2997   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2998   slp_instance instance;
2999   int i;
3000   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3001   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3002 
3003   /* Calculate scalar cost.  */
3004   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3005     {
3006       auto_vec<bool, 20> life;
3007       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3008       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3009 					      SLP_INSTANCE_TREE (instance),
3010 					      &life);
3011     }
3012 
3013   /* Unset visited flag.  */
3014   for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3015        gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3016     gimple_set_visited  (gsi_stmt (gsi), false);
3017 
3018   /* Complete the target-specific cost calculation.  */
3019   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3020 	       &vec_inside_cost, &vec_epilogue_cost);
3021 
3022   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3023 
3024   if (dump_enabled_p ())
3025     {
3026       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3027       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
3028 		   vec_inside_cost);
3029       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
3030       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
3031       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
3032     }
3033 
3034   /* Vectorization is profitable if its cost is more than the cost of scalar
3035      version.  Note that we err on the vector side for equal cost because
3036      the cost estimate is otherwise quite pessimistic (constant uses are
3037      free on the scalar side but cost a load on the vector side for
3038      example).  */
3039   if (vec_outside_cost + vec_inside_cost > scalar_cost)
3040     return false;
3041 
3042   return true;
3043 }
3044 
3045 /* Check if the basic block can be vectorized.  Returns a bb_vec_info
3046    if so and sets fatal to true if failure is independent of
3047    current_vector_size.  */
3048 
3049 static bb_vec_info
3050 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3051 		       gimple_stmt_iterator region_end,
3052 		       vec<data_reference_p> datarefs, int n_stmts,
3053 		       bool &fatal)
3054 {
3055   bb_vec_info bb_vinfo;
3056   slp_instance instance;
3057   int i;
3058   poly_uint64 min_vf = 2;
3059 
3060   /* The first group of checks is independent of the vector size.  */
3061   fatal = true;
3062 
3063   if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3064     {
3065       if (dump_enabled_p ())
3066 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3067 			 "not vectorized: too many instructions in "
3068 			 "basic block.\n");
3069       free_data_refs (datarefs);
3070       return NULL;
3071     }
3072 
3073   bb_vinfo = new _bb_vec_info (region_begin, region_end);
3074   if (!bb_vinfo)
3075     return NULL;
3076 
3077   BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3078 
3079   /* Analyze the data references.  */
3080 
3081   if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3082     {
3083       if (dump_enabled_p ())
3084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 			 "not vectorized: unhandled data-ref in basic "
3086 			 "block.\n");
3087 
3088       delete bb_vinfo;
3089       return NULL;
3090     }
3091 
3092   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3093     {
3094       if (dump_enabled_p ())
3095         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3096 			 "not vectorized: not enough data-refs in "
3097 			 "basic block.\n");
3098 
3099       delete bb_vinfo;
3100       return NULL;
3101     }
3102 
3103   if (!vect_analyze_data_ref_accesses (bb_vinfo))
3104     {
3105      if (dump_enabled_p ())
3106        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3107 			"not vectorized: unhandled data access in "
3108 			"basic block.\n");
3109 
3110       delete bb_vinfo;
3111       return NULL;
3112     }
3113 
3114   /* If there are no grouped stores in the region there is no need
3115      to continue with pattern recog as vect_analyze_slp will fail
3116      anyway.  */
3117   if (bb_vinfo->grouped_stores.is_empty ())
3118     {
3119       if (dump_enabled_p ())
3120 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3121 			 "not vectorized: no grouped stores in "
3122 			 "basic block.\n");
3123 
3124       delete bb_vinfo;
3125       return NULL;
3126     }
3127 
3128   /* While the rest of the analysis below depends on it in some way.  */
3129   fatal = false;
3130 
3131   vect_pattern_recog (bb_vinfo);
3132 
3133   /* Check the SLP opportunities in the basic block, analyze and build SLP
3134      trees.  */
3135   if (!vect_analyze_slp (bb_vinfo, n_stmts))
3136     {
3137       if (dump_enabled_p ())
3138 	{
3139 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 			   "Failed to SLP the basic block.\n");
3141 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142 			   "not vectorized: failed to find SLP opportunities "
3143 			   "in basic block.\n");
3144 	}
3145 
3146       delete bb_vinfo;
3147       return NULL;
3148     }
3149 
3150   vect_record_base_alignments (bb_vinfo);
3151 
3152   /* Analyze and verify the alignment of data references and the
3153      dependence in the SLP instances.  */
3154   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3155     {
3156       if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3157 	  || ! vect_slp_analyze_instance_dependence (instance))
3158 	{
3159 	  dump_printf_loc (MSG_NOTE, vect_location,
3160 			   "removing SLP instance operations starting from: ");
3161 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3162 			    SLP_TREE_SCALAR_STMTS
3163 			      (SLP_INSTANCE_TREE (instance))[0], 0);
3164 	  vect_free_slp_instance (instance);
3165 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3166 	  continue;
3167 	}
3168 
3169       /* Mark all the statements that we want to vectorize as pure SLP and
3170 	 relevant.  */
3171       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3172       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3173 
3174       i++;
3175     }
3176   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3177     {
3178       delete bb_vinfo;
3179       return NULL;
3180     }
3181 
3182   if (!vect_slp_analyze_operations (bb_vinfo))
3183     {
3184       if (dump_enabled_p ())
3185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3186 			 "not vectorized: bad operation in basic block.\n");
3187 
3188       delete bb_vinfo;
3189       return NULL;
3190     }
3191 
3192   /* Cost model: check if the vectorization is worthwhile.  */
3193   if (!unlimited_cost_model (NULL)
3194       && !vect_bb_vectorization_profitable_p (bb_vinfo))
3195     {
3196       if (dump_enabled_p ())
3197         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3198 			 "not vectorized: vectorization is not "
3199 			 "profitable.\n");
3200 
3201       delete bb_vinfo;
3202       return NULL;
3203     }
3204 
3205   if (dump_enabled_p ())
3206     dump_printf_loc (MSG_NOTE, vect_location,
3207 		     "Basic block will be vectorized using SLP\n");
3208 
3209   return bb_vinfo;
3210 }
3211 
3212 
3213 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
3214    true if anything in the basic-block was vectorized.  */
3215 
3216 bool
3217 vect_slp_bb (basic_block bb)
3218 {
3219   bb_vec_info bb_vinfo;
3220   gimple_stmt_iterator gsi;
3221   bool any_vectorized = false;
3222   auto_vector_sizes vector_sizes;
3223 
3224   if (dump_enabled_p ())
3225     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3226 
3227   /* Autodetect first vector size we try.  */
3228   current_vector_size = 0;
3229   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3230   unsigned int next_size = 0;
3231 
3232   gsi = gsi_start_bb (bb);
3233 
3234   poly_uint64 autodetected_vector_size = 0;
3235   while (1)
3236     {
3237       if (gsi_end_p (gsi))
3238 	break;
3239 
3240       gimple_stmt_iterator region_begin = gsi;
3241       vec<data_reference_p> datarefs = vNULL;
3242       int insns = 0;
3243 
3244       for (; !gsi_end_p (gsi); gsi_next (&gsi))
3245 	{
3246 	  gimple *stmt = gsi_stmt (gsi);
3247 	  if (is_gimple_debug (stmt))
3248 	    continue;
3249 	  insns++;
3250 
3251 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
3252 	    vect_location = gimple_location (stmt);
3253 
3254 	  if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3255 	    break;
3256 	}
3257 
3258       /* Skip leading unhandled stmts.  */
3259       if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3260 	{
3261 	  gsi_next (&gsi);
3262 	  continue;
3263 	}
3264 
3265       gimple_stmt_iterator region_end = gsi;
3266 
3267       bool vectorized = false;
3268       bool fatal = false;
3269       bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3270 					datarefs, insns, fatal);
3271       if (bb_vinfo
3272 	  && dbg_cnt (vect_slp))
3273 	{
3274 	  if (dump_enabled_p ())
3275 	    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3276 
3277 	  vect_schedule_slp (bb_vinfo);
3278 
3279 	  if (dump_enabled_p ())
3280 	    dump_printf_loc (MSG_NOTE, vect_location,
3281 			     "basic block part vectorized\n");
3282 
3283 	  vectorized = true;
3284 	}
3285       delete bb_vinfo;
3286 
3287       any_vectorized |= vectorized;
3288 
3289       if (next_size == 0)
3290 	autodetected_vector_size = current_vector_size;
3291 
3292       if (next_size < vector_sizes.length ()
3293 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
3294 	next_size += 1;
3295 
3296       if (vectorized
3297 	  || next_size == vector_sizes.length ()
3298 	  || known_eq (current_vector_size, 0U)
3299 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
3300 	     vector sizes will fail do not bother iterating.  */
3301 	  || fatal)
3302 	{
3303 	  if (gsi_end_p (region_end))
3304 	    break;
3305 
3306 	  /* Skip the unhandled stmt.  */
3307 	  gsi_next (&gsi);
3308 
3309 	  /* And reset vector sizes.  */
3310 	  current_vector_size = 0;
3311 	  next_size = 0;
3312 	}
3313       else
3314 	{
3315 	  /* Try the next biggest vector size.  */
3316 	  current_vector_size = vector_sizes[next_size++];
3317 	  if (dump_enabled_p ())
3318 	    {
3319 	      dump_printf_loc (MSG_NOTE, vect_location,
3320 			       "***** Re-trying analysis with "
3321 			       "vector size ");
3322 	      dump_dec (MSG_NOTE, current_vector_size);
3323 	      dump_printf (MSG_NOTE, "\n");
3324 	    }
3325 
3326 	  /* Start over.  */
3327 	  gsi = region_begin;
3328 	}
3329     }
3330 
3331   return any_vectorized;
3332 }
3333 
3334 
3335 /* Return 1 if vector type of boolean constant which is OPNUM
3336    operand in statement STMT is a boolean vector.  */
3337 
3338 static bool
3339 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3340 {
3341   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3342   enum tree_code code = gimple_expr_code (stmt);
3343   tree op, vectype;
3344   gimple *def_stmt;
3345   enum vect_def_type dt;
3346 
3347   /* For comparison and COND_EXPR type is chosen depending
3348      on the other comparison operand.  */
3349   if (TREE_CODE_CLASS (code) == tcc_comparison)
3350     {
3351       if (opnum)
3352 	op = gimple_assign_rhs1 (stmt);
3353       else
3354 	op = gimple_assign_rhs2 (stmt);
3355 
3356       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3357 			       &dt, &vectype))
3358 	gcc_unreachable ();
3359 
3360       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3361     }
3362 
3363   if (code == COND_EXPR)
3364     {
3365       tree cond = gimple_assign_rhs1 (stmt);
3366 
3367       if (TREE_CODE (cond) == SSA_NAME)
3368 	op = cond;
3369       else if (opnum)
3370 	op = TREE_OPERAND (cond, 1);
3371       else
3372 	op = TREE_OPERAND (cond, 0);
3373 
3374       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3375 			       &dt, &vectype))
3376 	gcc_unreachable ();
3377 
3378       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3379     }
3380 
3381   return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3382 }
3383 
3384 /* Build a variable-length vector in which the elements in ELTS are repeated
3385    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
3386    RESULTS and add any new instructions to SEQ.
3387 
3388    The approach we use is:
3389 
3390    (1) Find a vector mode VM with integer elements of mode IM.
3391 
3392    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3393        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
3394        from small vectors to IM.
3395 
3396    (3) Duplicate each ELTS'[I] into a vector of mode VM.
3397 
3398    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3399        correct byte contents.
3400 
3401    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3402 
3403    We try to find the largest IM for which this sequence works, in order
3404    to cut down on the number of interleaves.  */
3405 
3406 void
3407 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3408 			  unsigned int nresults, vec<tree> &results)
3409 {
3410   unsigned int nelts = elts.length ();
3411   tree element_type = TREE_TYPE (vector_type);
3412 
3413   /* (1) Find a vector mode VM with integer elements of mode IM.  */
3414   unsigned int nvectors = 1;
3415   tree new_vector_type;
3416   tree permutes[2];
3417   if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3418 				       &nvectors, &new_vector_type,
3419 				       permutes))
3420     gcc_unreachable ();
3421 
3422   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
3423   unsigned int partial_nelts = nelts / nvectors;
3424   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3425 
3426   tree_vector_builder partial_elts;
3427   auto_vec<tree, 32> pieces (nvectors * 2);
3428   pieces.quick_grow (nvectors * 2);
3429   for (unsigned int i = 0; i < nvectors; ++i)
3430     {
3431       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3432 	     ELTS' has mode IM.  */
3433       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3434       for (unsigned int j = 0; j < partial_nelts; ++j)
3435 	partial_elts.quick_push (elts[i * partial_nelts + j]);
3436       tree t = gimple_build_vector (seq, &partial_elts);
3437       t = gimple_build (seq, VIEW_CONVERT_EXPR,
3438 			TREE_TYPE (new_vector_type), t);
3439 
3440       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
3441       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3442     }
3443 
3444   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3445 	 correct byte contents.
3446 
3447      We need to repeat the following operation log2(nvectors) times:
3448 
3449 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3450 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3451 
3452      However, if each input repeats every N elements and the VF is
3453      a multiple of N * 2, the HI result is the same as the LO.  */
3454   unsigned int in_start = 0;
3455   unsigned int out_start = nvectors;
3456   unsigned int hi_start = nvectors / 2;
3457   /* A bound on the number of outputs needed to produce NRESULTS results
3458      in the final iteration.  */
3459   unsigned int noutputs_bound = nvectors * nresults;
3460   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3461     {
3462       noutputs_bound /= 2;
3463       unsigned int limit = MIN (noutputs_bound, nvectors);
3464       for (unsigned int i = 0; i < limit; ++i)
3465 	{
3466 	  if ((i & 1) != 0
3467 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3468 			     2 * in_repeat))
3469 	    {
3470 	      pieces[out_start + i] = pieces[out_start + i - 1];
3471 	      continue;
3472 	    }
3473 
3474 	  tree output = make_ssa_name (new_vector_type);
3475 	  tree input1 = pieces[in_start + (i / 2)];
3476 	  tree input2 = pieces[in_start + (i / 2) + hi_start];
3477 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3478 					       input1, input2,
3479 					       permutes[i & 1]);
3480 	  gimple_seq_add_stmt (seq, stmt);
3481 	  pieces[out_start + i] = output;
3482 	}
3483       std::swap (in_start, out_start);
3484     }
3485 
3486   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
3487   results.reserve (nresults);
3488   for (unsigned int i = 0; i < nresults; ++i)
3489     if (i < nvectors)
3490       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3491 					pieces[in_start + i]));
3492     else
3493       results.quick_push (results[i - nvectors]);
3494 }
3495 
3496 
3497 /* For constant and loop invariant defs of SLP_NODE this function returns
3498    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3499    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3500    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
3501    REDUC_INDEX is the index of the reduction operand in the statements, unless
3502    it is -1.  */
3503 
3504 static void
3505 vect_get_constant_vectors (tree op, slp_tree slp_node,
3506                            vec<tree> *vec_oprnds,
3507 			   unsigned int op_num, unsigned int number_of_vectors)
3508 {
3509   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3510   gimple *stmt = stmts[0];
3511   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3512   unsigned HOST_WIDE_INT nunits;
3513   tree vec_cst;
3514   unsigned j, number_of_places_left_in_vector;
3515   tree vector_type;
3516   tree vop;
3517   int group_size = stmts.length ();
3518   unsigned int vec_num, i;
3519   unsigned number_of_copies = 1;
3520   vec<tree> voprnds;
3521   voprnds.create (number_of_vectors);
3522   bool constant_p, is_store;
3523   tree neutral_op = NULL;
3524   enum tree_code code = gimple_expr_code (stmt);
3525   gimple_seq ctor_seq = NULL;
3526   auto_vec<tree, 16> permute_results;
3527 
3528   /* Check if vector type is a boolean vector.  */
3529   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3530       && vect_mask_constant_operand_p (stmt, op_num))
3531     vector_type
3532       = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3533   else
3534     vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3535 
3536   if (STMT_VINFO_DATA_REF (stmt_vinfo))
3537     {
3538       is_store = true;
3539       op = gimple_assign_rhs1 (stmt);
3540     }
3541   else
3542     is_store = false;
3543 
3544   gcc_assert (op);
3545 
3546   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3547      created vectors. It is greater than 1 if unrolling is performed.
3548 
3549      For example, we have two scalar operands, s1 and s2 (e.g., group of
3550      strided accesses of size two), while NUNITS is four (i.e., four scalars
3551      of this type can be packed in a vector).  The output vector will contain
3552      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3553      will be 2).
3554 
3555      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3556      containing the operands.
3557 
3558      For example, NUNITS is four as before, and the group size is 8
3559      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3560      {s5, s6, s7, s8}.  */
3561 
3562   /* When using duplicate_and_interleave, we just need one element for
3563      each scalar statement.  */
3564   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3565     nunits = group_size;
3566 
3567   number_of_copies = nunits * number_of_vectors / group_size;
3568 
3569   number_of_places_left_in_vector = nunits;
3570   constant_p = true;
3571   tree_vector_builder elts (vector_type, nunits, 1);
3572   elts.quick_grow (nunits);
3573   bool place_after_defs = false;
3574   for (j = 0; j < number_of_copies; j++)
3575     {
3576       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3577         {
3578           if (is_store)
3579             op = gimple_assign_rhs1 (stmt);
3580           else
3581 	    {
3582 	      switch (code)
3583 		{
3584 		  case COND_EXPR:
3585 		    {
3586 		      tree cond = gimple_assign_rhs1 (stmt);
3587 		      if (TREE_CODE (cond) == SSA_NAME)
3588 			op = gimple_op (stmt, op_num + 1);
3589 		      else if (op_num == 0 || op_num == 1)
3590 			op = TREE_OPERAND (cond, op_num);
3591 		      else
3592 			{
3593 			  if (op_num == 2)
3594 			    op = gimple_assign_rhs2 (stmt);
3595 			  else
3596 			    op = gimple_assign_rhs3 (stmt);
3597 			}
3598 		    }
3599 		    break;
3600 
3601 		  case CALL_EXPR:
3602 		    op = gimple_call_arg (stmt, op_num);
3603 		    break;
3604 
3605 		  case LSHIFT_EXPR:
3606 		  case RSHIFT_EXPR:
3607 		  case LROTATE_EXPR:
3608 		  case RROTATE_EXPR:
3609 		    op = gimple_op (stmt, op_num + 1);
3610 		    /* Unlike the other binary operators, shifts/rotates have
3611 		       the shift count being int, instead of the same type as
3612 		       the lhs, so make sure the scalar is the right type if
3613 		       we are dealing with vectors of
3614 		       long long/long/short/char.  */
3615 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3616 		      op = fold_convert (TREE_TYPE (vector_type), op);
3617 		    break;
3618 
3619 		  default:
3620 		    op = gimple_op (stmt, op_num + 1);
3621 		    break;
3622 		}
3623 	    }
3624 
3625           /* Create 'vect_ = {op0,op1,...,opn}'.  */
3626           number_of_places_left_in_vector--;
3627 	  tree orig_op = op;
3628 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3629 	    {
3630 	      if (CONSTANT_CLASS_P (op))
3631 		{
3632 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3633 		    {
3634 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
3635 			 of possibly different sizes of scalar value and
3636 			 vector element.  */
3637 		      if (integer_zerop (op))
3638 			op = build_int_cst (TREE_TYPE (vector_type), 0);
3639 		      else if (integer_onep (op))
3640 			op = build_all_ones_cst (TREE_TYPE (vector_type));
3641 		      else
3642 			gcc_unreachable ();
3643 		    }
3644 		  else
3645 		    op = fold_unary (VIEW_CONVERT_EXPR,
3646 				     TREE_TYPE (vector_type), op);
3647 		  gcc_assert (op && CONSTANT_CLASS_P (op));
3648 		}
3649 	      else
3650 		{
3651 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3652 		  gimple *init_stmt;
3653 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3654 		    {
3655 		      tree true_val
3656 			= build_all_ones_cst (TREE_TYPE (vector_type));
3657 		      tree false_val
3658 			= build_zero_cst (TREE_TYPE (vector_type));
3659 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3660 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3661 						       op, true_val,
3662 						       false_val);
3663 		    }
3664 		  else
3665 		    {
3666 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3667 				   op);
3668 		      init_stmt
3669 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3670 					       op);
3671 		    }
3672 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
3673 		  op = new_temp;
3674 		}
3675 	    }
3676 	  elts[number_of_places_left_in_vector] = op;
3677 	  if (!CONSTANT_CLASS_P (op))
3678 	    constant_p = false;
3679 	  if (TREE_CODE (orig_op) == SSA_NAME
3680 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3681 	      && STMT_VINFO_BB_VINFO (stmt_vinfo)
3682 	      && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3683 		  == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3684 	    place_after_defs = true;
3685 
3686           if (number_of_places_left_in_vector == 0)
3687             {
3688 	      if (constant_p
3689 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3690 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3691 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
3692 	      else
3693 		{
3694 		  if (vec_oprnds->is_empty ())
3695 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
3696 					      number_of_vectors,
3697 					      permute_results);
3698 		  vec_cst = permute_results[number_of_vectors - j - 1];
3699 		}
3700 	      tree init;
3701 	      gimple_stmt_iterator gsi;
3702 	      if (place_after_defs)
3703 		{
3704 		  gsi = gsi_for_stmt
3705 		          (vect_find_last_scalar_stmt_in_slp (slp_node));
3706 		  init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3707 		}
3708 	      else
3709 		init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3710 	      if (ctor_seq != NULL)
3711 		{
3712 		  gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3713 		  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3714 		  ctor_seq = NULL;
3715 		}
3716 	      voprnds.quick_push (init);
3717 	      place_after_defs = false;
3718               number_of_places_left_in_vector = nunits;
3719 	      constant_p = true;
3720 	      elts.new_vector (vector_type, nunits, 1);
3721 	      elts.quick_grow (nunits);
3722             }
3723         }
3724     }
3725 
3726   /* Since the vectors are created in the reverse order, we should invert
3727      them.  */
3728   vec_num = voprnds.length ();
3729   for (j = vec_num; j != 0; j--)
3730     {
3731       vop = voprnds[j - 1];
3732       vec_oprnds->quick_push (vop);
3733     }
3734 
3735   voprnds.release ();
3736 
3737   /* In case that VF is greater than the unrolling factor needed for the SLP
3738      group of stmts, NUMBER_OF_VECTORS to be created is greater than
3739      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3740      to replicate the vectors.  */
3741   while (number_of_vectors > vec_oprnds->length ())
3742     {
3743       tree neutral_vec = NULL;
3744 
3745       if (neutral_op)
3746         {
3747           if (!neutral_vec)
3748 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
3749 
3750           vec_oprnds->quick_push (neutral_vec);
3751         }
3752       else
3753         {
3754           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3755             vec_oprnds->quick_push (vop);
3756         }
3757     }
3758 }
3759 
3760 
3761 /* Get vectorized definitions from SLP_NODE that contains corresponding
3762    vectorized def-stmts.  */
3763 
3764 static void
3765 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3766 {
3767   tree vec_oprnd;
3768   gimple *vec_def_stmt;
3769   unsigned int i;
3770 
3771   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3772 
3773   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3774     {
3775       gcc_assert (vec_def_stmt);
3776       if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3777 	vec_oprnd = gimple_phi_result (vec_def_stmt);
3778       else
3779 	vec_oprnd = gimple_get_lhs (vec_def_stmt);
3780       vec_oprnds->quick_push (vec_oprnd);
3781     }
3782 }
3783 
3784 
3785 /* Get vectorized definitions for SLP_NODE.
3786    If the scalar definitions are loop invariants or constants, collect them and
3787    call vect_get_constant_vectors() to create vector stmts.
3788    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3789    must be stored in the corresponding child of SLP_NODE, and we call
3790    vect_get_slp_vect_defs () to retrieve them.  */
3791 
3792 void
3793 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3794 		   vec<vec<tree> > *vec_oprnds)
3795 {
3796   gimple *first_stmt;
3797   int number_of_vects = 0, i;
3798   unsigned int child_index = 0;
3799   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3800   slp_tree child = NULL;
3801   vec<tree> vec_defs;
3802   tree oprnd;
3803   bool vectorized_defs;
3804 
3805   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3806   FOR_EACH_VEC_ELT (ops, i, oprnd)
3807     {
3808       /* For each operand we check if it has vectorized definitions in a child
3809 	 node or we need to create them (for invariants and constants).  We
3810 	 check if the LHS of the first stmt of the next child matches OPRND.
3811 	 If it does, we found the correct child.  Otherwise, we call
3812 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3813 	 to check this child node for the next operand.  */
3814       vectorized_defs = false;
3815       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3816         {
3817           child = SLP_TREE_CHILDREN (slp_node)[child_index];
3818 
3819 	  /* We have to check both pattern and original def, if available.  */
3820 	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3821 	    {
3822 	      gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3823 	      gimple *related
3824 		= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3825 	      tree first_def_op;
3826 
3827 	      if (gimple_code (first_def) == GIMPLE_PHI)
3828 		first_def_op = gimple_phi_result (first_def);
3829 	      else
3830 		first_def_op = gimple_get_lhs (first_def);
3831 	      if (operand_equal_p (oprnd, first_def_op, 0)
3832 		  || (related
3833 		      && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3834 		{
3835 		  /* The number of vector defs is determined by the number of
3836 		     vector statements in the node from which we get those
3837 		     statements.  */
3838 		  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3839 		  vectorized_defs = true;
3840 		  child_index++;
3841 		}
3842 	    }
3843 	  else
3844 	    child_index++;
3845         }
3846 
3847       if (!vectorized_defs)
3848         {
3849           if (i == 0)
3850             {
3851               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3852               /* Number of vector stmts was calculated according to LHS in
3853                  vect_schedule_slp_instance (), fix it by replacing LHS with
3854                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
3855                  details.  */
3856               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3857                                              &rhs_size_unit);
3858               if (rhs_size_unit != lhs_size_unit)
3859                 {
3860                   number_of_vects *= rhs_size_unit;
3861                   number_of_vects /= lhs_size_unit;
3862                 }
3863             }
3864         }
3865 
3866       /* Allocate memory for vectorized defs.  */
3867       vec_defs = vNULL;
3868       vec_defs.create (number_of_vects);
3869 
3870       /* For reduction defs we call vect_get_constant_vectors (), since we are
3871          looking for initial loop invariant values.  */
3872       if (vectorized_defs)
3873         /* The defs are already vectorized.  */
3874 	vect_get_slp_vect_defs (child, &vec_defs);
3875       else
3876 	/* Build vectors from scalar defs.  */
3877 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3878 				   number_of_vects);
3879 
3880       vec_oprnds->quick_push (vec_defs);
3881     }
3882 }
3883 
3884 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3885    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3886    permute statements for the SLP node NODE of the SLP instance
3887    SLP_NODE_INSTANCE.  */
3888 
3889 bool
3890 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3891 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
3892 			      slp_instance slp_node_instance, bool analyze_only,
3893 			      unsigned *n_perms)
3894 {
3895   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3896   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3897   tree mask_element_type = NULL_TREE, mask_type;
3898   int vec_index = 0;
3899   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3900   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3901   unsigned int mask_element;
3902   machine_mode mode;
3903   unsigned HOST_WIDE_INT nunits, const_vf;
3904 
3905   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3906     return false;
3907 
3908   stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3909 
3910   mode = TYPE_MODE (vectype);
3911 
3912   /* At the moment, all permutations are represented using per-element
3913      indices, so we can't cope with variable vector lengths or
3914      vectorization factors.  */
3915   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3916       || !vf.is_constant (&const_vf))
3917     return false;
3918 
3919   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3920      same size as the vector element being permuted.  */
3921   mask_element_type = lang_hooks.types.type_for_mode
3922     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3923   mask_type = get_vectype_for_scalar_type (mask_element_type);
3924   vec_perm_builder mask (nunits, nunits, 1);
3925   mask.quick_grow (nunits);
3926   vec_perm_indices indices;
3927 
3928   /* Initialize the vect stmts of NODE to properly insert the generated
3929      stmts later.  */
3930   if (! analyze_only)
3931     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3932 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3933       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3934 
3935   /* Generate permutation masks for every NODE. Number of masks for each NODE
3936      is equal to GROUP_SIZE.
3937      E.g., we have a group of three nodes with three loads from the same
3938      location in each node, and the vector size is 4. I.e., we have a
3939      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3940      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3941      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3942      ...
3943 
3944      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3945      The last mask is illegal since we assume two operands for permute
3946      operation, and the mask element values can't be outside that range.
3947      Hence, the last mask must be converted into {2,5,5,5}.
3948      For the first two permutations we need the first and the second input
3949      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3950      we need the second and the third vectors: {b1,c1,a2,b2} and
3951      {c2,a3,b3,c3}.  */
3952 
3953   int vect_stmts_counter = 0;
3954   unsigned int index = 0;
3955   int first_vec_index = -1;
3956   int second_vec_index = -1;
3957   bool noop_p = true;
3958   *n_perms = 0;
3959 
3960   for (unsigned int j = 0; j < const_vf; j++)
3961     {
3962       for (int k = 0; k < group_size; k++)
3963 	{
3964 	  unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3965 			    + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3966 	  vec_index = i / nunits;
3967 	  mask_element = i % nunits;
3968 	  if (vec_index == first_vec_index
3969 	      || first_vec_index == -1)
3970 	    {
3971 	      first_vec_index = vec_index;
3972 	    }
3973 	  else if (vec_index == second_vec_index
3974 		   || second_vec_index == -1)
3975 	    {
3976 	      second_vec_index = vec_index;
3977 	      mask_element += nunits;
3978 	    }
3979 	  else
3980 	    {
3981 	      if (dump_enabled_p ())
3982 		{
3983 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3984 				   "permutation requires at "
3985 				   "least three vectors ");
3986 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3987 				    stmt, 0);
3988 		}
3989 	      gcc_assert (analyze_only);
3990 	      return false;
3991 	    }
3992 
3993 	  gcc_assert (mask_element < 2 * nunits);
3994 	  if (mask_element != index)
3995 	    noop_p = false;
3996 	  mask[index++] = mask_element;
3997 
3998 	  if (index == nunits && !noop_p)
3999 	    {
4000 	      indices.new_vector (mask, 2, nunits);
4001 	      if (!can_vec_perm_const_p (mode, indices))
4002 		{
4003 		  if (dump_enabled_p ())
4004 		    {
4005 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4006 				       vect_location,
4007 				       "unsupported vect permute { ");
4008 		      for (i = 0; i < nunits; ++i)
4009 			{
4010 			  dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4011 			  dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4012 			}
4013 		      dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4014 		    }
4015 		  gcc_assert (analyze_only);
4016 		  return false;
4017 		}
4018 
4019 	      ++*n_perms;
4020 	    }
4021 
4022 	  if (index == nunits)
4023 	    {
4024 	      if (!analyze_only)
4025 		{
4026 		  tree mask_vec = NULL_TREE;
4027 
4028 		  if (! noop_p)
4029 		    mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4030 
4031 		  if (second_vec_index == -1)
4032 		    second_vec_index = first_vec_index;
4033 
4034 		  /* Generate the permute statement if necessary.  */
4035 		  tree first_vec = dr_chain[first_vec_index];
4036 		  tree second_vec = dr_chain[second_vec_index];
4037 		  gimple *perm_stmt;
4038 		  if (! noop_p)
4039 		    {
4040 		      tree perm_dest
4041 			= vect_create_destination_var (gimple_assign_lhs (stmt),
4042 						       vectype);
4043 		      perm_dest = make_ssa_name (perm_dest);
4044 		      perm_stmt = gimple_build_assign (perm_dest,
4045 						       VEC_PERM_EXPR,
4046 						       first_vec, second_vec,
4047 						       mask_vec);
4048 		      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4049 		    }
4050 		  else
4051 		    /* If mask was NULL_TREE generate the requested
4052 		       identity transform.  */
4053 		    perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4054 
4055 		  /* Store the vector statement in NODE.  */
4056 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4057 		}
4058 
4059 	      index = 0;
4060 	      first_vec_index = -1;
4061 	      second_vec_index = -1;
4062 	      noop_p = true;
4063 	    }
4064 	}
4065     }
4066 
4067   return true;
4068 }
4069 
4070 typedef hash_map <vec <gimple *>, slp_tree,
4071 		  simple_hashmap_traits <bst_traits, slp_tree> >
4072   scalar_stmts_to_slp_tree_map_t;
4073 
4074 /* Vectorize SLP instance tree in postorder.  */
4075 
4076 static bool
4077 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4078 			    scalar_stmts_to_slp_tree_map_t *bst_map)
4079 {
4080   gimple *stmt;
4081   bool grouped_store, is_store;
4082   gimple_stmt_iterator si;
4083   stmt_vec_info stmt_info;
4084   unsigned int group_size;
4085   tree vectype;
4086   int i, j;
4087   slp_tree child;
4088 
4089   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4090     return false;
4091 
4092   /* See if we have already vectorized the same set of stmts and reuse their
4093      vectorized stmts.  */
4094   if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
4095     {
4096       SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
4097       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4098 	= SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
4099       return false;
4100     }
4101 
4102   bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
4103   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4104     vect_schedule_slp_instance (child, instance, bst_map);
4105 
4106   /* Push SLP node def-type to stmts.  */
4107   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4108     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4109       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4110 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4111 
4112   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4113   stmt_info = vinfo_for_stmt (stmt);
4114 
4115   /* VECTYPE is the type of the destination.  */
4116   vectype = STMT_VINFO_VECTYPE (stmt_info);
4117   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4118   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4119 
4120   if (!SLP_TREE_VEC_STMTS (node).exists ())
4121     SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4122 
4123   if (dump_enabled_p ())
4124     {
4125       dump_printf_loc (MSG_NOTE,vect_location,
4126 		       "------>vectorizing SLP node starting from: ");
4127       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4128     }
4129 
4130   /* Vectorized stmts go before the last scalar stmt which is where
4131      all uses are ready.  */
4132   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4133 
4134   /* Mark the first element of the reduction chain as reduction to properly
4135      transform the node.  In the analysis phase only the last element of the
4136      chain is marked as reduction.  */
4137   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4138       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4139     {
4140       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4141       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4142     }
4143 
4144   /* Handle two-operation SLP nodes by vectorizing the group with
4145      both operations and then performing a merge.  */
4146   if (SLP_TREE_TWO_OPERATORS (node))
4147     {
4148       enum tree_code code0 = gimple_assign_rhs_code (stmt);
4149       enum tree_code ocode = ERROR_MARK;
4150       gimple *ostmt;
4151       vec_perm_builder mask (group_size, group_size, 1);
4152       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4153 	if (gimple_assign_rhs_code (ostmt) != code0)
4154 	  {
4155 	    mask.quick_push (1);
4156 	    ocode = gimple_assign_rhs_code (ostmt);
4157 	  }
4158 	else
4159 	  mask.quick_push (0);
4160       if (ocode != ERROR_MARK)
4161 	{
4162 	  vec<gimple *> v0;
4163 	  vec<gimple *> v1;
4164 	  unsigned j;
4165 	  tree tmask = NULL_TREE;
4166 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4167 	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
4168 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4169 	  gimple_assign_set_rhs_code (stmt, ocode);
4170 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4171 	  gimple_assign_set_rhs_code (stmt, code0);
4172 	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
4173 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4174 	  tree meltype = build_nonstandard_integer_type
4175 	      (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4176 	  tree mvectype = get_same_sized_vectype (meltype, vectype);
4177 	  unsigned k = 0, l;
4178 	  for (j = 0; j < v0.length (); ++j)
4179 	    {
4180 	      /* Enforced by vect_build_slp_tree, which rejects variable-length
4181 		 vectors for SLP_TREE_TWO_OPERATORS.  */
4182 	      unsigned int const_nunits = nunits.to_constant ();
4183 	      tree_vector_builder melts (mvectype, const_nunits, 1);
4184 	      for (l = 0; l < const_nunits; ++l)
4185 		{
4186 		  if (k >= group_size)
4187 		    k = 0;
4188 		  tree t = build_int_cst (meltype,
4189 					  mask[k++] * const_nunits + l);
4190 		  melts.quick_push (t);
4191 		}
4192 	      tmask = melts.build ();
4193 
4194 	      /* ???  Not all targets support a VEC_PERM_EXPR with a
4195 	         constant mask that would translate to a vec_merge RTX
4196 		 (with their vec_perm_const_ok).  We can either not
4197 		 vectorize in that case or let veclower do its job.
4198 		 Unfortunately that isn't too great and at least for
4199 		 plus/minus we'd eventually like to match targets
4200 		 vector addsub instructions.  */
4201 	      gimple *vstmt;
4202 	      vstmt = gimple_build_assign (make_ssa_name (vectype),
4203 					   VEC_PERM_EXPR,
4204 					   gimple_assign_lhs (v0[j]),
4205 					   gimple_assign_lhs (v1[j]), tmask);
4206 	      vect_finish_stmt_generation (stmt, vstmt, &si);
4207 	      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4208 	    }
4209 	  v0.release ();
4210 	  v1.release ();
4211 	  return false;
4212 	}
4213     }
4214   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4215 
4216   /* Restore stmt def-types.  */
4217   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4218     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4219       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4220 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4221 
4222   return is_store;
4223 }
4224 
4225 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4226    For loop vectorization this is done in vectorizable_call, but for SLP
4227    it needs to be deferred until end of vect_schedule_slp, because multiple
4228    SLP instances may refer to the same scalar stmt.  */
4229 
4230 static void
4231 vect_remove_slp_scalar_calls (slp_tree node)
4232 {
4233   gimple *stmt, *new_stmt;
4234   gimple_stmt_iterator gsi;
4235   int i;
4236   slp_tree child;
4237   tree lhs;
4238   stmt_vec_info stmt_info;
4239 
4240   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4241     return;
4242 
4243   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4244     vect_remove_slp_scalar_calls (child);
4245 
4246   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4247     {
4248       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4249 	continue;
4250       stmt_info = vinfo_for_stmt (stmt);
4251       if (stmt_info == NULL
4252 	  || is_pattern_stmt_p (stmt_info)
4253 	  || !PURE_SLP_STMT (stmt_info))
4254 	continue;
4255       lhs = gimple_call_lhs (stmt);
4256       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4257       set_vinfo_for_stmt (new_stmt, stmt_info);
4258       set_vinfo_for_stmt (stmt, NULL);
4259       STMT_VINFO_STMT (stmt_info) = new_stmt;
4260       gsi = gsi_for_stmt (stmt);
4261       gsi_replace (&gsi, new_stmt, false);
4262       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4263     }
4264 }
4265 
4266 /* Generate vector code for all SLP instances in the loop/basic block.  */
4267 
4268 bool
4269 vect_schedule_slp (vec_info *vinfo)
4270 {
4271   vec<slp_instance> slp_instances;
4272   slp_instance instance;
4273   unsigned int i;
4274   bool is_store = false;
4275 
4276 
4277   scalar_stmts_to_slp_tree_map_t *bst_map
4278     = new scalar_stmts_to_slp_tree_map_t ();
4279   slp_instances = vinfo->slp_instances;
4280   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4281     {
4282       /* Schedule the tree of INSTANCE.  */
4283       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4284                                              instance, bst_map);
4285       if (dump_enabled_p ())
4286 	dump_printf_loc (MSG_NOTE, vect_location,
4287                          "vectorizing stmts using SLP.\n");
4288     }
4289   delete bst_map;
4290 
4291   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4292     {
4293       slp_tree root = SLP_INSTANCE_TREE (instance);
4294       gimple *store;
4295       unsigned int j;
4296       gimple_stmt_iterator gsi;
4297 
4298       /* Remove scalar call stmts.  Do not do this for basic-block
4299 	 vectorization as not all uses may be vectorized.
4300 	 ???  Why should this be necessary?  DCE should be able to
4301 	 remove the stmts itself.
4302 	 ???  For BB vectorization we can as well remove scalar
4303 	 stmts starting from the SLP tree root if they have no
4304 	 uses.  */
4305       if (is_a <loop_vec_info> (vinfo))
4306 	vect_remove_slp_scalar_calls (root);
4307 
4308       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4309                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4310         {
4311           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4312             break;
4313 
4314          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4315            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4316           /* Free the attached stmt_vec_info and remove the stmt.  */
4317           gsi = gsi_for_stmt (store);
4318 	  unlink_stmt_vdef (store);
4319           gsi_remove (&gsi, true);
4320 	  release_defs (store);
4321           free_stmt_vec_info (store);
4322         }
4323     }
4324 
4325   return is_store;
4326 }
4327