1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 
49 
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
51 
52 static void
53 vect_free_slp_tree (slp_tree node)
54 {
55   int i;
56   slp_tree child;
57 
58   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59     vect_free_slp_tree (child);
60 
61   gimple *stmt;
62   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63     /* After transform some stmts are removed and thus their vinfo is gone.  */
64     if (vinfo_for_stmt (stmt))
65       {
66 	gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 	STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
68       }
69 
70   SLP_TREE_CHILDREN (node).release ();
71   SLP_TREE_SCALAR_STMTS (node).release ();
72   SLP_TREE_VEC_STMTS (node).release ();
73   SLP_TREE_LOAD_PERMUTATION (node).release ();
74 
75   free (node);
76 }
77 
78 
79 /* Free the memory allocated for the SLP instance.  */
80 
81 void
82 vect_free_slp_instance (slp_instance instance)
83 {
84   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85   SLP_INSTANCE_LOADS (instance).release ();
86   free (instance);
87 }
88 
89 
90 /* Create an SLP node for SCALAR_STMTS.  */
91 
92 static slp_tree
93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
94 {
95   slp_tree node;
96   gimple *stmt = scalar_stmts[0];
97   unsigned int nops;
98 
99   if (is_gimple_call (stmt))
100     nops = gimple_call_num_args (stmt);
101   else if (is_gimple_assign (stmt))
102     {
103       nops = gimple_num_ops (stmt) - 1;
104       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 	nops++;
106     }
107   else if (gimple_code (stmt) == GIMPLE_PHI)
108     nops = 0;
109   else
110     return NULL;
111 
112   node = XNEW (struct _slp_tree);
113   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114   SLP_TREE_VEC_STMTS (node).create (0);
115   SLP_TREE_CHILDREN (node).create (nops);
116   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117   SLP_TREE_TWO_OPERATORS (node) = false;
118   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
119 
120   unsigned i;
121   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122     STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
123 
124   return node;
125 }
126 
127 
128 /* This structure is used in creation of an SLP tree.  Each instance
129    corresponds to the same operand in a group of scalar stmts in an SLP
130    node.  */
131 typedef struct _slp_oprnd_info
132 {
133   /* Def-stmts for the operands.  */
134   vec<gimple *> def_stmts;
135   /* Information about the first statement, its vector def-type, type, the
136      operand itself in case it's constant, and an indication if it's a pattern
137      stmt.  */
138   tree first_op_type;
139   enum vect_def_type first_dt;
140   bool first_pattern;
141   bool second_pattern;
142 } *slp_oprnd_info;
143 
144 
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146    operand.  */
147 static vec<slp_oprnd_info>
148 vect_create_oprnd_info (int nops, int group_size)
149 {
150   int i;
151   slp_oprnd_info oprnd_info;
152   vec<slp_oprnd_info> oprnds_info;
153 
154   oprnds_info.create (nops);
155   for (i = 0; i < nops; i++)
156     {
157       oprnd_info = XNEW (struct _slp_oprnd_info);
158       oprnd_info->def_stmts.create (group_size);
159       oprnd_info->first_dt = vect_uninitialized_def;
160       oprnd_info->first_op_type = NULL_TREE;
161       oprnd_info->first_pattern = false;
162       oprnd_info->second_pattern = false;
163       oprnds_info.quick_push (oprnd_info);
164     }
165 
166   return oprnds_info;
167 }
168 
169 
170 /* Free operands info.  */
171 
172 static void
173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
174 {
175   int i;
176   slp_oprnd_info oprnd_info;
177 
178   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
179     {
180       oprnd_info->def_stmts.release ();
181       XDELETE (oprnd_info);
182     }
183 
184   oprnds_info.release ();
185 }
186 
187 
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
190 
191 int
192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
193 {
194   gimple *next_stmt = first_stmt;
195   int result = 0;
196 
197   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198     return -1;
199 
200   do
201     {
202       if (next_stmt == stmt)
203 	return result;
204       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205       if (next_stmt)
206 	result += GROUP_GAP (vinfo_for_stmt (next_stmt));
207     }
208   while (next_stmt);
209 
210   return -1;
211 }
212 
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214    using the method implemented by duplicate_and_interleave.  Return true
215    if so, returning the number of intermediate vectors in *NVECTORS_OUT
216    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217    (if nonnull).  */
218 
219 bool
220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 				unsigned int *nvectors_out,
222 				tree *vector_type_out,
223 				tree *permutes)
224 {
225   poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226   poly_int64 nelts;
227   unsigned int nvectors = 1;
228   for (;;)
229     {
230       scalar_int_mode int_mode;
231       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232       if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 	  && int_mode_for_size (elt_bits, 0).exists (&int_mode))
234 	{
235 	  tree int_type = build_nonstandard_integer_type
236 	    (GET_MODE_BITSIZE (int_mode), 1);
237 	  tree vector_type = build_vector_type (int_type, nelts);
238 	  if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
239 	    {
240 	      vec_perm_builder sel1 (nelts, 2, 3);
241 	      vec_perm_builder sel2 (nelts, 2, 3);
242 	      poly_int64 half_nelts = exact_div (nelts, 2);
243 	      for (unsigned int i = 0; i < 3; ++i)
244 		{
245 		  sel1.quick_push (i);
246 		  sel1.quick_push (i + nelts);
247 		  sel2.quick_push (half_nelts + i);
248 		  sel2.quick_push (half_nelts + i + nelts);
249 		}
250 	      vec_perm_indices indices1 (sel1, 2, nelts);
251 	      vec_perm_indices indices2 (sel2, 2, nelts);
252 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
254 		{
255 		  if (nvectors_out)
256 		    *nvectors_out = nvectors;
257 		  if (vector_type_out)
258 		    *vector_type_out = vector_type;
259 		  if (permutes)
260 		    {
261 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 								indices1);
263 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 								indices2);
265 		    }
266 		  return true;
267 		}
268 	    }
269 	}
270       if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 	return false;
272       nvectors *= 2;
273     }
274 }
275 
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277    they are of a valid type and that they match the defs of the first stmt of
278    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
279    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero *SWAP
280    indicates swap is required for cond_expr stmts.  Specifically, *SWAP
281    is 1 if STMT is cond and operands of comparison need to be swapped;
282    *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283    If there is any operand swap in this function, *SWAP is set to non-zero
284    value.
285    If there was a fatal error return -1; if the error could be corrected by
286    swapping operands of father node of this one, return 1; if everything is
287    ok return 0.  */
288 static int
289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 			     vec<gimple *> stmts, unsigned stmt_num,
291 			     vec<slp_oprnd_info> *oprnds_info)
292 {
293   gimple *stmt = stmts[stmt_num];
294   tree oprnd;
295   unsigned int i, number_of_oprnds;
296   gimple *def_stmt;
297   enum vect_def_type dt = vect_uninitialized_def;
298   bool pattern = false;
299   slp_oprnd_info oprnd_info;
300   int first_op_idx = 1;
301   bool commutative = false;
302   bool first_op_cond = false;
303   bool first = stmt_num == 0;
304   bool second = stmt_num == 1;
305 
306   if (is_gimple_call (stmt))
307     {
308       number_of_oprnds = gimple_call_num_args (stmt);
309       first_op_idx = 3;
310     }
311   else if (is_gimple_assign (stmt))
312     {
313       enum tree_code code = gimple_assign_rhs_code (stmt);
314       number_of_oprnds = gimple_num_ops (stmt) - 1;
315       /* Swap can only be done for cond_expr if asked to, otherwise we
316 	 could result in different comparison code to the first stmt.  */
317       if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
319 	{
320 	  first_op_cond = true;
321 	  number_of_oprnds++;
322 	}
323       else
324 	commutative = commutative_tree_code (code);
325     }
326   else
327     return -1;
328 
329   bool swapped = (*swap != 0);
330   gcc_assert (!swapped || first_op_cond);
331   for (i = 0; i < number_of_oprnds; i++)
332     {
333 again:
334       if (first_op_cond)
335 	{
336 	  /* Map indicating how operands of cond_expr should be swapped.  */
337 	  int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 	  int *map = maps[*swap];
339 
340 	  if (i < 2)
341 	    oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 	  else
343 	    oprnd = gimple_op (stmt, map[i]);
344 	}
345       else
346 	oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
347 
348       oprnd_info = (*oprnds_info)[i];
349 
350       if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
351 	{
352 	  if (dump_enabled_p ())
353 	    {
354 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 			       "Build SLP failed: can't analyze def for ");
356 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 	    }
359 
360 	  return -1;
361 	}
362 
363       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364          from the pattern.  Check that all the stmts of the node are in the
365          pattern.  */
366       if (def_stmt && gimple_bb (def_stmt)
367           && vect_stmt_in_region_p (vinfo, def_stmt)
368           && vinfo_for_stmt (def_stmt)
369           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
372         {
373           pattern = true;
374           if (!first && !oprnd_info->first_pattern
375 	      /* Allow different pattern state for the defs of the
376 		 first stmt in reduction chains.  */
377 	      && (oprnd_info->first_dt != vect_reduction_def
378 		  || (!second && !oprnd_info->second_pattern)))
379 	    {
380 	      if (i == 0
381 		  && !swapped
382 		  && commutative)
383 		{
384 		  swapped = true;
385 		  goto again;
386 		}
387 
388 	      if (dump_enabled_p ())
389 		{
390 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 				   "Build SLP failed: some of the stmts"
392 				   " are in a pattern, and others are not ");
393 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 		}
396 
397 	      return 1;
398             }
399 
400           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
402 
403           if (dt == vect_unknown_def_type)
404             {
405               if (dump_enabled_p ())
406                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 				 "Unsupported pattern.\n");
408               return -1;
409             }
410 
411           switch (gimple_code (def_stmt))
412             {
413             case GIMPLE_PHI:
414             case GIMPLE_ASSIGN:
415 	      break;
416 
417 	    default:
418 	      if (dump_enabled_p ())
419 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 				 "unsupported defining stmt:\n");
421 	      return -1;
422             }
423         }
424 
425       if (second)
426 	oprnd_info->second_pattern = pattern;
427 
428       if (first)
429 	{
430 	  oprnd_info->first_dt = dt;
431 	  oprnd_info->first_pattern = pattern;
432 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
433 	}
434       else
435 	{
436 	  /* Not first stmt of the group, check that the def-stmt/s match
437 	     the def-stmt/s of the first stmt.  Allow different definition
438 	     types for reduction chains: the first stmt must be a
439 	     vect_reduction_def (a phi node), and the rest
440 	     vect_internal_def.  */
441 	  tree type = TREE_TYPE (oprnd);
442 	  if ((oprnd_info->first_dt != dt
443 	       && !(oprnd_info->first_dt == vect_reduction_def
444 		    && dt == vect_internal_def)
445 	       && !((oprnd_info->first_dt == vect_external_def
446 		     || oprnd_info->first_dt == vect_constant_def)
447 		    && (dt == vect_external_def
448 			|| dt == vect_constant_def)))
449 	      || !types_compatible_p (oprnd_info->first_op_type, type))
450 	    {
451 	      /* Try swapping operands if we got a mismatch.  */
452 	      if (i == 0
453 		  && !swapped
454 		  && commutative)
455 		{
456 		  swapped = true;
457 		  goto again;
458 		}
459 
460 	      if (dump_enabled_p ())
461 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 				 "Build SLP failed: different types\n");
463 
464 	      return 1;
465 	    }
466 	  if ((dt == vect_constant_def
467 	       || dt == vect_external_def)
468 	      && !current_vector_size.is_constant ()
469 	      && (TREE_CODE (type) == BOOLEAN_TYPE
470 		  || !can_duplicate_and_interleave_p (stmts.length (),
471 						      TYPE_MODE (type))))
472 	    {
473 	      if (dump_enabled_p ())
474 		{
475 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 				   "Build SLP failed: invalid type of def "
477 				   "for variable-length SLP ");
478 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 		  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 		}
481 	      return -1;
482 	    }
483 	}
484 
485       /* Check the types of the definitions.  */
486       switch (dt)
487 	{
488 	case vect_constant_def:
489 	case vect_external_def:
490 	  break;
491 
492 	case vect_reduction_def:
493 	case vect_induction_def:
494 	case vect_internal_def:
495 	  oprnd_info->def_stmts.quick_push (def_stmt);
496 	  break;
497 
498 	default:
499 	  /* FORNOW: Not supported.  */
500 	  if (dump_enabled_p ())
501 	    {
502 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 			       "Build SLP failed: illegal type of def ");
504 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
506 	    }
507 
508 	  return -1;
509 	}
510     }
511 
512   /* Swap operands.  */
513   if (swapped)
514     {
515       /* If there are already uses of this stmt in a SLP instance then
516          we've committed to the operand order and can't swap it.  */
517       if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
518 	{
519 	  if (dump_enabled_p ())
520 	    {
521 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 			       "Build SLP failed: cannot swap operands of "
523 			       "shared stmt ");
524 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 	    }
526 	  return -1;
527 	}
528 
529       if (first_op_cond)
530 	{
531 	  tree cond = gimple_assign_rhs1 (stmt);
532 	  enum tree_code code = TREE_CODE (cond);
533 
534 	  /* Swap.  */
535 	  if (*swap == 1)
536 	    {
537 	      swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 				 &TREE_OPERAND (cond, 1));
539 	      TREE_SET_CODE (cond, swap_tree_comparison (code));
540 	    }
541 	  /* Invert.  */
542 	  else
543 	    {
544 	      swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 				 gimple_assign_rhs3_ptr (stmt));
546 	      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 	      code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 	      gcc_assert (code != ERROR_MARK);
549 	      TREE_SET_CODE (cond, code);
550 	    }
551 	}
552       else
553 	swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 			   gimple_assign_rhs2_ptr (stmt));
555       if (dump_enabled_p ())
556 	{
557 	  dump_printf_loc (MSG_NOTE, vect_location,
558 			   "swapped operands to match def types in ");
559 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
560 	}
561     }
562 
563   *swap = swapped;
564   return 0;
565 }
566 
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568    caller's attempt to find the vector type in STMT with the narrowest
569    element type.  Return true if VECTYPE is nonnull and if it is valid
570    for VINFO.  When returning true, update MAX_NUNITS to reflect the
571    number of units in VECTYPE.  VINFO, GORUP_SIZE and MAX_NUNITS are
572    as for vect_build_slp_tree.  */
573 
574 static bool
575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 			tree vectype, poly_uint64 *max_nunits)
577 {
578   if (!vectype)
579     {
580       if (dump_enabled_p ())
581 	{
582 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 			   "Build SLP failed: unsupported data-type in ");
584 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
586 	}
587       /* Fatal mismatch.  */
588       return false;
589     }
590 
591   /* If populating the vector type requires unrolling then fail
592      before adjusting *max_nunits for basic-block vectorization.  */
593   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594   unsigned HOST_WIDE_INT const_nunits;
595   if (is_a <bb_vec_info> (vinfo)
596       && (!nunits.is_constant (&const_nunits)
597 	  || const_nunits > group_size))
598     {
599       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 		       "Build SLP failed: unrolling required "
601 		       "in basic block SLP\n");
602       /* Fatal mismatch.  */
603       return false;
604     }
605 
606   /* In case of multiple types we need to detect the smallest type.  */
607   vect_update_max_nunits (max_nunits, vectype);
608   return true;
609 }
610 
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612    permutation or are of unsupported types of operation.  Return
613    true if they are, otherwise return false and indicate in *MATCHES
614    which stmts are not isomorphic to the first one.  If MATCHES[0]
615    is false then this indicates the comparison could not be
616    carried out or the stmts will never be vectorized by SLP.
617 
618    Note COND_EXPR is possibly ismorphic to another one after swapping its
619    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620    the first stmt by swapping the two operands of comparison; set SWAP[i]
621    to 2 if stmt I is isormorphic to the first stmt by inverting the code
622    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
624 
625 static bool
626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 		       vec<gimple *> stmts, unsigned int group_size,
628 		       unsigned nops, poly_uint64 *max_nunits,
629 		       bool *matches, bool *two_operators)
630 {
631   unsigned int i;
632   gimple *first_stmt = stmts[0], *stmt = stmts[0];
633   enum tree_code first_stmt_code = ERROR_MARK;
634   enum tree_code alt_stmt_code = ERROR_MARK;
635   enum tree_code rhs_code = ERROR_MARK;
636   enum tree_code first_cond_code = ERROR_MARK;
637   tree lhs;
638   bool need_same_oprnds = false;
639   tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640   optab optab;
641   int icode;
642   machine_mode optab_op2_mode;
643   machine_mode vec_mode;
644   HOST_WIDE_INT dummy;
645   gimple *first_load = NULL, *prev_first_load = NULL;
646 
647   /* For every stmt in NODE find its def stmt/s.  */
648   FOR_EACH_VEC_ELT (stmts, i, stmt)
649     {
650       swap[i] = 0;
651       matches[i] = false;
652 
653       if (dump_enabled_p ())
654 	{
655 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
657 	}
658 
659       /* Fail to vectorize statements marked as unvectorizable.  */
660       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
661         {
662           if (dump_enabled_p ())
663             {
664               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 			       "Build SLP failed: unvectorizable statement ");
666               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
667             }
668 	  /* Fatal mismatch.  */
669 	  matches[0] = false;
670           return false;
671         }
672 
673       lhs = gimple_get_lhs (stmt);
674       if (lhs == NULL_TREE)
675 	{
676 	  if (dump_enabled_p ())
677 	    {
678 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
680 			       "GIMPLE_CALL ");
681 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 	    }
683 	  /* Fatal mismatch.  */
684 	  matches[0] = false;
685 	  return false;
686 	}
687 
688       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689       vectype = get_vectype_for_scalar_type (scalar_type);
690       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 				   max_nunits))
692 	{
693 	  /* Fatal mismatch.  */
694 	  matches[0] = false;
695           return false;
696         }
697 
698       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
699 	{
700 	  rhs_code = CALL_EXPR;
701 	  if (gimple_call_internal_p (call_stmt)
702 	      || gimple_call_tail_p (call_stmt)
703 	      || gimple_call_noreturn_p (call_stmt)
704 	      || !gimple_call_nothrow_p (call_stmt)
705 	      || gimple_call_chain (call_stmt))
706 	    {
707 	      if (dump_enabled_p ())
708 		{
709 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 				   "Build SLP failed: unsupported call type ");
711 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 				    call_stmt, 0);
713 		}
714 	      /* Fatal mismatch.  */
715 	      matches[0] = false;
716 	      return false;
717 	    }
718 	}
719       else
720 	rhs_code = gimple_assign_rhs_code (stmt);
721 
722       /* Check the operation.  */
723       if (i == 0)
724 	{
725 	  first_stmt_code = rhs_code;
726 
727 	  /* Shift arguments should be equal in all the packed stmts for a
728 	     vector shift with scalar shift operand.  */
729 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 	      || rhs_code == LROTATE_EXPR
731 	      || rhs_code == RROTATE_EXPR)
732 	    {
733 	      vec_mode = TYPE_MODE (vectype);
734 
735 	      /* First see if we have a vector/vector shift.  */
736 	      optab = optab_for_tree_code (rhs_code, vectype,
737 					   optab_vector);
738 
739 	      if (!optab
740 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
741 		{
742 		  /* No vector/vector shift, try for a vector/scalar shift.  */
743 		  optab = optab_for_tree_code (rhs_code, vectype,
744 					       optab_scalar);
745 
746 		  if (!optab)
747 		    {
748 		      if (dump_enabled_p ())
749 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 					 "Build SLP failed: no optab.\n");
751 		      /* Fatal mismatch.  */
752 		      matches[0] = false;
753 		      return false;
754 		    }
755 		  icode = (int) optab_handler (optab, vec_mode);
756 		  if (icode == CODE_FOR_nothing)
757 		    {
758 		      if (dump_enabled_p ())
759 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 					 "Build SLP failed: "
761 					 "op not supported by target.\n");
762 		      /* Fatal mismatch.  */
763 		      matches[0] = false;
764 		      return false;
765 		    }
766 		  optab_op2_mode = insn_data[icode].operand[2].mode;
767 		  if (!VECTOR_MODE_P (optab_op2_mode))
768 		    {
769 		      need_same_oprnds = true;
770 		      first_op1 = gimple_assign_rhs2 (stmt);
771 		    }
772 		}
773 	    }
774 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
775             {
776               need_same_oprnds = true;
777               first_op1 = gimple_assign_rhs2 (stmt);
778             }
779 	}
780       else
781 	{
782 	  if (first_stmt_code != rhs_code
783 	      && alt_stmt_code == ERROR_MARK)
784 	    alt_stmt_code = rhs_code;
785 	  if (first_stmt_code != rhs_code
786 	      && (first_stmt_code != IMAGPART_EXPR
787 		  || rhs_code != REALPART_EXPR)
788 	      && (first_stmt_code != REALPART_EXPR
789 		  || rhs_code != IMAGPART_EXPR)
790 	      /* Handle mismatches in plus/minus by computing both
791 		 and merging the results.  */
792 	      && !((first_stmt_code == PLUS_EXPR
793 		    || first_stmt_code == MINUS_EXPR)
794 		   && (alt_stmt_code == PLUS_EXPR
795 		       || alt_stmt_code == MINUS_EXPR)
796 		   && rhs_code == alt_stmt_code)
797               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798                    && (first_stmt_code == ARRAY_REF
799                        || first_stmt_code == BIT_FIELD_REF
800                        || first_stmt_code == INDIRECT_REF
801                        || first_stmt_code == COMPONENT_REF
802                        || first_stmt_code == MEM_REF)))
803 	    {
804 	      if (dump_enabled_p ())
805 		{
806 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 				   "Build SLP failed: different operation "
808 				   "in stmt ");
809 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 				   "original stmt ");
812 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 				    first_stmt, 0);
814 		}
815 	      /* Mismatch.  */
816 	      continue;
817 	    }
818 
819 	  if (need_same_oprnds
820 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
821 	    {
822 	      if (dump_enabled_p ())
823 		{
824 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 				   "Build SLP failed: different shift "
826 				   "arguments in ");
827 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828 		}
829 	      /* Mismatch.  */
830 	      continue;
831 	    }
832 
833 	  if (rhs_code == CALL_EXPR)
834 	    {
835 	      gimple *first_stmt = stmts[0];
836 	      if (gimple_call_num_args (stmt) != nops
837 		  || !operand_equal_p (gimple_call_fn (first_stmt),
838 				       gimple_call_fn (stmt), 0)
839 		  || gimple_call_fntype (first_stmt)
840 		     != gimple_call_fntype (stmt))
841 		{
842 		  if (dump_enabled_p ())
843 		    {
844 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 				       "Build SLP failed: different calls in ");
846 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 					stmt, 0);
848 		    }
849 		  /* Mismatch.  */
850 		  continue;
851 		}
852 	    }
853 	}
854 
855       /* Grouped store or load.  */
856       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
857 	{
858 	  if (REFERENCE_CLASS_P (lhs))
859 	    {
860 	      /* Store.  */
861 	      ;
862 	    }
863 	  else
864 	    {
865 	      /* Load.  */
866               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867               if (prev_first_load)
868                 {
869                   /* Check that there are no loads from different interleaving
870                      chains in the same node.  */
871                   if (prev_first_load != first_load)
872                     {
873                       if (dump_enabled_p ())
874                         {
875                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 					   vect_location,
877 					   "Build SLP failed: different "
878 					   "interleaving chains in one node ");
879                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 					    stmt, 0);
881                         }
882 		      /* Mismatch.  */
883 		      continue;
884                     }
885                 }
886               else
887                 prev_first_load = first_load;
888            }
889         } /* Grouped access.  */
890       else
891 	{
892 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 	    {
894 	      /* Not grouped load.  */
895 	      if (dump_enabled_p ())
896 		{
897 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 				   "Build SLP failed: not grouped load ");
899 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 		}
901 
902 	      /* FORNOW: Not grouped loads are not supported.  */
903 	      /* Fatal mismatch.  */
904 	      matches[0] = false;
905 	      return false;
906 	    }
907 
908 	  /* Not memory operation.  */
909 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 	      && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 	      && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 	      && rhs_code != CALL_EXPR)
914 	    {
915 	      if (dump_enabled_p ())
916 		{
917 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 				   "Build SLP failed: operation");
919 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
921 		}
922 	      /* Fatal mismatch.  */
923 	      matches[0] = false;
924 	      return false;
925 	    }
926 
927 	  if (rhs_code == COND_EXPR)
928 	    {
929 	      tree cond_expr = gimple_assign_rhs1 (stmt);
930 	      enum tree_code cond_code = TREE_CODE (cond_expr);
931 	      enum tree_code swap_code = ERROR_MARK;
932 	      enum tree_code invert_code = ERROR_MARK;
933 
934 	      if (i == 0)
935 		first_cond_code = TREE_CODE (cond_expr);
936 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
937 		{
938 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 		  swap_code = swap_tree_comparison (cond_code);
940 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
941 		}
942 
943 	      if (first_cond_code == cond_code)
944 		;
945 	      /* Isomorphic can be achieved by swapping.  */
946 	      else if (first_cond_code == swap_code)
947 		swap[i] = 1;
948 	      /* Isomorphic can be achieved by inverting.  */
949 	      else if (first_cond_code == invert_code)
950 		swap[i] = 2;
951 	      else
952 		{
953 		  if (dump_enabled_p ())
954 		    {
955 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 				       "Build SLP failed: different"
957 				       " operation");
958 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 					stmt, 0);
960 		    }
961 		  /* Mismatch.  */
962 		  continue;
963 		}
964 	    }
965 	}
966 
967       matches[i] = true;
968     }
969 
970   for (i = 0; i < group_size; ++i)
971     if (!matches[i])
972       return false;
973 
974   /* If we allowed a two-operation SLP node verify the target can cope
975      with the permute we are going to use.  */
976   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977   if (alt_stmt_code != ERROR_MARK
978       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
979     {
980       unsigned HOST_WIDE_INT count;
981       if (!nunits.is_constant (&count))
982 	{
983 	  if (dump_enabled_p ())
984 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 			     "Build SLP failed: different operations "
986 			     "not allowed with variable-length SLP.\n");
987 	  return false;
988 	}
989       vec_perm_builder sel (count, count, 1);
990       for (i = 0; i < count; ++i)
991 	{
992 	  unsigned int elt = i;
993 	  if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 	    elt += count;
995 	  sel.quick_push (elt);
996 	}
997       vec_perm_indices indices (sel, 2, count);
998       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
999 	{
1000 	  for (i = 0; i < group_size; ++i)
1001 	    if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1002 	      {
1003 		matches[i] = false;
1004 		if (dump_enabled_p ())
1005 		  {
1006 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 				     "Build SLP failed: different operation "
1008 				     "in stmt ");
1009 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 				      stmts[i], 0);
1011 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 				     "original stmt ");
1013 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 				      first_stmt, 0);
1015 		  }
1016 	      }
1017 	  return false;
1018 	}
1019       *two_operators = true;
1020     }
1021 
1022   return true;
1023 }
1024 
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026    Note we never remove apart from at destruction time so we do not
1027    need a special value for deleted that differs from empty.  */
1028 struct bst_traits
1029 {
1030   typedef vec <gimple *> value_type;
1031   typedef vec <gimple *> compare_type;
1032   static inline hashval_t hash (value_type);
1033   static inline bool equal (value_type existing, value_type candidate);
1034   static inline bool is_empty (value_type x) { return !x.exists (); }
1035   static inline bool is_deleted (value_type x) { return !x.exists (); }
1036   static inline void mark_empty (value_type &x) { x.release (); }
1037   static inline void mark_deleted (value_type &x) { x.release (); }
1038   static inline void remove (value_type &x) { x.release (); }
1039 };
1040 inline hashval_t
1041 bst_traits::hash (value_type x)
1042 {
1043   inchash::hash h;
1044   for (unsigned i = 0; i < x.length (); ++i)
1045     h.add_int (gimple_uid (x[i]));
1046   return h.end ();
1047 }
1048 inline bool
1049 bst_traits::equal (value_type existing, value_type candidate)
1050 {
1051   if (existing.length () != candidate.length ())
1052     return false;
1053   for (unsigned i = 0; i < existing.length (); ++i)
1054     if (existing[i] != candidate[i])
1055       return false;
1056   return true;
1057 }
1058 
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1061 
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 		       vec<gimple *> stmts, unsigned int group_size,
1065 		       poly_uint64 *max_nunits,
1066 		       vec<slp_tree> *loads,
1067 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 		       unsigned max_tree_size);
1069 
1070 static slp_tree
1071 vect_build_slp_tree (vec_info *vinfo,
1072 		     vec<gimple *> stmts, unsigned int group_size,
1073 		     poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 		     unsigned max_tree_size)
1076 {
1077   if (bst_fail->contains (stmts))
1078     return NULL;
1079   slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 					loads, matches, npermutes, tree_size,
1081 					max_tree_size);
1082   /* When SLP build fails for stmts record this, otherwise SLP build
1083      can be exponential in time when we allow to construct parts from
1084      scalars, see PR81723.  */
1085   if (! res)
1086     {
1087       vec <gimple *> x;
1088       x.create (stmts.length ());
1089       x.splice (stmts);
1090       bst_fail->add (x);
1091     }
1092   return res;
1093 }
1094 
1095 /* Recursively build an SLP tree starting from NODE.
1096    Fail (and return a value not equal to zero) if def-stmts are not
1097    isomorphic, require data permutation or are of unsupported types of
1098    operation.  Otherwise, return 0.
1099    The value returned is the depth in the SLP tree where a mismatch
1100    was found.  */
1101 
1102 static slp_tree
1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 		       vec<gimple *> stmts, unsigned int group_size,
1105 		       poly_uint64 *max_nunits,
1106 		       vec<slp_tree> *loads,
1107 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 		       unsigned max_tree_size)
1109 {
1110   unsigned nops, i, this_tree_size = 0;
1111   poly_uint64 this_max_nunits = *max_nunits;
1112   gimple *stmt;
1113   slp_tree node;
1114 
1115   matches[0] = false;
1116 
1117   stmt = stmts[0];
1118   if (is_gimple_call (stmt))
1119     nops = gimple_call_num_args (stmt);
1120   else if (is_gimple_assign (stmt))
1121     {
1122       nops = gimple_num_ops (stmt) - 1;
1123       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 	nops++;
1125     }
1126   else if (gimple_code (stmt) == GIMPLE_PHI)
1127     nops = 0;
1128   else
1129     return NULL;
1130 
1131   /* If the SLP node is a PHI (induction or reduction), terminate
1132      the recursion.  */
1133   if (gimple_code (stmt) == GIMPLE_PHI)
1134     {
1135       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136       tree vectype = get_vectype_for_scalar_type (scalar_type);
1137       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 				   max_nunits))
1139 	return NULL;
1140 
1141       vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142       /* Induction from different IVs is not supported.  */
1143       if (def_type == vect_induction_def)
1144 	{
1145 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 	    if (stmt != stmts[0])
1147 	      return NULL;
1148 	}
1149       else
1150 	{
1151 	  /* Else def types have to match.  */
1152 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1153 	    {
1154 	      /* But for reduction chains only check on the first stmt.  */
1155 	      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 		continue;
1158 	      if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 		return NULL;
1160 	    }
1161 	}
1162       node = vect_create_new_slp_node (stmts);
1163       return node;
1164     }
1165 
1166 
1167   bool two_operators = false;
1168   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169   if (!vect_build_slp_tree_1 (vinfo, swap,
1170 			      stmts, group_size, nops,
1171 			      &this_max_nunits, matches, &two_operators))
1172     return NULL;
1173 
1174   /* If the SLP node is a load, terminate the recursion.  */
1175   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1177     {
1178       *max_nunits = this_max_nunits;
1179       node = vect_create_new_slp_node (stmts);
1180       loads->safe_push (node);
1181       return node;
1182     }
1183 
1184   /* Get at the operands, verifying they are compatible.  */
1185   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186   slp_oprnd_info oprnd_info;
1187   FOR_EACH_VEC_ELT (stmts, i, stmt)
1188     {
1189       int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 					     stmts, i, &oprnds_info);
1191       if (res != 0)
1192 	matches[(res == -1) ? 0 : i] = false;
1193       if (!matches[0])
1194 	break;
1195     }
1196   for (i = 0; i < group_size; ++i)
1197     if (!matches[i])
1198       {
1199 	vect_free_oprnd_info (oprnds_info);
1200 	return NULL;
1201       }
1202 
1203   auto_vec<slp_tree, 4> children;
1204   auto_vec<slp_tree> this_loads;
1205 
1206   stmt = stmts[0];
1207 
1208   if (tree_size)
1209     max_tree_size -= *tree_size;
1210 
1211   /* Create SLP_TREE nodes for the definition node/s.  */
1212   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1213     {
1214       slp_tree child;
1215       unsigned old_nloads = this_loads.length ();
1216       unsigned old_tree_size = this_tree_size;
1217       unsigned int j;
1218 
1219       if (oprnd_info->first_dt != vect_internal_def
1220 	  && oprnd_info->first_dt != vect_reduction_def
1221 	  && oprnd_info->first_dt != vect_induction_def)
1222         continue;
1223 
1224       if (++this_tree_size > max_tree_size)
1225 	{
1226 	  FOR_EACH_VEC_ELT (children, j, child)
1227 	    vect_free_slp_tree (child);
1228 	  vect_free_oprnd_info (oprnds_info);
1229 	  return NULL;
1230 	}
1231 
1232       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 					group_size, &this_max_nunits,
1234 					&this_loads, matches, npermutes,
1235 					&this_tree_size,
1236 					max_tree_size)) != NULL)
1237 	{
1238 	  /* If we have all children of child built up from scalars then just
1239 	     throw that away and build it up this node from scalars.  */
1240 	  if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 	      /* ???  Rejecting patterns this way doesn't work.  We'd have to
1242 		 do extra work to cancel the pattern so the uses see the
1243 		 scalar version.  */
1244 	      && !is_pattern_stmt_p
1245 	            (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1246 	    {
1247 	      slp_tree grandchild;
1248 
1249 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 		if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 		  break;
1252 	      if (!grandchild)
1253 		{
1254 		  /* Roll back.  */
1255 		  this_loads.truncate (old_nloads);
1256 		  this_tree_size = old_tree_size;
1257 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 		    vect_free_slp_tree (grandchild);
1259 		  SLP_TREE_CHILDREN (child).truncate (0);
1260 
1261 		  dump_printf_loc (MSG_NOTE, vect_location,
1262 				   "Building parent vector operands from "
1263 				   "scalars instead\n");
1264 		  oprnd_info->def_stmts = vNULL;
1265 		  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 		  children.safe_push (child);
1267 		  continue;
1268 		}
1269 	    }
1270 
1271 	  oprnd_info->def_stmts = vNULL;
1272 	  children.safe_push (child);
1273 	  continue;
1274 	}
1275 
1276       /* If the SLP build failed fatally and we analyze a basic-block
1277          simply treat nodes we fail to build as externally defined
1278 	 (and thus build vectors from the scalar defs).
1279 	 The cost model will reject outright expensive cases.
1280 	 ???  This doesn't treat cases where permutation ultimatively
1281 	 fails (or we don't try permutation below).  Ideally we'd
1282 	 even compute a permutation that will end up with the maximum
1283 	 SLP tree size...  */
1284       if (is_a <bb_vec_info> (vinfo)
1285 	  && !matches[0]
1286 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
1287 	     do extra work to cancel the pattern so the uses see the
1288 	     scalar version.  */
1289 	  && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1290 	{
1291 	  dump_printf_loc (MSG_NOTE, vect_location,
1292 			   "Building vector operands from scalars\n");
1293 	  child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 	  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 	  children.safe_push (child);
1296 	  oprnd_info->def_stmts = vNULL;
1297 	  continue;
1298 	}
1299 
1300       /* If the SLP build for operand zero failed and operand zero
1301 	 and one can be commutated try that for the scalar stmts
1302 	 that failed the match.  */
1303       if (i == 0
1304 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1305 	  && matches[0]
1306 	  /* ???  For COND_EXPRs we can swap the comparison operands
1307 	     as well as the arms under some constraints.  */
1308 	  && nops == 2
1309 	  && oprnds_info[1]->first_dt == vect_internal_def
1310 	  && is_gimple_assign (stmt)
1311 	  /* Do so only if the number of not successful permutes was nor more
1312 	     than a cut-ff as re-trying the recursive match on
1313 	     possibly each level of the tree would expose exponential
1314 	     behavior.  */
1315 	  && *npermutes < 4)
1316 	{
1317 	  /* See whether we can swap the matching or the non-matching
1318 	     stmt operands.  */
1319 	  bool swap_not_matching = true;
1320 	  do
1321 	    {
1322 	      for (j = 0; j < group_size; ++j)
1323 		{
1324 		  if (matches[j] != !swap_not_matching)
1325 		    continue;
1326 		  gimple *stmt = stmts[j];
1327 		  /* Verify if we can swap operands of this stmt.  */
1328 		  if (!is_gimple_assign (stmt)
1329 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1330 		    {
1331 		      if (!swap_not_matching)
1332 			goto fail;
1333 		      swap_not_matching = false;
1334 		      break;
1335 		    }
1336 		  /* Verify if we can safely swap or if we committed to a
1337 		     specific operand order already.
1338 		     ???  Instead of modifying GIMPLE stmts here we could
1339 		     record whether we want to swap operands in the SLP
1340 		     node and temporarily do that when processing it
1341 		     (or wrap operand accessors in a helper).  */
1342 		  else if (swap[j] != 0
1343 			   || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1344 		    {
1345 		      if (!swap_not_matching)
1346 			{
1347 			  if (dump_enabled_p ())
1348 			    {
1349 			      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1350 					       vect_location,
1351 					       "Build SLP failed: cannot swap "
1352 					       "operands of shared stmt ");
1353 			      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1354 						TDF_SLIM, stmts[j], 0);
1355 			    }
1356 			  goto fail;
1357 			}
1358 		      swap_not_matching = false;
1359 		      break;
1360 		    }
1361 		}
1362 	    }
1363 	  while (j != group_size);
1364 
1365 	  /* Swap mismatched definition stmts.  */
1366 	  dump_printf_loc (MSG_NOTE, vect_location,
1367 			   "Re-trying with swapped operands of stmts ");
1368 	  for (j = 0; j < group_size; ++j)
1369 	    if (matches[j] == !swap_not_matching)
1370 	      {
1371 		std::swap (oprnds_info[0]->def_stmts[j],
1372 			   oprnds_info[1]->def_stmts[j]);
1373 		dump_printf (MSG_NOTE, "%d ", j);
1374 	      }
1375 	  dump_printf (MSG_NOTE, "\n");
1376 	  /* And try again with scratch 'matches' ... */
1377 	  bool *tem = XALLOCAVEC (bool, group_size);
1378 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1379 					    group_size, &this_max_nunits,
1380 					    &this_loads, tem, npermutes,
1381 					    &this_tree_size,
1382 					    max_tree_size)) != NULL)
1383 	    {
1384 	      /* ... so if successful we can apply the operand swapping
1385 		 to the GIMPLE IL.  This is necessary because for example
1386 		 vect_get_slp_defs uses operand indexes and thus expects
1387 		 canonical operand order.  This is also necessary even
1388 		 if we end up building the operand from scalars as
1389 		 we'll continue to process swapped operand two.  */
1390 	      for (j = 0; j < group_size; ++j)
1391 		{
1392 		  gimple *stmt = stmts[j];
1393 		  gimple_set_plf (stmt, GF_PLF_1, false);
1394 		}
1395 	      for (j = 0; j < group_size; ++j)
1396 		{
1397 		  gimple *stmt = stmts[j];
1398 		  if (matches[j] == !swap_not_matching)
1399 		    {
1400 		      /* Avoid swapping operands twice.  */
1401 		      if (gimple_plf (stmt, GF_PLF_1))
1402 			continue;
1403 		      swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1404 					 gimple_assign_rhs2_ptr (stmt));
1405 		      gimple_set_plf (stmt, GF_PLF_1, true);
1406 		    }
1407 		}
1408 	      /* Verify we swap all duplicates or none.  */
1409 	      if (flag_checking)
1410 		for (j = 0; j < group_size; ++j)
1411 		  {
1412 		    gimple *stmt = stmts[j];
1413 		    gcc_assert (gimple_plf (stmt, GF_PLF_1)
1414 				== (matches[j] == !swap_not_matching));
1415 		  }
1416 
1417 	      /* If we have all children of child built up from scalars then
1418 		 just throw that away and build it up this node from scalars.  */
1419 	      if (!SLP_TREE_CHILDREN (child).is_empty ()
1420 		  /* ???  Rejecting patterns this way doesn't work.  We'd have
1421 		     to do extra work to cancel the pattern so the uses see the
1422 		     scalar version.  */
1423 		  && !is_pattern_stmt_p
1424 			(vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1425 		{
1426 		  unsigned int j;
1427 		  slp_tree grandchild;
1428 
1429 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1430 		    if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1431 		      break;
1432 		  if (!grandchild)
1433 		    {
1434 		      /* Roll back.  */
1435 		      this_loads.truncate (old_nloads);
1436 		      this_tree_size = old_tree_size;
1437 		      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1438 			vect_free_slp_tree (grandchild);
1439 		      SLP_TREE_CHILDREN (child).truncate (0);
1440 
1441 		      dump_printf_loc (MSG_NOTE, vect_location,
1442 				       "Building parent vector operands from "
1443 				       "scalars instead\n");
1444 		      oprnd_info->def_stmts = vNULL;
1445 		      SLP_TREE_DEF_TYPE (child) = vect_external_def;
1446 		      children.safe_push (child);
1447 		      continue;
1448 		    }
1449 		}
1450 
1451 	      oprnd_info->def_stmts = vNULL;
1452 	      children.safe_push (child);
1453 	      continue;
1454 	    }
1455 
1456 	  ++*npermutes;
1457 	}
1458 
1459 fail:
1460       gcc_assert (child == NULL);
1461       FOR_EACH_VEC_ELT (children, j, child)
1462 	vect_free_slp_tree (child);
1463       vect_free_oprnd_info (oprnds_info);
1464       return NULL;
1465     }
1466 
1467   vect_free_oprnd_info (oprnds_info);
1468 
1469   if (tree_size)
1470     *tree_size += this_tree_size;
1471   *max_nunits = this_max_nunits;
1472   loads->safe_splice (this_loads);
1473 
1474   node = vect_create_new_slp_node (stmts);
1475   SLP_TREE_TWO_OPERATORS (node) = two_operators;
1476   SLP_TREE_CHILDREN (node).splice (children);
1477   return node;
1478 }
1479 
1480 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1481 
1482 static void
1483 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1484 {
1485   int i;
1486   gimple *stmt;
1487   slp_tree child;
1488 
1489   dump_printf_loc (dump_kind, loc, "node%s\n",
1490 		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
1491 		   ? " (external)" : "");
1492   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1493     {
1494       dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1495       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1496     }
1497   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1498     vect_print_slp_tree (dump_kind, loc, child);
1499 }
1500 
1501 
1502 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1503    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1504    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1505    stmts in NODE are to be marked.  */
1506 
1507 static void
1508 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1509 {
1510   int i;
1511   gimple *stmt;
1512   slp_tree child;
1513 
1514   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1515     return;
1516 
1517   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1518     if (j < 0 || i == j)
1519       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1520 
1521   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1522     vect_mark_slp_stmts (child, mark, j);
1523 }
1524 
1525 
1526 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1527 
1528 static void
1529 vect_mark_slp_stmts_relevant (slp_tree node)
1530 {
1531   int i;
1532   gimple *stmt;
1533   stmt_vec_info stmt_info;
1534   slp_tree child;
1535 
1536   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1537     return;
1538 
1539   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1540     {
1541       stmt_info = vinfo_for_stmt (stmt);
1542       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545     }
1546 
1547   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548     vect_mark_slp_stmts_relevant (child);
1549 }
1550 
1551 
1552 /* Rearrange the statements of NODE according to PERMUTATION.  */
1553 
1554 static void
1555 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1556                           vec<unsigned> permutation)
1557 {
1558   gimple *stmt;
1559   vec<gimple *> tmp_stmts;
1560   unsigned int i;
1561   slp_tree child;
1562 
1563   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1564     vect_slp_rearrange_stmts (child, group_size, permutation);
1565 
1566   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1567   tmp_stmts.create (group_size);
1568   tmp_stmts.quick_grow_cleared (group_size);
1569 
1570   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1571     tmp_stmts[permutation[i]] = stmt;
1572 
1573   SLP_TREE_SCALAR_STMTS (node).release ();
1574   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1575 }
1576 
1577 
1578 /* Attempt to reorder stmts in a reduction chain so that we don't
1579    require any load permutation.  Return true if that was possible,
1580    otherwise return false.  */
1581 
1582 static bool
1583 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1584 {
1585   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1586   unsigned int i, j;
1587   unsigned int lidx;
1588   slp_tree node, load;
1589 
1590   /* Compare all the permutation sequences to the first one.  We know
1591      that at least one load is permuted.  */
1592   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1593   if (!node->load_permutation.exists ())
1594     return false;
1595   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1596     {
1597       if (!load->load_permutation.exists ())
1598 	return false;
1599       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1600 	if (lidx != node->load_permutation[j])
1601 	  return false;
1602     }
1603 
1604   /* Check that the loads in the first sequence are different and there
1605      are no gaps between them.  */
1606   auto_sbitmap load_index (group_size);
1607   bitmap_clear (load_index);
1608   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1609     {
1610       if (lidx >= group_size)
1611 	return false;
1612       if (bitmap_bit_p (load_index, lidx))
1613 	return false;
1614 
1615       bitmap_set_bit (load_index, lidx);
1616     }
1617   for (i = 0; i < group_size; i++)
1618     if (!bitmap_bit_p (load_index, i))
1619       return false;
1620 
1621   /* This permutation is valid for reduction.  Since the order of the
1622      statements in the nodes is not important unless they are memory
1623      accesses, we can rearrange the statements in all the nodes
1624      according to the order of the loads.  */
1625   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1626 			    node->load_permutation);
1627 
1628   /* We are done, no actual permutations need to be generated.  */
1629   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1630   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1631     {
1632       gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1633       first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1634       /* But we have to keep those permutations that are required because
1635          of handling of gaps.  */
1636       if (known_eq (unrolling_factor, 1U)
1637 	  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1638 	      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1639 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1640       else
1641 	for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1642 	  SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1643     }
1644 
1645   return true;
1646 }
1647 
1648 /* Check if the required load permutations in the SLP instance
1649    SLP_INSTN are supported.  */
1650 
1651 static bool
1652 vect_supported_load_permutation_p (slp_instance slp_instn)
1653 {
1654   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1655   unsigned int i, j, k, next;
1656   slp_tree node;
1657   gimple *stmt, *load, *next_load;
1658 
1659   if (dump_enabled_p ())
1660     {
1661       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1662       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1663 	if (node->load_permutation.exists ())
1664 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1665 	    dump_printf (MSG_NOTE, "%d ", next);
1666 	else
1667 	  for (k = 0; k < group_size; ++k)
1668 	    dump_printf (MSG_NOTE, "%d ", k);
1669       dump_printf (MSG_NOTE, "\n");
1670     }
1671 
1672   /* In case of reduction every load permutation is allowed, since the order
1673      of the reduction statements is not important (as opposed to the case of
1674      grouped stores).  The only condition we need to check is that all the
1675      load nodes are of the same size and have the same permutation (and then
1676      rearrange all the nodes of the SLP instance according to this
1677      permutation).  */
1678 
1679   /* Check that all the load nodes are of the same size.  */
1680   /* ???  Can't we assert this? */
1681   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1682     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1683       return false;
1684 
1685   node = SLP_INSTANCE_TREE (slp_instn);
1686   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1687 
1688   /* Reduction (there are no data-refs in the root).
1689      In reduction chain the order of the loads is not important.  */
1690   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1691       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1692     vect_attempt_slp_rearrange_stmts (slp_instn);
1693 
1694   /* In basic block vectorization we allow any subchain of an interleaving
1695      chain.
1696      FORNOW: not supported in loop SLP because of realignment compications.  */
1697   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1698     {
1699       /* Check whether the loads in an instance form a subchain and thus
1700          no permutation is necessary.  */
1701       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1702         {
1703 	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1704 	    continue;
1705 	  bool subchain_p = true;
1706           next_load = NULL;
1707           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1708             {
1709               if (j != 0
1710 		  && (next_load != load
1711 		      || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1712 		{
1713 		  subchain_p = false;
1714 		  break;
1715 		}
1716               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1717             }
1718 	  if (subchain_p)
1719 	    SLP_TREE_LOAD_PERMUTATION (node).release ();
1720 	  else
1721 	    {
1722 	      stmt_vec_info group_info
1723 		= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1724 	      group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1725 	      unsigned HOST_WIDE_INT nunits;
1726 	      unsigned k, maxk = 0;
1727 	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1728 		if (k > maxk)
1729 		  maxk = k;
1730 	      /* In BB vectorization we may not actually use a loaded vector
1731 		 accessing elements in excess of GROUP_SIZE.  */
1732 	      tree vectype = STMT_VINFO_VECTYPE (group_info);
1733 	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1734 		  || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1735 		{
1736 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 				   "BB vectorization with gaps at the end of "
1738 				   "a load is not supported\n");
1739 		  return false;
1740 		}
1741 
1742 	      /* Verify the permutation can be generated.  */
1743 	      vec<tree> tem;
1744 	      unsigned n_perms;
1745 	      if (!vect_transform_slp_perm_load (node, tem, NULL,
1746 						 1, slp_instn, true, &n_perms))
1747 		{
1748 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1749 				   vect_location,
1750 				   "unsupported load permutation\n");
1751 		  return false;
1752 		}
1753 	    }
1754         }
1755       return true;
1756     }
1757 
1758   /* For loop vectorization verify we can generate the permutation.  Be
1759      conservative about the vectorization factor, there are permutations
1760      that will use three vector inputs only starting from a specific factor
1761      and the vectorization factor is not yet final.
1762      ???  The SLP instance unrolling factor might not be the maximum one.  */
1763   unsigned n_perms;
1764   poly_uint64 test_vf
1765     = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1766 			     LOOP_VINFO_VECT_FACTOR
1767 			     (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1768   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1769     if (node->load_permutation.exists ()
1770 	&& !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1771 					  slp_instn, true, &n_perms))
1772       return false;
1773 
1774   return true;
1775 }
1776 
1777 
1778 /* Find the last store in SLP INSTANCE.  */
1779 
1780 gimple *
1781 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1782 {
1783   gimple *last = NULL, *stmt;
1784 
1785   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1786     {
1787       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1788       if (is_pattern_stmt_p (stmt_vinfo))
1789 	last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1790       else
1791 	last = get_later_stmt (stmt, last);
1792     }
1793 
1794   return last;
1795 }
1796 
1797 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1798 
1799 static void
1800 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1801 			 stmt_vector_for_cost *prologue_cost_vec,
1802 			 stmt_vector_for_cost *body_cost_vec,
1803 			 unsigned ncopies_for_cost,
1804 			 scalar_stmts_set_t* visited)
1805 {
1806   unsigned i, j;
1807   slp_tree child;
1808   gimple *stmt;
1809   stmt_vec_info stmt_info;
1810   tree lhs;
1811 
1812   /* If we already costed the exact same set of scalar stmts we're done.
1813      We share the generated vector stmts for those.  */
1814   if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1815     return;
1816 
1817   visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1818 
1819   /* Recurse down the SLP tree.  */
1820   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1821     if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1822       vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1823 			       body_cost_vec, ncopies_for_cost, visited);
1824 
1825   /* Look at the first scalar stmt to determine the cost.  */
1826   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1827   stmt_info = vinfo_for_stmt (stmt);
1828   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829     {
1830       vect_memory_access_type memory_access_type
1831 	= (STMT_VINFO_STRIDED_P (stmt_info)
1832 	   ? VMAT_STRIDED_SLP
1833 	   : VMAT_CONTIGUOUS);
1834       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1835 	vect_model_store_cost (stmt_info, ncopies_for_cost,
1836 			       memory_access_type, VLS_STORE,
1837 			       node, prologue_cost_vec, body_cost_vec);
1838       else
1839 	{
1840 	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1841 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1842 	    {
1843 	      /* If the load is permuted then the alignment is determined by
1844 		 the first group element not by the first scalar stmt DR.  */
1845 	      stmt = GROUP_FIRST_ELEMENT (stmt_info);
1846 	      stmt_info = vinfo_for_stmt (stmt);
1847 	      /* Record the cost for the permutation.  */
1848 	      unsigned n_perms;
1849 	      vect_transform_slp_perm_load (node, vNULL, NULL,
1850 					    ncopies_for_cost, instance, true,
1851 					    &n_perms);
1852 	      record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1853 				stmt_info, 0, vect_body);
1854 	      unsigned assumed_nunits
1855 		= vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1856 	      /* And adjust the number of loads performed.  This handles
1857 	         redundancies as well as loads that are later dead.  */
1858 	      auto_sbitmap perm (GROUP_SIZE (stmt_info));
1859 	      bitmap_clear (perm);
1860 	      for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1861 		bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1862 	      ncopies_for_cost = 0;
1863 	      bool load_seen = false;
1864 	      for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1865 		{
1866 		  if (i % assumed_nunits == 0)
1867 		    {
1868 		      if (load_seen)
1869 			ncopies_for_cost++;
1870 		      load_seen = false;
1871 		    }
1872 		  if (bitmap_bit_p (perm, i))
1873 		    load_seen = true;
1874 		}
1875 	      if (load_seen)
1876 		ncopies_for_cost++;
1877 	      gcc_assert (ncopies_for_cost
1878 			  <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1879 			      + assumed_nunits - 1) / assumed_nunits);
1880 	      poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1881 	      ncopies_for_cost *= estimated_poly_value (uf);
1882 	    }
1883 	  /* Record the cost for the vector loads.  */
1884 	  vect_model_load_cost (stmt_info, ncopies_for_cost,
1885 				memory_access_type, node, prologue_cost_vec,
1886 				body_cost_vec);
1887 	  return;
1888 	}
1889     }
1890   else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1891     {
1892       /* ncopies_for_cost is the number of IVs we generate.  */
1893       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1894 			stmt_info, 0, vect_body);
1895 
1896       /* Prologue cost for the initial values and step vector.  */
1897       record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1898 			CONSTANT_CLASS_P
1899 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1900 			     (stmt_info))
1901 			? vector_load : vec_construct,
1902 			stmt_info, 0, vect_prologue);
1903       record_stmt_cost (prologue_cost_vec, 1,
1904 			CONSTANT_CLASS_P
1905 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1906 			? vector_load : vec_construct,
1907 			stmt_info, 0, vect_prologue);
1908 
1909       /* ???  No easy way to get at the actual number of vector stmts
1910          to be geneated and thus the derived IVs.  */
1911     }
1912   else
1913     {
1914       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1915 			stmt_info, 0, vect_body);
1916       if (SLP_TREE_TWO_OPERATORS (node))
1917 	{
1918 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1919 			    stmt_info, 0, vect_body);
1920 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1921 			    stmt_info, 0, vect_body);
1922 	}
1923     }
1924 
1925   /* Push SLP node def-type to stmts.  */
1926   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1927     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1928       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1929 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1930 
1931   /* Scan operands and account for prologue cost of constants/externals.
1932      ???  This over-estimates cost for multiple uses and should be
1933      re-engineered.  */
1934   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1935   lhs = gimple_get_lhs (stmt);
1936   for (i = 0; i < gimple_num_ops (stmt); ++i)
1937     {
1938       tree op = gimple_op (stmt, i);
1939       gimple *def_stmt;
1940       enum vect_def_type dt;
1941       if (!op || op == lhs)
1942 	continue;
1943       if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1944 	  && (dt == vect_constant_def || dt == vect_external_def))
1945 	{
1946 	  /* Without looking at the actual initializer a vector of
1947 	     constants can be implemented as load from the constant pool.
1948 	     When all elements are the same we can use a splat.  */
1949 	  tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1950 	  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1951 	  unsigned num_vects_to_check;
1952 	  unsigned HOST_WIDE_INT const_nunits;
1953 	  unsigned nelt_limit;
1954 	  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1955 	      && ! multiple_p (const_nunits, group_size))
1956 	    {
1957 	      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1958 	      nelt_limit = const_nunits;
1959 	    }
1960 	  else
1961 	    {
1962 	      /* If either the vector has variable length or the vectors
1963 	         are composed of repeated whole groups we only need to
1964 		 cost construction once.  All vectors will be the same.  */
1965 	      num_vects_to_check = 1;
1966 	      nelt_limit = group_size;
1967 	    }
1968 	  tree elt = NULL_TREE;
1969 	  unsigned nelt = 0;
1970 	  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1971 	    {
1972 	      unsigned si = j % group_size;
1973 	      if (nelt == 0)
1974 		elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1975 	      /* ???  We're just tracking whether all operands of a single
1976 		 vector initializer are the same, ideally we'd check if
1977 		 we emitted the same one already.  */
1978 	      else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1979 		elt = NULL_TREE;
1980 	      nelt++;
1981 	      if (nelt == nelt_limit)
1982 		{
1983 		  /* ???  We need to pass down stmt_info for a vector type
1984 		     even if it points to the wrong stmt.  */
1985 		  record_stmt_cost (prologue_cost_vec, 1,
1986 				    dt == vect_external_def
1987 				    ? (elt ? scalar_to_vec : vec_construct)
1988 				    : vector_load,
1989 				    stmt_info, 0, vect_prologue);
1990 		  nelt = 0;
1991 		}
1992 	    }
1993 	}
1994     }
1995 
1996   /* Restore stmt def-types.  */
1997   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1998     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1999       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2000 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2001 }
2002 
2003 /* Compute the cost for the SLP instance INSTANCE.  */
2004 
2005 static void
2006 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2007 {
2008   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2009   unsigned ncopies_for_cost;
2010   stmt_info_for_cost *si;
2011   unsigned i;
2012 
2013   /* Calculate the number of vector stmts to create based on the unrolling
2014      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2015      GROUP_SIZE / NUNITS otherwise.  */
2016   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2017   slp_tree node = SLP_INSTANCE_TREE (instance);
2018   stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2019   /* Get the estimated vectorization factor, which is always one for
2020      basic-block vectorization.  */
2021   unsigned int assumed_vf;
2022   if (STMT_VINFO_LOOP_VINFO (stmt_info))
2023     assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2024   else
2025     assumed_vf = 1;
2026   /* For reductions look at a reduction operand in case the reduction
2027      operation is widening like DOT_PROD or SAD.  */
2028   tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2029   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030     {
2031       gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2032       switch (gimple_assign_rhs_code (stmt))
2033 	{
2034 	case DOT_PROD_EXPR:
2035 	case SAD_EXPR:
2036 	  vectype_for_cost = get_vectype_for_scalar_type
2037 	    (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2038 	  break;
2039 	default:;
2040 	}
2041     }
2042   unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2043   ncopies_for_cost = (least_common_multiple (assumed_nunits,
2044 					     group_size * assumed_vf)
2045 		      / assumed_nunits);
2046 
2047   prologue_cost_vec.create (10);
2048   body_cost_vec.create (10);
2049   vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2050 			   &prologue_cost_vec, &body_cost_vec,
2051 			   ncopies_for_cost, visited);
2052 
2053   /* Record the prologue costs, which were delayed until we were
2054      sure that SLP was successful.  */
2055   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2056     {
2057       struct _stmt_vec_info *stmt_info
2058 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2059       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2060 			    si->misalign, vect_prologue);
2061     }
2062 
2063   /* Record the instance's instructions in the target cost model.  */
2064   FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2065     {
2066       struct _stmt_vec_info *stmt_info
2067 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2068       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2069 			    si->misalign, vect_body);
2070     }
2071 
2072   prologue_cost_vec.release ();
2073   body_cost_vec.release ();
2074 }
2075 
2076 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2077    one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2078    the first GROUP1_SIZE stmts, since stores are consecutive), the second
2079    containing the remainder.
2080    Return the first stmt in the second group.  */
2081 
2082 static gimple *
2083 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2084 {
2085   stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2086   gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2087   gcc_assert (group1_size > 0);
2088   int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2089   gcc_assert (group2_size > 0);
2090   GROUP_SIZE (first_vinfo) = group1_size;
2091 
2092   gimple *stmt = first_stmt;
2093   for (unsigned i = group1_size; i > 1; i--)
2094     {
2095       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2096       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2097     }
2098   /* STMT is now the last element of the first group.  */
2099   gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2100   GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2101 
2102   GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2103   for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2104     {
2105       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2106       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2107     }
2108 
2109   /* For the second group, the GROUP_GAP is that before the original group,
2110      plus skipping over the first vector.  */
2111   GROUP_GAP (vinfo_for_stmt (group2)) =
2112     GROUP_GAP (first_vinfo) + group1_size;
2113 
2114   /* GROUP_GAP of the first group now has to skip over the second group too.  */
2115   GROUP_GAP (first_vinfo) += group2_size;
2116 
2117   if (dump_enabled_p ())
2118     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2119 		     group1_size, group2_size);
2120 
2121   return group2;
2122 }
2123 
2124 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2125    statements and a vector of NUNITS elements.  */
2126 
2127 static poly_uint64
2128 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2129 {
2130   return exact_div (common_multiple (nunits, group_size), group_size);
2131 }
2132 
2133 /* Analyze an SLP instance starting from a group of grouped stores.  Call
2134    vect_build_slp_tree to build a tree of packed stmts if possible.
2135    Return FALSE if it's impossible to SLP any stmt in the loop.  */
2136 
2137 static bool
2138 vect_analyze_slp_instance (vec_info *vinfo,
2139 			   gimple *stmt, unsigned max_tree_size)
2140 {
2141   slp_instance new_instance;
2142   slp_tree node;
2143   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2144   tree vectype, scalar_type = NULL_TREE;
2145   gimple *next;
2146   unsigned int i;
2147   vec<slp_tree> loads;
2148   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2149   vec<gimple *> scalar_stmts;
2150 
2151   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2152     {
2153       if (dr)
2154         {
2155           scalar_type = TREE_TYPE (DR_REF (dr));
2156           vectype = get_vectype_for_scalar_type (scalar_type);
2157         }
2158       else
2159         {
2160           gcc_assert (is_a <loop_vec_info> (vinfo));
2161           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2162         }
2163 
2164       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2165     }
2166   else
2167     {
2168       gcc_assert (is_a <loop_vec_info> (vinfo));
2169       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2170       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2171     }
2172 
2173   if (!vectype)
2174     {
2175       if (dump_enabled_p ())
2176         {
2177           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 			   "Build SLP failed: unsupported data-type ");
2179           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2180           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2181         }
2182 
2183       return false;
2184     }
2185   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2186 
2187   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
2188   scalar_stmts.create (group_size);
2189   next = stmt;
2190   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2191     {
2192       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
2193       while (next)
2194         {
2195 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2196 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2197 	    scalar_stmts.safe_push (
2198 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2199 	  else
2200             scalar_stmts.safe_push (next);
2201           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2202         }
2203       /* Mark the first element of the reduction chain as reduction to properly
2204 	 transform the node.  In the reduction analysis phase only the last
2205 	 element of the chain is marked as reduction.  */
2206       if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2207 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2208     }
2209   else
2210     {
2211       /* Collect reduction statements.  */
2212       vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2213       for (i = 0; reductions.iterate (i, &next); i++)
2214 	scalar_stmts.safe_push (next);
2215     }
2216 
2217   loads.create (group_size);
2218 
2219   /* Build the tree for the SLP instance.  */
2220   bool *matches = XALLOCAVEC (bool, group_size);
2221   unsigned npermutes = 0;
2222   bst_fail = new scalar_stmts_set_t ();
2223   poly_uint64 max_nunits = nunits;
2224   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2225 			      &max_nunits, &loads, matches, &npermutes,
2226 			      NULL, max_tree_size);
2227   delete bst_fail;
2228   if (node != NULL)
2229     {
2230       /* Calculate the unrolling factor based on the smallest type.  */
2231       poly_uint64 unrolling_factor
2232 	= calculate_unrolling_factor (max_nunits, group_size);
2233 
2234       if (maybe_ne (unrolling_factor, 1U)
2235 	  && is_a <bb_vec_info> (vinfo))
2236 	{
2237 	  unsigned HOST_WIDE_INT const_max_nunits;
2238 	  if (!max_nunits.is_constant (&const_max_nunits)
2239 	      || const_max_nunits > group_size)
2240 	    {
2241 	      if (dump_enabled_p ())
2242 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 				 "Build SLP failed: store group "
2244 				 "size not a multiple of the vector size "
2245 				 "in basic block SLP\n");
2246 	      vect_free_slp_tree (node);
2247 	      loads.release ();
2248 	      return false;
2249 	    }
2250 	  /* Fatal mismatch.  */
2251 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
2252 	  vect_free_slp_tree (node);
2253 	  loads.release ();
2254 	}
2255       else
2256 	{
2257       /* Create a new SLP instance.  */
2258       new_instance = XNEW (struct _slp_instance);
2259       SLP_INSTANCE_TREE (new_instance) = node;
2260       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2261       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2262       SLP_INSTANCE_LOADS (new_instance) = loads;
2263 
2264       /* Compute the load permutation.  */
2265       slp_tree load_node;
2266       bool loads_permuted = false;
2267       FOR_EACH_VEC_ELT (loads, i, load_node)
2268 	{
2269 	  vec<unsigned> load_permutation;
2270 	  int j;
2271 	  gimple *load, *first_stmt;
2272 	  bool this_load_permuted = false;
2273 	  load_permutation.create (group_size);
2274 	  first_stmt = GROUP_FIRST_ELEMENT
2275 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2276 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2277 	    {
2278 		  int load_place = vect_get_place_in_interleaving_chain
2279 				     (load, first_stmt);
2280 	      gcc_assert (load_place != -1);
2281 	      if (load_place != j)
2282 		this_load_permuted = true;
2283 	      load_permutation.safe_push (load_place);
2284 	    }
2285 	  if (!this_load_permuted
2286 	      /* The load requires permutation when unrolling exposes
2287 	         a gap either because the group is larger than the SLP
2288 		 group-size or because there is a gap between the groups.  */
2289 	      && (known_eq (unrolling_factor, 1U)
2290 		  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2291 		      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2292 	    {
2293 	      load_permutation.release ();
2294 	      continue;
2295 	    }
2296 	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2297 	  loads_permuted = true;
2298 	}
2299 
2300       if (loads_permuted)
2301         {
2302           if (!vect_supported_load_permutation_p (new_instance))
2303             {
2304               if (dump_enabled_p ())
2305                 {
2306                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 				   "Build SLP failed: unsupported load "
2308 				   "permutation ");
2309 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2310 					TDF_SLIM, stmt, 0);
2311                 }
2312               vect_free_slp_instance (new_instance);
2313               return false;
2314             }
2315         }
2316 
2317 	  /* If the loads and stores can be handled with load/store-lan
2318 	 instructions do not generate this SLP instance.  */
2319       if (is_a <loop_vec_info> (vinfo)
2320 	  && loads_permuted
2321 	  && dr && vect_store_lanes_supported (vectype, group_size, false))
2322 	{
2323 	  slp_tree load_node;
2324 	  FOR_EACH_VEC_ELT (loads, i, load_node)
2325 	    {
2326 	      gimple *first_stmt = GROUP_FIRST_ELEMENT
2327 		  (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2328 	      stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2329 		  /* Use SLP for strided accesses (or if we
2330 		     can't load-lanes).  */
2331 	      if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2332 		  || ! vect_load_lanes_supported
2333 			(STMT_VINFO_VECTYPE (stmt_vinfo),
2334 			 GROUP_SIZE (stmt_vinfo), false))
2335 		break;
2336 	    }
2337 	  if (i == loads.length ())
2338 	    {
2339 	      if (dump_enabled_p ())
2340 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341 				 "Built SLP cancelled: can use "
2342 				 "load/store-lanes\n");
2343 	      vect_free_slp_instance (new_instance);
2344 	      return false;
2345 	    }
2346 	}
2347 
2348       vinfo->slp_instances.safe_push (new_instance);
2349 
2350       if (dump_enabled_p ())
2351 	{
2352 	  dump_printf_loc (MSG_NOTE, vect_location,
2353 			   "Final SLP tree for instance:\n");
2354 	  vect_print_slp_tree (MSG_NOTE, vect_location, node);
2355 	}
2356 
2357       return true;
2358     }
2359     }
2360   else
2361     {
2362   /* Failed to SLP.  */
2363   /* Free the allocated memory.  */
2364   scalar_stmts.release ();
2365   loads.release ();
2366     }
2367 
2368   /* For basic block SLP, try to break the group up into multiples of the
2369      vector size.  */
2370   unsigned HOST_WIDE_INT const_nunits;
2371   if (is_a <bb_vec_info> (vinfo)
2372       && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2373       && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2374       && nunits.is_constant (&const_nunits))
2375     {
2376       /* We consider breaking the group only on VF boundaries from the existing
2377 	 start.  */
2378       for (i = 0; i < group_size; i++)
2379 	if (!matches[i]) break;
2380 
2381       if (i >= const_nunits && i < group_size)
2382 	{
2383 	  /* Split into two groups at the first vector boundary before i.  */
2384 	  gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2385 	  unsigned group1_size = i & ~(const_nunits - 1);
2386 
2387 	  gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2388 	  bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2389 	  /* If the first non-match was in the middle of a vector,
2390 	     skip the rest of that vector.  */
2391 	  if (group1_size < i)
2392 	    {
2393 	      i = group1_size + const_nunits;
2394 	      if (i < group_size)
2395 		rest = vect_split_slp_store_group (rest, const_nunits);
2396 	    }
2397 	  if (i < group_size)
2398 	    res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2399 	  return res;
2400 	}
2401       /* Even though the first vector did not all match, we might be able to SLP
2402 	 (some) of the remainder.  FORNOW ignore this possibility.  */
2403     }
2404 
2405   return false;
2406 }
2407 
2408 
2409 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
2410    trees of packed scalar stmts if SLP is possible.  */
2411 
2412 bool
2413 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2414 {
2415   unsigned int i;
2416   gimple *first_element;
2417 
2418   if (dump_enabled_p ())
2419     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2420 
2421   /* Find SLP sequences starting from groups of grouped stores.  */
2422   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2423     vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2424 
2425   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2426     {
2427       if (loop_vinfo->reduction_chains.length () > 0)
2428 	{
2429 	  /* Find SLP sequences starting from reduction chains.  */
2430 	  FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2431 	    if (! vect_analyze_slp_instance (vinfo, first_element,
2432 					     max_tree_size))
2433 	      {
2434 		/* Dissolve reduction chain group.  */
2435 		gimple *next, *stmt = first_element;
2436 		while (stmt)
2437 		  {
2438 		    stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2439 		    next = GROUP_NEXT_ELEMENT (vinfo);
2440 		    GROUP_FIRST_ELEMENT (vinfo) = NULL;
2441 		    GROUP_NEXT_ELEMENT (vinfo) = NULL;
2442 		    stmt = next;
2443 		  }
2444 		STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2445 		  = vect_internal_def;
2446 	      }
2447 	}
2448 
2449       /* Find SLP sequences starting from groups of reductions.  */
2450       if (loop_vinfo->reductions.length () > 1)
2451 	vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2452 				   max_tree_size);
2453     }
2454 
2455   return true;
2456 }
2457 
2458 
2459 /* For each possible SLP instance decide whether to SLP it and calculate overall
2460    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
2461    least one instance.  */
2462 
2463 bool
2464 vect_make_slp_decision (loop_vec_info loop_vinfo)
2465 {
2466   unsigned int i;
2467   poly_uint64 unrolling_factor = 1;
2468   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2469   slp_instance instance;
2470   int decided_to_slp = 0;
2471 
2472   if (dump_enabled_p ())
2473     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2474                      "\n");
2475 
2476   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477     {
2478       /* FORNOW: SLP if you can.  */
2479       /* All unroll factors have the form current_vector_size * X for some
2480 	 rational X, so they must have a common multiple.  */
2481       unrolling_factor
2482 	= force_common_multiple (unrolling_factor,
2483 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2484 
2485       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
2486 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2487 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
2488       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2489       decided_to_slp++;
2490     }
2491 
2492   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2493 
2494   if (decided_to_slp && dump_enabled_p ())
2495     {
2496       dump_printf_loc (MSG_NOTE, vect_location,
2497 		       "Decided to SLP %d instances. Unrolling factor ",
2498 		       decided_to_slp);
2499       dump_dec (MSG_NOTE, unrolling_factor);
2500       dump_printf (MSG_NOTE, "\n");
2501     }
2502 
2503   return (decided_to_slp > 0);
2504 }
2505 
2506 
2507 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2508    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
2509 
2510 static void
2511 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2512 {
2513   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2514   imm_use_iterator imm_iter;
2515   gimple *use_stmt;
2516   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2517   slp_tree child;
2518   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2520   int j;
2521 
2522   /* Propagate hybrid down the SLP tree.  */
2523   if (stype == hybrid)
2524     ;
2525   else if (HYBRID_SLP_STMT (stmt_vinfo))
2526     stype = hybrid;
2527   else
2528     {
2529       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
2530       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2531       /* If we get a pattern stmt here we have to use the LHS of the
2532          original stmt for immediate uses.  */
2533       if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2534 	  && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2535 	stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2536       tree def;
2537       if (gimple_code (stmt) == GIMPLE_PHI)
2538 	def = gimple_phi_result (stmt);
2539       else
2540 	def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2541       if (def)
2542 	FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2543 	  {
2544 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2545 	      continue;
2546 	    use_vinfo = vinfo_for_stmt (use_stmt);
2547 	    if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2548 		&& STMT_VINFO_RELATED_STMT (use_vinfo))
2549 	      use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2550 	    if (!STMT_SLP_TYPE (use_vinfo)
2551 		&& (STMT_VINFO_RELEVANT (use_vinfo)
2552 		    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2553 		&& !(gimple_code (use_stmt) == GIMPLE_PHI
2554 		     && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2555 	      {
2556 		if (dump_enabled_p ())
2557 		  {
2558 		    dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2559 				     "def in non-SLP stmt: ");
2560 		    dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2561 		  }
2562 		stype = hybrid;
2563 	      }
2564 	  }
2565     }
2566 
2567   if (stype == hybrid
2568       && !HYBRID_SLP_STMT (stmt_vinfo))
2569     {
2570       if (dump_enabled_p ())
2571 	{
2572 	  dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2573 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2574 	}
2575       STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2576     }
2577 
2578   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2579     if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2580       vect_detect_hybrid_slp_stmts (child, i, stype);
2581 }
2582 
2583 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
2584 
2585 static tree
2586 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2587 {
2588   walk_stmt_info *wi = (walk_stmt_info *)data;
2589   struct loop *loopp = (struct loop *)wi->info;
2590 
2591   if (wi->is_lhs)
2592     return NULL_TREE;
2593 
2594   if (TREE_CODE (*tp) == SSA_NAME
2595       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2596     {
2597       gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2598       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2599 	  && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2600 	{
2601 	  if (dump_enabled_p ())
2602 	    {
2603 	      dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2604 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2605 	    }
2606 	  STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2607 	}
2608     }
2609 
2610   return NULL_TREE;
2611 }
2612 
2613 static tree
2614 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2615 			  walk_stmt_info *)
2616 {
2617   stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2618   /* If the stmt is in a SLP instance then this isn't a reason
2619      to mark use definitions in other SLP instances as hybrid.  */
2620   if (! STMT_SLP_TYPE (use_vinfo)
2621       && (STMT_VINFO_RELEVANT (use_vinfo)
2622 	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2623       && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2624 	    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2625     ;
2626   else
2627     *handled = true;
2628   return NULL_TREE;
2629 }
2630 
2631 /* Find stmts that must be both vectorized and SLPed.  */
2632 
2633 void
2634 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2635 {
2636   unsigned int i;
2637   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2638   slp_instance instance;
2639 
2640   if (dump_enabled_p ())
2641     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2642                      "\n");
2643 
2644   /* First walk all pattern stmt in the loop and mark defs of uses as
2645      hybrid because immediate uses in them are not recorded.  */
2646   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2647     {
2648       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2649       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 	   gsi_next (&gsi))
2651 	{
2652 	  gimple *stmt = gsi_stmt (gsi);
2653 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2654 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2655 	    {
2656 	      walk_stmt_info wi;
2657 	      memset (&wi, 0, sizeof (wi));
2658 	      wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2659 	      gimple_stmt_iterator gsi2
2660 		= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2661 	      walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2662 				vect_detect_hybrid_slp_1, &wi);
2663 	      walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2664 			       vect_detect_hybrid_slp_2,
2665 			       vect_detect_hybrid_slp_1, &wi);
2666 	    }
2667 	}
2668     }
2669 
2670   /* Then walk the SLP instance trees marking stmts with uses in
2671      non-SLP stmts as hybrid, also propagating hybrid down the
2672      SLP tree, collecting the above info on-the-fly.  */
2673   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2674     {
2675       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2676 	vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2677 				      i, pure_slp);
2678     }
2679 }
2680 
2681 
2682 /* Initialize a bb_vec_info struct for the statements between
2683    REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive).  */
2684 
2685 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2686 			    gimple_stmt_iterator region_end_in)
2687   : vec_info (vec_info::bb, init_cost (NULL)),
2688     bb (gsi_bb (region_begin_in)),
2689     region_begin (region_begin_in),
2690     region_end (region_end_in)
2691 {
2692   gimple_stmt_iterator gsi;
2693 
2694   for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2695        gsi_next (&gsi))
2696     {
2697       gimple *stmt = gsi_stmt (gsi);
2698       gimple_set_uid (stmt, 0);
2699       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2700     }
2701 
2702   bb->aux = this;
2703 }
2704 
2705 
2706 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2707    stmts in the basic block.  */
2708 
2709 _bb_vec_info::~_bb_vec_info ()
2710 {
2711   for (gimple_stmt_iterator si = region_begin;
2712        gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2713     {
2714       gimple *stmt = gsi_stmt (si);
2715       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716 
2717       if (stmt_info)
2718         /* Free stmt_vec_info.  */
2719         free_stmt_vec_info (stmt);
2720 
2721       /* Reset region marker.  */
2722       gimple_set_uid (stmt, -1);
2723     }
2724 
2725   bb->aux = NULL;
2726 }
2727 
2728 
2729 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2730    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2731 
2732    Return true if the operations are supported.  */
2733 
2734 static bool
2735 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2736 				  slp_instance node_instance)
2737 {
2738   bool dummy;
2739   int i, j;
2740   gimple *stmt;
2741   slp_tree child;
2742 
2743   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2744     return true;
2745 
2746   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2748       return false;
2749 
2750   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2751   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2752   gcc_assert (stmt_info);
2753   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2754 
2755   /* For BB vectorization vector types are assigned here.
2756      Memory accesses already got their vector type assigned
2757      in vect_analyze_data_refs.  */
2758   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2759   if (bb_vinfo
2760       && ! STMT_VINFO_DATA_REF (stmt_info))
2761     {
2762       gcc_assert (PURE_SLP_STMT (stmt_info));
2763 
2764       tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2765       if (dump_enabled_p ())
2766 	{
2767 	  dump_printf_loc (MSG_NOTE, vect_location,
2768 			   "get vectype for scalar type:  ");
2769 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2770 	  dump_printf (MSG_NOTE, "\n");
2771 	}
2772 
2773       tree vectype = get_vectype_for_scalar_type (scalar_type);
2774       if (!vectype)
2775 	{
2776 	  if (dump_enabled_p ())
2777 	    {
2778 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 			       "not SLPed: unsupported data-type ");
2780 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2781 				 scalar_type);
2782 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2783 	    }
2784 	  return false;
2785 	}
2786 
2787       if (dump_enabled_p ())
2788 	{
2789 	  dump_printf_loc (MSG_NOTE, vect_location, "vectype:  ");
2790 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2791 	  dump_printf (MSG_NOTE, "\n");
2792 	}
2793 
2794       gimple *sstmt;
2795       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2796 	STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2797     }
2798 
2799   /* Calculate the number of vector statements to be created for the
2800      scalar stmts in this node.  For SLP reductions it is equal to the
2801      number of vector statements in the children (which has already been
2802      calculated by the recursive call).  Otherwise it is the number of
2803      scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2804      VF divided by the number of elements in a vector.  */
2805   if (GROUP_FIRST_ELEMENT (stmt_info)
2806       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2807     SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2808       = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2809   else
2810     {
2811       poly_uint64 vf;
2812       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2813 	vf = loop_vinfo->vectorization_factor;
2814       else
2815 	vf = 1;
2816       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2817       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2818       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2819 	= vect_get_num_vectors (vf * group_size, vectype);
2820     }
2821 
2822   /* Push SLP node def-type to stmt operands.  */
2823   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2824     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2825       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2826 	= SLP_TREE_DEF_TYPE (child);
2827   bool res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2828   /* Restore def-types.  */
2829   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2830     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2831       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2832 	= vect_internal_def;
2833   if (! res)
2834     return false;
2835 
2836   return true;
2837 }
2838 
2839 
2840 /* Analyze statements in SLP instances of VINFO.  Return true if the
2841    operations are supported. */
2842 
2843 bool
2844 vect_slp_analyze_operations (vec_info *vinfo)
2845 {
2846   slp_instance instance;
2847   int i;
2848 
2849   if (dump_enabled_p ())
2850     dump_printf_loc (MSG_NOTE, vect_location,
2851 		     "=== vect_slp_analyze_operations ===\n");
2852 
2853   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2854     {
2855       if (!vect_slp_analyze_node_operations (vinfo,
2856 					     SLP_INSTANCE_TREE (instance),
2857 					     instance))
2858         {
2859 	  dump_printf_loc (MSG_NOTE, vect_location,
2860 			   "removing SLP instance operations starting from: ");
2861 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2862 			    SLP_TREE_SCALAR_STMTS
2863 			      (SLP_INSTANCE_TREE (instance))[0], 0);
2864 	  vect_free_slp_instance (instance);
2865           vinfo->slp_instances.ordered_remove (i);
2866 	}
2867       else
2868 	i++;
2869     }
2870 
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873 		     "=== vect_analyze_slp_cost ===\n");
2874 
2875   /* Compute the costs of the SLP instances.  */
2876   scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2877   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2878     vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2879   delete visited;
2880 
2881   return !vinfo->slp_instances.is_empty ();
2882 }
2883 
2884 
2885 /* Compute the scalar cost of the SLP node NODE and its children
2886    and return it.  Do not account defs that are marked in LIFE and
2887    update LIFE according to uses of NODE.  */
2888 
2889 static unsigned
2890 vect_bb_slp_scalar_cost (basic_block bb,
2891 			 slp_tree node, vec<bool, va_heap> *life)
2892 {
2893   unsigned scalar_cost = 0;
2894   unsigned i;
2895   gimple *stmt;
2896   slp_tree child;
2897 
2898   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2899     {
2900       unsigned stmt_cost;
2901       ssa_op_iter op_iter;
2902       def_operand_p def_p;
2903       stmt_vec_info stmt_info;
2904 
2905       if ((*life)[i])
2906 	continue;
2907 
2908       /* If there is a non-vectorized use of the defs then the scalar
2909          stmt is kept live in which case we do not account it or any
2910 	 required defs in the SLP children in the scalar cost.  This
2911 	 way we make the vectorization more costly when compared to
2912 	 the scalar cost.  */
2913       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2914 	{
2915 	  imm_use_iterator use_iter;
2916 	  gimple *use_stmt;
2917 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2918 	    if (!is_gimple_debug (use_stmt)
2919 		&& (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2920 					     use_stmt)
2921 		    || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2922 	      {
2923 		(*life)[i] = true;
2924 		BREAK_FROM_IMM_USE_STMT (use_iter);
2925 	      }
2926 	}
2927       if ((*life)[i])
2928 	continue;
2929 
2930       /* Count scalar stmts only once.  */
2931       if (gimple_visited_p (stmt))
2932 	continue;
2933       gimple_set_visited (stmt, true);
2934 
2935       stmt_info = vinfo_for_stmt (stmt);
2936       if (STMT_VINFO_DATA_REF (stmt_info))
2937         {
2938           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2939             stmt_cost = vect_get_stmt_cost (scalar_load);
2940           else
2941             stmt_cost = vect_get_stmt_cost (scalar_store);
2942         }
2943       else
2944         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2945 
2946       scalar_cost += stmt_cost;
2947     }
2948 
2949   auto_vec<bool, 20> subtree_life;
2950   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2951     {
2952       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2953 	{
2954 	  /* Do not directly pass LIFE to the recursive call, copy it to
2955 	     confine changes in the callee to the current child/subtree.  */
2956 	  subtree_life.safe_splice (*life);
2957 	  scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2958 	  subtree_life.truncate (0);
2959 	}
2960     }
2961 
2962   return scalar_cost;
2963 }
2964 
2965 /* Check if vectorization of the basic block is profitable.  */
2966 
2967 static bool
2968 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2969 {
2970   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2971   slp_instance instance;
2972   int i;
2973   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2974   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2975 
2976   /* Calculate scalar cost.  */
2977   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2978     {
2979       auto_vec<bool, 20> life;
2980       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2981       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2982 					      SLP_INSTANCE_TREE (instance),
2983 					      &life);
2984     }
2985 
2986   /* Unset visited flag.  */
2987   for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2988        gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2989     gimple_set_visited  (gsi_stmt (gsi), false);
2990 
2991   /* Complete the target-specific cost calculation.  */
2992   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2993 	       &vec_inside_cost, &vec_epilogue_cost);
2994 
2995   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2996 
2997   if (dump_enabled_p ())
2998     {
2999       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3000       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
3001 		   vec_inside_cost);
3002       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
3003       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
3004       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
3005     }
3006 
3007   /* Vectorization is profitable if its cost is more than the cost of scalar
3008      version.  Note that we err on the vector side for equal cost because
3009      the cost estimate is otherwise quite pessimistic (constant uses are
3010      free on the scalar side but cost a load on the vector side for
3011      example).  */
3012   if (vec_outside_cost + vec_inside_cost > scalar_cost)
3013     return false;
3014 
3015   return true;
3016 }
3017 
3018 /* Check if the basic block can be vectorized.  Returns a bb_vec_info
3019    if so and sets fatal to true if failure is independent of
3020    current_vector_size.  */
3021 
3022 static bb_vec_info
3023 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3024 		       gimple_stmt_iterator region_end,
3025 		       vec<data_reference_p> datarefs, int n_stmts,
3026 		       bool &fatal)
3027 {
3028   bb_vec_info bb_vinfo;
3029   slp_instance instance;
3030   int i;
3031   poly_uint64 min_vf = 2;
3032 
3033   /* The first group of checks is independent of the vector size.  */
3034   fatal = true;
3035 
3036   if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3037     {
3038       if (dump_enabled_p ())
3039 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040 			 "not vectorized: too many instructions in "
3041 			 "basic block.\n");
3042       free_data_refs (datarefs);
3043       return NULL;
3044     }
3045 
3046   bb_vinfo = new _bb_vec_info (region_begin, region_end);
3047   if (!bb_vinfo)
3048     return NULL;
3049 
3050   BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3051 
3052   /* Analyze the data references.  */
3053 
3054   if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3055     {
3056       if (dump_enabled_p ())
3057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 			 "not vectorized: unhandled data-ref in basic "
3059 			 "block.\n");
3060 
3061       delete bb_vinfo;
3062       return NULL;
3063     }
3064 
3065   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3066     {
3067       if (dump_enabled_p ())
3068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3069 			 "not vectorized: not enough data-refs in "
3070 			 "basic block.\n");
3071 
3072       delete bb_vinfo;
3073       return NULL;
3074     }
3075 
3076   if (!vect_analyze_data_ref_accesses (bb_vinfo))
3077     {
3078      if (dump_enabled_p ())
3079        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080 			"not vectorized: unhandled data access in "
3081 			"basic block.\n");
3082 
3083       delete bb_vinfo;
3084       return NULL;
3085     }
3086 
3087   /* If there are no grouped stores in the region there is no need
3088      to continue with pattern recog as vect_analyze_slp will fail
3089      anyway.  */
3090   if (bb_vinfo->grouped_stores.is_empty ())
3091     {
3092       if (dump_enabled_p ())
3093 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3094 			 "not vectorized: no grouped stores in "
3095 			 "basic block.\n");
3096 
3097       delete bb_vinfo;
3098       return NULL;
3099     }
3100 
3101   /* While the rest of the analysis below depends on it in some way.  */
3102   fatal = false;
3103 
3104   vect_pattern_recog (bb_vinfo);
3105 
3106   /* Check the SLP opportunities in the basic block, analyze and build SLP
3107      trees.  */
3108   if (!vect_analyze_slp (bb_vinfo, n_stmts))
3109     {
3110       if (dump_enabled_p ())
3111 	{
3112 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3113 			   "Failed to SLP the basic block.\n");
3114 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3115 			   "not vectorized: failed to find SLP opportunities "
3116 			   "in basic block.\n");
3117 	}
3118 
3119       delete bb_vinfo;
3120       return NULL;
3121     }
3122 
3123   vect_record_base_alignments (bb_vinfo);
3124 
3125   /* Analyze and verify the alignment of data references and the
3126      dependence in the SLP instances.  */
3127   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3128     {
3129       if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3130 	  || ! vect_slp_analyze_instance_dependence (instance))
3131 	{
3132 	  dump_printf_loc (MSG_NOTE, vect_location,
3133 			   "removing SLP instance operations starting from: ");
3134 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3135 			    SLP_TREE_SCALAR_STMTS
3136 			      (SLP_INSTANCE_TREE (instance))[0], 0);
3137 	  vect_free_slp_instance (instance);
3138 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3139 	  continue;
3140 	}
3141 
3142       /* Mark all the statements that we want to vectorize as pure SLP and
3143 	 relevant.  */
3144       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3145       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3146 
3147       i++;
3148     }
3149   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3150     {
3151       delete bb_vinfo;
3152       return NULL;
3153     }
3154 
3155   if (!vect_slp_analyze_operations (bb_vinfo))
3156     {
3157       if (dump_enabled_p ())
3158         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 			 "not vectorized: bad operation in basic block.\n");
3160 
3161       delete bb_vinfo;
3162       return NULL;
3163     }
3164 
3165   /* Cost model: check if the vectorization is worthwhile.  */
3166   if (!unlimited_cost_model (NULL)
3167       && !vect_bb_vectorization_profitable_p (bb_vinfo))
3168     {
3169       if (dump_enabled_p ())
3170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3171 			 "not vectorized: vectorization is not "
3172 			 "profitable.\n");
3173 
3174       delete bb_vinfo;
3175       return NULL;
3176     }
3177 
3178   if (dump_enabled_p ())
3179     dump_printf_loc (MSG_NOTE, vect_location,
3180 		     "Basic block will be vectorized using SLP\n");
3181 
3182   return bb_vinfo;
3183 }
3184 
3185 
3186 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
3187    true if anything in the basic-block was vectorized.  */
3188 
3189 bool
3190 vect_slp_bb (basic_block bb)
3191 {
3192   bb_vec_info bb_vinfo;
3193   gimple_stmt_iterator gsi;
3194   bool any_vectorized = false;
3195   auto_vector_sizes vector_sizes;
3196 
3197   if (dump_enabled_p ())
3198     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3199 
3200   /* Autodetect first vector size we try.  */
3201   current_vector_size = 0;
3202   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3203   unsigned int next_size = 0;
3204 
3205   gsi = gsi_start_bb (bb);
3206 
3207   poly_uint64 autodetected_vector_size = 0;
3208   while (1)
3209     {
3210       if (gsi_end_p (gsi))
3211 	break;
3212 
3213       gimple_stmt_iterator region_begin = gsi;
3214       vec<data_reference_p> datarefs = vNULL;
3215       int insns = 0;
3216 
3217       for (; !gsi_end_p (gsi); gsi_next (&gsi))
3218 	{
3219 	  gimple *stmt = gsi_stmt (gsi);
3220 	  if (is_gimple_debug (stmt))
3221 	    continue;
3222 	  insns++;
3223 
3224 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
3225 	    vect_location = gimple_location (stmt);
3226 
3227 	  if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3228 	    break;
3229 	}
3230 
3231       /* Skip leading unhandled stmts.  */
3232       if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3233 	{
3234 	  gsi_next (&gsi);
3235 	  continue;
3236 	}
3237 
3238       gimple_stmt_iterator region_end = gsi;
3239 
3240       bool vectorized = false;
3241       bool fatal = false;
3242       bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3243 					datarefs, insns, fatal);
3244       if (bb_vinfo
3245 	  && dbg_cnt (vect_slp))
3246 	{
3247 	  if (dump_enabled_p ())
3248 	    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3249 
3250 	  vect_schedule_slp (bb_vinfo);
3251 
3252 	  if (dump_enabled_p ())
3253 	    dump_printf_loc (MSG_NOTE, vect_location,
3254 			     "basic block part vectorized\n");
3255 
3256 	  vectorized = true;
3257 	}
3258       delete bb_vinfo;
3259 
3260       any_vectorized |= vectorized;
3261 
3262       if (next_size == 0)
3263 	autodetected_vector_size = current_vector_size;
3264 
3265       if (next_size < vector_sizes.length ()
3266 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
3267 	next_size += 1;
3268 
3269       if (vectorized
3270 	  || next_size == vector_sizes.length ()
3271 	  || known_eq (current_vector_size, 0U)
3272 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
3273 	     vector sizes will fail do not bother iterating.  */
3274 	  || fatal)
3275 	{
3276 	  if (gsi_end_p (region_end))
3277 	    break;
3278 
3279 	  /* Skip the unhandled stmt.  */
3280 	  gsi_next (&gsi);
3281 
3282 	  /* And reset vector sizes.  */
3283 	  current_vector_size = 0;
3284 	  next_size = 0;
3285 	}
3286       else
3287 	{
3288 	  /* Try the next biggest vector size.  */
3289 	  current_vector_size = vector_sizes[next_size++];
3290 	  if (dump_enabled_p ())
3291 	    {
3292 	      dump_printf_loc (MSG_NOTE, vect_location,
3293 			       "***** Re-trying analysis with "
3294 			       "vector size ");
3295 	      dump_dec (MSG_NOTE, current_vector_size);
3296 	      dump_printf (MSG_NOTE, "\n");
3297 	    }
3298 
3299 	  /* Start over.  */
3300 	  gsi = region_begin;
3301 	}
3302     }
3303 
3304   return any_vectorized;
3305 }
3306 
3307 
3308 /* Return 1 if vector type of boolean constant which is OPNUM
3309    operand in statement STMT is a boolean vector.  */
3310 
3311 static bool
3312 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3313 {
3314   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3315   enum tree_code code = gimple_expr_code (stmt);
3316   tree op, vectype;
3317   gimple *def_stmt;
3318   enum vect_def_type dt;
3319 
3320   /* For comparison and COND_EXPR type is chosen depending
3321      on the other comparison operand.  */
3322   if (TREE_CODE_CLASS (code) == tcc_comparison)
3323     {
3324       if (opnum)
3325 	op = gimple_assign_rhs1 (stmt);
3326       else
3327 	op = gimple_assign_rhs2 (stmt);
3328 
3329       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3330 			       &dt, &vectype))
3331 	gcc_unreachable ();
3332 
3333       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3334     }
3335 
3336   if (code == COND_EXPR)
3337     {
3338       tree cond = gimple_assign_rhs1 (stmt);
3339 
3340       if (TREE_CODE (cond) == SSA_NAME)
3341 	op = cond;
3342       else if (opnum)
3343 	op = TREE_OPERAND (cond, 1);
3344       else
3345 	op = TREE_OPERAND (cond, 0);
3346 
3347       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3348 			       &dt, &vectype))
3349 	gcc_unreachable ();
3350 
3351       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3352     }
3353 
3354   return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3355 }
3356 
3357 /* Build a variable-length vector in which the elements in ELTS are repeated
3358    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
3359    RESULTS and add any new instructions to SEQ.
3360 
3361    The approach we use is:
3362 
3363    (1) Find a vector mode VM with integer elements of mode IM.
3364 
3365    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3366        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
3367        from small vectors to IM.
3368 
3369    (3) Duplicate each ELTS'[I] into a vector of mode VM.
3370 
3371    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3372        correct byte contents.
3373 
3374    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3375 
3376    We try to find the largest IM for which this sequence works, in order
3377    to cut down on the number of interleaves.  */
3378 
3379 void
3380 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3381 			  unsigned int nresults, vec<tree> &results)
3382 {
3383   unsigned int nelts = elts.length ();
3384   tree element_type = TREE_TYPE (vector_type);
3385 
3386   /* (1) Find a vector mode VM with integer elements of mode IM.  */
3387   unsigned int nvectors = 1;
3388   tree new_vector_type;
3389   tree permutes[2];
3390   if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3391 				       &nvectors, &new_vector_type,
3392 				       permutes))
3393     gcc_unreachable ();
3394 
3395   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
3396   unsigned int partial_nelts = nelts / nvectors;
3397   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3398 
3399   tree_vector_builder partial_elts;
3400   auto_vec<tree, 32> pieces (nvectors * 2);
3401   pieces.quick_grow (nvectors * 2);
3402   for (unsigned int i = 0; i < nvectors; ++i)
3403     {
3404       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3405 	     ELTS' has mode IM.  */
3406       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3407       for (unsigned int j = 0; j < partial_nelts; ++j)
3408 	partial_elts.quick_push (elts[i * partial_nelts + j]);
3409       tree t = gimple_build_vector (seq, &partial_elts);
3410       t = gimple_build (seq, VIEW_CONVERT_EXPR,
3411 			TREE_TYPE (new_vector_type), t);
3412 
3413       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
3414       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3415     }
3416 
3417   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3418 	 correct byte contents.
3419 
3420      We need to repeat the following operation log2(nvectors) times:
3421 
3422 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3423 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3424 
3425      However, if each input repeats every N elements and the VF is
3426      a multiple of N * 2, the HI result is the same as the LO.  */
3427   unsigned int in_start = 0;
3428   unsigned int out_start = nvectors;
3429   unsigned int hi_start = nvectors / 2;
3430   /* A bound on the number of outputs needed to produce NRESULTS results
3431      in the final iteration.  */
3432   unsigned int noutputs_bound = nvectors * nresults;
3433   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3434     {
3435       noutputs_bound /= 2;
3436       unsigned int limit = MIN (noutputs_bound, nvectors);
3437       for (unsigned int i = 0; i < limit; ++i)
3438 	{
3439 	  if ((i & 1) != 0
3440 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3441 			     2 * in_repeat))
3442 	    {
3443 	      pieces[out_start + i] = pieces[out_start + i - 1];
3444 	      continue;
3445 	    }
3446 
3447 	  tree output = make_ssa_name (new_vector_type);
3448 	  tree input1 = pieces[in_start + (i / 2)];
3449 	  tree input2 = pieces[in_start + (i / 2) + hi_start];
3450 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3451 					       input1, input2,
3452 					       permutes[i & 1]);
3453 	  gimple_seq_add_stmt (seq, stmt);
3454 	  pieces[out_start + i] = output;
3455 	}
3456       std::swap (in_start, out_start);
3457     }
3458 
3459   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
3460   results.reserve (nresults);
3461   for (unsigned int i = 0; i < nresults; ++i)
3462     if (i < nvectors)
3463       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3464 					pieces[in_start + i]));
3465     else
3466       results.quick_push (results[i - nvectors]);
3467 }
3468 
3469 
3470 /* For constant and loop invariant defs of SLP_NODE this function returns
3471    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3472    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3473    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
3474    REDUC_INDEX is the index of the reduction operand in the statements, unless
3475    it is -1.  */
3476 
3477 static void
3478 vect_get_constant_vectors (tree op, slp_tree slp_node,
3479                            vec<tree> *vec_oprnds,
3480 			   unsigned int op_num, unsigned int number_of_vectors)
3481 {
3482   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3483   gimple *stmt = stmts[0];
3484   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3485   unsigned HOST_WIDE_INT nunits;
3486   tree vec_cst;
3487   unsigned j, number_of_places_left_in_vector;
3488   tree vector_type;
3489   tree vop;
3490   int group_size = stmts.length ();
3491   unsigned int vec_num, i;
3492   unsigned number_of_copies = 1;
3493   vec<tree> voprnds;
3494   voprnds.create (number_of_vectors);
3495   bool constant_p, is_store;
3496   tree neutral_op = NULL;
3497   enum tree_code code = gimple_expr_code (stmt);
3498   gimple_seq ctor_seq = NULL;
3499   auto_vec<tree, 16> permute_results;
3500 
3501   /* Check if vector type is a boolean vector.  */
3502   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3503       && vect_mask_constant_operand_p (stmt, op_num))
3504     vector_type
3505       = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3506   else
3507     vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3508 
3509   if (STMT_VINFO_DATA_REF (stmt_vinfo))
3510     {
3511       is_store = true;
3512       op = gimple_assign_rhs1 (stmt);
3513     }
3514   else
3515     is_store = false;
3516 
3517   gcc_assert (op);
3518 
3519   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3520      created vectors. It is greater than 1 if unrolling is performed.
3521 
3522      For example, we have two scalar operands, s1 and s2 (e.g., group of
3523      strided accesses of size two), while NUNITS is four (i.e., four scalars
3524      of this type can be packed in a vector).  The output vector will contain
3525      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3526      will be 2).
3527 
3528      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3529      containing the operands.
3530 
3531      For example, NUNITS is four as before, and the group size is 8
3532      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3533      {s5, s6, s7, s8}.  */
3534 
3535   /* When using duplicate_and_interleave, we just need one element for
3536      each scalar statement.  */
3537   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3538     nunits = group_size;
3539 
3540   number_of_copies = nunits * number_of_vectors / group_size;
3541 
3542   number_of_places_left_in_vector = nunits;
3543   constant_p = true;
3544   tree_vector_builder elts (vector_type, nunits, 1);
3545   elts.quick_grow (nunits);
3546   bool place_after_defs = false;
3547   for (j = 0; j < number_of_copies; j++)
3548     {
3549       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3550         {
3551           if (is_store)
3552             op = gimple_assign_rhs1 (stmt);
3553           else
3554 	    {
3555 	      switch (code)
3556 		{
3557 		  case COND_EXPR:
3558 		    {
3559 		      tree cond = gimple_assign_rhs1 (stmt);
3560 		      if (TREE_CODE (cond) == SSA_NAME)
3561 			op = gimple_op (stmt, op_num + 1);
3562 		      else if (op_num == 0 || op_num == 1)
3563 			op = TREE_OPERAND (cond, op_num);
3564 		      else
3565 			{
3566 			  if (op_num == 2)
3567 			    op = gimple_assign_rhs2 (stmt);
3568 			  else
3569 			    op = gimple_assign_rhs3 (stmt);
3570 			}
3571 		    }
3572 		    break;
3573 
3574 		  case CALL_EXPR:
3575 		    op = gimple_call_arg (stmt, op_num);
3576 		    break;
3577 
3578 		  case LSHIFT_EXPR:
3579 		  case RSHIFT_EXPR:
3580 		  case LROTATE_EXPR:
3581 		  case RROTATE_EXPR:
3582 		    op = gimple_op (stmt, op_num + 1);
3583 		    /* Unlike the other binary operators, shifts/rotates have
3584 		       the shift count being int, instead of the same type as
3585 		       the lhs, so make sure the scalar is the right type if
3586 		       we are dealing with vectors of
3587 		       long long/long/short/char.  */
3588 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3589 		      op = fold_convert (TREE_TYPE (vector_type), op);
3590 		    break;
3591 
3592 		  default:
3593 		    op = gimple_op (stmt, op_num + 1);
3594 		    break;
3595 		}
3596 	    }
3597 
3598           /* Create 'vect_ = {op0,op1,...,opn}'.  */
3599           number_of_places_left_in_vector--;
3600 	  tree orig_op = op;
3601 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3602 	    {
3603 	      if (CONSTANT_CLASS_P (op))
3604 		{
3605 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3606 		    {
3607 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
3608 			 of possibly different sizes of scalar value and
3609 			 vector element.  */
3610 		      if (integer_zerop (op))
3611 			op = build_int_cst (TREE_TYPE (vector_type), 0);
3612 		      else if (integer_onep (op))
3613 			op = build_all_ones_cst (TREE_TYPE (vector_type));
3614 		      else
3615 			gcc_unreachable ();
3616 		    }
3617 		  else
3618 		    op = fold_unary (VIEW_CONVERT_EXPR,
3619 				     TREE_TYPE (vector_type), op);
3620 		  gcc_assert (op && CONSTANT_CLASS_P (op));
3621 		}
3622 	      else
3623 		{
3624 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3625 		  gimple *init_stmt;
3626 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3627 		    {
3628 		      tree true_val
3629 			= build_all_ones_cst (TREE_TYPE (vector_type));
3630 		      tree false_val
3631 			= build_zero_cst (TREE_TYPE (vector_type));
3632 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3633 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3634 						       op, true_val,
3635 						       false_val);
3636 		    }
3637 		  else
3638 		    {
3639 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3640 				   op);
3641 		      init_stmt
3642 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3643 					       op);
3644 		    }
3645 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
3646 		  op = new_temp;
3647 		}
3648 	    }
3649 	  elts[number_of_places_left_in_vector] = op;
3650 	  if (!CONSTANT_CLASS_P (op))
3651 	    constant_p = false;
3652 	  if (TREE_CODE (orig_op) == SSA_NAME
3653 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3654 	      && STMT_VINFO_BB_VINFO (stmt_vinfo)
3655 	      && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3656 		  == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3657 	    place_after_defs = true;
3658 
3659           if (number_of_places_left_in_vector == 0)
3660             {
3661 	      if (constant_p
3662 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3663 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3664 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
3665 	      else
3666 		{
3667 		  if (vec_oprnds->is_empty ())
3668 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
3669 					      number_of_vectors,
3670 					      permute_results);
3671 		  vec_cst = permute_results[number_of_vectors - j - 1];
3672 		}
3673 	      tree init;
3674 	      gimple_stmt_iterator gsi;
3675 	      if (place_after_defs)
3676 		{
3677 		  gsi = gsi_for_stmt
3678 		          (vect_find_last_scalar_stmt_in_slp (slp_node));
3679 		  init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3680 		}
3681 	      else
3682 		init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3683 	      if (ctor_seq != NULL)
3684 		{
3685 		  gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3686 		  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3687 		  ctor_seq = NULL;
3688 		}
3689 	      voprnds.quick_push (init);
3690 	      place_after_defs = false;
3691               number_of_places_left_in_vector = nunits;
3692 	      constant_p = true;
3693 	      elts.new_vector (vector_type, nunits, 1);
3694 	      elts.quick_grow (nunits);
3695             }
3696         }
3697     }
3698 
3699   /* Since the vectors are created in the reverse order, we should invert
3700      them.  */
3701   vec_num = voprnds.length ();
3702   for (j = vec_num; j != 0; j--)
3703     {
3704       vop = voprnds[j - 1];
3705       vec_oprnds->quick_push (vop);
3706     }
3707 
3708   voprnds.release ();
3709 
3710   /* In case that VF is greater than the unrolling factor needed for the SLP
3711      group of stmts, NUMBER_OF_VECTORS to be created is greater than
3712      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3713      to replicate the vectors.  */
3714   while (number_of_vectors > vec_oprnds->length ())
3715     {
3716       tree neutral_vec = NULL;
3717 
3718       if (neutral_op)
3719         {
3720           if (!neutral_vec)
3721 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
3722 
3723           vec_oprnds->quick_push (neutral_vec);
3724         }
3725       else
3726         {
3727           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3728             vec_oprnds->quick_push (vop);
3729         }
3730     }
3731 }
3732 
3733 
3734 /* Get vectorized definitions from SLP_NODE that contains corresponding
3735    vectorized def-stmts.  */
3736 
3737 static void
3738 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3739 {
3740   tree vec_oprnd;
3741   gimple *vec_def_stmt;
3742   unsigned int i;
3743 
3744   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3745 
3746   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3747     {
3748       gcc_assert (vec_def_stmt);
3749       if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3750 	vec_oprnd = gimple_phi_result (vec_def_stmt);
3751       else
3752 	vec_oprnd = gimple_get_lhs (vec_def_stmt);
3753       vec_oprnds->quick_push (vec_oprnd);
3754     }
3755 }
3756 
3757 
3758 /* Get vectorized definitions for SLP_NODE.
3759    If the scalar definitions are loop invariants or constants, collect them and
3760    call vect_get_constant_vectors() to create vector stmts.
3761    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3762    must be stored in the corresponding child of SLP_NODE, and we call
3763    vect_get_slp_vect_defs () to retrieve them.  */
3764 
3765 void
3766 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3767 		   vec<vec<tree> > *vec_oprnds)
3768 {
3769   gimple *first_stmt;
3770   int number_of_vects = 0, i;
3771   unsigned int child_index = 0;
3772   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3773   slp_tree child = NULL;
3774   vec<tree> vec_defs;
3775   tree oprnd;
3776   bool vectorized_defs;
3777 
3778   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3779   FOR_EACH_VEC_ELT (ops, i, oprnd)
3780     {
3781       /* For each operand we check if it has vectorized definitions in a child
3782 	 node or we need to create them (for invariants and constants).  We
3783 	 check if the LHS of the first stmt of the next child matches OPRND.
3784 	 If it does, we found the correct child.  Otherwise, we call
3785 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3786 	 to check this child node for the next operand.  */
3787       vectorized_defs = false;
3788       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3789         {
3790           child = SLP_TREE_CHILDREN (slp_node)[child_index];
3791 
3792 	  /* We have to check both pattern and original def, if available.  */
3793 	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3794 	    {
3795 	      gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3796 	      gimple *related
3797 		= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3798 	      tree first_def_op;
3799 
3800 	      if (gimple_code (first_def) == GIMPLE_PHI)
3801 		first_def_op = gimple_phi_result (first_def);
3802 	      else
3803 		first_def_op = gimple_get_lhs (first_def);
3804 	      if (operand_equal_p (oprnd, first_def_op, 0)
3805 		  || (related
3806 		      && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3807 		{
3808 		  /* The number of vector defs is determined by the number of
3809 		     vector statements in the node from which we get those
3810 		     statements.  */
3811 		  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3812 		  vectorized_defs = true;
3813 		  child_index++;
3814 		}
3815 	    }
3816 	  else
3817 	    child_index++;
3818         }
3819 
3820       if (!vectorized_defs)
3821         {
3822           if (i == 0)
3823             {
3824               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3825               /* Number of vector stmts was calculated according to LHS in
3826                  vect_schedule_slp_instance (), fix it by replacing LHS with
3827                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
3828                  details.  */
3829               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3830                                              &rhs_size_unit);
3831               if (rhs_size_unit != lhs_size_unit)
3832                 {
3833                   number_of_vects *= rhs_size_unit;
3834                   number_of_vects /= lhs_size_unit;
3835                 }
3836             }
3837         }
3838 
3839       /* Allocate memory for vectorized defs.  */
3840       vec_defs = vNULL;
3841       vec_defs.create (number_of_vects);
3842 
3843       /* For reduction defs we call vect_get_constant_vectors (), since we are
3844          looking for initial loop invariant values.  */
3845       if (vectorized_defs)
3846         /* The defs are already vectorized.  */
3847 	vect_get_slp_vect_defs (child, &vec_defs);
3848       else
3849 	/* Build vectors from scalar defs.  */
3850 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3851 				   number_of_vects);
3852 
3853       vec_oprnds->quick_push (vec_defs);
3854     }
3855 }
3856 
3857 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3858    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3859    permute statements for the SLP node NODE of the SLP instance
3860    SLP_NODE_INSTANCE.  */
3861 
3862 bool
3863 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3864 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
3865 			      slp_instance slp_node_instance, bool analyze_only,
3866 			      unsigned *n_perms)
3867 {
3868   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3869   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3870   tree mask_element_type = NULL_TREE, mask_type;
3871   int vec_index = 0;
3872   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3873   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3874   unsigned int mask_element;
3875   machine_mode mode;
3876   unsigned HOST_WIDE_INT nunits, const_vf;
3877 
3878   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3879     return false;
3880 
3881   stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3882 
3883   mode = TYPE_MODE (vectype);
3884 
3885   /* At the moment, all permutations are represented using per-element
3886      indices, so we can't cope with variable vector lengths or
3887      vectorization factors.  */
3888   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3889       || !vf.is_constant (&const_vf))
3890     return false;
3891 
3892   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3893      same size as the vector element being permuted.  */
3894   mask_element_type = lang_hooks.types.type_for_mode
3895     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3896   mask_type = get_vectype_for_scalar_type (mask_element_type);
3897   vec_perm_builder mask (nunits, nunits, 1);
3898   mask.quick_grow (nunits);
3899   vec_perm_indices indices;
3900 
3901   /* Initialize the vect stmts of NODE to properly insert the generated
3902      stmts later.  */
3903   if (! analyze_only)
3904     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3905 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3906       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3907 
3908   /* Generate permutation masks for every NODE. Number of masks for each NODE
3909      is equal to GROUP_SIZE.
3910      E.g., we have a group of three nodes with three loads from the same
3911      location in each node, and the vector size is 4. I.e., we have a
3912      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3913      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3914      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3915      ...
3916 
3917      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3918      The last mask is illegal since we assume two operands for permute
3919      operation, and the mask element values can't be outside that range.
3920      Hence, the last mask must be converted into {2,5,5,5}.
3921      For the first two permutations we need the first and the second input
3922      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3923      we need the second and the third vectors: {b1,c1,a2,b2} and
3924      {c2,a3,b3,c3}.  */
3925 
3926   int vect_stmts_counter = 0;
3927   unsigned int index = 0;
3928   int first_vec_index = -1;
3929   int second_vec_index = -1;
3930   bool noop_p = true;
3931   *n_perms = 0;
3932 
3933   for (unsigned int j = 0; j < const_vf; j++)
3934     {
3935       for (int k = 0; k < group_size; k++)
3936 	{
3937 	  unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3938 			    + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3939 	  vec_index = i / nunits;
3940 	  mask_element = i % nunits;
3941 	  if (vec_index == first_vec_index
3942 	      || first_vec_index == -1)
3943 	    {
3944 	      first_vec_index = vec_index;
3945 	    }
3946 	  else if (vec_index == second_vec_index
3947 		   || second_vec_index == -1)
3948 	    {
3949 	      second_vec_index = vec_index;
3950 	      mask_element += nunits;
3951 	    }
3952 	  else
3953 	    {
3954 	      if (dump_enabled_p ())
3955 		{
3956 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3957 				   "permutation requires at "
3958 				   "least three vectors ");
3959 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3960 				    stmt, 0);
3961 		}
3962 	      gcc_assert (analyze_only);
3963 	      return false;
3964 	    }
3965 
3966 	  gcc_assert (mask_element < 2 * nunits);
3967 	  if (mask_element != index)
3968 	    noop_p = false;
3969 	  mask[index++] = mask_element;
3970 
3971 	  if (index == nunits && !noop_p)
3972 	    {
3973 	      indices.new_vector (mask, 2, nunits);
3974 	      if (!can_vec_perm_const_p (mode, indices))
3975 		{
3976 		  if (dump_enabled_p ())
3977 		    {
3978 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3979 				       vect_location,
3980 				       "unsupported vect permute { ");
3981 		      for (i = 0; i < nunits; ++i)
3982 			{
3983 			  dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3984 			  dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3985 			}
3986 		      dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3987 		    }
3988 		  gcc_assert (analyze_only);
3989 		  return false;
3990 		}
3991 
3992 	      ++*n_perms;
3993 	    }
3994 
3995 	  if (index == nunits)
3996 	    {
3997 	      if (!analyze_only)
3998 		{
3999 		  tree mask_vec = NULL_TREE;
4000 
4001 		  if (! noop_p)
4002 		    mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4003 
4004 		  if (second_vec_index == -1)
4005 		    second_vec_index = first_vec_index;
4006 
4007 		  /* Generate the permute statement if necessary.  */
4008 		  tree first_vec = dr_chain[first_vec_index];
4009 		  tree second_vec = dr_chain[second_vec_index];
4010 		  gimple *perm_stmt;
4011 		  if (! noop_p)
4012 		    {
4013 		      tree perm_dest
4014 			= vect_create_destination_var (gimple_assign_lhs (stmt),
4015 						       vectype);
4016 		      perm_dest = make_ssa_name (perm_dest);
4017 		      perm_stmt = gimple_build_assign (perm_dest,
4018 						       VEC_PERM_EXPR,
4019 						       first_vec, second_vec,
4020 						       mask_vec);
4021 		      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4022 		    }
4023 		  else
4024 		    /* If mask was NULL_TREE generate the requested
4025 		       identity transform.  */
4026 		    perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4027 
4028 		  /* Store the vector statement in NODE.  */
4029 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4030 		}
4031 
4032 	      index = 0;
4033 	      first_vec_index = -1;
4034 	      second_vec_index = -1;
4035 	      noop_p = true;
4036 	    }
4037 	}
4038     }
4039 
4040   return true;
4041 }
4042 
4043 typedef hash_map <vec <gimple *>, slp_tree,
4044 		  simple_hashmap_traits <bst_traits, slp_tree> >
4045   scalar_stmts_to_slp_tree_map_t;
4046 
4047 /* Vectorize SLP instance tree in postorder.  */
4048 
4049 static bool
4050 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4051 			    scalar_stmts_to_slp_tree_map_t *bst_map)
4052 {
4053   gimple *stmt;
4054   bool grouped_store, is_store;
4055   gimple_stmt_iterator si;
4056   stmt_vec_info stmt_info;
4057   unsigned int group_size;
4058   tree vectype;
4059   int i, j;
4060   slp_tree child;
4061 
4062   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4063     return false;
4064 
4065   /* See if we have already vectorized the same set of stmts and reuse their
4066      vectorized stmts.  */
4067   slp_tree &leader
4068     = bst_map->get_or_insert (SLP_TREE_SCALAR_STMTS (node).copy ());
4069   if (leader)
4070     {
4071       SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (leader));
4072       return false;
4073     }
4074 
4075   leader = node;
4076   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4077     vect_schedule_slp_instance (child, instance, bst_map);
4078 
4079   /* Push SLP node def-type to stmts.  */
4080   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4081     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4082       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4083 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4084 
4085   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4086   stmt_info = vinfo_for_stmt (stmt);
4087 
4088   /* VECTYPE is the type of the destination.  */
4089   vectype = STMT_VINFO_VECTYPE (stmt_info);
4090   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4091   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4092 
4093   if (!SLP_TREE_VEC_STMTS (node).exists ())
4094     SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4095 
4096   if (dump_enabled_p ())
4097     {
4098       dump_printf_loc (MSG_NOTE,vect_location,
4099 		       "------>vectorizing SLP node starting from: ");
4100       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4101     }
4102 
4103   /* Vectorized stmts go before the last scalar stmt which is where
4104      all uses are ready.  */
4105   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4106 
4107   /* Mark the first element of the reduction chain as reduction to properly
4108      transform the node.  In the analysis phase only the last element of the
4109      chain is marked as reduction.  */
4110   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4111       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4112     {
4113       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4114       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4115     }
4116 
4117   /* Handle two-operation SLP nodes by vectorizing the group with
4118      both operations and then performing a merge.  */
4119   if (SLP_TREE_TWO_OPERATORS (node))
4120     {
4121       enum tree_code code0 = gimple_assign_rhs_code (stmt);
4122       enum tree_code ocode = ERROR_MARK;
4123       gimple *ostmt;
4124       vec_perm_builder mask (group_size, group_size, 1);
4125       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4126 	if (gimple_assign_rhs_code (ostmt) != code0)
4127 	  {
4128 	    mask.quick_push (1);
4129 	    ocode = gimple_assign_rhs_code (ostmt);
4130 	  }
4131 	else
4132 	  mask.quick_push (0);
4133       if (ocode != ERROR_MARK)
4134 	{
4135 	  vec<gimple *> v0;
4136 	  vec<gimple *> v1;
4137 	  unsigned j;
4138 	  tree tmask = NULL_TREE;
4139 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4140 	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
4141 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4142 	  gimple_assign_set_rhs_code (stmt, ocode);
4143 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4144 	  gimple_assign_set_rhs_code (stmt, code0);
4145 	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
4146 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4147 	  tree meltype = build_nonstandard_integer_type
4148 	      (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4149 	  tree mvectype = get_same_sized_vectype (meltype, vectype);
4150 	  unsigned k = 0, l;
4151 	  for (j = 0; j < v0.length (); ++j)
4152 	    {
4153 	      /* Enforced by vect_build_slp_tree, which rejects variable-length
4154 		 vectors for SLP_TREE_TWO_OPERATORS.  */
4155 	      unsigned int const_nunits = nunits.to_constant ();
4156 	      tree_vector_builder melts (mvectype, const_nunits, 1);
4157 	      for (l = 0; l < const_nunits; ++l)
4158 		{
4159 		  if (k >= group_size)
4160 		    k = 0;
4161 		  tree t = build_int_cst (meltype,
4162 					  mask[k++] * const_nunits + l);
4163 		  melts.quick_push (t);
4164 		}
4165 	      tmask = melts.build ();
4166 
4167 	      /* ???  Not all targets support a VEC_PERM_EXPR with a
4168 	         constant mask that would translate to a vec_merge RTX
4169 		 (with their vec_perm_const_ok).  We can either not
4170 		 vectorize in that case or let veclower do its job.
4171 		 Unfortunately that isn't too great and at least for
4172 		 plus/minus we'd eventually like to match targets
4173 		 vector addsub instructions.  */
4174 	      gimple *vstmt;
4175 	      vstmt = gimple_build_assign (make_ssa_name (vectype),
4176 					   VEC_PERM_EXPR,
4177 					   gimple_assign_lhs (v0[j]),
4178 					   gimple_assign_lhs (v1[j]), tmask);
4179 	      vect_finish_stmt_generation (stmt, vstmt, &si);
4180 	      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4181 	    }
4182 	  v0.release ();
4183 	  v1.release ();
4184 	  return false;
4185 	}
4186     }
4187   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4188 
4189   /* Restore stmt def-types.  */
4190   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4191     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4192       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4193 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4194 
4195   return is_store;
4196 }
4197 
4198 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4199    For loop vectorization this is done in vectorizable_call, but for SLP
4200    it needs to be deferred until end of vect_schedule_slp, because multiple
4201    SLP instances may refer to the same scalar stmt.  */
4202 
4203 static void
4204 vect_remove_slp_scalar_calls (slp_tree node)
4205 {
4206   gimple *stmt, *new_stmt;
4207   gimple_stmt_iterator gsi;
4208   int i;
4209   slp_tree child;
4210   tree lhs;
4211   stmt_vec_info stmt_info;
4212 
4213   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4214     return;
4215 
4216   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4217     vect_remove_slp_scalar_calls (child);
4218 
4219   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4220     {
4221       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4222 	continue;
4223       stmt_info = vinfo_for_stmt (stmt);
4224       if (stmt_info == NULL
4225 	  || is_pattern_stmt_p (stmt_info)
4226 	  || !PURE_SLP_STMT (stmt_info))
4227 	continue;
4228       lhs = gimple_call_lhs (stmt);
4229       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4230       set_vinfo_for_stmt (new_stmt, stmt_info);
4231       set_vinfo_for_stmt (stmt, NULL);
4232       STMT_VINFO_STMT (stmt_info) = new_stmt;
4233       gsi = gsi_for_stmt (stmt);
4234       gsi_replace (&gsi, new_stmt, false);
4235       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4236     }
4237 }
4238 
4239 /* Generate vector code for all SLP instances in the loop/basic block.  */
4240 
4241 bool
4242 vect_schedule_slp (vec_info *vinfo)
4243 {
4244   vec<slp_instance> slp_instances;
4245   slp_instance instance;
4246   unsigned int i;
4247   bool is_store = false;
4248 
4249 
4250   scalar_stmts_to_slp_tree_map_t *bst_map
4251     = new scalar_stmts_to_slp_tree_map_t ();
4252   slp_instances = vinfo->slp_instances;
4253   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4254     {
4255       /* Schedule the tree of INSTANCE.  */
4256       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4257                                              instance, bst_map);
4258       if (dump_enabled_p ())
4259 	dump_printf_loc (MSG_NOTE, vect_location,
4260                          "vectorizing stmts using SLP.\n");
4261     }
4262   delete bst_map;
4263 
4264   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4265     {
4266       slp_tree root = SLP_INSTANCE_TREE (instance);
4267       gimple *store;
4268       unsigned int j;
4269       gimple_stmt_iterator gsi;
4270 
4271       /* Remove scalar call stmts.  Do not do this for basic-block
4272 	 vectorization as not all uses may be vectorized.
4273 	 ???  Why should this be necessary?  DCE should be able to
4274 	 remove the stmts itself.
4275 	 ???  For BB vectorization we can as well remove scalar
4276 	 stmts starting from the SLP tree root if they have no
4277 	 uses.  */
4278       if (is_a <loop_vec_info> (vinfo))
4279 	vect_remove_slp_scalar_calls (root);
4280 
4281       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4282                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4283         {
4284           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4285             break;
4286 
4287          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4288            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4289           /* Free the attached stmt_vec_info and remove the stmt.  */
4290           gsi = gsi_for_stmt (store);
4291 	  unlink_stmt_vdef (store);
4292           gsi_remove (&gsi, true);
4293 	  release_defs (store);
4294           free_stmt_vec_info (store);
4295         }
4296     }
4297 
4298   return is_store;
4299 }
4300