1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2019 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 
49 
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51    FINAL_P is true if we have vectorized the instance or if we have
52    made a final decision not to vectorize the statements in any way.  */
53 
54 static void
vect_free_slp_tree(slp_tree node,bool final_p)55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57   int i;
58   slp_tree child;
59 
60   if (--node->refcnt != 0)
61     return;
62 
63   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64     vect_free_slp_tree (child, final_p);
65 
66   /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67      Some statements might no longer exist, after having been
68      removed by vect_transform_stmt.  Updating the remaining
69      statements would be redundant.  */
70   if (!final_p)
71     {
72       stmt_vec_info stmt_info;
73       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 	{
75 	  gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 	  STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 	}
78     }
79 
80   SLP_TREE_CHILDREN (node).release ();
81   SLP_TREE_SCALAR_STMTS (node).release ();
82   SLP_TREE_VEC_STMTS (node).release ();
83   SLP_TREE_LOAD_PERMUTATION (node).release ();
84 
85   free (node);
86 }
87 
88 /* Free the memory allocated for the SLP instance.  FINAL_P is true if we
89    have vectorized the instance or if we have made a final decision not
90    to vectorize the statements in any way.  */
91 
92 void
vect_free_slp_instance(slp_instance instance,bool final_p)93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95   vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96   SLP_INSTANCE_LOADS (instance).release ();
97   free (instance);
98 }
99 
100 
101 /* Create an SLP node for SCALAR_STMTS.  */
102 
103 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts)104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106   slp_tree node;
107   stmt_vec_info stmt_info = scalar_stmts[0];
108   unsigned int nops;
109 
110   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111     nops = gimple_call_num_args (stmt);
112   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113     {
114       nops = gimple_num_ops (stmt) - 1;
115       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 	nops++;
117     }
118   else if (is_a <gphi *> (stmt_info->stmt))
119     nops = 0;
120   else
121     return NULL;
122 
123   node = XNEW (struct _slp_tree);
124   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125   SLP_TREE_VEC_STMTS (node).create (0);
126   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127   SLP_TREE_CHILDREN (node).create (nops);
128   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129   SLP_TREE_TWO_OPERATORS (node) = false;
130   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131   node->refcnt = 1;
132   node->max_nunits = 1;
133 
134   unsigned i;
135   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
136     STMT_VINFO_NUM_SLP_USES (stmt_info)++;
137 
138   return node;
139 }
140 
141 
142 /* This structure is used in creation of an SLP tree.  Each instance
143    corresponds to the same operand in a group of scalar stmts in an SLP
144    node.  */
145 typedef struct _slp_oprnd_info
146 {
147   /* Def-stmts for the operands.  */
148   vec<stmt_vec_info> def_stmts;
149   /* Information about the first statement, its vector def-type, type, the
150      operand itself in case it's constant, and an indication if it's a pattern
151      stmt.  */
152   tree first_op_type;
153   enum vect_def_type first_dt;
154   bool any_pattern;
155 } *slp_oprnd_info;
156 
157 
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159    operand.  */
160 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)161 vect_create_oprnd_info (int nops, int group_size)
162 {
163   int i;
164   slp_oprnd_info oprnd_info;
165   vec<slp_oprnd_info> oprnds_info;
166 
167   oprnds_info.create (nops);
168   for (i = 0; i < nops; i++)
169     {
170       oprnd_info = XNEW (struct _slp_oprnd_info);
171       oprnd_info->def_stmts.create (group_size);
172       oprnd_info->first_dt = vect_uninitialized_def;
173       oprnd_info->first_op_type = NULL_TREE;
174       oprnd_info->any_pattern = false;
175       oprnds_info.quick_push (oprnd_info);
176     }
177 
178   return oprnds_info;
179 }
180 
181 
182 /* Free operands info.  */
183 
184 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)185 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
186 {
187   int i;
188   slp_oprnd_info oprnd_info;
189 
190   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
191     {
192       oprnd_info->def_stmts.release ();
193       XDELETE (oprnd_info);
194     }
195 
196   oprnds_info.release ();
197 }
198 
199 
200 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
201    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
202    of the chain.  */
203 
204 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)205 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
206 				      stmt_vec_info first_stmt_info)
207 {
208   stmt_vec_info next_stmt_info = first_stmt_info;
209   int result = 0;
210 
211   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
212     return -1;
213 
214   do
215     {
216       if (next_stmt_info == stmt_info)
217 	return result;
218       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
219       if (next_stmt_info)
220 	result += DR_GROUP_GAP (next_stmt_info);
221     }
222   while (next_stmt_info);
223 
224   return -1;
225 }
226 
227 /* Check whether it is possible to load COUNT elements of type ELT_MODE
228    using the method implemented by duplicate_and_interleave.  Return true
229    if so, returning the number of intermediate vectors in *NVECTORS_OUT
230    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
231    (if nonnull).  */
232 
233 bool
can_duplicate_and_interleave_p(unsigned int count,machine_mode elt_mode,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)234 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
235 				unsigned int *nvectors_out,
236 				tree *vector_type_out,
237 				tree *permutes)
238 {
239   poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
240   poly_int64 nelts;
241   unsigned int nvectors = 1;
242   for (;;)
243     {
244       scalar_int_mode int_mode;
245       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
246       if (multiple_p (current_vector_size, elt_bytes, &nelts)
247 	  && int_mode_for_size (elt_bits, 0).exists (&int_mode))
248 	{
249 	  tree int_type = build_nonstandard_integer_type
250 	    (GET_MODE_BITSIZE (int_mode), 1);
251 	  tree vector_type = build_vector_type (int_type, nelts);
252 	  if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
253 	    {
254 	      vec_perm_builder sel1 (nelts, 2, 3);
255 	      vec_perm_builder sel2 (nelts, 2, 3);
256 	      poly_int64 half_nelts = exact_div (nelts, 2);
257 	      for (unsigned int i = 0; i < 3; ++i)
258 		{
259 		  sel1.quick_push (i);
260 		  sel1.quick_push (i + nelts);
261 		  sel2.quick_push (half_nelts + i);
262 		  sel2.quick_push (half_nelts + i + nelts);
263 		}
264 	      vec_perm_indices indices1 (sel1, 2, nelts);
265 	      vec_perm_indices indices2 (sel2, 2, nelts);
266 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
267 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
268 		{
269 		  if (nvectors_out)
270 		    *nvectors_out = nvectors;
271 		  if (vector_type_out)
272 		    *vector_type_out = vector_type;
273 		  if (permutes)
274 		    {
275 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
276 								indices1);
277 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
278 								indices2);
279 		    }
280 		  return true;
281 		}
282 	    }
283 	}
284       if (!multiple_p (elt_bytes, 2, &elt_bytes))
285 	return false;
286       nvectors *= 2;
287     }
288 }
289 
290 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
291    they are of a valid type and that they match the defs of the first stmt of
292    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
293    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero *SWAP
294    indicates swap is required for cond_expr stmts.  Specifically, *SWAP
295    is 1 if STMT is cond and operands of comparison need to be swapped;
296    *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
297    If there is any operand swap in this function, *SWAP is set to non-zero
298    value.
299    If there was a fatal error return -1; if the error could be corrected by
300    swapping operands of father node of this one, return 1; if everything is
301    ok return 0.  */
302 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)303 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
304 			     vec<stmt_vec_info> stmts, unsigned stmt_num,
305 			     vec<slp_oprnd_info> *oprnds_info)
306 {
307   stmt_vec_info stmt_info = stmts[stmt_num];
308   tree oprnd;
309   unsigned int i, number_of_oprnds;
310   enum vect_def_type dt = vect_uninitialized_def;
311   slp_oprnd_info oprnd_info;
312   int first_op_idx = 1;
313   unsigned int commutative_op = -1U;
314   bool first_op_cond = false;
315   bool first = stmt_num == 0;
316 
317   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
318     {
319       number_of_oprnds = gimple_call_num_args (stmt);
320       first_op_idx = 3;
321       if (gimple_call_internal_p (stmt))
322 	{
323 	  internal_fn ifn = gimple_call_internal_fn (stmt);
324 	  commutative_op = first_commutative_argument (ifn);
325 	}
326     }
327   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
328     {
329       enum tree_code code = gimple_assign_rhs_code (stmt);
330       number_of_oprnds = gimple_num_ops (stmt) - 1;
331       /* Swap can only be done for cond_expr if asked to, otherwise we
332 	 could result in different comparison code to the first stmt.  */
333       if (gimple_assign_rhs_code (stmt) == COND_EXPR
334 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
335 	{
336 	  first_op_cond = true;
337 	  number_of_oprnds++;
338 	}
339       else
340 	commutative_op = commutative_tree_code (code) ? 0U : -1U;
341     }
342   else
343     return -1;
344 
345   bool swapped = (*swap != 0);
346   gcc_assert (!swapped || first_op_cond);
347   for (i = 0; i < number_of_oprnds; i++)
348     {
349 again:
350       if (first_op_cond)
351 	{
352 	  /* Map indicating how operands of cond_expr should be swapped.  */
353 	  int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
354 	  int *map = maps[*swap];
355 
356 	  if (i < 2)
357 	    oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
358 					     first_op_idx), map[i]);
359 	  else
360 	    oprnd = gimple_op (stmt_info->stmt, map[i]);
361 	}
362       else
363 	oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
364 
365       oprnd_info = (*oprnds_info)[i];
366 
367       stmt_vec_info def_stmt_info;
368       if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
369 	{
370 	  if (dump_enabled_p ())
371 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 			     "Build SLP failed: can't analyze def for %T\n",
373 			     oprnd);
374 
375 	  return -1;
376 	}
377 
378       if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
379 	oprnd_info->any_pattern = true;
380 
381       if (first)
382 	{
383 	  oprnd_info->first_dt = dt;
384 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
385 	}
386       else
387 	{
388 	  /* Not first stmt of the group, check that the def-stmt/s match
389 	     the def-stmt/s of the first stmt.  Allow different definition
390 	     types for reduction chains: the first stmt must be a
391 	     vect_reduction_def (a phi node), and the rest
392 	     vect_internal_def.  */
393 	  tree type = TREE_TYPE (oprnd);
394 	  if ((oprnd_info->first_dt != dt
395 	       && !(oprnd_info->first_dt == vect_reduction_def
396 		    && dt == vect_internal_def)
397 	       && !((oprnd_info->first_dt == vect_external_def
398 		     || oprnd_info->first_dt == vect_constant_def)
399 		    && (dt == vect_external_def
400 			|| dt == vect_constant_def)))
401 	      || !types_compatible_p (oprnd_info->first_op_type, type))
402 	    {
403 	      /* Try swapping operands if we got a mismatch.  */
404 	      if (i == commutative_op && !swapped)
405 		{
406 		  swapped = true;
407 		  goto again;
408 		}
409 
410 	      if (dump_enabled_p ())
411 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 				 "Build SLP failed: different types\n");
413 
414 	      return 1;
415 	    }
416 	  if ((dt == vect_constant_def
417 	       || dt == vect_external_def)
418 	      && !current_vector_size.is_constant ()
419 	      && (TREE_CODE (type) == BOOLEAN_TYPE
420 		  || !can_duplicate_and_interleave_p (stmts.length (),
421 						      TYPE_MODE (type))))
422 	    {
423 	      if (dump_enabled_p ())
424 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 				 "Build SLP failed: invalid type of def "
426 				 "for variable-length SLP %T\n", oprnd);
427 	      return -1;
428 	    }
429 	}
430 
431       /* Check the types of the definitions.  */
432       switch (dt)
433 	{
434 	case vect_constant_def:
435 	case vect_external_def:
436 	  break;
437 
438 	case vect_reduction_def:
439 	case vect_induction_def:
440 	case vect_internal_def:
441 	  oprnd_info->def_stmts.quick_push (def_stmt_info);
442 	  break;
443 
444 	default:
445 	  /* FORNOW: Not supported.  */
446 	  if (dump_enabled_p ())
447 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
448 			     "Build SLP failed: illegal type of def %T\n",
449 			     oprnd);
450 
451 	  return -1;
452 	}
453     }
454 
455   /* Swap operands.  */
456   if (swapped)
457     {
458       /* If there are already uses of this stmt in a SLP instance then
459          we've committed to the operand order and can't swap it.  */
460       if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
461 	{
462 	  if (dump_enabled_p ())
463 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 			     "Build SLP failed: cannot swap operands of "
465 			     "shared stmt %G", stmt_info->stmt);
466 	  return -1;
467 	}
468 
469       if (first_op_cond)
470 	{
471 	  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
472 	  tree cond = gimple_assign_rhs1 (stmt);
473 	  enum tree_code code = TREE_CODE (cond);
474 
475 	  /* Swap.  */
476 	  if (*swap == 1)
477 	    {
478 	      swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
479 				 &TREE_OPERAND (cond, 1));
480 	      TREE_SET_CODE (cond, swap_tree_comparison (code));
481 	    }
482 	  /* Invert.  */
483 	  else
484 	    {
485 	      swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
486 				 gimple_assign_rhs3_ptr (stmt));
487 	      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
488 	      code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
489 	      gcc_assert (code != ERROR_MARK);
490 	      TREE_SET_CODE (cond, code);
491 	    }
492 	}
493       else
494 	{
495 	  unsigned int op = commutative_op + first_op_idx;
496 	  swap_ssa_operands (stmt_info->stmt,
497 			     gimple_op_ptr (stmt_info->stmt, op),
498 			     gimple_op_ptr (stmt_info->stmt, op + 1));
499 	}
500       if (dump_enabled_p ())
501 	dump_printf_loc (MSG_NOTE, vect_location,
502 			 "swapped operands to match def types in %G",
503 			 stmt_info->stmt);
504     }
505 
506   *swap = swapped;
507   return 0;
508 }
509 
510 /* Return true if call statements CALL1 and CALL2 are similar enough
511    to be combined into the same SLP group.  */
512 
513 static bool
compatible_calls_p(gcall * call1,gcall * call2)514 compatible_calls_p (gcall *call1, gcall *call2)
515 {
516   unsigned int nargs = gimple_call_num_args (call1);
517   if (nargs != gimple_call_num_args (call2))
518     return false;
519 
520   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
521     return false;
522 
523   if (gimple_call_internal_p (call1))
524     {
525       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
526 			       TREE_TYPE (gimple_call_lhs (call2))))
527 	return false;
528       for (unsigned int i = 0; i < nargs; ++i)
529 	if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
530 				 TREE_TYPE (gimple_call_arg (call2, i))))
531 	  return false;
532     }
533   else
534     {
535       if (!operand_equal_p (gimple_call_fn (call1),
536 			    gimple_call_fn (call2), 0))
537 	return false;
538 
539       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
540 	return false;
541     }
542   return true;
543 }
544 
545 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
546    caller's attempt to find the vector type in STMT_INFO with the narrowest
547    element type.  Return true if VECTYPE is nonnull and if it is valid
548    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
549    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
550    vect_build_slp_tree.  */
551 
552 static bool
vect_record_max_nunits(stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)553 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
554 			tree vectype, poly_uint64 *max_nunits)
555 {
556   if (!vectype)
557     {
558       if (dump_enabled_p ())
559 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 			 "Build SLP failed: unsupported data-type in %G\n",
561 			 stmt_info->stmt);
562       /* Fatal mismatch.  */
563       return false;
564     }
565 
566   /* If populating the vector type requires unrolling then fail
567      before adjusting *max_nunits for basic-block vectorization.  */
568   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
569   unsigned HOST_WIDE_INT const_nunits;
570   if (STMT_VINFO_BB_VINFO (stmt_info)
571       && (!nunits.is_constant (&const_nunits)
572 	  || const_nunits > group_size))
573     {
574       if (dump_enabled_p ())
575 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 			 "Build SLP failed: unrolling required "
577 			 "in basic block SLP\n");
578       /* Fatal mismatch.  */
579       return false;
580     }
581 
582   /* In case of multiple types we need to detect the smallest type.  */
583   vect_update_max_nunits (max_nunits, vectype);
584   return true;
585 }
586 
587 /* STMTS is a group of GROUP_SIZE SLP statements in which some
588    statements do the same operation as the first statement and in which
589    the others do ALT_STMT_CODE.  Return true if we can take one vector
590    of the first operation and one vector of the second and permute them
591    to get the required result.  VECTYPE is the type of the vector that
592    would be permuted.  */
593 
594 static bool
vect_two_operations_perm_ok_p(vec<stmt_vec_info> stmts,unsigned int group_size,tree vectype,tree_code alt_stmt_code)595 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
596 			       unsigned int group_size, tree vectype,
597 			       tree_code alt_stmt_code)
598 {
599   unsigned HOST_WIDE_INT count;
600   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
601     return false;
602 
603   vec_perm_builder sel (count, count, 1);
604   for (unsigned int i = 0; i < count; ++i)
605     {
606       unsigned int elt = i;
607       gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
608       if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
609 	elt += count;
610       sel.quick_push (elt);
611     }
612   vec_perm_indices indices (sel, 2, count);
613   return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
614 }
615 
616 /* Verify if the scalar stmts STMTS are isomorphic, require data
617    permutation or are of unsupported types of operation.  Return
618    true if they are, otherwise return false and indicate in *MATCHES
619    which stmts are not isomorphic to the first one.  If MATCHES[0]
620    is false then this indicates the comparison could not be
621    carried out or the stmts will never be vectorized by SLP.
622 
623    Note COND_EXPR is possibly ismorphic to another one after swapping its
624    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
625    the first stmt by swapping the two operands of comparison; set SWAP[i]
626    to 2 if stmt I is isormorphic to the first stmt by inverting the code
627    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
628    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
629 
630 static bool
vect_build_slp_tree_1(unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators)631 vect_build_slp_tree_1 (unsigned char *swap,
632 		       vec<stmt_vec_info> stmts, unsigned int group_size,
633 		       poly_uint64 *max_nunits, bool *matches,
634 		       bool *two_operators)
635 {
636   unsigned int i;
637   stmt_vec_info first_stmt_info = stmts[0];
638   enum tree_code first_stmt_code = ERROR_MARK;
639   enum tree_code alt_stmt_code = ERROR_MARK;
640   enum tree_code rhs_code = ERROR_MARK;
641   enum tree_code first_cond_code = ERROR_MARK;
642   tree lhs;
643   bool need_same_oprnds = false;
644   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
645   optab optab;
646   int icode;
647   machine_mode optab_op2_mode;
648   machine_mode vec_mode;
649   stmt_vec_info first_load = NULL, prev_first_load = NULL;
650 
651   /* For every stmt in NODE find its def stmt/s.  */
652   stmt_vec_info stmt_info;
653   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
654     {
655       gimple *stmt = stmt_info->stmt;
656       swap[i] = 0;
657       matches[i] = false;
658 
659       if (dump_enabled_p ())
660 	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
661 
662       /* Fail to vectorize statements marked as unvectorizable.  */
663       if (!STMT_VINFO_VECTORIZABLE (stmt_info))
664         {
665           if (dump_enabled_p ())
666 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 			     "Build SLP failed: unvectorizable statement %G",
668 			     stmt);
669 	  /* Fatal mismatch.  */
670 	  matches[0] = false;
671           return false;
672         }
673 
674       lhs = gimple_get_lhs (stmt);
675       if (lhs == NULL_TREE)
676 	{
677 	  if (dump_enabled_p ())
678 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 			     "Build SLP failed: not GIMPLE_ASSIGN nor "
680 			     "GIMPLE_CALL %G", stmt);
681 	  /* Fatal mismatch.  */
682 	  matches[0] = false;
683 	  return false;
684 	}
685 
686       tree nunits_vectype;
687       if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
688 					   &nunits_vectype)
689 	  || (nunits_vectype
690 	      && !vect_record_max_nunits (stmt_info, group_size,
691 					  nunits_vectype, max_nunits)))
692 	{
693 	  /* Fatal mismatch.  */
694 	  matches[0] = false;
695 	  return false;
696 	}
697 
698       gcc_assert (vectype);
699 
700       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
701 	{
702 	  rhs_code = CALL_EXPR;
703 	  if ((gimple_call_internal_p (call_stmt)
704 	       && (!vectorizable_internal_fn_p
705 		   (gimple_call_internal_fn (call_stmt))))
706 	      || gimple_call_tail_p (call_stmt)
707 	      || gimple_call_noreturn_p (call_stmt)
708 	      || !gimple_call_nothrow_p (call_stmt)
709 	      || gimple_call_chain (call_stmt))
710 	    {
711 	      if (dump_enabled_p ())
712 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 				 "Build SLP failed: unsupported call type %G",
714 				 call_stmt);
715 	      /* Fatal mismatch.  */
716 	      matches[0] = false;
717 	      return false;
718 	    }
719 	}
720       else
721 	rhs_code = gimple_assign_rhs_code (stmt);
722 
723       /* Check the operation.  */
724       if (i == 0)
725 	{
726 	  first_stmt_code = rhs_code;
727 
728 	  /* Shift arguments should be equal in all the packed stmts for a
729 	     vector shift with scalar shift operand.  */
730 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
731 	      || rhs_code == LROTATE_EXPR
732 	      || rhs_code == RROTATE_EXPR)
733 	    {
734 	      if (vectype == boolean_type_node)
735 		{
736 		  if (dump_enabled_p ())
737 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 				     "Build SLP failed: shift of a"
739 				     " boolean.\n");
740 		  /* Fatal mismatch.  */
741 		  matches[0] = false;
742 		  return false;
743 		}
744 
745 	      vec_mode = TYPE_MODE (vectype);
746 
747 	      /* First see if we have a vector/vector shift.  */
748 	      optab = optab_for_tree_code (rhs_code, vectype,
749 					   optab_vector);
750 
751 	      if (!optab
752 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
753 		{
754 		  /* No vector/vector shift, try for a vector/scalar shift.  */
755 		  optab = optab_for_tree_code (rhs_code, vectype,
756 					       optab_scalar);
757 
758 		  if (!optab)
759 		    {
760 		      if (dump_enabled_p ())
761 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 					 "Build SLP failed: no optab.\n");
763 		      /* Fatal mismatch.  */
764 		      matches[0] = false;
765 		      return false;
766 		    }
767 		  icode = (int) optab_handler (optab, vec_mode);
768 		  if (icode == CODE_FOR_nothing)
769 		    {
770 		      if (dump_enabled_p ())
771 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 					 "Build SLP failed: "
773 					 "op not supported by target.\n");
774 		      /* Fatal mismatch.  */
775 		      matches[0] = false;
776 		      return false;
777 		    }
778 		  optab_op2_mode = insn_data[icode].operand[2].mode;
779 		  if (!VECTOR_MODE_P (optab_op2_mode))
780 		    {
781 		      need_same_oprnds = true;
782 		      first_op1 = gimple_assign_rhs2 (stmt);
783 		    }
784 		}
785 	    }
786 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
787             {
788               need_same_oprnds = true;
789               first_op1 = gimple_assign_rhs2 (stmt);
790             }
791 	}
792       else
793 	{
794 	  if (first_stmt_code != rhs_code
795 	      && alt_stmt_code == ERROR_MARK)
796 	    alt_stmt_code = rhs_code;
797 	  if (first_stmt_code != rhs_code
798 	      && (first_stmt_code != IMAGPART_EXPR
799 		  || rhs_code != REALPART_EXPR)
800 	      && (first_stmt_code != REALPART_EXPR
801 		  || rhs_code != IMAGPART_EXPR)
802 	      /* Handle mismatches in plus/minus by computing both
803 		 and merging the results.  */
804 	      && !((first_stmt_code == PLUS_EXPR
805 		    || first_stmt_code == MINUS_EXPR)
806 		   && (alt_stmt_code == PLUS_EXPR
807 		       || alt_stmt_code == MINUS_EXPR)
808 		   && rhs_code == alt_stmt_code)
809 	      && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
810                    && (first_stmt_code == ARRAY_REF
811                        || first_stmt_code == BIT_FIELD_REF
812                        || first_stmt_code == INDIRECT_REF
813                        || first_stmt_code == COMPONENT_REF
814                        || first_stmt_code == MEM_REF)))
815 	    {
816 	      if (dump_enabled_p ())
817 		{
818 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 				   "Build SLP failed: different operation "
820 				   "in stmt %G", stmt);
821 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 				   "original stmt %G", first_stmt_info->stmt);
823 		}
824 	      /* Mismatch.  */
825 	      continue;
826 	    }
827 
828 	  if (need_same_oprnds
829 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
830 	    {
831 	      if (dump_enabled_p ())
832 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 				 "Build SLP failed: different shift "
834 				 "arguments in %G", stmt);
835 	      /* Mismatch.  */
836 	      continue;
837 	    }
838 
839 	  if (rhs_code == CALL_EXPR)
840 	    {
841 	      if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
842 				       as_a <gcall *> (stmt)))
843 		{
844 		  if (dump_enabled_p ())
845 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 				     "Build SLP failed: different calls in %G",
847 				     stmt);
848 		  /* Mismatch.  */
849 		  continue;
850 		}
851 	    }
852 	}
853 
854       /* Grouped store or load.  */
855       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
856 	{
857 	  if (REFERENCE_CLASS_P (lhs))
858 	    {
859 	      /* Store.  */
860 	      ;
861 	    }
862 	  else
863 	    {
864 	      /* Load.  */
865 	      first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
866               if (prev_first_load)
867                 {
868                   /* Check that there are no loads from different interleaving
869                      chains in the same node.  */
870                   if (prev_first_load != first_load)
871                     {
872                       if (dump_enabled_p ())
873 			dump_printf_loc (MSG_MISSED_OPTIMIZATION,
874 					 vect_location,
875 					 "Build SLP failed: different "
876 					 "interleaving chains in one node %G",
877 					 stmt);
878 		      /* Mismatch.  */
879 		      continue;
880                     }
881                 }
882               else
883                 prev_first_load = first_load;
884            }
885         } /* Grouped access.  */
886       else
887 	{
888 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
889 	    {
890 	      /* Not grouped load.  */
891 	      if (dump_enabled_p ())
892 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 				 "Build SLP failed: not grouped load %G", stmt);
894 
895 	      /* FORNOW: Not grouped loads are not supported.  */
896 	      /* Fatal mismatch.  */
897 	      matches[0] = false;
898 	      return false;
899 	    }
900 
901 	  /* Not memory operation.  */
902 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
903 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
904 	      && TREE_CODE_CLASS (rhs_code) != tcc_expression
905 	      && TREE_CODE_CLASS (rhs_code) != tcc_comparison
906 	      && rhs_code != CALL_EXPR)
907 	    {
908 	      if (dump_enabled_p ())
909 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 				 "Build SLP failed: operation unsupported %G",
911 				 stmt);
912 	      /* Fatal mismatch.  */
913 	      matches[0] = false;
914 	      return false;
915 	    }
916 
917 	  if (rhs_code == COND_EXPR)
918 	    {
919 	      tree cond_expr = gimple_assign_rhs1 (stmt);
920 	      enum tree_code cond_code = TREE_CODE (cond_expr);
921 	      enum tree_code swap_code = ERROR_MARK;
922 	      enum tree_code invert_code = ERROR_MARK;
923 
924 	      if (i == 0)
925 		first_cond_code = TREE_CODE (cond_expr);
926 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
927 		{
928 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
929 		  swap_code = swap_tree_comparison (cond_code);
930 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
931 		}
932 
933 	      if (first_cond_code == cond_code)
934 		;
935 	      /* Isomorphic can be achieved by swapping.  */
936 	      else if (first_cond_code == swap_code)
937 		swap[i] = 1;
938 	      /* Isomorphic can be achieved by inverting.  */
939 	      else if (first_cond_code == invert_code)
940 		swap[i] = 2;
941 	      else
942 		{
943 		  if (dump_enabled_p ())
944 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 				     "Build SLP failed: different"
946 				     " operation %G", stmt);
947 		  /* Mismatch.  */
948 		  continue;
949 		}
950 	    }
951 	}
952 
953       matches[i] = true;
954     }
955 
956   for (i = 0; i < group_size; ++i)
957     if (!matches[i])
958       return false;
959 
960   /* If we allowed a two-operation SLP node verify the target can cope
961      with the permute we are going to use.  */
962   if (alt_stmt_code != ERROR_MARK
963       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
964     {
965       if (vectype == boolean_type_node
966 	  || !vect_two_operations_perm_ok_p (stmts, group_size,
967 					     vectype, alt_stmt_code))
968 	{
969 	  for (i = 0; i < group_size; ++i)
970 	    if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
971 	      {
972 		matches[i] = false;
973 		if (dump_enabled_p ())
974 		  {
975 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 				     "Build SLP failed: different operation "
977 				     "in stmt %G", stmts[i]->stmt);
978 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 				     "original stmt %G", first_stmt_info->stmt);
980 		  }
981 	      }
982 	  return false;
983 	}
984       *two_operators = true;
985     }
986 
987   return true;
988 }
989 
990 /* Traits for the hash_set to record failed SLP builds for a stmt set.
991    Note we never remove apart from at destruction time so we do not
992    need a special value for deleted that differs from empty.  */
993 struct bst_traits
994 {
995   typedef vec <stmt_vec_info> value_type;
996   typedef vec <stmt_vec_info> compare_type;
997   static inline hashval_t hash (value_type);
998   static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits999   static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1000   static inline bool is_deleted (value_type x) { return !x.exists (); }
mark_emptybst_traits1001   static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1002   static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1003   static inline void remove (value_type &x) { x.release (); }
1004 };
1005 inline hashval_t
hash(value_type x)1006 bst_traits::hash (value_type x)
1007 {
1008   inchash::hash h;
1009   for (unsigned i = 0; i < x.length (); ++i)
1010     h.add_int (gimple_uid (x[i]->stmt));
1011   return h.end ();
1012 }
1013 inline bool
equal(value_type existing,value_type candidate)1014 bst_traits::equal (value_type existing, value_type candidate)
1015 {
1016   if (existing.length () != candidate.length ())
1017     return false;
1018   for (unsigned i = 0; i < existing.length (); ++i)
1019     if (existing[i] != candidate[i])
1020       return false;
1021   return true;
1022 }
1023 
1024 typedef hash_map <vec <gimple *>, slp_tree,
1025 		  simple_hashmap_traits <bst_traits, slp_tree> >
1026   scalar_stmts_to_slp_tree_map_t;
1027 
1028 static slp_tree
1029 vect_build_slp_tree_2 (vec_info *vinfo,
1030 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1031 		       poly_uint64 *max_nunits,
1032 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1033 		       unsigned max_tree_size,
1034 		       scalar_stmts_to_slp_tree_map_t *bst_map);
1035 
1036 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1037 vect_build_slp_tree (vec_info *vinfo,
1038 		     vec<stmt_vec_info> stmts, unsigned int group_size,
1039 		     poly_uint64 *max_nunits,
1040 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
1041 		     unsigned max_tree_size,
1042 		     scalar_stmts_to_slp_tree_map_t *bst_map)
1043 {
1044   if (slp_tree *leader = bst_map->get (stmts))
1045     {
1046       if (dump_enabled_p ())
1047 	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1048 			 *leader ? "" : "failed ", *leader);
1049       if (*leader)
1050 	{
1051 	  (*leader)->refcnt++;
1052 	  vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1053 	}
1054       return *leader;
1055     }
1056   poly_uint64 this_max_nunits = 1;
1057   slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1058 					&this_max_nunits,
1059 					matches, npermutes, tree_size,
1060 					max_tree_size, bst_map);
1061   if (res)
1062     {
1063       res->max_nunits = this_max_nunits;
1064       vect_update_max_nunits (max_nunits, this_max_nunits);
1065       /* Keep a reference for the bst_map use.  */
1066       res->refcnt++;
1067     }
1068   bst_map->put (stmts.copy (), res);
1069   return res;
1070 }
1071 
1072 /* Recursively build an SLP tree starting from NODE.
1073    Fail (and return a value not equal to zero) if def-stmts are not
1074    isomorphic, require data permutation or are of unsupported types of
1075    operation.  Otherwise, return 0.
1076    The value returned is the depth in the SLP tree where a mismatch
1077    was found.  */
1078 
1079 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1080 vect_build_slp_tree_2 (vec_info *vinfo,
1081 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1082 		       poly_uint64 *max_nunits,
1083 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1084 		       unsigned max_tree_size,
1085 		       scalar_stmts_to_slp_tree_map_t *bst_map)
1086 {
1087   unsigned nops, i, this_tree_size = 0;
1088   poly_uint64 this_max_nunits = *max_nunits;
1089   slp_tree node;
1090 
1091   matches[0] = false;
1092 
1093   stmt_vec_info stmt_info = stmts[0];
1094   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1095     nops = gimple_call_num_args (stmt);
1096   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1097     {
1098       nops = gimple_num_ops (stmt) - 1;
1099       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1100 	nops++;
1101     }
1102   else if (is_a <gphi *> (stmt_info->stmt))
1103     nops = 0;
1104   else
1105     return NULL;
1106 
1107   /* If the SLP node is a PHI (induction or reduction), terminate
1108      the recursion.  */
1109   if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1110     {
1111       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1112       tree vectype = get_vectype_for_scalar_type (scalar_type);
1113       if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1114 	return NULL;
1115 
1116       vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1117       /* Induction from different IVs is not supported.  */
1118       if (def_type == vect_induction_def)
1119 	{
1120 	  stmt_vec_info other_info;
1121 	  FOR_EACH_VEC_ELT (stmts, i, other_info)
1122 	    if (stmt_info != other_info)
1123 	      return NULL;
1124 	}
1125       else if (def_type == vect_reduction_def
1126 	       || def_type == vect_double_reduction_def
1127 	       || def_type == vect_nested_cycle)
1128 	{
1129 	  /* Else def types have to match.  */
1130 	  stmt_vec_info other_info;
1131 	  FOR_EACH_VEC_ELT (stmts, i, other_info)
1132 	    {
1133 	      /* But for reduction chains only check on the first stmt.  */
1134 	      if (!STMT_VINFO_DATA_REF (other_info)
1135 		  && REDUC_GROUP_FIRST_ELEMENT (other_info)
1136 		  && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1137 		continue;
1138 	      if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1139 		return NULL;
1140 	    }
1141 	}
1142       else
1143 	return NULL;
1144       node = vect_create_new_slp_node (stmts);
1145       return node;
1146     }
1147 
1148 
1149   bool two_operators = false;
1150   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1151   if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1152 			      &this_max_nunits, matches, &two_operators))
1153     return NULL;
1154 
1155   /* If the SLP node is a load, terminate the recursion.  */
1156   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1157       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1158     {
1159       *max_nunits = this_max_nunits;
1160       node = vect_create_new_slp_node (stmts);
1161       return node;
1162     }
1163 
1164   /* Get at the operands, verifying they are compatible.  */
1165   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1166   slp_oprnd_info oprnd_info;
1167   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1168     {
1169       int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1170 					     stmts, i, &oprnds_info);
1171       if (res != 0)
1172 	matches[(res == -1) ? 0 : i] = false;
1173       if (!matches[0])
1174 	break;
1175     }
1176   for (i = 0; i < group_size; ++i)
1177     if (!matches[i])
1178       {
1179 	vect_free_oprnd_info (oprnds_info);
1180 	return NULL;
1181       }
1182 
1183   auto_vec<slp_tree, 4> children;
1184 
1185   stmt_info = stmts[0];
1186 
1187   if (tree_size)
1188     max_tree_size -= *tree_size;
1189 
1190   /* Create SLP_TREE nodes for the definition node/s.  */
1191   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1192     {
1193       slp_tree child;
1194       unsigned old_tree_size = this_tree_size;
1195       unsigned int j;
1196 
1197       if (oprnd_info->first_dt != vect_internal_def
1198 	  && oprnd_info->first_dt != vect_reduction_def
1199 	  && oprnd_info->first_dt != vect_induction_def)
1200         continue;
1201 
1202       if (++this_tree_size > max_tree_size)
1203 	{
1204 	  if (dump_enabled_p ())
1205 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1206 			     vect_location,
1207 			     "Build SLP failed: SLP tree too large\n");
1208 	  FOR_EACH_VEC_ELT (children, j, child)
1209 	    vect_free_slp_tree (child, false);
1210 	  vect_free_oprnd_info (oprnds_info);
1211 	  return NULL;
1212 	}
1213 
1214       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1215 					group_size, &this_max_nunits,
1216 					matches, npermutes,
1217 					&this_tree_size,
1218 					max_tree_size, bst_map)) != NULL)
1219 	{
1220 	  /* If we have all children of child built up from scalars then just
1221 	     throw that away and build it up this node from scalars.  */
1222 	  if (!SLP_TREE_CHILDREN (child).is_empty ()
1223 	      /* ???  Rejecting patterns this way doesn't work.  We'd have to
1224 		 do extra work to cancel the pattern so the uses see the
1225 		 scalar version.  */
1226 	      && !oprnd_info->any_pattern)
1227 	    {
1228 	      slp_tree grandchild;
1229 
1230 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1231 		if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1232 		  break;
1233 	      if (!grandchild)
1234 		{
1235 		  /* Roll back.  */
1236 		  this_tree_size = old_tree_size;
1237 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1238 		    vect_free_slp_tree (grandchild, false);
1239 		  SLP_TREE_CHILDREN (child).truncate (0);
1240 
1241 		  if (dump_enabled_p ())
1242 		    dump_printf_loc (MSG_NOTE, vect_location,
1243 				     "Building parent vector operands from "
1244 				     "scalars instead\n");
1245 		  oprnd_info->def_stmts = vNULL;
1246 		  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1247 		  children.safe_push (child);
1248 		  continue;
1249 		}
1250 	    }
1251 
1252 	  oprnd_info->def_stmts = vNULL;
1253 	  children.safe_push (child);
1254 	  continue;
1255 	}
1256 
1257       /* If the SLP build failed fatally and we analyze a basic-block
1258          simply treat nodes we fail to build as externally defined
1259 	 (and thus build vectors from the scalar defs).
1260 	 The cost model will reject outright expensive cases.
1261 	 ???  This doesn't treat cases where permutation ultimatively
1262 	 fails (or we don't try permutation below).  Ideally we'd
1263 	 even compute a permutation that will end up with the maximum
1264 	 SLP tree size...  */
1265       if (is_a <bb_vec_info> (vinfo)
1266 	  && !matches[0]
1267 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
1268 	     do extra work to cancel the pattern so the uses see the
1269 	     scalar version.  */
1270 	  && !is_pattern_stmt_p (stmt_info)
1271 	  && !oprnd_info->any_pattern)
1272 	{
1273 	  if (dump_enabled_p ())
1274 	    dump_printf_loc (MSG_NOTE, vect_location,
1275 			     "Building vector operands from scalars\n");
1276 	  child = vect_create_new_slp_node (oprnd_info->def_stmts);
1277 	  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1278 	  children.safe_push (child);
1279 	  oprnd_info->def_stmts = vNULL;
1280 	  continue;
1281 	}
1282 
1283       /* If the SLP build for operand zero failed and operand zero
1284 	 and one can be commutated try that for the scalar stmts
1285 	 that failed the match.  */
1286       if (i == 0
1287 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1288 	  && matches[0]
1289 	  /* ???  For COND_EXPRs we can swap the comparison operands
1290 	     as well as the arms under some constraints.  */
1291 	  && nops == 2
1292 	  && oprnds_info[1]->first_dt == vect_internal_def
1293 	  && is_gimple_assign (stmt_info->stmt)
1294 	  /* Swapping operands for reductions breaks assumptions later on.  */
1295 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1296 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1297 	  /* Do so only if the number of not successful permutes was nor more
1298 	     than a cut-ff as re-trying the recursive match on
1299 	     possibly each level of the tree would expose exponential
1300 	     behavior.  */
1301 	  && *npermutes < 4)
1302 	{
1303 	  /* See whether we can swap the matching or the non-matching
1304 	     stmt operands.  */
1305 	  bool swap_not_matching = true;
1306 	  do
1307 	    {
1308 	      for (j = 0; j < group_size; ++j)
1309 		{
1310 		  if (matches[j] != !swap_not_matching)
1311 		    continue;
1312 		  stmt_vec_info stmt_info = stmts[j];
1313 		  /* Verify if we can swap operands of this stmt.  */
1314 		  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1315 		  if (!stmt
1316 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1317 		    {
1318 		      if (!swap_not_matching)
1319 			goto fail;
1320 		      swap_not_matching = false;
1321 		      break;
1322 		    }
1323 		  /* Verify if we can safely swap or if we committed to a
1324 		     specific operand order already.
1325 		     ???  Instead of modifying GIMPLE stmts here we could
1326 		     record whether we want to swap operands in the SLP
1327 		     node and temporarily do that when processing it
1328 		     (or wrap operand accessors in a helper).  */
1329 		  else if (swap[j] != 0
1330 			   || STMT_VINFO_NUM_SLP_USES (stmt_info))
1331 		    {
1332 		      if (!swap_not_matching)
1333 			{
1334 			  if (dump_enabled_p ())
1335 			    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1336 					     vect_location,
1337 					     "Build SLP failed: cannot swap "
1338 					     "operands of shared stmt %G",
1339 					     stmts[j]->stmt);
1340 			  goto fail;
1341 			}
1342 		      swap_not_matching = false;
1343 		      break;
1344 		    }
1345 		}
1346 	    }
1347 	  while (j != group_size);
1348 
1349 	  /* Swap mismatched definition stmts.  */
1350 	  if (dump_enabled_p ())
1351 	    dump_printf_loc (MSG_NOTE, vect_location,
1352 			     "Re-trying with swapped operands of stmts ");
1353 	  for (j = 0; j < group_size; ++j)
1354 	    if (matches[j] == !swap_not_matching)
1355 	      {
1356 		std::swap (oprnds_info[0]->def_stmts[j],
1357 			   oprnds_info[1]->def_stmts[j]);
1358 		if (dump_enabled_p ())
1359 		  dump_printf (MSG_NOTE, "%d ", j);
1360 	      }
1361 	  if (dump_enabled_p ())
1362 	    dump_printf (MSG_NOTE, "\n");
1363 	  /* And try again with scratch 'matches' ... */
1364 	  bool *tem = XALLOCAVEC (bool, group_size);
1365 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1366 					    group_size, &this_max_nunits,
1367 					    tem, npermutes,
1368 					    &this_tree_size,
1369 					    max_tree_size, bst_map)) != NULL)
1370 	    {
1371 	      /* ... so if successful we can apply the operand swapping
1372 		 to the GIMPLE IL.  This is necessary because for example
1373 		 vect_get_slp_defs uses operand indexes and thus expects
1374 		 canonical operand order.  This is also necessary even
1375 		 if we end up building the operand from scalars as
1376 		 we'll continue to process swapped operand two.  */
1377 	      for (j = 0; j < group_size; ++j)
1378 		gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1379 	      for (j = 0; j < group_size; ++j)
1380 		if (matches[j] == !swap_not_matching)
1381 		  {
1382 		    gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1383 		    /* Avoid swapping operands twice.  */
1384 		    if (gimple_plf (stmt, GF_PLF_1))
1385 		      continue;
1386 		    swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1387 				       gimple_assign_rhs2_ptr (stmt));
1388 		    gimple_set_plf (stmt, GF_PLF_1, true);
1389 		  }
1390 	      /* Verify we swap all duplicates or none.  */
1391 	      if (flag_checking)
1392 		for (j = 0; j < group_size; ++j)
1393 		  gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1394 			      == (matches[j] == !swap_not_matching));
1395 
1396 	      /* If we have all children of child built up from scalars then
1397 		 just throw that away and build it up this node from scalars.  */
1398 	      if (!SLP_TREE_CHILDREN (child).is_empty ()
1399 		  /* ???  Rejecting patterns this way doesn't work.  We'd have
1400 		     to do extra work to cancel the pattern so the uses see the
1401 		     scalar version.  */
1402 		  && !oprnd_info->any_pattern)
1403 		{
1404 		  unsigned int j;
1405 		  slp_tree grandchild;
1406 
1407 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1408 		    if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1409 		      break;
1410 		  if (!grandchild)
1411 		    {
1412 		      /* Roll back.  */
1413 		      this_tree_size = old_tree_size;
1414 		      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1415 			vect_free_slp_tree (grandchild, false);
1416 		      SLP_TREE_CHILDREN (child).truncate (0);
1417 
1418 		      if (dump_enabled_p ())
1419 			dump_printf_loc (MSG_NOTE, vect_location,
1420 					 "Building parent vector operands from "
1421 					 "scalars instead\n");
1422 		      oprnd_info->def_stmts = vNULL;
1423 		      SLP_TREE_DEF_TYPE (child) = vect_external_def;
1424 		      children.safe_push (child);
1425 		      continue;
1426 		    }
1427 		}
1428 
1429 	      oprnd_info->def_stmts = vNULL;
1430 	      children.safe_push (child);
1431 	      continue;
1432 	    }
1433 
1434 	  ++*npermutes;
1435 	}
1436 
1437 fail:
1438       gcc_assert (child == NULL);
1439       FOR_EACH_VEC_ELT (children, j, child)
1440 	vect_free_slp_tree (child, false);
1441       vect_free_oprnd_info (oprnds_info);
1442       return NULL;
1443     }
1444 
1445   vect_free_oprnd_info (oprnds_info);
1446 
1447   if (tree_size)
1448     *tree_size += this_tree_size;
1449   *max_nunits = this_max_nunits;
1450 
1451   node = vect_create_new_slp_node (stmts);
1452   SLP_TREE_TWO_OPERATORS (node) = two_operators;
1453   SLP_TREE_CHILDREN (node).splice (children);
1454   return node;
1455 }
1456 
1457 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1458 
1459 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)1460 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1461 		     slp_tree node, hash_set<slp_tree> &visited)
1462 {
1463   int i;
1464   stmt_vec_info stmt_info;
1465   slp_tree child;
1466 
1467   if (visited.add (node))
1468     return;
1469 
1470   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1471   dump_user_location_t user_loc = loc.get_user_location ();
1472   dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1473 		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
1474 		   ? " (external)" : "", node,
1475 		   estimated_poly_value (node->max_nunits));
1476   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1477     dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1478   if (SLP_TREE_CHILDREN (node).is_empty ())
1479     return;
1480   dump_printf_loc (metadata, user_loc, "\tchildren");
1481   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1482     dump_printf (dump_kind, " %p", (void *)child);
1483   dump_printf (dump_kind, "\n");
1484   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1485     vect_print_slp_tree (dump_kind, loc, child, visited);
1486 }
1487 
1488 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)1489 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1490 		     slp_tree node)
1491 {
1492   hash_set<slp_tree> visited;
1493   vect_print_slp_tree (dump_kind, loc, node, visited);
1494 }
1495 
1496 /* Mark the tree rooted at NODE with PURE_SLP.  */
1497 
1498 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)1499 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1500 {
1501   int i;
1502   stmt_vec_info stmt_info;
1503   slp_tree child;
1504 
1505   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1506     return;
1507 
1508   if (visited.add (node))
1509     return;
1510 
1511   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1512     STMT_SLP_TYPE (stmt_info) = pure_slp;
1513 
1514   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1515     vect_mark_slp_stmts (child, visited);
1516 }
1517 
1518 static void
vect_mark_slp_stmts(slp_tree node)1519 vect_mark_slp_stmts (slp_tree node)
1520 {
1521   hash_set<slp_tree> visited;
1522   vect_mark_slp_stmts (node, visited);
1523 }
1524 
1525 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1526 
1527 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)1528 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1529 {
1530   int i;
1531   stmt_vec_info stmt_info;
1532   slp_tree child;
1533 
1534   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1535     return;
1536 
1537   if (visited.add (node))
1538     return;
1539 
1540   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1541     {
1542       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545     }
1546 
1547   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548     vect_mark_slp_stmts_relevant (child, visited);
1549 }
1550 
1551 static void
vect_mark_slp_stmts_relevant(slp_tree node)1552 vect_mark_slp_stmts_relevant (slp_tree node)
1553 {
1554   hash_set<slp_tree> visited;
1555   vect_mark_slp_stmts_relevant (node, visited);
1556 }
1557 
1558 
1559 /* Rearrange the statements of NODE according to PERMUTATION.  */
1560 
1561 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation,hash_set<slp_tree> & visited)1562 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1563                           vec<unsigned> permutation,
1564 			  hash_set<slp_tree> &visited)
1565 {
1566   stmt_vec_info stmt_info;
1567   vec<stmt_vec_info> tmp_stmts;
1568   unsigned int i;
1569   slp_tree child;
1570 
1571   if (visited.add (node))
1572     return;
1573 
1574   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1575     vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1576 
1577   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1578   tmp_stmts.create (group_size);
1579   tmp_stmts.quick_grow_cleared (group_size);
1580 
1581   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1582     tmp_stmts[permutation[i]] = stmt_info;
1583 
1584   SLP_TREE_SCALAR_STMTS (node).release ();
1585   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1586 }
1587 
1588 
1589 /* Attempt to reorder stmts in a reduction chain so that we don't
1590    require any load permutation.  Return true if that was possible,
1591    otherwise return false.  */
1592 
1593 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1594 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1595 {
1596   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1597   unsigned int i, j;
1598   unsigned int lidx;
1599   slp_tree node, load;
1600 
1601   /* Compare all the permutation sequences to the first one.  We know
1602      that at least one load is permuted.  */
1603   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1604   if (!node->load_permutation.exists ())
1605     return false;
1606   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1607     {
1608       if (!load->load_permutation.exists ())
1609 	return false;
1610       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1611 	if (lidx != node->load_permutation[j])
1612 	  return false;
1613     }
1614 
1615   /* Check that the loads in the first sequence are different and there
1616      are no gaps between them.  */
1617   auto_sbitmap load_index (group_size);
1618   bitmap_clear (load_index);
1619   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1620     {
1621       if (lidx >= group_size)
1622 	return false;
1623       if (bitmap_bit_p (load_index, lidx))
1624 	return false;
1625 
1626       bitmap_set_bit (load_index, lidx);
1627     }
1628   for (i = 0; i < group_size; i++)
1629     if (!bitmap_bit_p (load_index, i))
1630       return false;
1631 
1632   /* This permutation is valid for reduction.  Since the order of the
1633      statements in the nodes is not important unless they are memory
1634      accesses, we can rearrange the statements in all the nodes
1635      according to the order of the loads.  */
1636   hash_set<slp_tree> visited;
1637   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1638 			    node->load_permutation, visited);
1639 
1640   /* We are done, no actual permutations need to be generated.  */
1641   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1642   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1643     {
1644       stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1645       first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1646       /* But we have to keep those permutations that are required because
1647          of handling of gaps.  */
1648       if (known_eq (unrolling_factor, 1U)
1649 	  || (group_size == DR_GROUP_SIZE (first_stmt_info)
1650 	      && DR_GROUP_GAP (first_stmt_info) == 0))
1651 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1652       else
1653 	for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1654 	  SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1655     }
1656 
1657   return true;
1658 }
1659 
1660 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
1661 
1662 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node,hash_set<slp_tree> & visited)1663 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1664 		       hash_set<slp_tree> &visited)
1665 {
1666   if (visited.add (node))
1667     return;
1668 
1669   if (SLP_TREE_CHILDREN (node).length () == 0)
1670     {
1671       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1672       if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1673 	  && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 	  && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1675 	SLP_INSTANCE_LOADS (inst).safe_push (node);
1676     }
1677   else
1678     {
1679       unsigned i;
1680       slp_tree child;
1681       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1682 	vect_gather_slp_loads (inst, child, visited);
1683     }
1684 }
1685 
1686 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node)1687 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1688 {
1689   hash_set<slp_tree> visited;
1690   vect_gather_slp_loads (inst, node, visited);
1691 }
1692 
1693 /* Check if the required load permutations in the SLP instance
1694    SLP_INSTN are supported.  */
1695 
1696 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1697 vect_supported_load_permutation_p (slp_instance slp_instn)
1698 {
1699   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1700   unsigned int i, j, k, next;
1701   slp_tree node;
1702 
1703   if (dump_enabled_p ())
1704     {
1705       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1706       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1707 	if (node->load_permutation.exists ())
1708 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1709 	    dump_printf (MSG_NOTE, "%d ", next);
1710 	else
1711 	  for (k = 0; k < group_size; ++k)
1712 	    dump_printf (MSG_NOTE, "%d ", k);
1713       dump_printf (MSG_NOTE, "\n");
1714     }
1715 
1716   /* In case of reduction every load permutation is allowed, since the order
1717      of the reduction statements is not important (as opposed to the case of
1718      grouped stores).  The only condition we need to check is that all the
1719      load nodes are of the same size and have the same permutation (and then
1720      rearrange all the nodes of the SLP instance according to this
1721      permutation).  */
1722 
1723   /* Check that all the load nodes are of the same size.  */
1724   /* ???  Can't we assert this? */
1725   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1726     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1727       return false;
1728 
1729   node = SLP_INSTANCE_TREE (slp_instn);
1730   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1731 
1732   /* Reduction (there are no data-refs in the root).
1733      In reduction chain the order of the loads is not important.  */
1734   if (!STMT_VINFO_DATA_REF (stmt_info)
1735       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1736     vect_attempt_slp_rearrange_stmts (slp_instn);
1737 
1738   /* In basic block vectorization we allow any subchain of an interleaving
1739      chain.
1740      FORNOW: not supported in loop SLP because of realignment compications.  */
1741   if (STMT_VINFO_BB_VINFO (stmt_info))
1742     {
1743       /* Check whether the loads in an instance form a subchain and thus
1744          no permutation is necessary.  */
1745       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1746         {
1747 	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1748 	    continue;
1749 	  bool subchain_p = true;
1750 	  stmt_vec_info next_load_info = NULL;
1751 	  stmt_vec_info load_info;
1752 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1753 	    {
1754 	      if (j != 0
1755 		  && (next_load_info != load_info
1756 		      || DR_GROUP_GAP (load_info) != 1))
1757 		{
1758 		  subchain_p = false;
1759 		  break;
1760 		}
1761 	      next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1762 	    }
1763 	  if (subchain_p)
1764 	    SLP_TREE_LOAD_PERMUTATION (node).release ();
1765 	  else
1766 	    {
1767 	      stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1768 	      group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1769 	      unsigned HOST_WIDE_INT nunits;
1770 	      unsigned k, maxk = 0;
1771 	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1772 		if (k > maxk)
1773 		  maxk = k;
1774 	      /* In BB vectorization we may not actually use a loaded vector
1775 		 accessing elements in excess of DR_GROUP_SIZE.  */
1776 	      tree vectype = STMT_VINFO_VECTYPE (group_info);
1777 	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1778 		  || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1779 		{
1780 		  if (dump_enabled_p ())
1781 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 				     "BB vectorization with gaps at the end of "
1783 				     "a load is not supported\n");
1784 		  return false;
1785 		}
1786 
1787 	      /* Verify the permutation can be generated.  */
1788 	      vec<tree> tem;
1789 	      unsigned n_perms;
1790 	      if (!vect_transform_slp_perm_load (node, tem, NULL,
1791 						 1, slp_instn, true, &n_perms))
1792 		{
1793 		  if (dump_enabled_p ())
1794 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1795 				     vect_location,
1796 				     "unsupported load permutation\n");
1797 		  return false;
1798 		}
1799 	    }
1800         }
1801       return true;
1802     }
1803 
1804   /* For loop vectorization verify we can generate the permutation.  Be
1805      conservative about the vectorization factor, there are permutations
1806      that will use three vector inputs only starting from a specific factor
1807      and the vectorization factor is not yet final.
1808      ???  The SLP instance unrolling factor might not be the maximum one.  */
1809   unsigned n_perms;
1810   poly_uint64 test_vf
1811     = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1812 			     LOOP_VINFO_VECT_FACTOR
1813 			     (STMT_VINFO_LOOP_VINFO (stmt_info)));
1814   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1815     if (node->load_permutation.exists ()
1816 	&& !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1817 					  slp_instn, true, &n_perms))
1818       return false;
1819 
1820   return true;
1821 }
1822 
1823 
1824 /* Find the last store in SLP INSTANCE.  */
1825 
1826 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)1827 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1828 {
1829   stmt_vec_info last = NULL;
1830   stmt_vec_info stmt_vinfo;
1831 
1832   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1833     {
1834       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1835       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1836     }
1837 
1838   return last;
1839 }
1840 
1841 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1842    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1843    (also containing the first GROUP1_SIZE stmts, since stores are
1844    consecutive), the second containing the remainder.
1845    Return the first stmt in the second group.  */
1846 
1847 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)1848 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1849 {
1850   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1851   gcc_assert (group1_size > 0);
1852   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1853   gcc_assert (group2_size > 0);
1854   DR_GROUP_SIZE (first_vinfo) = group1_size;
1855 
1856   stmt_vec_info stmt_info = first_vinfo;
1857   for (unsigned i = group1_size; i > 1; i--)
1858     {
1859       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1860       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1861     }
1862   /* STMT is now the last element of the first group.  */
1863   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1864   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1865 
1866   DR_GROUP_SIZE (group2) = group2_size;
1867   for (stmt_info = group2; stmt_info;
1868        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1869     {
1870       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1871       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1872     }
1873 
1874   /* For the second group, the DR_GROUP_GAP is that before the original group,
1875      plus skipping over the first vector.  */
1876   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1877 
1878   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
1879   DR_GROUP_GAP (first_vinfo) += group2_size;
1880 
1881   if (dump_enabled_p ())
1882     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1883 		     group1_size, group2_size);
1884 
1885   return group2;
1886 }
1887 
1888 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1889    statements and a vector of NUNITS elements.  */
1890 
1891 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)1892 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1893 {
1894   return exact_div (common_multiple (nunits, group_size), group_size);
1895 }
1896 
1897 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1898    vect_build_slp_tree to build a tree of packed stmts if possible.
1899    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1900 
1901 static bool
vect_analyze_slp_instance(vec_info * vinfo,stmt_vec_info stmt_info,unsigned max_tree_size)1902 vect_analyze_slp_instance (vec_info *vinfo,
1903 			   stmt_vec_info stmt_info, unsigned max_tree_size)
1904 {
1905   slp_instance new_instance;
1906   slp_tree node;
1907   unsigned int group_size;
1908   tree vectype, scalar_type = NULL_TREE;
1909   unsigned int i;
1910   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1911   vec<stmt_vec_info> scalar_stmts;
1912 
1913   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1914     {
1915       scalar_type = TREE_TYPE (DR_REF (dr));
1916       vectype = get_vectype_for_scalar_type (scalar_type);
1917       group_size = DR_GROUP_SIZE (stmt_info);
1918     }
1919   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1920     {
1921       gcc_assert (is_a <loop_vec_info> (vinfo));
1922       vectype = STMT_VINFO_VECTYPE (stmt_info);
1923       group_size = REDUC_GROUP_SIZE (stmt_info);
1924     }
1925   else
1926     {
1927       gcc_assert (is_a <loop_vec_info> (vinfo));
1928       vectype = STMT_VINFO_VECTYPE (stmt_info);
1929       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1930     }
1931 
1932   if (!vectype)
1933     {
1934       if (dump_enabled_p ())
1935 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 			 "Build SLP failed: unsupported data-type %T\n",
1937 			 scalar_type);
1938 
1939       return false;
1940     }
1941   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1942 
1943   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1944   scalar_stmts.create (group_size);
1945   stmt_vec_info next_info = stmt_info;
1946   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1947     {
1948       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1949       while (next_info)
1950         {
1951 	  scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1952 	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1953         }
1954     }
1955   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1956     {
1957       /* Collect the reduction stmts and store them in
1958 	 SLP_TREE_SCALAR_STMTS.  */
1959       while (next_info)
1960         {
1961 	  scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1962 	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1963         }
1964       /* Mark the first element of the reduction chain as reduction to properly
1965 	 transform the node.  In the reduction analysis phase only the last
1966 	 element of the chain is marked as reduction.  */
1967       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1968       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
1969 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
1970     }
1971   else
1972     {
1973       /* Collect reduction statements.  */
1974       vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1975       for (i = 0; reductions.iterate (i, &next_info); i++)
1976 	scalar_stmts.safe_push (next_info);
1977     }
1978 
1979   /* Build the tree for the SLP instance.  */
1980   bool *matches = XALLOCAVEC (bool, group_size);
1981   unsigned npermutes = 0;
1982   scalar_stmts_to_slp_tree_map_t *bst_map
1983     = new scalar_stmts_to_slp_tree_map_t ();
1984   poly_uint64 max_nunits = nunits;
1985   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1986 			      &max_nunits, matches, &npermutes,
1987 			      NULL, max_tree_size, bst_map);
1988   /* The map keeps a reference on SLP nodes built, release that.  */
1989   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1990        it != bst_map->end (); ++it)
1991     if ((*it).second)
1992       vect_free_slp_tree ((*it).second, false);
1993   delete bst_map;
1994   if (node != NULL)
1995     {
1996       /* Calculate the unrolling factor based on the smallest type.  */
1997       poly_uint64 unrolling_factor
1998 	= calculate_unrolling_factor (max_nunits, group_size);
1999 
2000       if (maybe_ne (unrolling_factor, 1U)
2001 	  && is_a <bb_vec_info> (vinfo))
2002 	{
2003 	  unsigned HOST_WIDE_INT const_max_nunits;
2004 	  if (!max_nunits.is_constant (&const_max_nunits)
2005 	      || const_max_nunits > group_size)
2006 	    {
2007 	      if (dump_enabled_p ())
2008 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 				 "Build SLP failed: store group "
2010 				 "size not a multiple of the vector size "
2011 				 "in basic block SLP\n");
2012 	      vect_free_slp_tree (node, false);
2013 	      return false;
2014 	    }
2015 	  /* Fatal mismatch.  */
2016 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
2017 	  vect_free_slp_tree (node, false);
2018 	}
2019       else
2020 	{
2021 	  /* Create a new SLP instance.  */
2022 	  new_instance = XNEW (struct _slp_instance);
2023 	  SLP_INSTANCE_TREE (new_instance) = node;
2024 	  SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2025 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2026 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
2027 	  vect_gather_slp_loads (new_instance, node);
2028 
2029 	  /* Compute the load permutation.  */
2030 	  slp_tree load_node;
2031 	  bool loads_permuted = false;
2032 	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2033 	    {
2034 	      vec<unsigned> load_permutation;
2035 	      int j;
2036 	      stmt_vec_info load_info;
2037 	      bool this_load_permuted = false;
2038 	      load_permutation.create (group_size);
2039 	      stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2040 		  (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2041 	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2042 		{
2043 		  int load_place = vect_get_place_in_interleaving_chain
2044 		      (load_info, first_stmt_info);
2045 		  gcc_assert (load_place != -1);
2046 		  if (load_place != j)
2047 		    this_load_permuted = true;
2048 		  load_permutation.safe_push (load_place);
2049 		}
2050 	      if (!this_load_permuted
2051 		  /* The load requires permutation when unrolling exposes
2052 		     a gap either because the group is larger than the SLP
2053 		     group-size or because there is a gap between the groups.  */
2054 		  && (known_eq (unrolling_factor, 1U)
2055 		      || (group_size == DR_GROUP_SIZE (first_stmt_info)
2056 			  && DR_GROUP_GAP (first_stmt_info) == 0)))
2057 		{
2058 		  load_permutation.release ();
2059 		  continue;
2060 		}
2061 	      SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2062 	      loads_permuted = true;
2063 	    }
2064 
2065 	  if (loads_permuted)
2066 	    {
2067 	      if (!vect_supported_load_permutation_p (new_instance))
2068 		{
2069 		  if (dump_enabled_p ())
2070 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071 				     "Build SLP failed: unsupported load "
2072 				     "permutation %G", stmt_info->stmt);
2073 		  vect_free_slp_instance (new_instance, false);
2074 		  return false;
2075 		}
2076 	    }
2077 
2078 	  /* If the loads and stores can be handled with load/store-lan
2079 	     instructions do not generate this SLP instance.  */
2080 	  if (is_a <loop_vec_info> (vinfo)
2081 	      && loads_permuted
2082 	      && dr && vect_store_lanes_supported (vectype, group_size, false))
2083 	    {
2084 	      slp_tree load_node;
2085 	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2086 		{
2087 		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2088 		      (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2089 		  /* Use SLP for strided accesses (or if we can't load-lanes).  */
2090 		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2091 		      || ! vect_load_lanes_supported
2092 		      (STMT_VINFO_VECTYPE (stmt_vinfo),
2093 		       DR_GROUP_SIZE (stmt_vinfo), false))
2094 		    break;
2095 		}
2096 	      if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2097 		{
2098 		  if (dump_enabled_p ())
2099 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 				     "Built SLP cancelled: can use "
2101 				     "load/store-lanes\n");
2102 		  vect_free_slp_instance (new_instance, false);
2103 		  return false;
2104 		}
2105 	    }
2106 
2107 	  vinfo->slp_instances.safe_push (new_instance);
2108 
2109 	  if (dump_enabled_p ())
2110 	    {
2111 	      dump_printf_loc (MSG_NOTE, vect_location,
2112 			       "Final SLP tree for instance:\n");
2113 	      vect_print_slp_tree (MSG_NOTE, vect_location, node);
2114 	    }
2115 
2116 	  return true;
2117 	}
2118     }
2119   else
2120     {
2121       /* Failed to SLP.  */
2122       /* Free the allocated memory.  */
2123       scalar_stmts.release ();
2124     }
2125 
2126   /* For basic block SLP, try to break the group up into multiples of the
2127      vector size.  */
2128   unsigned HOST_WIDE_INT const_nunits;
2129   if (is_a <bb_vec_info> (vinfo)
2130       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2131       && DR_GROUP_FIRST_ELEMENT (stmt_info)
2132       && nunits.is_constant (&const_nunits))
2133     {
2134       /* We consider breaking the group only on VF boundaries from the existing
2135 	 start.  */
2136       for (i = 0; i < group_size; i++)
2137 	if (!matches[i]) break;
2138 
2139       if (i >= const_nunits && i < group_size)
2140 	{
2141 	  /* Split into two groups at the first vector boundary before i.  */
2142 	  gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2143 	  unsigned group1_size = i & ~(const_nunits - 1);
2144 
2145 	  stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2146 							   group1_size);
2147 	  bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2148 						max_tree_size);
2149 	  /* If the first non-match was in the middle of a vector,
2150 	     skip the rest of that vector.  */
2151 	  if (group1_size < i)
2152 	    {
2153 	      i = group1_size + const_nunits;
2154 	      if (i < group_size)
2155 		rest = vect_split_slp_store_group (rest, const_nunits);
2156 	    }
2157 	  if (i < group_size)
2158 	    res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2159 	  return res;
2160 	}
2161       /* Even though the first vector did not all match, we might be able to SLP
2162 	 (some) of the remainder.  FORNOW ignore this possibility.  */
2163     }
2164 
2165   return false;
2166 }
2167 
2168 
2169 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
2170    trees of packed scalar stmts if SLP is possible.  */
2171 
2172 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2173 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2174 {
2175   unsigned int i;
2176   stmt_vec_info first_element;
2177 
2178   DUMP_VECT_SCOPE ("vect_analyze_slp");
2179 
2180   /* Find SLP sequences starting from groups of grouped stores.  */
2181   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2182     vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2183 
2184   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2185     {
2186       if (loop_vinfo->reduction_chains.length () > 0)
2187 	{
2188 	  /* Find SLP sequences starting from reduction chains.  */
2189 	  FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2190 	    if (! vect_analyze_slp_instance (vinfo, first_element,
2191 					     max_tree_size))
2192 	      {
2193 		/* Dissolve reduction chain group.  */
2194 		stmt_vec_info vinfo = first_element;
2195 		while (vinfo)
2196 		  {
2197 		    stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2198 		    REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2199 		    REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2200 		    vinfo = next;
2201 		  }
2202 		STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2203 	      }
2204 	}
2205 
2206       /* Find SLP sequences starting from groups of reductions.  */
2207       if (loop_vinfo->reductions.length () > 1)
2208 	vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2209 				   max_tree_size);
2210     }
2211 
2212   return opt_result::success ();
2213 }
2214 
2215 
2216 /* For each possible SLP instance decide whether to SLP it and calculate overall
2217    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
2218    least one instance.  */
2219 
2220 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2221 vect_make_slp_decision (loop_vec_info loop_vinfo)
2222 {
2223   unsigned int i;
2224   poly_uint64 unrolling_factor = 1;
2225   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2226   slp_instance instance;
2227   int decided_to_slp = 0;
2228 
2229   DUMP_VECT_SCOPE ("vect_make_slp_decision");
2230 
2231   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2232     {
2233       /* FORNOW: SLP if you can.  */
2234       /* All unroll factors have the form current_vector_size * X for some
2235 	 rational X, so they must have a common multiple.  */
2236       unrolling_factor
2237 	= force_common_multiple (unrolling_factor,
2238 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2239 
2240       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
2241 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2242 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
2243       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2244       decided_to_slp++;
2245     }
2246 
2247   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2248 
2249   if (decided_to_slp && dump_enabled_p ())
2250     {
2251       dump_printf_loc (MSG_NOTE, vect_location,
2252 		       "Decided to SLP %d instances. Unrolling factor ",
2253 		       decided_to_slp);
2254       dump_dec (MSG_NOTE, unrolling_factor);
2255       dump_printf (MSG_NOTE, "\n");
2256     }
2257 
2258   return (decided_to_slp > 0);
2259 }
2260 
2261 
2262 /* Private data for vect_detect_hybrid_slp.  */
2263 struct vdhs_data
2264 {
2265   loop_vec_info loop_vinfo;
2266   vec<stmt_vec_info> *worklist;
2267 };
2268 
2269 /* Walker for walk_gimple_op.  */
2270 
2271 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)2272 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2273 {
2274   walk_stmt_info *wi = (walk_stmt_info *)data;
2275   vdhs_data *dat = (vdhs_data *)wi->info;
2276 
2277   if (wi->is_lhs)
2278     return NULL_TREE;
2279 
2280   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2281   if (!def_stmt_info)
2282     return NULL_TREE;
2283   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2284   if (PURE_SLP_STMT (def_stmt_info))
2285     {
2286       if (dump_enabled_p ())
2287 	dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2288 			 def_stmt_info->stmt);
2289       STMT_SLP_TYPE (def_stmt_info) = hybrid;
2290       dat->worklist->safe_push (def_stmt_info);
2291     }
2292 
2293   return NULL_TREE;
2294 }
2295 
2296 /* Find stmts that must be both vectorized and SLPed.  */
2297 
2298 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2299 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2300 {
2301   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2302 
2303   /* All stmts participating in SLP are marked pure_slp, all other
2304      stmts are loop_vect.
2305      First collect all loop_vect stmts into a worklist.  */
2306   auto_vec<stmt_vec_info> worklist;
2307   for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2308     {
2309       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2310       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2311 	   gsi_next (&gsi))
2312 	{
2313 	  gphi *phi = gsi.phi ();
2314 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2315 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2316 	    worklist.safe_push (stmt_info);
2317 	}
2318       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2319 	   gsi_next (&gsi))
2320 	{
2321 	  gimple *stmt = gsi_stmt (gsi);
2322 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2323 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2324 	    {
2325 	      for (gimple_stmt_iterator gsi2
2326 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2327 		   !gsi_end_p (gsi2); gsi_next (&gsi2))
2328 		{
2329 		  stmt_vec_info patt_info
2330 		    = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2331 		  if (!STMT_SLP_TYPE (patt_info)
2332 		      && STMT_VINFO_RELEVANT (patt_info))
2333 		    worklist.safe_push (patt_info);
2334 		}
2335 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2336 	    }
2337 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2338 	    worklist.safe_push (stmt_info);
2339 	}
2340     }
2341 
2342   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2343      mark any SLP vectorized stmt as hybrid.
2344      ???  We're visiting def stmts N times (once for each non-SLP and
2345      once for each hybrid-SLP use).  */
2346   walk_stmt_info wi;
2347   vdhs_data dat;
2348   dat.worklist = &worklist;
2349   dat.loop_vinfo = loop_vinfo;
2350   memset (&wi, 0, sizeof (wi));
2351   wi.info = (void *)&dat;
2352   while (!worklist.is_empty ())
2353     {
2354       stmt_vec_info stmt_info = worklist.pop ();
2355       /* Since SSA operands are not set up for pattern stmts we need
2356 	 to use walk_gimple_op.  */
2357       wi.is_lhs = 0;
2358       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2359     }
2360 }
2361 
2362 
2363 /* Initialize a bb_vec_info struct for the statements between
2364    REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive).  */
2365 
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in,vec_info_shared * shared)2366 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2367 			    gimple_stmt_iterator region_end_in,
2368 			    vec_info_shared *shared)
2369   : vec_info (vec_info::bb, init_cost (NULL), shared),
2370     bb (gsi_bb (region_begin_in)),
2371     region_begin (region_begin_in),
2372     region_end (region_end_in)
2373 {
2374   gimple_stmt_iterator gsi;
2375 
2376   for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2377        gsi_next (&gsi))
2378     {
2379       gimple *stmt = gsi_stmt (gsi);
2380       gimple_set_uid (stmt, 0);
2381       add_stmt (stmt);
2382     }
2383 
2384   bb->aux = this;
2385 }
2386 
2387 
2388 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2389    stmts in the basic block.  */
2390 
~_bb_vec_info()2391 _bb_vec_info::~_bb_vec_info ()
2392 {
2393   for (gimple_stmt_iterator si = region_begin;
2394        gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2395     /* Reset region marker.  */
2396     gimple_set_uid (gsi_stmt (si), -1);
2397 
2398   bb->aux = NULL;
2399 }
2400 
2401 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
2402    given then that child nodes have already been processed, and that
2403    their def types currently match their SLP node's def type.  */
2404 
2405 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)2406 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2407 				    slp_instance node_instance,
2408 				    stmt_vector_for_cost *cost_vec)
2409 {
2410   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2411   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2412 
2413   /* For BB vectorization vector types are assigned here.
2414      Memory accesses already got their vector type assigned
2415      in vect_analyze_data_refs.  */
2416   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2417   if (bb_vinfo
2418       && ! STMT_VINFO_DATA_REF (stmt_info))
2419     {
2420       tree vectype, nunits_vectype;
2421       if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2422 					   &nunits_vectype))
2423 	/* We checked this when building the node.  */
2424 	gcc_unreachable ();
2425       if (vectype == boolean_type_node)
2426 	{
2427 	  vectype = vect_get_mask_type_for_stmt (stmt_info);
2428 	  if (!vectype)
2429 	    /* vect_get_mask_type_for_stmt has already explained the
2430 	       failure.  */
2431 	    return false;
2432 	}
2433 
2434       stmt_vec_info sstmt_info;
2435       unsigned int i;
2436       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2437 	STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2438     }
2439 
2440   /* Calculate the number of vector statements to be created for the
2441      scalar stmts in this node.  For SLP reductions it is equal to the
2442      number of vector statements in the children (which has already been
2443      calculated by the recursive call).  Otherwise it is the number of
2444      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2445      VF divided by the number of elements in a vector.  */
2446   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2447       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2448     SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2449       = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2450   else
2451     {
2452       poly_uint64 vf;
2453       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2454 	vf = loop_vinfo->vectorization_factor;
2455       else
2456 	vf = 1;
2457       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2458       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2459       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2460 	= vect_get_num_vectors (vf * group_size, vectype);
2461     }
2462 
2463   bool dummy;
2464   return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2465 }
2466 
2467 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2468    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2469 
2470    Return true if the operations are supported.  */
2471 
2472 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,scalar_stmts_to_slp_tree_map_t * visited,scalar_stmts_to_slp_tree_map_t * lvisited,stmt_vector_for_cost * cost_vec)2473 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2474 				  slp_instance node_instance,
2475 				  scalar_stmts_to_slp_tree_map_t *visited,
2476 				  scalar_stmts_to_slp_tree_map_t *lvisited,
2477 				  stmt_vector_for_cost *cost_vec)
2478 {
2479   int i, j;
2480   slp_tree child;
2481 
2482   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2483     return true;
2484 
2485   /* If we already analyzed the exact same set of scalar stmts we're done.
2486      We share the generated vector stmts for those.  */
2487   slp_tree *leader;
2488   if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2489       || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2490     {
2491       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2492 	= SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2493       return true;
2494     }
2495 
2496   /* The SLP graph is acyclic so not caching whether we failed or succeeded
2497      doesn't result in any issue since we throw away the lvisited set
2498      when we fail.  */
2499   lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2500 
2501   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2502     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2503 					   visited, lvisited, cost_vec))
2504       return false;
2505 
2506   /* ???  We have to catch the case late where two first scalar stmts appear
2507      in multiple SLP children with different def type and fail.  Remember
2508      original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2509      match it when that is vect_internal_def.  */
2510   auto_vec<vect_def_type, 4> dt;
2511   dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2512   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2513     dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2514 
2515   /* Push SLP node def-type to stmt operands.  */
2516   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2517     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2518       STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2519 	= SLP_TREE_DEF_TYPE (child);
2520 
2521   /* Check everything worked out.  */
2522   bool res = true;
2523   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2524     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2525       {
2526 	if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2527 	    != SLP_TREE_DEF_TYPE (child))
2528 	  res = false;
2529       }
2530     else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
2531       res = false;
2532   if (!res && dump_enabled_p ())
2533     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2534 		     "not vectorized: same operand with different "
2535 		     "def type in stmt.\n");
2536 
2537   if (res)
2538     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2539 					      cost_vec);
2540 
2541   /* Restore def-types.  */
2542   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2543     STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2544 
2545   return res;
2546 }
2547 
2548 
2549 /* Analyze statements in SLP instances of VINFO.  Return true if the
2550    operations are supported. */
2551 
2552 bool
vect_slp_analyze_operations(vec_info * vinfo)2553 vect_slp_analyze_operations (vec_info *vinfo)
2554 {
2555   slp_instance instance;
2556   int i;
2557 
2558   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2559 
2560   scalar_stmts_to_slp_tree_map_t *visited
2561     = new scalar_stmts_to_slp_tree_map_t ();
2562   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2563     {
2564       scalar_stmts_to_slp_tree_map_t lvisited;
2565       stmt_vector_for_cost cost_vec;
2566       cost_vec.create (2);
2567       if (!vect_slp_analyze_node_operations (vinfo,
2568 					     SLP_INSTANCE_TREE (instance),
2569 					     instance, visited, &lvisited,
2570 					     &cost_vec))
2571         {
2572 	  slp_tree node = SLP_INSTANCE_TREE (instance);
2573 	  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2574 	  if (dump_enabled_p ())
2575 	    dump_printf_loc (MSG_NOTE, vect_location,
2576 			     "removing SLP instance operations starting from: %G",
2577 			     stmt_info->stmt);
2578 	  vect_free_slp_instance (instance, false);
2579           vinfo->slp_instances.ordered_remove (i);
2580 	  cost_vec.release ();
2581 	}
2582       else
2583 	{
2584 	  for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2585 	       x != lvisited.end(); ++x)
2586 	    visited->put ((*x).first.copy (), (*x).second);
2587 	  i++;
2588 
2589 	  add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2590 	  cost_vec.release ();
2591 	}
2592     }
2593   delete visited;
2594 
2595   return !vinfo->slp_instances.is_empty ();
2596 }
2597 
2598 
2599 /* Compute the scalar cost of the SLP node NODE and its children
2600    and return it.  Do not account defs that are marked in LIFE and
2601    update LIFE according to uses of NODE.  */
2602 
2603 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<slp_tree> & visited)2604 vect_bb_slp_scalar_cost (basic_block bb,
2605 			 slp_tree node, vec<bool, va_heap> *life,
2606 			 stmt_vector_for_cost *cost_vec,
2607 			 hash_set<slp_tree> &visited)
2608 {
2609   unsigned i;
2610   stmt_vec_info stmt_info;
2611   slp_tree child;
2612 
2613   if (visited.add (node))
2614     return;
2615 
2616   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2617     {
2618       gimple *stmt = stmt_info->stmt;
2619       vec_info *vinfo = stmt_info->vinfo;
2620       ssa_op_iter op_iter;
2621       def_operand_p def_p;
2622 
2623       if ((*life)[i])
2624 	continue;
2625 
2626       /* If there is a non-vectorized use of the defs then the scalar
2627          stmt is kept live in which case we do not account it or any
2628 	 required defs in the SLP children in the scalar cost.  This
2629 	 way we make the vectorization more costly when compared to
2630 	 the scalar cost.  */
2631       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2632 	{
2633 	  imm_use_iterator use_iter;
2634 	  gimple *use_stmt;
2635 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2636 	    if (!is_gimple_debug (use_stmt))
2637 	      {
2638 		stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2639 		if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2640 		  {
2641 		    (*life)[i] = true;
2642 		    BREAK_FROM_IMM_USE_STMT (use_iter);
2643 		  }
2644 	      }
2645 	}
2646       if ((*life)[i])
2647 	continue;
2648 
2649       /* Count scalar stmts only once.  */
2650       if (gimple_visited_p (stmt))
2651 	continue;
2652       gimple_set_visited (stmt, true);
2653 
2654       vect_cost_for_stmt kind;
2655       if (STMT_VINFO_DATA_REF (stmt_info))
2656         {
2657           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2658 	    kind = scalar_load;
2659           else
2660 	    kind = scalar_store;
2661         }
2662       else
2663 	kind = scalar_stmt;
2664       record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2665     }
2666 
2667   auto_vec<bool, 20> subtree_life;
2668   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2669     {
2670       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2671 	{
2672 	  /* Do not directly pass LIFE to the recursive call, copy it to
2673 	     confine changes in the callee to the current child/subtree.  */
2674 	  subtree_life.safe_splice (*life);
2675 	  vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2676 				   visited);
2677 	  subtree_life.truncate (0);
2678 	}
2679     }
2680 }
2681 
2682 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec)2683 vect_bb_slp_scalar_cost (basic_block bb,
2684 			 slp_tree node, vec<bool, va_heap> *life,
2685 			 stmt_vector_for_cost *cost_vec)
2686 {
2687   hash_set<slp_tree> visited;
2688   vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2689 }
2690 
2691 /* Check if vectorization of the basic block is profitable.  */
2692 
2693 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2694 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2695 {
2696   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2697   slp_instance instance;
2698   int i;
2699   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2700   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2701 
2702   /* Calculate scalar cost.  */
2703   stmt_vector_for_cost scalar_costs;
2704   scalar_costs.create (0);
2705   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2706     {
2707       auto_vec<bool, 20> life;
2708       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2709       vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2710 			       SLP_INSTANCE_TREE (instance),
2711 			       &life, &scalar_costs);
2712     }
2713   void *target_cost_data = init_cost (NULL);
2714   add_stmt_costs (target_cost_data, &scalar_costs);
2715   scalar_costs.release ();
2716   unsigned dummy;
2717   finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2718   destroy_cost_data (target_cost_data);
2719 
2720   /* Unset visited flag.  */
2721   for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2722        gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2723     gimple_set_visited  (gsi_stmt (gsi), false);
2724 
2725   /* Complete the target-specific cost calculation.  */
2726   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2727 	       &vec_inside_cost, &vec_epilogue_cost);
2728 
2729   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2730 
2731   if (dump_enabled_p ())
2732     {
2733       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2734       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2735 		   vec_inside_cost);
2736       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2737       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2738       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2739     }
2740 
2741   /* Vectorization is profitable if its cost is more than the cost of scalar
2742      version.  Note that we err on the vector side for equal cost because
2743      the cost estimate is otherwise quite pessimistic (constant uses are
2744      free on the scalar side but cost a load on the vector side for
2745      example).  */
2746   if (vec_outside_cost + vec_inside_cost > scalar_cost)
2747     return false;
2748 
2749   return true;
2750 }
2751 
2752 /* Check if the basic block can be vectorized.  Returns a bb_vec_info
2753    if so and sets fatal to true if failure is independent of
2754    current_vector_size.  */
2755 
2756 static bb_vec_info
vect_slp_analyze_bb_1(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,int n_stmts,bool & fatal,vec_info_shared * shared)2757 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2758 		       gimple_stmt_iterator region_end,
2759 		       vec<data_reference_p> datarefs, int n_stmts,
2760 		       bool &fatal, vec_info_shared *shared)
2761 {
2762   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2763 
2764   bb_vec_info bb_vinfo;
2765   slp_instance instance;
2766   int i;
2767   poly_uint64 min_vf = 2;
2768 
2769   /* The first group of checks is independent of the vector size.  */
2770   fatal = true;
2771 
2772   if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2773     {
2774       if (dump_enabled_p ())
2775 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 			 "not vectorized: too many instructions in "
2777 			 "basic block.\n");
2778       free_data_refs (datarefs);
2779       return NULL;
2780     }
2781 
2782   bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2783   if (!bb_vinfo)
2784     return NULL;
2785 
2786   BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2787   bb_vinfo->shared->save_datarefs ();
2788 
2789   /* Analyze the data references.  */
2790 
2791   if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2792     {
2793       if (dump_enabled_p ())
2794         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2795 			 "not vectorized: unhandled data-ref in basic "
2796 			 "block.\n");
2797 
2798       delete bb_vinfo;
2799       return NULL;
2800     }
2801 
2802   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2803     {
2804       if (dump_enabled_p ())
2805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2806 			 "not vectorized: not enough data-refs in "
2807 			 "basic block.\n");
2808 
2809       delete bb_vinfo;
2810       return NULL;
2811     }
2812 
2813   if (!vect_analyze_data_ref_accesses (bb_vinfo))
2814     {
2815      if (dump_enabled_p ())
2816        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 			"not vectorized: unhandled data access in "
2818 			"basic block.\n");
2819 
2820       delete bb_vinfo;
2821       return NULL;
2822     }
2823 
2824   /* If there are no grouped stores in the region there is no need
2825      to continue with pattern recog as vect_analyze_slp will fail
2826      anyway.  */
2827   if (bb_vinfo->grouped_stores.is_empty ())
2828     {
2829       if (dump_enabled_p ())
2830 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831 			 "not vectorized: no grouped stores in "
2832 			 "basic block.\n");
2833 
2834       delete bb_vinfo;
2835       return NULL;
2836     }
2837 
2838   /* While the rest of the analysis below depends on it in some way.  */
2839   fatal = false;
2840 
2841   vect_pattern_recog (bb_vinfo);
2842 
2843   /* Check the SLP opportunities in the basic block, analyze and build SLP
2844      trees.  */
2845   if (!vect_analyze_slp (bb_vinfo, n_stmts))
2846     {
2847       if (dump_enabled_p ())
2848 	{
2849 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2850 			   "Failed to SLP the basic block.\n");
2851 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2852 			   "not vectorized: failed to find SLP opportunities "
2853 			   "in basic block.\n");
2854 	}
2855 
2856       delete bb_vinfo;
2857       return NULL;
2858     }
2859 
2860   vect_record_base_alignments (bb_vinfo);
2861 
2862   /* Analyze and verify the alignment of data references and the
2863      dependence in the SLP instances.  */
2864   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2865     {
2866       if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2867 	  || ! vect_slp_analyze_instance_dependence (instance))
2868 	{
2869 	  slp_tree node = SLP_INSTANCE_TREE (instance);
2870 	  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2871 	  if (dump_enabled_p ())
2872 	    dump_printf_loc (MSG_NOTE, vect_location,
2873 			     "removing SLP instance operations starting from: %G",
2874 			     stmt_info->stmt);
2875 	  vect_free_slp_instance (instance, false);
2876 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2877 	  continue;
2878 	}
2879 
2880       /* Mark all the statements that we want to vectorize as pure SLP and
2881 	 relevant.  */
2882       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2883       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2884 
2885       i++;
2886     }
2887   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2888     {
2889       delete bb_vinfo;
2890       return NULL;
2891     }
2892 
2893   if (!vect_slp_analyze_operations (bb_vinfo))
2894     {
2895       if (dump_enabled_p ())
2896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2897 			 "not vectorized: bad operation in basic block.\n");
2898 
2899       delete bb_vinfo;
2900       return NULL;
2901     }
2902 
2903   /* Cost model: check if the vectorization is worthwhile.  */
2904   if (!unlimited_cost_model (NULL)
2905       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2906     {
2907       if (dump_enabled_p ())
2908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909 			 "not vectorized: vectorization is not "
2910 			 "profitable.\n");
2911 
2912       delete bb_vinfo;
2913       return NULL;
2914     }
2915 
2916   if (dump_enabled_p ())
2917     dump_printf_loc (MSG_NOTE, vect_location,
2918 		     "Basic block will be vectorized using SLP\n");
2919 
2920   return bb_vinfo;
2921 }
2922 
2923 
2924 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
2925    true if anything in the basic-block was vectorized.  */
2926 
2927 bool
vect_slp_bb(basic_block bb)2928 vect_slp_bb (basic_block bb)
2929 {
2930   bb_vec_info bb_vinfo;
2931   gimple_stmt_iterator gsi;
2932   bool any_vectorized = false;
2933   auto_vector_sizes vector_sizes;
2934 
2935   /* Autodetect first vector size we try.  */
2936   current_vector_size = 0;
2937   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2938   unsigned int next_size = 0;
2939 
2940   gsi = gsi_start_bb (bb);
2941 
2942   poly_uint64 autodetected_vector_size = 0;
2943   while (1)
2944     {
2945       if (gsi_end_p (gsi))
2946 	break;
2947 
2948       gimple_stmt_iterator region_begin = gsi;
2949       vec<data_reference_p> datarefs = vNULL;
2950       int insns = 0;
2951 
2952       for (; !gsi_end_p (gsi); gsi_next (&gsi))
2953 	{
2954 	  gimple *stmt = gsi_stmt (gsi);
2955 	  if (is_gimple_debug (stmt))
2956 	    continue;
2957 	  insns++;
2958 
2959 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
2960 	    vect_location = stmt;
2961 
2962 	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
2963 	    break;
2964 	}
2965 
2966       /* Skip leading unhandled stmts.  */
2967       if (gsi_stmt (region_begin) == gsi_stmt (gsi))
2968 	{
2969 	  gsi_next (&gsi);
2970 	  continue;
2971 	}
2972 
2973       gimple_stmt_iterator region_end = gsi;
2974 
2975       bool vectorized = false;
2976       bool fatal = false;
2977       vec_info_shared shared;
2978       bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
2979 					datarefs, insns, fatal, &shared);
2980       if (bb_vinfo
2981 	  && dbg_cnt (vect_slp))
2982 	{
2983 	  if (dump_enabled_p ())
2984 	    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
2985 
2986 	  bb_vinfo->shared->check_datarefs ();
2987 	  vect_schedule_slp (bb_vinfo);
2988 
2989 	  unsigned HOST_WIDE_INT bytes;
2990 	  if (dump_enabled_p ())
2991 	    {
2992 	      if (current_vector_size.is_constant (&bytes))
2993 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2994 				 "basic block part vectorized using %wu byte "
2995 				 "vectors\n", bytes);
2996 	      else
2997 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2998 				 "basic block part vectorized using variable "
2999 				 "length vectors\n");
3000 	    }
3001 
3002 	  vectorized = true;
3003 	}
3004       delete bb_vinfo;
3005 
3006       any_vectorized |= vectorized;
3007 
3008       if (next_size == 0)
3009 	autodetected_vector_size = current_vector_size;
3010 
3011       if (next_size < vector_sizes.length ()
3012 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
3013 	next_size += 1;
3014 
3015       if (vectorized
3016 	  || next_size == vector_sizes.length ()
3017 	  || known_eq (current_vector_size, 0U)
3018 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
3019 	     vector sizes will fail do not bother iterating.  */
3020 	  || fatal)
3021 	{
3022 	  if (gsi_end_p (region_end))
3023 	    break;
3024 
3025 	  /* Skip the unhandled stmt.  */
3026 	  gsi_next (&gsi);
3027 
3028 	  /* And reset vector sizes.  */
3029 	  current_vector_size = 0;
3030 	  next_size = 0;
3031 	}
3032       else
3033 	{
3034 	  /* Try the next biggest vector size.  */
3035 	  current_vector_size = vector_sizes[next_size++];
3036 	  if (dump_enabled_p ())
3037 	    {
3038 	      dump_printf_loc (MSG_NOTE, vect_location,
3039 			       "***** Re-trying analysis with "
3040 			       "vector size ");
3041 	      dump_dec (MSG_NOTE, current_vector_size);
3042 	      dump_printf (MSG_NOTE, "\n");
3043 	    }
3044 
3045 	  /* Start over.  */
3046 	  gsi = region_begin;
3047 	}
3048     }
3049 
3050   return any_vectorized;
3051 }
3052 
3053 
3054 /* Return 1 if vector type STMT_VINFO is a boolean vector.  */
3055 
3056 static bool
vect_mask_constant_operand_p(stmt_vec_info stmt_vinfo)3057 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3058 {
3059   enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3060   tree op, vectype;
3061   enum vect_def_type dt;
3062 
3063   /* For comparison and COND_EXPR type is chosen depending
3064      on the non-constant other comparison operand.  */
3065   if (TREE_CODE_CLASS (code) == tcc_comparison)
3066     {
3067       gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3068       op = gimple_assign_rhs1 (stmt);
3069 
3070       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3071 	gcc_unreachable ();
3072 
3073       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3074     }
3075 
3076   if (code == COND_EXPR)
3077     {
3078       gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3079       tree cond = gimple_assign_rhs1 (stmt);
3080 
3081       if (TREE_CODE (cond) == SSA_NAME)
3082 	op = cond;
3083       else
3084 	op = TREE_OPERAND (cond, 0);
3085 
3086       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3087 	gcc_unreachable ();
3088 
3089       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3090     }
3091 
3092   return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3093 }
3094 
3095 /* Build a variable-length vector in which the elements in ELTS are repeated
3096    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
3097    RESULTS and add any new instructions to SEQ.
3098 
3099    The approach we use is:
3100 
3101    (1) Find a vector mode VM with integer elements of mode IM.
3102 
3103    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3104        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
3105        from small vectors to IM.
3106 
3107    (3) Duplicate each ELTS'[I] into a vector of mode VM.
3108 
3109    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3110        correct byte contents.
3111 
3112    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3113 
3114    We try to find the largest IM for which this sequence works, in order
3115    to cut down on the number of interleaves.  */
3116 
3117 void
duplicate_and_interleave(gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3118 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3119 			  unsigned int nresults, vec<tree> &results)
3120 {
3121   unsigned int nelts = elts.length ();
3122   tree element_type = TREE_TYPE (vector_type);
3123 
3124   /* (1) Find a vector mode VM with integer elements of mode IM.  */
3125   unsigned int nvectors = 1;
3126   tree new_vector_type;
3127   tree permutes[2];
3128   if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3129 				       &nvectors, &new_vector_type,
3130 				       permutes))
3131     gcc_unreachable ();
3132 
3133   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
3134   unsigned int partial_nelts = nelts / nvectors;
3135   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3136 
3137   tree_vector_builder partial_elts;
3138   auto_vec<tree, 32> pieces (nvectors * 2);
3139   pieces.quick_grow (nvectors * 2);
3140   for (unsigned int i = 0; i < nvectors; ++i)
3141     {
3142       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3143 	     ELTS' has mode IM.  */
3144       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3145       for (unsigned int j = 0; j < partial_nelts; ++j)
3146 	partial_elts.quick_push (elts[i * partial_nelts + j]);
3147       tree t = gimple_build_vector (seq, &partial_elts);
3148       t = gimple_build (seq, VIEW_CONVERT_EXPR,
3149 			TREE_TYPE (new_vector_type), t);
3150 
3151       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
3152       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3153     }
3154 
3155   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3156 	 correct byte contents.
3157 
3158      We need to repeat the following operation log2(nvectors) times:
3159 
3160 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3161 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3162 
3163      However, if each input repeats every N elements and the VF is
3164      a multiple of N * 2, the HI result is the same as the LO.  */
3165   unsigned int in_start = 0;
3166   unsigned int out_start = nvectors;
3167   unsigned int hi_start = nvectors / 2;
3168   /* A bound on the number of outputs needed to produce NRESULTS results
3169      in the final iteration.  */
3170   unsigned int noutputs_bound = nvectors * nresults;
3171   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3172     {
3173       noutputs_bound /= 2;
3174       unsigned int limit = MIN (noutputs_bound, nvectors);
3175       for (unsigned int i = 0; i < limit; ++i)
3176 	{
3177 	  if ((i & 1) != 0
3178 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3179 			     2 * in_repeat))
3180 	    {
3181 	      pieces[out_start + i] = pieces[out_start + i - 1];
3182 	      continue;
3183 	    }
3184 
3185 	  tree output = make_ssa_name (new_vector_type);
3186 	  tree input1 = pieces[in_start + (i / 2)];
3187 	  tree input2 = pieces[in_start + (i / 2) + hi_start];
3188 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3189 					       input1, input2,
3190 					       permutes[i & 1]);
3191 	  gimple_seq_add_stmt (seq, stmt);
3192 	  pieces[out_start + i] = output;
3193 	}
3194       std::swap (in_start, out_start);
3195     }
3196 
3197   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
3198   results.reserve (nresults);
3199   for (unsigned int i = 0; i < nresults; ++i)
3200     if (i < nvectors)
3201       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3202 					pieces[in_start + i]));
3203     else
3204       results.quick_push (results[i - nvectors]);
3205 }
3206 
3207 
3208 /* For constant and loop invariant defs of SLP_NODE this function returns
3209    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3210    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3211    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
3212    REDUC_INDEX is the index of the reduction operand in the statements, unless
3213    it is -1.  */
3214 
3215 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors)3216 vect_get_constant_vectors (tree op, slp_tree slp_node,
3217                            vec<tree> *vec_oprnds,
3218 			   unsigned int op_num, unsigned int number_of_vectors)
3219 {
3220   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3221   stmt_vec_info stmt_vinfo = stmts[0];
3222   gimple *stmt = stmt_vinfo->stmt;
3223   unsigned HOST_WIDE_INT nunits;
3224   tree vec_cst;
3225   unsigned j, number_of_places_left_in_vector;
3226   tree vector_type;
3227   tree vop;
3228   int group_size = stmts.length ();
3229   unsigned int vec_num, i;
3230   unsigned number_of_copies = 1;
3231   vec<tree> voprnds;
3232   voprnds.create (number_of_vectors);
3233   bool constant_p, is_store;
3234   tree neutral_op = NULL;
3235   enum tree_code code = gimple_expr_code (stmt);
3236   gimple_seq ctor_seq = NULL;
3237   auto_vec<tree, 16> permute_results;
3238 
3239   /* Check if vector type is a boolean vector.  */
3240   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3241       && vect_mask_constant_operand_p (stmt_vinfo))
3242     vector_type
3243       = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3244   else
3245     vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3246 
3247   if (STMT_VINFO_DATA_REF (stmt_vinfo))
3248     {
3249       is_store = true;
3250       op = gimple_assign_rhs1 (stmt);
3251     }
3252   else
3253     is_store = false;
3254 
3255   gcc_assert (op);
3256 
3257   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3258      created vectors. It is greater than 1 if unrolling is performed.
3259 
3260      For example, we have two scalar operands, s1 and s2 (e.g., group of
3261      strided accesses of size two), while NUNITS is four (i.e., four scalars
3262      of this type can be packed in a vector).  The output vector will contain
3263      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3264      will be 2).
3265 
3266      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3267      containing the operands.
3268 
3269      For example, NUNITS is four as before, and the group size is 8
3270      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3271      {s5, s6, s7, s8}.  */
3272 
3273   /* When using duplicate_and_interleave, we just need one element for
3274      each scalar statement.  */
3275   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3276     nunits = group_size;
3277 
3278   number_of_copies = nunits * number_of_vectors / group_size;
3279 
3280   number_of_places_left_in_vector = nunits;
3281   constant_p = true;
3282   tree_vector_builder elts (vector_type, nunits, 1);
3283   elts.quick_grow (nunits);
3284   bool place_after_defs = false;
3285   for (j = 0; j < number_of_copies; j++)
3286     {
3287       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3288         {
3289 	  stmt = stmt_vinfo->stmt;
3290           if (is_store)
3291             op = gimple_assign_rhs1 (stmt);
3292           else
3293 	    {
3294 	      switch (code)
3295 		{
3296 		  case COND_EXPR:
3297 		    {
3298 		      tree cond = gimple_assign_rhs1 (stmt);
3299 		      if (TREE_CODE (cond) == SSA_NAME)
3300 			op = gimple_op (stmt, op_num + 1);
3301 		      else if (op_num == 0 || op_num == 1)
3302 			op = TREE_OPERAND (cond, op_num);
3303 		      else
3304 			{
3305 			  if (op_num == 2)
3306 			    op = gimple_assign_rhs2 (stmt);
3307 			  else
3308 			    op = gimple_assign_rhs3 (stmt);
3309 			}
3310 		    }
3311 		    break;
3312 
3313 		  case CALL_EXPR:
3314 		    op = gimple_call_arg (stmt, op_num);
3315 		    break;
3316 
3317 		  case LSHIFT_EXPR:
3318 		  case RSHIFT_EXPR:
3319 		  case LROTATE_EXPR:
3320 		  case RROTATE_EXPR:
3321 		    op = gimple_op (stmt, op_num + 1);
3322 		    /* Unlike the other binary operators, shifts/rotates have
3323 		       the shift count being int, instead of the same type as
3324 		       the lhs, so make sure the scalar is the right type if
3325 		       we are dealing with vectors of
3326 		       long long/long/short/char.  */
3327 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3328 		      op = fold_convert (TREE_TYPE (vector_type), op);
3329 		    break;
3330 
3331 		  default:
3332 		    op = gimple_op (stmt, op_num + 1);
3333 		    break;
3334 		}
3335 	    }
3336 
3337           /* Create 'vect_ = {op0,op1,...,opn}'.  */
3338           number_of_places_left_in_vector--;
3339 	  tree orig_op = op;
3340 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3341 	    {
3342 	      if (CONSTANT_CLASS_P (op))
3343 		{
3344 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3345 		    {
3346 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
3347 			 of possibly different sizes of scalar value and
3348 			 vector element.  */
3349 		      if (integer_zerop (op))
3350 			op = build_int_cst (TREE_TYPE (vector_type), 0);
3351 		      else if (integer_onep (op))
3352 			op = build_all_ones_cst (TREE_TYPE (vector_type));
3353 		      else
3354 			gcc_unreachable ();
3355 		    }
3356 		  else
3357 		    op = fold_unary (VIEW_CONVERT_EXPR,
3358 				     TREE_TYPE (vector_type), op);
3359 		  gcc_assert (op && CONSTANT_CLASS_P (op));
3360 		}
3361 	      else
3362 		{
3363 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3364 		  gimple *init_stmt;
3365 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3366 		    {
3367 		      tree true_val
3368 			= build_all_ones_cst (TREE_TYPE (vector_type));
3369 		      tree false_val
3370 			= build_zero_cst (TREE_TYPE (vector_type));
3371 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3372 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3373 						       op, true_val,
3374 						       false_val);
3375 		    }
3376 		  else
3377 		    {
3378 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3379 				   op);
3380 		      init_stmt
3381 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3382 					       op);
3383 		    }
3384 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
3385 		  op = new_temp;
3386 		}
3387 	    }
3388 	  elts[number_of_places_left_in_vector] = op;
3389 	  if (!CONSTANT_CLASS_P (op))
3390 	    constant_p = false;
3391 	  if (TREE_CODE (orig_op) == SSA_NAME
3392 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3393 	      && STMT_VINFO_BB_VINFO (stmt_vinfo)
3394 	      && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3395 		  == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3396 	    place_after_defs = true;
3397 
3398           if (number_of_places_left_in_vector == 0)
3399             {
3400 	      if (constant_p
3401 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3402 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3403 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
3404 	      else
3405 		{
3406 		  if (vec_oprnds->is_empty ())
3407 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
3408 					      number_of_vectors,
3409 					      permute_results);
3410 		  vec_cst = permute_results[number_of_vectors - j - 1];
3411 		}
3412 	      tree init;
3413 	      gimple_stmt_iterator gsi;
3414 	      if (place_after_defs)
3415 		{
3416 		  stmt_vec_info last_stmt_info
3417 		    = vect_find_last_scalar_stmt_in_slp (slp_node);
3418 		  gsi = gsi_for_stmt (last_stmt_info->stmt);
3419 		  init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3420 					   &gsi);
3421 		}
3422 	      else
3423 		init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3424 					 NULL);
3425 	      if (ctor_seq != NULL)
3426 		{
3427 		  gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3428 		  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3429 		  ctor_seq = NULL;
3430 		}
3431 	      voprnds.quick_push (init);
3432 	      place_after_defs = false;
3433               number_of_places_left_in_vector = nunits;
3434 	      constant_p = true;
3435 	      elts.new_vector (vector_type, nunits, 1);
3436 	      elts.quick_grow (nunits);
3437             }
3438         }
3439     }
3440 
3441   /* Since the vectors are created in the reverse order, we should invert
3442      them.  */
3443   vec_num = voprnds.length ();
3444   for (j = vec_num; j != 0; j--)
3445     {
3446       vop = voprnds[j - 1];
3447       vec_oprnds->quick_push (vop);
3448     }
3449 
3450   voprnds.release ();
3451 
3452   /* In case that VF is greater than the unrolling factor needed for the SLP
3453      group of stmts, NUMBER_OF_VECTORS to be created is greater than
3454      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3455      to replicate the vectors.  */
3456   while (number_of_vectors > vec_oprnds->length ())
3457     {
3458       tree neutral_vec = NULL;
3459 
3460       if (neutral_op)
3461         {
3462           if (!neutral_vec)
3463 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
3464 
3465           vec_oprnds->quick_push (neutral_vec);
3466         }
3467       else
3468         {
3469           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3470             vec_oprnds->quick_push (vop);
3471         }
3472     }
3473 }
3474 
3475 
3476 /* Get vectorized definitions from SLP_NODE that contains corresponding
3477    vectorized def-stmts.  */
3478 
3479 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3480 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3481 {
3482   tree vec_oprnd;
3483   stmt_vec_info vec_def_stmt_info;
3484   unsigned int i;
3485 
3486   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3487 
3488   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3489     {
3490       gcc_assert (vec_def_stmt_info);
3491       if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3492 	vec_oprnd = gimple_phi_result (vec_def_phi);
3493       else
3494 	vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3495       vec_oprnds->quick_push (vec_oprnd);
3496     }
3497 }
3498 
3499 
3500 /* Get vectorized definitions for SLP_NODE.
3501    If the scalar definitions are loop invariants or constants, collect them and
3502    call vect_get_constant_vectors() to create vector stmts.
3503    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3504    must be stored in the corresponding child of SLP_NODE, and we call
3505    vect_get_slp_vect_defs () to retrieve them.  */
3506 
3507 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds)3508 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3509 		   vec<vec<tree> > *vec_oprnds)
3510 {
3511   int number_of_vects = 0, i;
3512   unsigned int child_index = 0;
3513   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3514   slp_tree child = NULL;
3515   vec<tree> vec_defs;
3516   tree oprnd;
3517   bool vectorized_defs;
3518 
3519   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3520   FOR_EACH_VEC_ELT (ops, i, oprnd)
3521     {
3522       /* For each operand we check if it has vectorized definitions in a child
3523 	 node or we need to create them (for invariants and constants).  We
3524 	 check if the LHS of the first stmt of the next child matches OPRND.
3525 	 If it does, we found the correct child.  Otherwise, we call
3526 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3527 	 to check this child node for the next operand.  */
3528       vectorized_defs = false;
3529       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3530         {
3531           child = SLP_TREE_CHILDREN (slp_node)[child_index];
3532 
3533 	  /* We have to check both pattern and original def, if available.  */
3534 	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3535 	    {
3536 	      stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3537 	      stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3538 	      tree first_def_op;
3539 
3540 	      if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3541 		first_def_op = gimple_phi_result (first_def);
3542 	      else
3543 		first_def_op = gimple_get_lhs (first_def_info->stmt);
3544 	      if (operand_equal_p (oprnd, first_def_op, 0)
3545 		  || (related
3546 		      && operand_equal_p (oprnd,
3547 					  gimple_get_lhs (related->stmt), 0)))
3548 		{
3549 		  /* The number of vector defs is determined by the number of
3550 		     vector statements in the node from which we get those
3551 		     statements.  */
3552 		  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3553 		  vectorized_defs = true;
3554 		  child_index++;
3555 		}
3556 	    }
3557 	  else
3558 	    child_index++;
3559         }
3560 
3561       if (!vectorized_defs)
3562         {
3563           if (i == 0)
3564             {
3565               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3566               /* Number of vector stmts was calculated according to LHS in
3567                  vect_schedule_slp_instance (), fix it by replacing LHS with
3568                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
3569                  details.  */
3570 	      vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3571 					     &rhs_size_unit);
3572               if (rhs_size_unit != lhs_size_unit)
3573                 {
3574                   number_of_vects *= rhs_size_unit;
3575                   number_of_vects /= lhs_size_unit;
3576                 }
3577             }
3578         }
3579 
3580       /* Allocate memory for vectorized defs.  */
3581       vec_defs = vNULL;
3582       vec_defs.create (number_of_vects);
3583 
3584       /* For reduction defs we call vect_get_constant_vectors (), since we are
3585          looking for initial loop invariant values.  */
3586       if (vectorized_defs)
3587         /* The defs are already vectorized.  */
3588 	vect_get_slp_vect_defs (child, &vec_defs);
3589       else
3590 	/* Build vectors from scalar defs.  */
3591 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3592 				   number_of_vects);
3593 
3594       vec_oprnds->quick_push (vec_defs);
3595     }
3596 }
3597 
3598 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3599    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3600    permute statements for the SLP node NODE of the SLP instance
3601    SLP_NODE_INSTANCE.  */
3602 
3603 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3604 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3605 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
3606 			      slp_instance slp_node_instance, bool analyze_only,
3607 			      unsigned *n_perms)
3608 {
3609   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3610   vec_info *vinfo = stmt_info->vinfo;
3611   int vec_index = 0;
3612   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3613   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3614   unsigned int mask_element;
3615   machine_mode mode;
3616 
3617   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3618     return false;
3619 
3620   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3621 
3622   mode = TYPE_MODE (vectype);
3623   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3624 
3625   /* Initialize the vect stmts of NODE to properly insert the generated
3626      stmts later.  */
3627   if (! analyze_only)
3628     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3629 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3630       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3631 
3632   /* Generate permutation masks for every NODE. Number of masks for each NODE
3633      is equal to GROUP_SIZE.
3634      E.g., we have a group of three nodes with three loads from the same
3635      location in each node, and the vector size is 4. I.e., we have a
3636      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3637      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3638      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3639      ...
3640 
3641      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3642      The last mask is illegal since we assume two operands for permute
3643      operation, and the mask element values can't be outside that range.
3644      Hence, the last mask must be converted into {2,5,5,5}.
3645      For the first two permutations we need the first and the second input
3646      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3647      we need the second and the third vectors: {b1,c1,a2,b2} and
3648      {c2,a3,b3,c3}.  */
3649 
3650   int vect_stmts_counter = 0;
3651   unsigned int index = 0;
3652   int first_vec_index = -1;
3653   int second_vec_index = -1;
3654   bool noop_p = true;
3655   *n_perms = 0;
3656 
3657   vec_perm_builder mask;
3658   unsigned int nelts_to_build;
3659   unsigned int nvectors_per_build;
3660   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3661 		      && multiple_p (nunits, group_size));
3662   if (repeating_p)
3663     {
3664       /* A single vector contains a whole number of copies of the node, so:
3665 	 (a) all permutes can use the same mask; and
3666 	 (b) the permutes only need a single vector input.  */
3667       mask.new_vector (nunits, group_size, 3);
3668       nelts_to_build = mask.encoded_nelts ();
3669       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3670     }
3671   else
3672     {
3673       /* We need to construct a separate mask for each vector statement.  */
3674       unsigned HOST_WIDE_INT const_nunits, const_vf;
3675       if (!nunits.is_constant (&const_nunits)
3676 	  || !vf.is_constant (&const_vf))
3677 	return false;
3678       mask.new_vector (const_nunits, const_nunits, 1);
3679       nelts_to_build = const_vf * group_size;
3680       nvectors_per_build = 1;
3681     }
3682 
3683   unsigned int count = mask.encoded_nelts ();
3684   mask.quick_grow (count);
3685   vec_perm_indices indices;
3686 
3687   for (unsigned int j = 0; j < nelts_to_build; j++)
3688     {
3689       unsigned int iter_num = j / group_size;
3690       unsigned int stmt_num = j % group_size;
3691       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3692 			+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3693       if (repeating_p)
3694 	{
3695 	  first_vec_index = 0;
3696 	  mask_element = i;
3697 	}
3698       else
3699 	{
3700 	  /* Enforced before the loop when !repeating_p.  */
3701 	  unsigned int const_nunits = nunits.to_constant ();
3702 	  vec_index = i / const_nunits;
3703 	  mask_element = i % const_nunits;
3704 	  if (vec_index == first_vec_index
3705 	      || first_vec_index == -1)
3706 	    {
3707 	      first_vec_index = vec_index;
3708 	    }
3709 	  else if (vec_index == second_vec_index
3710 		   || second_vec_index == -1)
3711 	    {
3712 	      second_vec_index = vec_index;
3713 	      mask_element += const_nunits;
3714 	    }
3715 	  else
3716 	    {
3717 	      if (dump_enabled_p ())
3718 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 				 "permutation requires at "
3720 				 "least three vectors %G",
3721 				 stmt_info->stmt);
3722 	      gcc_assert (analyze_only);
3723 	      return false;
3724 	    }
3725 
3726 	  gcc_assert (mask_element < 2 * const_nunits);
3727 	}
3728 
3729       if (mask_element != index)
3730 	noop_p = false;
3731       mask[index++] = mask_element;
3732 
3733       if (index == count && !noop_p)
3734 	{
3735 	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3736 	  if (!can_vec_perm_const_p (mode, indices))
3737 	    {
3738 	      if (dump_enabled_p ())
3739 		{
3740 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3741 				   vect_location,
3742 				   "unsupported vect permute { ");
3743 		  for (i = 0; i < count; ++i)
3744 		    {
3745 		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3746 		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3747 		    }
3748 		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3749 		}
3750 	      gcc_assert (analyze_only);
3751 	      return false;
3752 	    }
3753 
3754 	  ++*n_perms;
3755 	}
3756 
3757       if (index == count)
3758 	{
3759 	  if (!analyze_only)
3760 	    {
3761 	      tree mask_vec = NULL_TREE;
3762 
3763 	      if (! noop_p)
3764 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3765 
3766 	      if (second_vec_index == -1)
3767 		second_vec_index = first_vec_index;
3768 
3769 	      for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3770 		{
3771 		  /* Generate the permute statement if necessary.  */
3772 		  tree first_vec = dr_chain[first_vec_index + ri];
3773 		  tree second_vec = dr_chain[second_vec_index + ri];
3774 		  stmt_vec_info perm_stmt_info;
3775 		  if (! noop_p)
3776 		    {
3777 		      gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3778 		      tree perm_dest
3779 			= vect_create_destination_var (gimple_assign_lhs (stmt),
3780 						       vectype);
3781 		      perm_dest = make_ssa_name (perm_dest);
3782 		      gassign *perm_stmt
3783 			= gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3784 					       first_vec, second_vec,
3785 					       mask_vec);
3786 		      perm_stmt_info
3787 			= vect_finish_stmt_generation (stmt_info, perm_stmt,
3788 						       gsi);
3789 		    }
3790 		  else
3791 		    /* If mask was NULL_TREE generate the requested
3792 		       identity transform.  */
3793 		    perm_stmt_info = vinfo->lookup_def (first_vec);
3794 
3795 		  /* Store the vector statement in NODE.  */
3796 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3797 		    = perm_stmt_info;
3798 		}
3799 	    }
3800 
3801 	  index = 0;
3802 	  first_vec_index = -1;
3803 	  second_vec_index = -1;
3804 	  noop_p = true;
3805 	}
3806     }
3807 
3808   return true;
3809 }
3810 
3811 /* Vectorize SLP instance tree in postorder.  */
3812 
3813 static void
vect_schedule_slp_instance(slp_tree node,slp_instance instance,scalar_stmts_to_slp_tree_map_t * bst_map)3814 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3815 			    scalar_stmts_to_slp_tree_map_t *bst_map)
3816 {
3817   gimple_stmt_iterator si;
3818   stmt_vec_info stmt_info;
3819   unsigned int group_size;
3820   tree vectype;
3821   int i, j;
3822   slp_tree child;
3823 
3824   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3825     return;
3826 
3827   /* See if we have already vectorized the node in the graph of the
3828      SLP instance.  */
3829   if (SLP_TREE_VEC_STMTS (node).exists ())
3830     return;
3831 
3832   /* See if we have already vectorized the same set of stmts and reuse their
3833      vectorized stmts across instances.  */
3834   if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3835     {
3836       SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3837       return;
3838     }
3839 
3840   bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3841   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3842     vect_schedule_slp_instance (child, instance, bst_map);
3843 
3844   /* Push SLP node def-type to stmts.  */
3845   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3846     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3847       {
3848 	stmt_vec_info child_stmt_info;
3849 	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3850 	  STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3851       }
3852 
3853   stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3854 
3855   /* VECTYPE is the type of the destination.  */
3856   vectype = STMT_VINFO_VECTYPE (stmt_info);
3857   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3858   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3859 
3860   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3861   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3862 
3863   if (dump_enabled_p ())
3864     dump_printf_loc (MSG_NOTE, vect_location,
3865 		     "------>vectorizing SLP node starting from: %G",
3866 		     stmt_info->stmt);
3867 
3868   /* Vectorized stmts go before the last scalar stmt which is where
3869      all uses are ready.  */
3870   stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3871   si = gsi_for_stmt (last_stmt_info->stmt);
3872 
3873   /* Mark the first element of the reduction chain as reduction to properly
3874      transform the node.  In the analysis phase only the last element of the
3875      chain is marked as reduction.  */
3876   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3877       && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3878       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3879     {
3880       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3881       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3882     }
3883 
3884   /* Handle two-operation SLP nodes by vectorizing the group with
3885      both operations and then performing a merge.  */
3886   if (SLP_TREE_TWO_OPERATORS (node))
3887     {
3888       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3889       enum tree_code code0 = gimple_assign_rhs_code (stmt);
3890       enum tree_code ocode = ERROR_MARK;
3891       stmt_vec_info ostmt_info;
3892       vec_perm_builder mask (group_size, group_size, 1);
3893       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3894 	{
3895 	  gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3896 	  if (gimple_assign_rhs_code (ostmt) != code0)
3897 	    {
3898 	      mask.quick_push (1);
3899 	      ocode = gimple_assign_rhs_code (ostmt);
3900 	    }
3901 	  else
3902 	    mask.quick_push (0);
3903 	}
3904       if (ocode != ERROR_MARK)
3905 	{
3906 	  vec<stmt_vec_info> v0;
3907 	  vec<stmt_vec_info> v1;
3908 	  unsigned j;
3909 	  tree tmask = NULL_TREE;
3910 	  vect_transform_stmt (stmt_info, &si, node, instance);
3911 	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
3912 	  SLP_TREE_VEC_STMTS (node).truncate (0);
3913 	  gimple_assign_set_rhs_code (stmt, ocode);
3914 	  vect_transform_stmt (stmt_info, &si, node, instance);
3915 	  gimple_assign_set_rhs_code (stmt, code0);
3916 	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
3917 	  SLP_TREE_VEC_STMTS (node).truncate (0);
3918 	  tree meltype = build_nonstandard_integer_type
3919 	      (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3920 	  tree mvectype = get_same_sized_vectype (meltype, vectype);
3921 	  unsigned k = 0, l;
3922 	  for (j = 0; j < v0.length (); ++j)
3923 	    {
3924 	      /* Enforced by vect_build_slp_tree, which rejects variable-length
3925 		 vectors for SLP_TREE_TWO_OPERATORS.  */
3926 	      unsigned int const_nunits = nunits.to_constant ();
3927 	      tree_vector_builder melts (mvectype, const_nunits, 1);
3928 	      for (l = 0; l < const_nunits; ++l)
3929 		{
3930 		  if (k >= group_size)
3931 		    k = 0;
3932 		  tree t = build_int_cst (meltype,
3933 					  mask[k++] * const_nunits + l);
3934 		  melts.quick_push (t);
3935 		}
3936 	      tmask = melts.build ();
3937 
3938 	      /* ???  Not all targets support a VEC_PERM_EXPR with a
3939 	         constant mask that would translate to a vec_merge RTX
3940 		 (with their vec_perm_const_ok).  We can either not
3941 		 vectorize in that case or let veclower do its job.
3942 		 Unfortunately that isn't too great and at least for
3943 		 plus/minus we'd eventually like to match targets
3944 		 vector addsub instructions.  */
3945 	      gimple *vstmt;
3946 	      vstmt = gimple_build_assign (make_ssa_name (vectype),
3947 					   VEC_PERM_EXPR,
3948 					   gimple_assign_lhs (v0[j]->stmt),
3949 					   gimple_assign_lhs (v1[j]->stmt),
3950 					   tmask);
3951 	      SLP_TREE_VEC_STMTS (node).quick_push
3952 		(vect_finish_stmt_generation (stmt_info, vstmt, &si));
3953 	    }
3954 	  v0.release ();
3955 	  v1.release ();
3956 	  return;
3957 	}
3958     }
3959   vect_transform_stmt (stmt_info, &si, node, instance);
3960 
3961   /* Restore stmt def-types.  */
3962   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3963     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3964       {
3965 	stmt_vec_info child_stmt_info;
3966 	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3967 	  STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3968       }
3969 }
3970 
3971 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3972    For loop vectorization this is done in vectorizable_call, but for SLP
3973    it needs to be deferred until end of vect_schedule_slp, because multiple
3974    SLP instances may refer to the same scalar stmt.  */
3975 
3976 static void
vect_remove_slp_scalar_calls(slp_tree node,hash_set<slp_tree> & visited)3977 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
3978 {
3979   gimple *new_stmt;
3980   gimple_stmt_iterator gsi;
3981   int i;
3982   slp_tree child;
3983   tree lhs;
3984   stmt_vec_info stmt_info;
3985 
3986   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3987     return;
3988 
3989   if (visited.add (node))
3990     return;
3991 
3992   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3993     vect_remove_slp_scalar_calls (child, visited);
3994 
3995   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3996     {
3997       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3998       if (!stmt || gimple_bb (stmt) == NULL)
3999 	continue;
4000       if (is_pattern_stmt_p (stmt_info)
4001 	  || !PURE_SLP_STMT (stmt_info))
4002 	continue;
4003       lhs = gimple_call_lhs (stmt);
4004       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4005       gsi = gsi_for_stmt (stmt);
4006       stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4007       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4008     }
4009 }
4010 
4011 static void
vect_remove_slp_scalar_calls(slp_tree node)4012 vect_remove_slp_scalar_calls (slp_tree node)
4013 {
4014   hash_set<slp_tree> visited;
4015   vect_remove_slp_scalar_calls (node, visited);
4016 }
4017 
4018 /* Generate vector code for all SLP instances in the loop/basic block.  */
4019 
4020 void
vect_schedule_slp(vec_info * vinfo)4021 vect_schedule_slp (vec_info *vinfo)
4022 {
4023   vec<slp_instance> slp_instances;
4024   slp_instance instance;
4025   unsigned int i;
4026 
4027   scalar_stmts_to_slp_tree_map_t *bst_map
4028     = new scalar_stmts_to_slp_tree_map_t ();
4029   slp_instances = vinfo->slp_instances;
4030   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4031     {
4032       /* Schedule the tree of INSTANCE.  */
4033       vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4034 				  instance, bst_map);
4035       if (dump_enabled_p ())
4036 	dump_printf_loc (MSG_NOTE, vect_location,
4037                          "vectorizing stmts using SLP.\n");
4038     }
4039   delete bst_map;
4040 
4041   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4042     {
4043       slp_tree root = SLP_INSTANCE_TREE (instance);
4044       stmt_vec_info store_info;
4045       unsigned int j;
4046 
4047       /* Remove scalar call stmts.  Do not do this for basic-block
4048 	 vectorization as not all uses may be vectorized.
4049 	 ???  Why should this be necessary?  DCE should be able to
4050 	 remove the stmts itself.
4051 	 ???  For BB vectorization we can as well remove scalar
4052 	 stmts starting from the SLP tree root if they have no
4053 	 uses.  */
4054       if (is_a <loop_vec_info> (vinfo))
4055 	vect_remove_slp_scalar_calls (root);
4056 
4057       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4058                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4059         {
4060 	  if (!STMT_VINFO_DATA_REF (store_info))
4061 	    break;
4062 
4063 	  store_info = vect_orig_stmt (store_info);
4064 	  /* Free the attached stmt_vec_info and remove the stmt.  */
4065 	  vinfo->remove_stmt (store_info);
4066         }
4067     }
4068 }
4069