1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 
49 
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
51 
52 static void
vect_free_slp_tree(slp_tree node)53 vect_free_slp_tree (slp_tree node)
54 {
55   int i;
56   slp_tree child;
57 
58   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59     vect_free_slp_tree (child);
60 
61   gimple *stmt;
62   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63     /* After transform some stmts are removed and thus their vinfo is gone.  */
64     if (vinfo_for_stmt (stmt))
65       {
66 	gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 	STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
68       }
69 
70   SLP_TREE_CHILDREN (node).release ();
71   SLP_TREE_SCALAR_STMTS (node).release ();
72   SLP_TREE_VEC_STMTS (node).release ();
73   SLP_TREE_LOAD_PERMUTATION (node).release ();
74 
75   free (node);
76 }
77 
78 
79 /* Free the memory allocated for the SLP instance.  */
80 
81 void
vect_free_slp_instance(slp_instance instance)82 vect_free_slp_instance (slp_instance instance)
83 {
84   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85   SLP_INSTANCE_LOADS (instance).release ();
86   free (instance);
87 }
88 
89 
90 /* Create an SLP node for SCALAR_STMTS.  */
91 
92 static slp_tree
vect_create_new_slp_node(vec<gimple * > scalar_stmts)93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
94 {
95   slp_tree node;
96   gimple *stmt = scalar_stmts[0];
97   unsigned int nops;
98 
99   if (is_gimple_call (stmt))
100     nops = gimple_call_num_args (stmt);
101   else if (is_gimple_assign (stmt))
102     {
103       nops = gimple_num_ops (stmt) - 1;
104       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 	nops++;
106     }
107   else if (gimple_code (stmt) == GIMPLE_PHI)
108     nops = 0;
109   else
110     return NULL;
111 
112   node = XNEW (struct _slp_tree);
113   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114   SLP_TREE_VEC_STMTS (node).create (0);
115   SLP_TREE_CHILDREN (node).create (nops);
116   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117   SLP_TREE_TWO_OPERATORS (node) = false;
118   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
119 
120   unsigned i;
121   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122     STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
123 
124   return node;
125 }
126 
127 
128 /* This structure is used in creation of an SLP tree.  Each instance
129    corresponds to the same operand in a group of scalar stmts in an SLP
130    node.  */
131 typedef struct _slp_oprnd_info
132 {
133   /* Def-stmts for the operands.  */
134   vec<gimple *> def_stmts;
135   /* Information about the first statement, its vector def-type, type, the
136      operand itself in case it's constant, and an indication if it's a pattern
137      stmt.  */
138   tree first_op_type;
139   enum vect_def_type first_dt;
140   bool first_pattern;
141   bool second_pattern;
142 } *slp_oprnd_info;
143 
144 
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146    operand.  */
147 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)148 vect_create_oprnd_info (int nops, int group_size)
149 {
150   int i;
151   slp_oprnd_info oprnd_info;
152   vec<slp_oprnd_info> oprnds_info;
153 
154   oprnds_info.create (nops);
155   for (i = 0; i < nops; i++)
156     {
157       oprnd_info = XNEW (struct _slp_oprnd_info);
158       oprnd_info->def_stmts.create (group_size);
159       oprnd_info->first_dt = vect_uninitialized_def;
160       oprnd_info->first_op_type = NULL_TREE;
161       oprnd_info->first_pattern = false;
162       oprnd_info->second_pattern = false;
163       oprnds_info.quick_push (oprnd_info);
164     }
165 
166   return oprnds_info;
167 }
168 
169 
170 /* Free operands info.  */
171 
172 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
174 {
175   int i;
176   slp_oprnd_info oprnd_info;
177 
178   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
179     {
180       oprnd_info->def_stmts.release ();
181       XDELETE (oprnd_info);
182     }
183 
184   oprnds_info.release ();
185 }
186 
187 
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
190 
191 int
vect_get_place_in_interleaving_chain(gimple * stmt,gimple * first_stmt)192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
193 {
194   gimple *next_stmt = first_stmt;
195   int result = 0;
196 
197   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198     return -1;
199 
200   do
201     {
202       if (next_stmt == stmt)
203 	return result;
204       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205       if (next_stmt)
206 	result += GROUP_GAP (vinfo_for_stmt (next_stmt));
207     }
208   while (next_stmt);
209 
210   return -1;
211 }
212 
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214    using the method implemented by duplicate_and_interleave.  Return true
215    if so, returning the number of intermediate vectors in *NVECTORS_OUT
216    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217    (if nonnull).  */
218 
219 bool
can_duplicate_and_interleave_p(unsigned int count,machine_mode elt_mode,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 				unsigned int *nvectors_out,
222 				tree *vector_type_out,
223 				tree *permutes)
224 {
225   poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226   poly_int64 nelts;
227   unsigned int nvectors = 1;
228   for (;;)
229     {
230       scalar_int_mode int_mode;
231       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232       if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 	  && int_mode_for_size (elt_bits, 0).exists (&int_mode))
234 	{
235 	  tree int_type = build_nonstandard_integer_type
236 	    (GET_MODE_BITSIZE (int_mode), 1);
237 	  tree vector_type = build_vector_type (int_type, nelts);
238 	  if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
239 	    {
240 	      vec_perm_builder sel1 (nelts, 2, 3);
241 	      vec_perm_builder sel2 (nelts, 2, 3);
242 	      poly_int64 half_nelts = exact_div (nelts, 2);
243 	      for (unsigned int i = 0; i < 3; ++i)
244 		{
245 		  sel1.quick_push (i);
246 		  sel1.quick_push (i + nelts);
247 		  sel2.quick_push (half_nelts + i);
248 		  sel2.quick_push (half_nelts + i + nelts);
249 		}
250 	      vec_perm_indices indices1 (sel1, 2, nelts);
251 	      vec_perm_indices indices2 (sel2, 2, nelts);
252 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
254 		{
255 		  if (nvectors_out)
256 		    *nvectors_out = nvectors;
257 		  if (vector_type_out)
258 		    *vector_type_out = vector_type;
259 		  if (permutes)
260 		    {
261 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 								indices1);
263 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 								indices2);
265 		    }
266 		  return true;
267 		}
268 	    }
269 	}
270       if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 	return false;
272       nvectors *= 2;
273     }
274 }
275 
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277    they are of a valid type and that they match the defs of the first stmt of
278    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
279    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero *SWAP
280    indicates swap is required for cond_expr stmts.  Specifically, *SWAP
281    is 1 if STMT is cond and operands of comparison need to be swapped;
282    *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283    If there is any operand swap in this function, *SWAP is set to non-zero
284    value.
285    If there was a fatal error return -1; if the error could be corrected by
286    swapping operands of father node of this one, return 1; if everything is
287    ok return 0.  */
288 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 			     vec<gimple *> stmts, unsigned stmt_num,
291 			     vec<slp_oprnd_info> *oprnds_info)
292 {
293   gimple *stmt = stmts[stmt_num];
294   tree oprnd;
295   unsigned int i, number_of_oprnds;
296   gimple *def_stmt;
297   enum vect_def_type dt = vect_uninitialized_def;
298   bool pattern = false;
299   slp_oprnd_info oprnd_info;
300   int first_op_idx = 1;
301   bool commutative = false;
302   bool first_op_cond = false;
303   bool first = stmt_num == 0;
304   bool second = stmt_num == 1;
305 
306   if (is_gimple_call (stmt))
307     {
308       number_of_oprnds = gimple_call_num_args (stmt);
309       first_op_idx = 3;
310     }
311   else if (is_gimple_assign (stmt))
312     {
313       enum tree_code code = gimple_assign_rhs_code (stmt);
314       number_of_oprnds = gimple_num_ops (stmt) - 1;
315       /* Swap can only be done for cond_expr if asked to, otherwise we
316 	 could result in different comparison code to the first stmt.  */
317       if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
319 	{
320 	  first_op_cond = true;
321 	  number_of_oprnds++;
322 	}
323       else
324 	commutative = commutative_tree_code (code);
325     }
326   else
327     return -1;
328 
329   bool swapped = (*swap != 0);
330   gcc_assert (!swapped || first_op_cond);
331   for (i = 0; i < number_of_oprnds; i++)
332     {
333 again:
334       if (first_op_cond)
335 	{
336 	  /* Map indicating how operands of cond_expr should be swapped.  */
337 	  int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 	  int *map = maps[*swap];
339 
340 	  if (i < 2)
341 	    oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 	  else
343 	    oprnd = gimple_op (stmt, map[i]);
344 	}
345       else
346 	oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
347 
348       oprnd_info = (*oprnds_info)[i];
349 
350       if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
351 	{
352 	  if (dump_enabled_p ())
353 	    {
354 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 			       "Build SLP failed: can't analyze def for ");
356 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 	    }
359 
360 	  return -1;
361 	}
362 
363       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364          from the pattern.  Check that all the stmts of the node are in the
365          pattern.  */
366       if (def_stmt && gimple_bb (def_stmt)
367           && vect_stmt_in_region_p (vinfo, def_stmt)
368           && vinfo_for_stmt (def_stmt)
369           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
372         {
373           pattern = true;
374           if (!first && !oprnd_info->first_pattern
375 	      /* Allow different pattern state for the defs of the
376 		 first stmt in reduction chains.  */
377 	      && (oprnd_info->first_dt != vect_reduction_def
378 		  || (!second && !oprnd_info->second_pattern)))
379 	    {
380 	      if (i == 0
381 		  && !swapped
382 		  && commutative)
383 		{
384 		  swapped = true;
385 		  goto again;
386 		}
387 
388 	      if (dump_enabled_p ())
389 		{
390 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 				   "Build SLP failed: some of the stmts"
392 				   " are in a pattern, and others are not ");
393 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 		}
396 
397 	      return 1;
398             }
399 
400           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
402 
403           if (dt == vect_unknown_def_type)
404             {
405               if (dump_enabled_p ())
406                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 				 "Unsupported pattern.\n");
408               return -1;
409             }
410 
411           switch (gimple_code (def_stmt))
412             {
413             case GIMPLE_PHI:
414             case GIMPLE_ASSIGN:
415 	      break;
416 
417 	    default:
418 	      if (dump_enabled_p ())
419 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 				 "unsupported defining stmt:\n");
421 	      return -1;
422             }
423         }
424 
425       if (second)
426 	oprnd_info->second_pattern = pattern;
427 
428       if (first)
429 	{
430 	  oprnd_info->first_dt = dt;
431 	  oprnd_info->first_pattern = pattern;
432 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
433 	}
434       else
435 	{
436 	  /* Not first stmt of the group, check that the def-stmt/s match
437 	     the def-stmt/s of the first stmt.  Allow different definition
438 	     types for reduction chains: the first stmt must be a
439 	     vect_reduction_def (a phi node), and the rest
440 	     vect_internal_def.  */
441 	  tree type = TREE_TYPE (oprnd);
442 	  if ((oprnd_info->first_dt != dt
443 	       && !(oprnd_info->first_dt == vect_reduction_def
444 		    && dt == vect_internal_def)
445 	       && !((oprnd_info->first_dt == vect_external_def
446 		     || oprnd_info->first_dt == vect_constant_def)
447 		    && (dt == vect_external_def
448 			|| dt == vect_constant_def)))
449 	      || !types_compatible_p (oprnd_info->first_op_type, type))
450 	    {
451 	      /* Try swapping operands if we got a mismatch.  */
452 	      if (i == 0
453 		  && !swapped
454 		  && commutative)
455 		{
456 		  swapped = true;
457 		  goto again;
458 		}
459 
460 	      if (dump_enabled_p ())
461 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 				 "Build SLP failed: different types\n");
463 
464 	      return 1;
465 	    }
466 	  if ((dt == vect_constant_def
467 	       || dt == vect_external_def)
468 	      && !current_vector_size.is_constant ()
469 	      && (TREE_CODE (type) == BOOLEAN_TYPE
470 		  || !can_duplicate_and_interleave_p (stmts.length (),
471 						      TYPE_MODE (type))))
472 	    {
473 	      if (dump_enabled_p ())
474 		{
475 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 				   "Build SLP failed: invalid type of def "
477 				   "for variable-length SLP ");
478 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 		  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 		}
481 	      return -1;
482 	    }
483 	}
484 
485       /* Check the types of the definitions.  */
486       switch (dt)
487 	{
488 	case vect_constant_def:
489 	case vect_external_def:
490 	  break;
491 
492 	case vect_reduction_def:
493 	case vect_induction_def:
494 	case vect_internal_def:
495 	  oprnd_info->def_stmts.quick_push (def_stmt);
496 	  break;
497 
498 	default:
499 	  /* FORNOW: Not supported.  */
500 	  if (dump_enabled_p ())
501 	    {
502 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 			       "Build SLP failed: illegal type of def ");
504 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
506 	    }
507 
508 	  return -1;
509 	}
510     }
511 
512   /* Swap operands.  */
513   if (swapped)
514     {
515       /* If there are already uses of this stmt in a SLP instance then
516          we've committed to the operand order and can't swap it.  */
517       if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
518 	{
519 	  if (dump_enabled_p ())
520 	    {
521 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 			       "Build SLP failed: cannot swap operands of "
523 			       "shared stmt ");
524 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 	    }
526 	  return -1;
527 	}
528 
529       if (first_op_cond)
530 	{
531 	  tree cond = gimple_assign_rhs1 (stmt);
532 	  enum tree_code code = TREE_CODE (cond);
533 
534 	  /* Swap.  */
535 	  if (*swap == 1)
536 	    {
537 	      swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 				 &TREE_OPERAND (cond, 1));
539 	      TREE_SET_CODE (cond, swap_tree_comparison (code));
540 	    }
541 	  /* Invert.  */
542 	  else
543 	    {
544 	      swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 				 gimple_assign_rhs3_ptr (stmt));
546 	      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 	      code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 	      gcc_assert (code != ERROR_MARK);
549 	      TREE_SET_CODE (cond, code);
550 	    }
551 	}
552       else
553 	swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 			   gimple_assign_rhs2_ptr (stmt));
555       if (dump_enabled_p ())
556 	{
557 	  dump_printf_loc (MSG_NOTE, vect_location,
558 			   "swapped operands to match def types in ");
559 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
560 	}
561     }
562 
563   *swap = swapped;
564   return 0;
565 }
566 
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568    caller's attempt to find the vector type in STMT with the narrowest
569    element type.  Return true if VECTYPE is nonnull and if it is valid
570    for VINFO.  When returning true, update MAX_NUNITS to reflect the
571    number of units in VECTYPE.  VINFO, GORUP_SIZE and MAX_NUNITS are
572    as for vect_build_slp_tree.  */
573 
574 static bool
vect_record_max_nunits(vec_info * vinfo,gimple * stmt,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 			tree vectype, poly_uint64 *max_nunits)
577 {
578   if (!vectype)
579     {
580       if (dump_enabled_p ())
581 	{
582 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 			   "Build SLP failed: unsupported data-type in ");
584 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
586 	}
587       /* Fatal mismatch.  */
588       return false;
589     }
590 
591   /* If populating the vector type requires unrolling then fail
592      before adjusting *max_nunits for basic-block vectorization.  */
593   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594   unsigned HOST_WIDE_INT const_nunits;
595   if (is_a <bb_vec_info> (vinfo)
596       && (!nunits.is_constant (&const_nunits)
597 	  || const_nunits > group_size))
598     {
599       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 		       "Build SLP failed: unrolling required "
601 		       "in basic block SLP\n");
602       /* Fatal mismatch.  */
603       return false;
604     }
605 
606   /* In case of multiple types we need to detect the smallest type.  */
607   vect_update_max_nunits (max_nunits, vectype);
608   return true;
609 }
610 
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612    permutation or are of unsupported types of operation.  Return
613    true if they are, otherwise return false and indicate in *MATCHES
614    which stmts are not isomorphic to the first one.  If MATCHES[0]
615    is false then this indicates the comparison could not be
616    carried out or the stmts will never be vectorized by SLP.
617 
618    Note COND_EXPR is possibly ismorphic to another one after swapping its
619    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620    the first stmt by swapping the two operands of comparison; set SWAP[i]
621    to 2 if stmt I is isormorphic to the first stmt by inverting the code
622    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
624 
625 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned int group_size,unsigned nops,poly_uint64 * max_nunits,bool * matches,bool * two_operators)626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 		       vec<gimple *> stmts, unsigned int group_size,
628 		       unsigned nops, poly_uint64 *max_nunits,
629 		       bool *matches, bool *two_operators)
630 {
631   unsigned int i;
632   gimple *first_stmt = stmts[0], *stmt = stmts[0];
633   enum tree_code first_stmt_code = ERROR_MARK;
634   enum tree_code alt_stmt_code = ERROR_MARK;
635   enum tree_code rhs_code = ERROR_MARK;
636   enum tree_code first_cond_code = ERROR_MARK;
637   tree lhs;
638   bool need_same_oprnds = false;
639   tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640   optab optab;
641   int icode;
642   machine_mode optab_op2_mode;
643   machine_mode vec_mode;
644   HOST_WIDE_INT dummy;
645   gimple *first_load = NULL, *prev_first_load = NULL;
646 
647   /* For every stmt in NODE find its def stmt/s.  */
648   FOR_EACH_VEC_ELT (stmts, i, stmt)
649     {
650       swap[i] = 0;
651       matches[i] = false;
652 
653       if (dump_enabled_p ())
654 	{
655 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
657 	}
658 
659       /* Fail to vectorize statements marked as unvectorizable.  */
660       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
661         {
662           if (dump_enabled_p ())
663             {
664               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 			       "Build SLP failed: unvectorizable statement ");
666               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
667             }
668 	  /* Fatal mismatch.  */
669 	  matches[0] = false;
670           return false;
671         }
672 
673       lhs = gimple_get_lhs (stmt);
674       if (lhs == NULL_TREE)
675 	{
676 	  if (dump_enabled_p ())
677 	    {
678 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
680 			       "GIMPLE_CALL ");
681 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 	    }
683 	  /* Fatal mismatch.  */
684 	  matches[0] = false;
685 	  return false;
686 	}
687 
688       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689       vectype = get_vectype_for_scalar_type (scalar_type);
690       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 				   max_nunits))
692 	{
693 	  /* Fatal mismatch.  */
694 	  matches[0] = false;
695           return false;
696         }
697 
698       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
699 	{
700 	  rhs_code = CALL_EXPR;
701 	  if (gimple_call_internal_p (call_stmt)
702 	      || gimple_call_tail_p (call_stmt)
703 	      || gimple_call_noreturn_p (call_stmt)
704 	      || !gimple_call_nothrow_p (call_stmt)
705 	      || gimple_call_chain (call_stmt))
706 	    {
707 	      if (dump_enabled_p ())
708 		{
709 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 				   "Build SLP failed: unsupported call type ");
711 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 				    call_stmt, 0);
713 		}
714 	      /* Fatal mismatch.  */
715 	      matches[0] = false;
716 	      return false;
717 	    }
718 	}
719       else
720 	rhs_code = gimple_assign_rhs_code (stmt);
721 
722       /* Check the operation.  */
723       if (i == 0)
724 	{
725 	  first_stmt_code = rhs_code;
726 
727 	  /* Shift arguments should be equal in all the packed stmts for a
728 	     vector shift with scalar shift operand.  */
729 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 	      || rhs_code == LROTATE_EXPR
731 	      || rhs_code == RROTATE_EXPR)
732 	    {
733 	      vec_mode = TYPE_MODE (vectype);
734 
735 	      /* First see if we have a vector/vector shift.  */
736 	      optab = optab_for_tree_code (rhs_code, vectype,
737 					   optab_vector);
738 
739 	      if (!optab
740 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
741 		{
742 		  /* No vector/vector shift, try for a vector/scalar shift.  */
743 		  optab = optab_for_tree_code (rhs_code, vectype,
744 					       optab_scalar);
745 
746 		  if (!optab)
747 		    {
748 		      if (dump_enabled_p ())
749 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 					 "Build SLP failed: no optab.\n");
751 		      /* Fatal mismatch.  */
752 		      matches[0] = false;
753 		      return false;
754 		    }
755 		  icode = (int) optab_handler (optab, vec_mode);
756 		  if (icode == CODE_FOR_nothing)
757 		    {
758 		      if (dump_enabled_p ())
759 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 					 "Build SLP failed: "
761 					 "op not supported by target.\n");
762 		      /* Fatal mismatch.  */
763 		      matches[0] = false;
764 		      return false;
765 		    }
766 		  optab_op2_mode = insn_data[icode].operand[2].mode;
767 		  if (!VECTOR_MODE_P (optab_op2_mode))
768 		    {
769 		      need_same_oprnds = true;
770 		      first_op1 = gimple_assign_rhs2 (stmt);
771 		    }
772 		}
773 	    }
774 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
775             {
776               need_same_oprnds = true;
777               first_op1 = gimple_assign_rhs2 (stmt);
778             }
779 	}
780       else
781 	{
782 	  if (first_stmt_code != rhs_code
783 	      && alt_stmt_code == ERROR_MARK)
784 	    alt_stmt_code = rhs_code;
785 	  if (first_stmt_code != rhs_code
786 	      && (first_stmt_code != IMAGPART_EXPR
787 		  || rhs_code != REALPART_EXPR)
788 	      && (first_stmt_code != REALPART_EXPR
789 		  || rhs_code != IMAGPART_EXPR)
790 	      /* Handle mismatches in plus/minus by computing both
791 		 and merging the results.  */
792 	      && !((first_stmt_code == PLUS_EXPR
793 		    || first_stmt_code == MINUS_EXPR)
794 		   && (alt_stmt_code == PLUS_EXPR
795 		       || alt_stmt_code == MINUS_EXPR)
796 		   && rhs_code == alt_stmt_code)
797               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798                    && (first_stmt_code == ARRAY_REF
799                        || first_stmt_code == BIT_FIELD_REF
800                        || first_stmt_code == INDIRECT_REF
801                        || first_stmt_code == COMPONENT_REF
802                        || first_stmt_code == MEM_REF)))
803 	    {
804 	      if (dump_enabled_p ())
805 		{
806 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 				   "Build SLP failed: different operation "
808 				   "in stmt ");
809 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 				   "original stmt ");
812 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 				    first_stmt, 0);
814 		}
815 	      /* Mismatch.  */
816 	      continue;
817 	    }
818 
819 	  if (need_same_oprnds
820 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
821 	    {
822 	      if (dump_enabled_p ())
823 		{
824 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 				   "Build SLP failed: different shift "
826 				   "arguments in ");
827 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828 		}
829 	      /* Mismatch.  */
830 	      continue;
831 	    }
832 
833 	  if (rhs_code == CALL_EXPR)
834 	    {
835 	      gimple *first_stmt = stmts[0];
836 	      if (gimple_call_num_args (stmt) != nops
837 		  || !operand_equal_p (gimple_call_fn (first_stmt),
838 				       gimple_call_fn (stmt), 0)
839 		  || gimple_call_fntype (first_stmt)
840 		     != gimple_call_fntype (stmt))
841 		{
842 		  if (dump_enabled_p ())
843 		    {
844 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 				       "Build SLP failed: different calls in ");
846 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 					stmt, 0);
848 		    }
849 		  /* Mismatch.  */
850 		  continue;
851 		}
852 	    }
853 	}
854 
855       /* Grouped store or load.  */
856       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
857 	{
858 	  if (REFERENCE_CLASS_P (lhs))
859 	    {
860 	      /* Store.  */
861 	      ;
862 	    }
863 	  else
864 	    {
865 	      /* Load.  */
866               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867               if (prev_first_load)
868                 {
869                   /* Check that there are no loads from different interleaving
870                      chains in the same node.  */
871                   if (prev_first_load != first_load)
872                     {
873                       if (dump_enabled_p ())
874                         {
875                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 					   vect_location,
877 					   "Build SLP failed: different "
878 					   "interleaving chains in one node ");
879                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 					    stmt, 0);
881                         }
882 		      /* Mismatch.  */
883 		      continue;
884                     }
885                 }
886               else
887                 prev_first_load = first_load;
888            }
889         } /* Grouped access.  */
890       else
891 	{
892 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 	    {
894 	      /* Not grouped load.  */
895 	      if (dump_enabled_p ())
896 		{
897 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 				   "Build SLP failed: not grouped load ");
899 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 		}
901 
902 	      /* FORNOW: Not grouped loads are not supported.  */
903 	      /* Fatal mismatch.  */
904 	      matches[0] = false;
905 	      return false;
906 	    }
907 
908 	  /* Not memory operation.  */
909 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 	      && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 	      && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 	      && rhs_code != CALL_EXPR)
914 	    {
915 	      if (dump_enabled_p ())
916 		{
917 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 				   "Build SLP failed: operation");
919 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
921 		}
922 	      /* Fatal mismatch.  */
923 	      matches[0] = false;
924 	      return false;
925 	    }
926 
927 	  if (rhs_code == COND_EXPR)
928 	    {
929 	      tree cond_expr = gimple_assign_rhs1 (stmt);
930 	      enum tree_code cond_code = TREE_CODE (cond_expr);
931 	      enum tree_code swap_code = ERROR_MARK;
932 	      enum tree_code invert_code = ERROR_MARK;
933 
934 	      if (i == 0)
935 		first_cond_code = TREE_CODE (cond_expr);
936 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
937 		{
938 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 		  swap_code = swap_tree_comparison (cond_code);
940 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
941 		}
942 
943 	      if (first_cond_code == cond_code)
944 		;
945 	      /* Isomorphic can be achieved by swapping.  */
946 	      else if (first_cond_code == swap_code)
947 		swap[i] = 1;
948 	      /* Isomorphic can be achieved by inverting.  */
949 	      else if (first_cond_code == invert_code)
950 		swap[i] = 2;
951 	      else
952 		{
953 		  if (dump_enabled_p ())
954 		    {
955 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 				       "Build SLP failed: different"
957 				       " operation");
958 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 					stmt, 0);
960 		    }
961 		  /* Mismatch.  */
962 		  continue;
963 		}
964 	    }
965 	}
966 
967       matches[i] = true;
968     }
969 
970   for (i = 0; i < group_size; ++i)
971     if (!matches[i])
972       return false;
973 
974   /* If we allowed a two-operation SLP node verify the target can cope
975      with the permute we are going to use.  */
976   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977   if (alt_stmt_code != ERROR_MARK
978       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
979     {
980       unsigned HOST_WIDE_INT count;
981       if (!nunits.is_constant (&count))
982 	{
983 	  if (dump_enabled_p ())
984 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 			     "Build SLP failed: different operations "
986 			     "not allowed with variable-length SLP.\n");
987 	  return false;
988 	}
989       vec_perm_builder sel (count, count, 1);
990       for (i = 0; i < count; ++i)
991 	{
992 	  unsigned int elt = i;
993 	  if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 	    elt += count;
995 	  sel.quick_push (elt);
996 	}
997       vec_perm_indices indices (sel, 2, count);
998       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
999 	{
1000 	  for (i = 0; i < group_size; ++i)
1001 	    if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1002 	      {
1003 		matches[i] = false;
1004 		if (dump_enabled_p ())
1005 		  {
1006 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 				     "Build SLP failed: different operation "
1008 				     "in stmt ");
1009 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 				      stmts[i], 0);
1011 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 				     "original stmt ");
1013 		    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 				      first_stmt, 0);
1015 		  }
1016 	      }
1017 	  return false;
1018 	}
1019       *two_operators = true;
1020     }
1021 
1022   return true;
1023 }
1024 
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026    Note we never remove apart from at destruction time so we do not
1027    need a special value for deleted that differs from empty.  */
1028 struct bst_traits
1029 {
1030   typedef vec <gimple *> value_type;
1031   typedef vec <gimple *> compare_type;
1032   static inline hashval_t hash (value_type);
1033   static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1034   static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1035   static inline bool is_deleted (value_type x) { return !x.exists (); }
mark_emptybst_traits1036   static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1037   static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1038   static inline void remove (value_type &x) { x.release (); }
1039 };
1040 inline hashval_t
hash(value_type x)1041 bst_traits::hash (value_type x)
1042 {
1043   inchash::hash h;
1044   for (unsigned i = 0; i < x.length (); ++i)
1045     h.add_int (gimple_uid (x[i]));
1046   return h.end ();
1047 }
1048 inline bool
equal(value_type existing,value_type candidate)1049 bst_traits::equal (value_type existing, value_type candidate)
1050 {
1051   if (existing.length () != candidate.length ())
1052     return false;
1053   for (unsigned i = 0; i < existing.length (); ++i)
1054     if (existing[i] != candidate[i])
1055       return false;
1056   return true;
1057 }
1058 
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1061 
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 		       vec<gimple *> stmts, unsigned int group_size,
1065 		       poly_uint64 *max_nunits,
1066 		       vec<slp_tree> *loads,
1067 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 		       unsigned max_tree_size);
1069 
1070 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1071 vect_build_slp_tree (vec_info *vinfo,
1072 		     vec<gimple *> stmts, unsigned int group_size,
1073 		     poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 		     unsigned max_tree_size)
1076 {
1077   if (bst_fail->contains (stmts))
1078     return NULL;
1079   slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 					loads, matches, npermutes, tree_size,
1081 					max_tree_size);
1082   /* When SLP build fails for stmts record this, otherwise SLP build
1083      can be exponential in time when we allow to construct parts from
1084      scalars, see PR81723.  */
1085   if (! res)
1086     {
1087       vec <gimple *> x;
1088       x.create (stmts.length ());
1089       x.splice (stmts);
1090       bst_fail->add (x);
1091     }
1092   return res;
1093 }
1094 
1095 /* Recursively build an SLP tree starting from NODE.
1096    Fail (and return a value not equal to zero) if def-stmts are not
1097    isomorphic, require data permutation or are of unsupported types of
1098    operation.  Otherwise, return 0.
1099    The value returned is the depth in the SLP tree where a mismatch
1100    was found.  */
1101 
1102 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 		       vec<gimple *> stmts, unsigned int group_size,
1105 		       poly_uint64 *max_nunits,
1106 		       vec<slp_tree> *loads,
1107 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 		       unsigned max_tree_size)
1109 {
1110   unsigned nops, i, this_tree_size = 0;
1111   poly_uint64 this_max_nunits = *max_nunits;
1112   gimple *stmt;
1113   slp_tree node;
1114 
1115   matches[0] = false;
1116 
1117   stmt = stmts[0];
1118   if (is_gimple_call (stmt))
1119     nops = gimple_call_num_args (stmt);
1120   else if (is_gimple_assign (stmt))
1121     {
1122       nops = gimple_num_ops (stmt) - 1;
1123       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 	nops++;
1125     }
1126   else if (gimple_code (stmt) == GIMPLE_PHI)
1127     nops = 0;
1128   else
1129     return NULL;
1130 
1131   /* If the SLP node is a PHI (induction or reduction), terminate
1132      the recursion.  */
1133   if (gimple_code (stmt) == GIMPLE_PHI)
1134     {
1135       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136       tree vectype = get_vectype_for_scalar_type (scalar_type);
1137       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 				   max_nunits))
1139 	return NULL;
1140 
1141       vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142       /* Induction from different IVs is not supported.  */
1143       if (def_type == vect_induction_def)
1144 	{
1145 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 	    if (stmt != stmts[0])
1147 	      return NULL;
1148 	}
1149       else
1150 	{
1151 	  /* Else def types have to match.  */
1152 	  FOR_EACH_VEC_ELT (stmts, i, stmt)
1153 	    {
1154 	      /* But for reduction chains only check on the first stmt.  */
1155 	      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 		continue;
1158 	      if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 		return NULL;
1160 	    }
1161 	}
1162       node = vect_create_new_slp_node (stmts);
1163       return node;
1164     }
1165 
1166 
1167   bool two_operators = false;
1168   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169   if (!vect_build_slp_tree_1 (vinfo, swap,
1170 			      stmts, group_size, nops,
1171 			      &this_max_nunits, matches, &two_operators))
1172     return NULL;
1173 
1174   /* If the SLP node is a load, terminate the recursion.  */
1175   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1177     {
1178       *max_nunits = this_max_nunits;
1179       node = vect_create_new_slp_node (stmts);
1180       loads->safe_push (node);
1181       return node;
1182     }
1183 
1184   /* Get at the operands, verifying they are compatible.  */
1185   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186   slp_oprnd_info oprnd_info;
1187   FOR_EACH_VEC_ELT (stmts, i, stmt)
1188     {
1189       int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 					     stmts, i, &oprnds_info);
1191       if (res != 0)
1192 	matches[(res == -1) ? 0 : i] = false;
1193       if (!matches[0])
1194 	break;
1195     }
1196   for (i = 0; i < group_size; ++i)
1197     if (!matches[i])
1198       {
1199 	vect_free_oprnd_info (oprnds_info);
1200 	return NULL;
1201       }
1202 
1203   auto_vec<slp_tree, 4> children;
1204   auto_vec<slp_tree> this_loads;
1205 
1206   stmt = stmts[0];
1207 
1208   if (tree_size)
1209     max_tree_size -= *tree_size;
1210 
1211   /* Create SLP_TREE nodes for the definition node/s.  */
1212   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1213     {
1214       slp_tree child;
1215       unsigned old_nloads = this_loads.length ();
1216       unsigned old_tree_size = this_tree_size;
1217       unsigned int j;
1218 
1219       if (oprnd_info->first_dt != vect_internal_def
1220 	  && oprnd_info->first_dt != vect_reduction_def
1221 	  && oprnd_info->first_dt != vect_induction_def)
1222         continue;
1223 
1224       if (++this_tree_size > max_tree_size)
1225 	{
1226 	  FOR_EACH_VEC_ELT (children, j, child)
1227 	    vect_free_slp_tree (child);
1228 	  vect_free_oprnd_info (oprnds_info);
1229 	  return NULL;
1230 	}
1231 
1232       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 					group_size, &this_max_nunits,
1234 					&this_loads, matches, npermutes,
1235 					&this_tree_size,
1236 					max_tree_size)) != NULL)
1237 	{
1238 	  /* If we have all children of child built up from scalars then just
1239 	     throw that away and build it up this node from scalars.  */
1240 	  if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 	      /* ???  Rejecting patterns this way doesn't work.  We'd have to
1242 		 do extra work to cancel the pattern so the uses see the
1243 		 scalar version.  */
1244 	      && !is_pattern_stmt_p
1245 	            (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1246 	    {
1247 	      slp_tree grandchild;
1248 
1249 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 		if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 		  break;
1252 	      if (!grandchild)
1253 		{
1254 		  /* Roll back.  */
1255 		  this_loads.truncate (old_nloads);
1256 		  this_tree_size = old_tree_size;
1257 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 		    vect_free_slp_tree (grandchild);
1259 		  SLP_TREE_CHILDREN (child).truncate (0);
1260 
1261 		  dump_printf_loc (MSG_NOTE, vect_location,
1262 				   "Building parent vector operands from "
1263 				   "scalars instead\n");
1264 		  oprnd_info->def_stmts = vNULL;
1265 		  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 		  children.safe_push (child);
1267 		  continue;
1268 		}
1269 	    }
1270 
1271 	  oprnd_info->def_stmts = vNULL;
1272 	  children.safe_push (child);
1273 	  continue;
1274 	}
1275 
1276       /* If the SLP build failed fatally and we analyze a basic-block
1277          simply treat nodes we fail to build as externally defined
1278 	 (and thus build vectors from the scalar defs).
1279 	 The cost model will reject outright expensive cases.
1280 	 ???  This doesn't treat cases where permutation ultimatively
1281 	 fails (or we don't try permutation below).  Ideally we'd
1282 	 even compute a permutation that will end up with the maximum
1283 	 SLP tree size...  */
1284       if (is_a <bb_vec_info> (vinfo)
1285 	  && !matches[0]
1286 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
1287 	     do extra work to cancel the pattern so the uses see the
1288 	     scalar version.  */
1289 	  && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1290 	{
1291 	  dump_printf_loc (MSG_NOTE, vect_location,
1292 			   "Building vector operands from scalars\n");
1293 	  child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 	  SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 	  children.safe_push (child);
1296 	  oprnd_info->def_stmts = vNULL;
1297 	  continue;
1298 	}
1299 
1300       /* If the SLP build for operand zero failed and operand zero
1301 	 and one can be commutated try that for the scalar stmts
1302 	 that failed the match.  */
1303       if (i == 0
1304 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1305 	  && matches[0]
1306 	  /* ???  For COND_EXPRs we can swap the comparison operands
1307 	     as well as the arms under some constraints.  */
1308 	  && nops == 2
1309 	  && oprnds_info[1]->first_dt == vect_internal_def
1310 	  && is_gimple_assign (stmt)
1311 	  /* Swapping operands for reductions breaks assumptions later on.  */
1312 	  && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
1313 	  && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_double_reduction_def
1314 	  /* Do so only if the number of not successful permutes was nor more
1315 	     than a cut-ff as re-trying the recursive match on
1316 	     possibly each level of the tree would expose exponential
1317 	     behavior.  */
1318 	  && *npermutes < 4)
1319 	{
1320 	  /* See whether we can swap the matching or the non-matching
1321 	     stmt operands.  */
1322 	  bool swap_not_matching = true;
1323 	  do
1324 	    {
1325 	      for (j = 0; j < group_size; ++j)
1326 		{
1327 		  if (matches[j] != !swap_not_matching)
1328 		    continue;
1329 		  gimple *stmt = stmts[j];
1330 		  /* Verify if we can swap operands of this stmt.  */
1331 		  if (!is_gimple_assign (stmt)
1332 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1333 		    {
1334 		      if (!swap_not_matching)
1335 			goto fail;
1336 		      swap_not_matching = false;
1337 		      break;
1338 		    }
1339 		  /* Verify if we can safely swap or if we committed to a
1340 		     specific operand order already.
1341 		     ???  Instead of modifying GIMPLE stmts here we could
1342 		     record whether we want to swap operands in the SLP
1343 		     node and temporarily do that when processing it
1344 		     (or wrap operand accessors in a helper).  */
1345 		  else if (swap[j] != 0
1346 			   || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1347 		    {
1348 		      if (!swap_not_matching)
1349 			{
1350 			  if (dump_enabled_p ())
1351 			    {
1352 			      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1353 					       vect_location,
1354 					       "Build SLP failed: cannot swap "
1355 					       "operands of shared stmt ");
1356 			      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1357 						TDF_SLIM, stmts[j], 0);
1358 			    }
1359 			  goto fail;
1360 			}
1361 		      swap_not_matching = false;
1362 		      break;
1363 		    }
1364 		}
1365 	    }
1366 	  while (j != group_size);
1367 
1368 	  /* Swap mismatched definition stmts.  */
1369 	  dump_printf_loc (MSG_NOTE, vect_location,
1370 			   "Re-trying with swapped operands of stmts ");
1371 	  for (j = 0; j < group_size; ++j)
1372 	    if (matches[j] == !swap_not_matching)
1373 	      {
1374 		std::swap (oprnds_info[0]->def_stmts[j],
1375 			   oprnds_info[1]->def_stmts[j]);
1376 		dump_printf (MSG_NOTE, "%d ", j);
1377 	      }
1378 	  dump_printf (MSG_NOTE, "\n");
1379 	  /* And try again with scratch 'matches' ... */
1380 	  bool *tem = XALLOCAVEC (bool, group_size);
1381 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1382 					    group_size, &this_max_nunits,
1383 					    &this_loads, tem, npermutes,
1384 					    &this_tree_size,
1385 					    max_tree_size)) != NULL)
1386 	    {
1387 	      /* ... so if successful we can apply the operand swapping
1388 		 to the GIMPLE IL.  This is necessary because for example
1389 		 vect_get_slp_defs uses operand indexes and thus expects
1390 		 canonical operand order.  This is also necessary even
1391 		 if we end up building the operand from scalars as
1392 		 we'll continue to process swapped operand two.  */
1393 	      for (j = 0; j < group_size; ++j)
1394 		{
1395 		  gimple *stmt = stmts[j];
1396 		  gimple_set_plf (stmt, GF_PLF_1, false);
1397 		}
1398 	      for (j = 0; j < group_size; ++j)
1399 		{
1400 		  gimple *stmt = stmts[j];
1401 		  if (matches[j] == !swap_not_matching)
1402 		    {
1403 		      /* Avoid swapping operands twice.  */
1404 		      if (gimple_plf (stmt, GF_PLF_1))
1405 			continue;
1406 		      swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1407 					 gimple_assign_rhs2_ptr (stmt));
1408 		      gimple_set_plf (stmt, GF_PLF_1, true);
1409 		    }
1410 		}
1411 	      /* Verify we swap all duplicates or none.  */
1412 	      if (flag_checking)
1413 		for (j = 0; j < group_size; ++j)
1414 		  {
1415 		    gimple *stmt = stmts[j];
1416 		    gcc_assert (gimple_plf (stmt, GF_PLF_1)
1417 				== (matches[j] == !swap_not_matching));
1418 		  }
1419 
1420 	      /* If we have all children of child built up from scalars then
1421 		 just throw that away and build it up this node from scalars.  */
1422 	      if (!SLP_TREE_CHILDREN (child).is_empty ()
1423 		  /* ???  Rejecting patterns this way doesn't work.  We'd have
1424 		     to do extra work to cancel the pattern so the uses see the
1425 		     scalar version.  */
1426 		  && !is_pattern_stmt_p
1427 			(vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1428 		{
1429 		  unsigned int j;
1430 		  slp_tree grandchild;
1431 
1432 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1433 		    if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1434 		      break;
1435 		  if (!grandchild)
1436 		    {
1437 		      /* Roll back.  */
1438 		      this_loads.truncate (old_nloads);
1439 		      this_tree_size = old_tree_size;
1440 		      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1441 			vect_free_slp_tree (grandchild);
1442 		      SLP_TREE_CHILDREN (child).truncate (0);
1443 
1444 		      dump_printf_loc (MSG_NOTE, vect_location,
1445 				       "Building parent vector operands from "
1446 				       "scalars instead\n");
1447 		      oprnd_info->def_stmts = vNULL;
1448 		      SLP_TREE_DEF_TYPE (child) = vect_external_def;
1449 		      children.safe_push (child);
1450 		      continue;
1451 		    }
1452 		}
1453 
1454 	      oprnd_info->def_stmts = vNULL;
1455 	      children.safe_push (child);
1456 	      continue;
1457 	    }
1458 
1459 	  ++*npermutes;
1460 	}
1461 
1462 fail:
1463       gcc_assert (child == NULL);
1464       FOR_EACH_VEC_ELT (children, j, child)
1465 	vect_free_slp_tree (child);
1466       vect_free_oprnd_info (oprnds_info);
1467       return NULL;
1468     }
1469 
1470   vect_free_oprnd_info (oprnds_info);
1471 
1472   if (tree_size)
1473     *tree_size += this_tree_size;
1474   *max_nunits = this_max_nunits;
1475   loads->safe_splice (this_loads);
1476 
1477   node = vect_create_new_slp_node (stmts);
1478   SLP_TREE_TWO_OPERATORS (node) = two_operators;
1479   SLP_TREE_CHILDREN (node).splice (children);
1480   return node;
1481 }
1482 
1483 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1484 
1485 static void
vect_print_slp_tree(dump_flags_t dump_kind,location_t loc,slp_tree node)1486 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1487 {
1488   int i;
1489   gimple *stmt;
1490   slp_tree child;
1491 
1492   dump_printf_loc (dump_kind, loc, "node%s\n",
1493 		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
1494 		   ? " (external)" : "");
1495   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1496     {
1497       dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1498       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1499     }
1500   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1501     vect_print_slp_tree (dump_kind, loc, child);
1502 }
1503 
1504 
1505 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1506    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1507    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1508    stmts in NODE are to be marked.  */
1509 
1510 static void
vect_mark_slp_stmts(slp_tree node,enum slp_vect_type mark,int j)1511 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1512 {
1513   int i;
1514   gimple *stmt;
1515   slp_tree child;
1516 
1517   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1518     return;
1519 
1520   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1521     if (j < 0 || i == j)
1522       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1523 
1524   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1525     vect_mark_slp_stmts (child, mark, j);
1526 }
1527 
1528 
1529 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1530 
1531 static void
vect_mark_slp_stmts_relevant(slp_tree node)1532 vect_mark_slp_stmts_relevant (slp_tree node)
1533 {
1534   int i;
1535   gimple *stmt;
1536   stmt_vec_info stmt_info;
1537   slp_tree child;
1538 
1539   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1540     return;
1541 
1542   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1543     {
1544       stmt_info = vinfo_for_stmt (stmt);
1545       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1546                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1547       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1548     }
1549 
1550   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1551     vect_mark_slp_stmts_relevant (child);
1552 }
1553 
1554 
1555 /* Rearrange the statements of NODE according to PERMUTATION.  */
1556 
1557 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation)1558 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1559                           vec<unsigned> permutation)
1560 {
1561   gimple *stmt;
1562   vec<gimple *> tmp_stmts;
1563   unsigned int i;
1564   slp_tree child;
1565 
1566   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1567     vect_slp_rearrange_stmts (child, group_size, permutation);
1568 
1569   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1570   tmp_stmts.create (group_size);
1571   tmp_stmts.quick_grow_cleared (group_size);
1572 
1573   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1574     tmp_stmts[permutation[i]] = stmt;
1575 
1576   SLP_TREE_SCALAR_STMTS (node).release ();
1577   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1578 }
1579 
1580 
1581 /* Attempt to reorder stmts in a reduction chain so that we don't
1582    require any load permutation.  Return true if that was possible,
1583    otherwise return false.  */
1584 
1585 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1586 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1587 {
1588   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1589   unsigned int i, j;
1590   unsigned int lidx;
1591   slp_tree node, load;
1592 
1593   /* Compare all the permutation sequences to the first one.  We know
1594      that at least one load is permuted.  */
1595   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1596   if (!node->load_permutation.exists ())
1597     return false;
1598   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1599     {
1600       if (!load->load_permutation.exists ())
1601 	return false;
1602       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1603 	if (lidx != node->load_permutation[j])
1604 	  return false;
1605     }
1606 
1607   /* Check that the loads in the first sequence are different and there
1608      are no gaps between them.  */
1609   auto_sbitmap load_index (group_size);
1610   bitmap_clear (load_index);
1611   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1612     {
1613       if (lidx >= group_size)
1614 	return false;
1615       if (bitmap_bit_p (load_index, lidx))
1616 	return false;
1617 
1618       bitmap_set_bit (load_index, lidx);
1619     }
1620   for (i = 0; i < group_size; i++)
1621     if (!bitmap_bit_p (load_index, i))
1622       return false;
1623 
1624   /* This permutation is valid for reduction.  Since the order of the
1625      statements in the nodes is not important unless they are memory
1626      accesses, we can rearrange the statements in all the nodes
1627      according to the order of the loads.  */
1628   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1629 			    node->load_permutation);
1630 
1631   /* We are done, no actual permutations need to be generated.  */
1632   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1633   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1634     {
1635       gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1636       first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1637       /* But we have to keep those permutations that are required because
1638          of handling of gaps.  */
1639       if (known_eq (unrolling_factor, 1U)
1640 	  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1641 	      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1642 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1643       else
1644 	for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1645 	  SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1646     }
1647 
1648   return true;
1649 }
1650 
1651 /* Check if the required load permutations in the SLP instance
1652    SLP_INSTN are supported.  */
1653 
1654 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1655 vect_supported_load_permutation_p (slp_instance slp_instn)
1656 {
1657   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1658   unsigned int i, j, k, next;
1659   slp_tree node;
1660   gimple *stmt, *load, *next_load;
1661 
1662   if (dump_enabled_p ())
1663     {
1664       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1665       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1666 	if (node->load_permutation.exists ())
1667 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1668 	    dump_printf (MSG_NOTE, "%d ", next);
1669 	else
1670 	  for (k = 0; k < group_size; ++k)
1671 	    dump_printf (MSG_NOTE, "%d ", k);
1672       dump_printf (MSG_NOTE, "\n");
1673     }
1674 
1675   /* In case of reduction every load permutation is allowed, since the order
1676      of the reduction statements is not important (as opposed to the case of
1677      grouped stores).  The only condition we need to check is that all the
1678      load nodes are of the same size and have the same permutation (and then
1679      rearrange all the nodes of the SLP instance according to this
1680      permutation).  */
1681 
1682   /* Check that all the load nodes are of the same size.  */
1683   /* ???  Can't we assert this? */
1684   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1685     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1686       return false;
1687 
1688   node = SLP_INSTANCE_TREE (slp_instn);
1689   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1690 
1691   /* Reduction (there are no data-refs in the root).
1692      In reduction chain the order of the loads is not important.  */
1693   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1694       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1695     vect_attempt_slp_rearrange_stmts (slp_instn);
1696 
1697   /* In basic block vectorization we allow any subchain of an interleaving
1698      chain.
1699      FORNOW: not supported in loop SLP because of realignment compications.  */
1700   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1701     {
1702       /* Check whether the loads in an instance form a subchain and thus
1703          no permutation is necessary.  */
1704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1705         {
1706 	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1707 	    continue;
1708 	  bool subchain_p = true;
1709           next_load = NULL;
1710           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1711             {
1712               if (j != 0
1713 		  && (next_load != load
1714 		      || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1715 		{
1716 		  subchain_p = false;
1717 		  break;
1718 		}
1719               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1720             }
1721 	  if (subchain_p)
1722 	    SLP_TREE_LOAD_PERMUTATION (node).release ();
1723 	  else
1724 	    {
1725 	      stmt_vec_info group_info
1726 		= vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1727 	      group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1728 	      unsigned HOST_WIDE_INT nunits;
1729 	      unsigned k, maxk = 0;
1730 	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1731 		if (k > maxk)
1732 		  maxk = k;
1733 	      /* In BB vectorization we may not actually use a loaded vector
1734 		 accessing elements in excess of GROUP_SIZE.  */
1735 	      tree vectype = STMT_VINFO_VECTYPE (group_info);
1736 	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1737 		  || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1738 		{
1739 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1740 				   "BB vectorization with gaps at the end of "
1741 				   "a load is not supported\n");
1742 		  return false;
1743 		}
1744 
1745 	      /* Verify the permutation can be generated.  */
1746 	      vec<tree> tem;
1747 	      unsigned n_perms;
1748 	      if (!vect_transform_slp_perm_load (node, tem, NULL,
1749 						 1, slp_instn, true, &n_perms))
1750 		{
1751 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1752 				   vect_location,
1753 				   "unsupported load permutation\n");
1754 		  return false;
1755 		}
1756 	    }
1757         }
1758       return true;
1759     }
1760 
1761   /* For loop vectorization verify we can generate the permutation.  Be
1762      conservative about the vectorization factor, there are permutations
1763      that will use three vector inputs only starting from a specific factor
1764      and the vectorization factor is not yet final.
1765      ???  The SLP instance unrolling factor might not be the maximum one.  */
1766   unsigned n_perms;
1767   poly_uint64 test_vf
1768     = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1769 			     LOOP_VINFO_VECT_FACTOR
1770 			     (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1771   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1772     if (node->load_permutation.exists ()
1773 	&& !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1774 					  slp_instn, true, &n_perms))
1775       return false;
1776 
1777   return true;
1778 }
1779 
1780 
1781 /* Find the last store in SLP INSTANCE.  */
1782 
1783 gimple *
vect_find_last_scalar_stmt_in_slp(slp_tree node)1784 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1785 {
1786   gimple *last = NULL, *stmt;
1787 
1788   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1789     {
1790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1791       if (is_pattern_stmt_p (stmt_vinfo))
1792 	last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1793       else
1794 	last = get_later_stmt (stmt, last);
1795     }
1796 
1797   return last;
1798 }
1799 
1800 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1801 
1802 static void
vect_analyze_slp_cost_1(slp_instance instance,slp_tree node,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * body_cost_vec,unsigned ncopies_for_cost,scalar_stmts_set_t * visited)1803 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1804 			 stmt_vector_for_cost *prologue_cost_vec,
1805 			 stmt_vector_for_cost *body_cost_vec,
1806 			 unsigned ncopies_for_cost,
1807 			 scalar_stmts_set_t* visited)
1808 {
1809   unsigned i, j;
1810   slp_tree child;
1811   gimple *stmt;
1812   stmt_vec_info stmt_info;
1813   tree lhs;
1814 
1815   /* If we already costed the exact same set of scalar stmts we're done.
1816      We share the generated vector stmts for those.  */
1817   if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1818     return;
1819 
1820   visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1821 
1822   /* Recurse down the SLP tree.  */
1823   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1824     if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1825       vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1826 			       body_cost_vec, ncopies_for_cost, visited);
1827 
1828   /* Look at the first scalar stmt to determine the cost.  */
1829   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1830   stmt_info = vinfo_for_stmt (stmt);
1831   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832     {
1833       vect_memory_access_type memory_access_type
1834 	= (STMT_VINFO_STRIDED_P (stmt_info)
1835 	   ? VMAT_STRIDED_SLP
1836 	   : VMAT_CONTIGUOUS);
1837       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1838 	vect_model_store_cost (stmt_info, ncopies_for_cost,
1839 			       memory_access_type, VLS_STORE,
1840 			       node, prologue_cost_vec, body_cost_vec);
1841       else
1842 	{
1843 	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1844 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1845 	    {
1846 	      /* If the load is permuted then the alignment is determined by
1847 		 the first group element not by the first scalar stmt DR.  */
1848 	      stmt = GROUP_FIRST_ELEMENT (stmt_info);
1849 	      stmt_info = vinfo_for_stmt (stmt);
1850 	      /* Record the cost for the permutation.  */
1851 	      unsigned n_perms;
1852 	      vect_transform_slp_perm_load (node, vNULL, NULL,
1853 					    ncopies_for_cost, instance, true,
1854 					    &n_perms);
1855 	      record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1856 				stmt_info, 0, vect_body);
1857 	      unsigned assumed_nunits
1858 		= vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1859 	      /* And adjust the number of loads performed.  This handles
1860 	         redundancies as well as loads that are later dead.  */
1861 	      auto_sbitmap perm (GROUP_SIZE (stmt_info));
1862 	      bitmap_clear (perm);
1863 	      for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1864 		bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1865 	      ncopies_for_cost = 0;
1866 	      bool load_seen = false;
1867 	      for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1868 		{
1869 		  if (i % assumed_nunits == 0)
1870 		    {
1871 		      if (load_seen)
1872 			ncopies_for_cost++;
1873 		      load_seen = false;
1874 		    }
1875 		  if (bitmap_bit_p (perm, i))
1876 		    load_seen = true;
1877 		}
1878 	      if (load_seen)
1879 		ncopies_for_cost++;
1880 	      gcc_assert (ncopies_for_cost
1881 			  <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1882 			      + assumed_nunits - 1) / assumed_nunits);
1883 	      poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1884 	      ncopies_for_cost *= estimated_poly_value (uf);
1885 	    }
1886 	  /* Record the cost for the vector loads.  */
1887 	  vect_model_load_cost (stmt_info, ncopies_for_cost,
1888 				memory_access_type, node, prologue_cost_vec,
1889 				body_cost_vec);
1890 	  return;
1891 	}
1892     }
1893   else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1894     {
1895       /* ncopies_for_cost is the number of IVs we generate.  */
1896       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1897 			stmt_info, 0, vect_body);
1898 
1899       /* Prologue cost for the initial values and step vector.  */
1900       record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1901 			CONSTANT_CLASS_P
1902 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1903 			     (stmt_info))
1904 			? vector_load : vec_construct,
1905 			stmt_info, 0, vect_prologue);
1906       record_stmt_cost (prologue_cost_vec, 1,
1907 			CONSTANT_CLASS_P
1908 			  (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1909 			? vector_load : vec_construct,
1910 			stmt_info, 0, vect_prologue);
1911 
1912       /* ???  No easy way to get at the actual number of vector stmts
1913          to be geneated and thus the derived IVs.  */
1914     }
1915   else
1916     {
1917       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1918 			stmt_info, 0, vect_body);
1919       if (SLP_TREE_TWO_OPERATORS (node))
1920 	{
1921 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1922 			    stmt_info, 0, vect_body);
1923 	  record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1924 			    stmt_info, 0, vect_body);
1925 	}
1926     }
1927 
1928   /* Push SLP node def-type to stmts.  */
1929   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1930     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1931       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1932 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1933 
1934   /* Scan operands and account for prologue cost of constants/externals.
1935      ???  This over-estimates cost for multiple uses and should be
1936      re-engineered.  */
1937   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1938   lhs = gimple_get_lhs (stmt);
1939   for (i = 0; i < gimple_num_ops (stmt); ++i)
1940     {
1941       tree op = gimple_op (stmt, i);
1942       gimple *def_stmt;
1943       enum vect_def_type dt;
1944       if (!op || op == lhs)
1945 	continue;
1946       if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1947 	  && (dt == vect_constant_def || dt == vect_external_def))
1948 	{
1949 	  /* Without looking at the actual initializer a vector of
1950 	     constants can be implemented as load from the constant pool.
1951 	     When all elements are the same we can use a splat.  */
1952 	  tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1953 	  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1954 	  unsigned num_vects_to_check;
1955 	  unsigned HOST_WIDE_INT const_nunits;
1956 	  unsigned nelt_limit;
1957 	  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1958 	      && ! multiple_p (const_nunits, group_size))
1959 	    {
1960 	      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1961 	      nelt_limit = const_nunits;
1962 	    }
1963 	  else
1964 	    {
1965 	      /* If either the vector has variable length or the vectors
1966 	         are composed of repeated whole groups we only need to
1967 		 cost construction once.  All vectors will be the same.  */
1968 	      num_vects_to_check = 1;
1969 	      nelt_limit = group_size;
1970 	    }
1971 	  tree elt = NULL_TREE;
1972 	  unsigned nelt = 0;
1973 	  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1974 	    {
1975 	      unsigned si = j % group_size;
1976 	      if (nelt == 0)
1977 		elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1978 	      /* ???  We're just tracking whether all operands of a single
1979 		 vector initializer are the same, ideally we'd check if
1980 		 we emitted the same one already.  */
1981 	      else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1982 		elt = NULL_TREE;
1983 	      nelt++;
1984 	      if (nelt == nelt_limit)
1985 		{
1986 		  /* ???  We need to pass down stmt_info for a vector type
1987 		     even if it points to the wrong stmt.  */
1988 		  record_stmt_cost (prologue_cost_vec, 1,
1989 				    dt == vect_external_def
1990 				    ? (elt ? scalar_to_vec : vec_construct)
1991 				    : vector_load,
1992 				    stmt_info, 0, vect_prologue);
1993 		  nelt = 0;
1994 		}
1995 	    }
1996 	}
1997     }
1998 
1999   /* Restore stmt def-types.  */
2000   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2001     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2002       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2003 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2004 }
2005 
2006 /* Compute the cost for the SLP instance INSTANCE.  */
2007 
2008 static void
vect_analyze_slp_cost(slp_instance instance,void * data,scalar_stmts_set_t * visited)2009 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2010 {
2011   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2012   unsigned ncopies_for_cost;
2013   stmt_info_for_cost *si;
2014   unsigned i;
2015 
2016   /* Calculate the number of vector stmts to create based on the unrolling
2017      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2018      GROUP_SIZE / NUNITS otherwise.  */
2019   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2020   slp_tree node = SLP_INSTANCE_TREE (instance);
2021   stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2022   /* Get the estimated vectorization factor, which is always one for
2023      basic-block vectorization.  */
2024   unsigned int assumed_vf;
2025   if (STMT_VINFO_LOOP_VINFO (stmt_info))
2026     assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2027   else
2028     assumed_vf = 1;
2029   /* For reductions look at a reduction operand in case the reduction
2030      operation is widening like DOT_PROD or SAD.  */
2031   tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2032   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033     {
2034       gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2035       switch (gimple_assign_rhs_code (stmt))
2036 	{
2037 	case DOT_PROD_EXPR:
2038 	case SAD_EXPR:
2039 	  vectype_for_cost = get_vectype_for_scalar_type
2040 	    (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2041 	  break;
2042 	default:;
2043 	}
2044     }
2045   unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2046   ncopies_for_cost = (least_common_multiple (assumed_nunits,
2047 					     group_size * assumed_vf)
2048 		      / assumed_nunits);
2049 
2050   prologue_cost_vec.create (10);
2051   body_cost_vec.create (10);
2052   vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2053 			   &prologue_cost_vec, &body_cost_vec,
2054 			   ncopies_for_cost, visited);
2055 
2056   /* Record the prologue costs, which were delayed until we were
2057      sure that SLP was successful.  */
2058   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2059     {
2060       struct _stmt_vec_info *stmt_info
2061 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2062       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2063 			    si->misalign, vect_prologue);
2064     }
2065 
2066   /* Record the instance's instructions in the target cost model.  */
2067   FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2068     {
2069       struct _stmt_vec_info *stmt_info
2070 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2071       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2072 			    si->misalign, vect_body);
2073     }
2074 
2075   prologue_cost_vec.release ();
2076   body_cost_vec.release ();
2077 }
2078 
2079 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2080    one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2081    the first GROUP1_SIZE stmts, since stores are consecutive), the second
2082    containing the remainder.
2083    Return the first stmt in the second group.  */
2084 
2085 static gimple *
vect_split_slp_store_group(gimple * first_stmt,unsigned group1_size)2086 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2087 {
2088   stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2089   gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2090   gcc_assert (group1_size > 0);
2091   int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2092   gcc_assert (group2_size > 0);
2093   GROUP_SIZE (first_vinfo) = group1_size;
2094 
2095   gimple *stmt = first_stmt;
2096   for (unsigned i = group1_size; i > 1; i--)
2097     {
2098       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2099       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2100     }
2101   /* STMT is now the last element of the first group.  */
2102   gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2103   GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2104 
2105   GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2106   for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2107     {
2108       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2109       gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2110     }
2111 
2112   /* For the second group, the GROUP_GAP is that before the original group,
2113      plus skipping over the first vector.  */
2114   GROUP_GAP (vinfo_for_stmt (group2)) =
2115     GROUP_GAP (first_vinfo) + group1_size;
2116 
2117   /* GROUP_GAP of the first group now has to skip over the second group too.  */
2118   GROUP_GAP (first_vinfo) += group2_size;
2119 
2120   if (dump_enabled_p ())
2121     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2122 		     group1_size, group2_size);
2123 
2124   return group2;
2125 }
2126 
2127 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2128    statements and a vector of NUNITS elements.  */
2129 
2130 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2131 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2132 {
2133   return exact_div (common_multiple (nunits, group_size), group_size);
2134 }
2135 
2136 /* Analyze an SLP instance starting from a group of grouped stores.  Call
2137    vect_build_slp_tree to build a tree of packed stmts if possible.
2138    Return FALSE if it's impossible to SLP any stmt in the loop.  */
2139 
2140 static bool
vect_analyze_slp_instance(vec_info * vinfo,gimple * stmt,unsigned max_tree_size)2141 vect_analyze_slp_instance (vec_info *vinfo,
2142 			   gimple *stmt, unsigned max_tree_size)
2143 {
2144   slp_instance new_instance;
2145   slp_tree node;
2146   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2147   tree vectype, scalar_type = NULL_TREE;
2148   gimple *next;
2149   unsigned int i;
2150   vec<slp_tree> loads;
2151   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2152   vec<gimple *> scalar_stmts;
2153 
2154   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2155     {
2156       if (dr)
2157         {
2158           scalar_type = TREE_TYPE (DR_REF (dr));
2159           vectype = get_vectype_for_scalar_type (scalar_type);
2160         }
2161       else
2162         {
2163           gcc_assert (is_a <loop_vec_info> (vinfo));
2164           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2165         }
2166 
2167       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2168     }
2169   else
2170     {
2171       gcc_assert (is_a <loop_vec_info> (vinfo));
2172       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2173       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2174     }
2175 
2176   if (!vectype)
2177     {
2178       if (dump_enabled_p ())
2179         {
2180           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181 			   "Build SLP failed: unsupported data-type ");
2182           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2183           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2184         }
2185 
2186       return false;
2187     }
2188   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2189 
2190   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
2191   scalar_stmts.create (group_size);
2192   next = stmt;
2193   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2194     {
2195       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
2196       while (next)
2197         {
2198 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2199 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2200 	    scalar_stmts.safe_push (
2201 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2202 	  else
2203             scalar_stmts.safe_push (next);
2204           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2205         }
2206       /* Mark the first element of the reduction chain as reduction to properly
2207 	 transform the node.  In the reduction analysis phase only the last
2208 	 element of the chain is marked as reduction.  */
2209       if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2210 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2211     }
2212   else
2213     {
2214       /* Collect reduction statements.  */
2215       vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2216       for (i = 0; reductions.iterate (i, &next); i++)
2217 	scalar_stmts.safe_push (next);
2218     }
2219 
2220   loads.create (group_size);
2221 
2222   /* Build the tree for the SLP instance.  */
2223   bool *matches = XALLOCAVEC (bool, group_size);
2224   unsigned npermutes = 0;
2225   bst_fail = new scalar_stmts_set_t ();
2226   poly_uint64 max_nunits = nunits;
2227   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2228 			      &max_nunits, &loads, matches, &npermutes,
2229 			      NULL, max_tree_size);
2230   delete bst_fail;
2231   if (node != NULL)
2232     {
2233       /* Calculate the unrolling factor based on the smallest type.  */
2234       poly_uint64 unrolling_factor
2235 	= calculate_unrolling_factor (max_nunits, group_size);
2236 
2237       if (maybe_ne (unrolling_factor, 1U)
2238 	  && is_a <bb_vec_info> (vinfo))
2239 	{
2240 	  unsigned HOST_WIDE_INT const_max_nunits;
2241 	  if (!max_nunits.is_constant (&const_max_nunits)
2242 	      || const_max_nunits > group_size)
2243 	    {
2244 	      if (dump_enabled_p ())
2245 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 				 "Build SLP failed: store group "
2247 				 "size not a multiple of the vector size "
2248 				 "in basic block SLP\n");
2249 	      vect_free_slp_tree (node);
2250 	      loads.release ();
2251 	      return false;
2252 	    }
2253 	  /* Fatal mismatch.  */
2254 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
2255 	  vect_free_slp_tree (node);
2256 	  loads.release ();
2257 	}
2258       else
2259 	{
2260       /* Create a new SLP instance.  */
2261       new_instance = XNEW (struct _slp_instance);
2262       SLP_INSTANCE_TREE (new_instance) = node;
2263       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2264       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2265       SLP_INSTANCE_LOADS (new_instance) = loads;
2266 
2267       /* Compute the load permutation.  */
2268       slp_tree load_node;
2269       bool loads_permuted = false;
2270       FOR_EACH_VEC_ELT (loads, i, load_node)
2271 	{
2272 	  vec<unsigned> load_permutation;
2273 	  int j;
2274 	  gimple *load, *first_stmt;
2275 	  bool this_load_permuted = false;
2276 	  load_permutation.create (group_size);
2277 	  first_stmt = GROUP_FIRST_ELEMENT
2278 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2279 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2280 	    {
2281 		  int load_place = vect_get_place_in_interleaving_chain
2282 				     (load, first_stmt);
2283 	      gcc_assert (load_place != -1);
2284 	      if (load_place != j)
2285 		this_load_permuted = true;
2286 	      load_permutation.safe_push (load_place);
2287 	    }
2288 	  if (!this_load_permuted
2289 	      /* The load requires permutation when unrolling exposes
2290 	         a gap either because the group is larger than the SLP
2291 		 group-size or because there is a gap between the groups.  */
2292 	      && (known_eq (unrolling_factor, 1U)
2293 		  || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2294 		      && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2295 	    {
2296 	      load_permutation.release ();
2297 	      continue;
2298 	    }
2299 	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2300 	  loads_permuted = true;
2301 	}
2302 
2303       if (loads_permuted)
2304         {
2305           if (!vect_supported_load_permutation_p (new_instance))
2306             {
2307               if (dump_enabled_p ())
2308                 {
2309                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 				   "Build SLP failed: unsupported load "
2311 				   "permutation ");
2312 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2313 					TDF_SLIM, stmt, 0);
2314                 }
2315               vect_free_slp_instance (new_instance);
2316               return false;
2317             }
2318         }
2319 
2320 	  /* If the loads and stores can be handled with load/store-lan
2321 	 instructions do not generate this SLP instance.  */
2322       if (is_a <loop_vec_info> (vinfo)
2323 	  && loads_permuted
2324 	  && dr && vect_store_lanes_supported (vectype, group_size, false))
2325 	{
2326 	  slp_tree load_node;
2327 	  FOR_EACH_VEC_ELT (loads, i, load_node)
2328 	    {
2329 	      gimple *first_stmt = GROUP_FIRST_ELEMENT
2330 		  (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2331 	      stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2332 		  /* Use SLP for strided accesses (or if we
2333 		     can't load-lanes).  */
2334 	      if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2335 		  || ! vect_load_lanes_supported
2336 			(STMT_VINFO_VECTYPE (stmt_vinfo),
2337 			 GROUP_SIZE (stmt_vinfo), false))
2338 		break;
2339 	    }
2340 	  if (i == loads.length ())
2341 	    {
2342 	      if (dump_enabled_p ())
2343 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 				 "Built SLP cancelled: can use "
2345 				 "load/store-lanes\n");
2346 	      vect_free_slp_instance (new_instance);
2347 	      return false;
2348 	    }
2349 	}
2350 
2351       vinfo->slp_instances.safe_push (new_instance);
2352 
2353       if (dump_enabled_p ())
2354 	{
2355 	  dump_printf_loc (MSG_NOTE, vect_location,
2356 			   "Final SLP tree for instance:\n");
2357 	  vect_print_slp_tree (MSG_NOTE, vect_location, node);
2358 	}
2359 
2360       return true;
2361     }
2362     }
2363   else
2364     {
2365   /* Failed to SLP.  */
2366   /* Free the allocated memory.  */
2367   scalar_stmts.release ();
2368   loads.release ();
2369     }
2370 
2371   /* For basic block SLP, try to break the group up into multiples of the
2372      vector size.  */
2373   unsigned HOST_WIDE_INT const_nunits;
2374   if (is_a <bb_vec_info> (vinfo)
2375       && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2376       && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2377       && nunits.is_constant (&const_nunits))
2378     {
2379       /* We consider breaking the group only on VF boundaries from the existing
2380 	 start.  */
2381       for (i = 0; i < group_size; i++)
2382 	if (!matches[i]) break;
2383 
2384       if (i >= const_nunits && i < group_size)
2385 	{
2386 	  /* Split into two groups at the first vector boundary before i.  */
2387 	  gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2388 	  unsigned group1_size = i & ~(const_nunits - 1);
2389 
2390 	  gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2391 	  bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2392 	  /* If the first non-match was in the middle of a vector,
2393 	     skip the rest of that vector.  */
2394 	  if (group1_size < i)
2395 	    {
2396 	      i = group1_size + const_nunits;
2397 	      if (i < group_size)
2398 		rest = vect_split_slp_store_group (rest, const_nunits);
2399 	    }
2400 	  if (i < group_size)
2401 	    res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2402 	  return res;
2403 	}
2404       /* Even though the first vector did not all match, we might be able to SLP
2405 	 (some) of the remainder.  FORNOW ignore this possibility.  */
2406     }
2407 
2408   return false;
2409 }
2410 
2411 
2412 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
2413    trees of packed scalar stmts if SLP is possible.  */
2414 
2415 bool
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2416 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2417 {
2418   unsigned int i;
2419   gimple *first_element;
2420 
2421   if (dump_enabled_p ())
2422     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2423 
2424   /* Find SLP sequences starting from groups of grouped stores.  */
2425   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2426     vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2427 
2428   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2429     {
2430       if (loop_vinfo->reduction_chains.length () > 0)
2431 	{
2432 	  /* Find SLP sequences starting from reduction chains.  */
2433 	  FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2434 	    if (! vect_analyze_slp_instance (vinfo, first_element,
2435 					     max_tree_size))
2436 	      {
2437 		/* Dissolve reduction chain group.  */
2438 		gimple *next, *stmt = first_element;
2439 		while (stmt)
2440 		  {
2441 		    stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2442 		    next = GROUP_NEXT_ELEMENT (vinfo);
2443 		    GROUP_FIRST_ELEMENT (vinfo) = NULL;
2444 		    GROUP_NEXT_ELEMENT (vinfo) = NULL;
2445 		    stmt = next;
2446 		  }
2447 		STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2448 		  = vect_internal_def;
2449 	      }
2450 	}
2451 
2452       /* Find SLP sequences starting from groups of reductions.  */
2453       if (loop_vinfo->reductions.length () > 1)
2454 	vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2455 				   max_tree_size);
2456     }
2457 
2458   return true;
2459 }
2460 
2461 
2462 /* For each possible SLP instance decide whether to SLP it and calculate overall
2463    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
2464    least one instance.  */
2465 
2466 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2467 vect_make_slp_decision (loop_vec_info loop_vinfo)
2468 {
2469   unsigned int i;
2470   poly_uint64 unrolling_factor = 1;
2471   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2472   slp_instance instance;
2473   int decided_to_slp = 0;
2474 
2475   if (dump_enabled_p ())
2476     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2477                      "\n");
2478 
2479   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2480     {
2481       /* FORNOW: SLP if you can.  */
2482       /* All unroll factors have the form current_vector_size * X for some
2483 	 rational X, so they must have a common multiple.  */
2484       unrolling_factor
2485 	= force_common_multiple (unrolling_factor,
2486 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2487 
2488       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
2489 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2490 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
2491       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2492       decided_to_slp++;
2493     }
2494 
2495   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2496 
2497   if (decided_to_slp && dump_enabled_p ())
2498     {
2499       dump_printf_loc (MSG_NOTE, vect_location,
2500 		       "Decided to SLP %d instances. Unrolling factor ",
2501 		       decided_to_slp);
2502       dump_dec (MSG_NOTE, unrolling_factor);
2503       dump_printf (MSG_NOTE, "\n");
2504     }
2505 
2506   return (decided_to_slp > 0);
2507 }
2508 
2509 
2510 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2511    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
2512 
2513 static void
vect_detect_hybrid_slp_stmts(slp_tree node,unsigned i,slp_vect_type stype)2514 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2515 {
2516   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2517   imm_use_iterator imm_iter;
2518   gimple *use_stmt;
2519   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2520   slp_tree child;
2521   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2522   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2523   int j;
2524 
2525   /* Propagate hybrid down the SLP tree.  */
2526   if (stype == hybrid)
2527     ;
2528   else if (HYBRID_SLP_STMT (stmt_vinfo))
2529     stype = hybrid;
2530   else
2531     {
2532       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
2533       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2534       /* If we get a pattern stmt here we have to use the LHS of the
2535          original stmt for immediate uses.  */
2536       if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2537 	  && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2538 	stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2539       tree def;
2540       if (gimple_code (stmt) == GIMPLE_PHI)
2541 	def = gimple_phi_result (stmt);
2542       else
2543 	def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2544       if (def)
2545 	FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2546 	  {
2547 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2548 	      continue;
2549 	    use_vinfo = vinfo_for_stmt (use_stmt);
2550 	    if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2551 		&& STMT_VINFO_RELATED_STMT (use_vinfo))
2552 	      use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2553 	    if (!STMT_SLP_TYPE (use_vinfo)
2554 		&& (STMT_VINFO_RELEVANT (use_vinfo)
2555 		    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2556 		&& !(gimple_code (use_stmt) == GIMPLE_PHI
2557 		     && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2558 	      {
2559 		if (dump_enabled_p ())
2560 		  {
2561 		    dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2562 				     "def in non-SLP stmt: ");
2563 		    dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2564 		  }
2565 		stype = hybrid;
2566 	      }
2567 	  }
2568     }
2569 
2570   if (stype == hybrid
2571       && !HYBRID_SLP_STMT (stmt_vinfo))
2572     {
2573       if (dump_enabled_p ())
2574 	{
2575 	  dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2576 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2577 	}
2578       STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2579     }
2580 
2581   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2582     if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2583       vect_detect_hybrid_slp_stmts (child, i, stype);
2584 }
2585 
2586 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
2587 
2588 static tree
vect_detect_hybrid_slp_1(tree * tp,int *,void * data)2589 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2590 {
2591   walk_stmt_info *wi = (walk_stmt_info *)data;
2592   struct loop *loopp = (struct loop *)wi->info;
2593 
2594   if (wi->is_lhs)
2595     return NULL_TREE;
2596 
2597   if (TREE_CODE (*tp) == SSA_NAME
2598       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2599     {
2600       gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2601       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2602 	  && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2603 	{
2604 	  if (dump_enabled_p ())
2605 	    {
2606 	      dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2607 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2608 	    }
2609 	  STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2610 	}
2611     }
2612 
2613   return NULL_TREE;
2614 }
2615 
2616 static tree
vect_detect_hybrid_slp_2(gimple_stmt_iterator * gsi,bool * handled,walk_stmt_info *)2617 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2618 			  walk_stmt_info *)
2619 {
2620   stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2621   /* If the stmt is in a SLP instance then this isn't a reason
2622      to mark use definitions in other SLP instances as hybrid.  */
2623   if (! STMT_SLP_TYPE (use_vinfo)
2624       && (STMT_VINFO_RELEVANT (use_vinfo)
2625 	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2626       && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2627 	    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2628     ;
2629   else
2630     *handled = true;
2631   return NULL_TREE;
2632 }
2633 
2634 /* Find stmts that must be both vectorized and SLPed.  */
2635 
2636 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2637 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2638 {
2639   unsigned int i;
2640   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2641   slp_instance instance;
2642 
2643   if (dump_enabled_p ())
2644     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2645                      "\n");
2646 
2647   /* First walk all pattern stmt in the loop and mark defs of uses as
2648      hybrid because immediate uses in them are not recorded.  */
2649   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2650     {
2651       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2652       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2653 	   gsi_next (&gsi))
2654 	{
2655 	  gimple *stmt = gsi_stmt (gsi);
2656 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2657 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2658 	    {
2659 	      walk_stmt_info wi;
2660 	      memset (&wi, 0, sizeof (wi));
2661 	      wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2662 	      gimple_stmt_iterator gsi2
2663 		= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2664 	      walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2665 				vect_detect_hybrid_slp_1, &wi);
2666 	      walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2667 			       vect_detect_hybrid_slp_2,
2668 			       vect_detect_hybrid_slp_1, &wi);
2669 	    }
2670 	}
2671     }
2672 
2673   /* Then walk the SLP instance trees marking stmts with uses in
2674      non-SLP stmts as hybrid, also propagating hybrid down the
2675      SLP tree, collecting the above info on-the-fly.  */
2676   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2677     {
2678       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2679 	vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2680 				      i, pure_slp);
2681     }
2682 }
2683 
2684 
2685 /* Initialize a bb_vec_info struct for the statements between
2686    REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive).  */
2687 
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in)2688 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2689 			    gimple_stmt_iterator region_end_in)
2690   : vec_info (vec_info::bb, init_cost (NULL)),
2691     bb (gsi_bb (region_begin_in)),
2692     region_begin (region_begin_in),
2693     region_end (region_end_in)
2694 {
2695   gimple_stmt_iterator gsi;
2696 
2697   for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2698        gsi_next (&gsi))
2699     {
2700       gimple *stmt = gsi_stmt (gsi);
2701       gimple_set_uid (stmt, 0);
2702       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2703     }
2704 
2705   bb->aux = this;
2706 }
2707 
2708 
2709 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2710    stmts in the basic block.  */
2711 
~_bb_vec_info()2712 _bb_vec_info::~_bb_vec_info ()
2713 {
2714   for (gimple_stmt_iterator si = region_begin;
2715        gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2716     {
2717       gimple *stmt = gsi_stmt (si);
2718       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2719 
2720       if (stmt_info)
2721         /* Free stmt_vec_info.  */
2722         free_stmt_vec_info (stmt);
2723 
2724       /* Reset region marker.  */
2725       gimple_set_uid (stmt, -1);
2726     }
2727 
2728   bb->aux = NULL;
2729 }
2730 
2731 
2732 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2733    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2734 
2735    Return true if the operations are supported.  */
2736 
2737 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance)2738 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2739 				  slp_instance node_instance)
2740 {
2741   bool dummy;
2742   int i, j;
2743   gimple *stmt;
2744   slp_tree child;
2745 
2746   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2747     return true;
2748 
2749   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2751       return false;
2752 
2753   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2754   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2755   gcc_assert (stmt_info);
2756   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2757 
2758   /* For BB vectorization vector types are assigned here.
2759      Memory accesses already got their vector type assigned
2760      in vect_analyze_data_refs.  */
2761   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2762   if (bb_vinfo
2763       && ! STMT_VINFO_DATA_REF (stmt_info))
2764     {
2765       gcc_assert (PURE_SLP_STMT (stmt_info));
2766 
2767       tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2768       if (dump_enabled_p ())
2769 	{
2770 	  dump_printf_loc (MSG_NOTE, vect_location,
2771 			   "get vectype for scalar type:  ");
2772 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2773 	  dump_printf (MSG_NOTE, "\n");
2774 	}
2775 
2776       tree vectype = get_vectype_for_scalar_type (scalar_type);
2777       if (!vectype)
2778 	{
2779 	  if (dump_enabled_p ())
2780 	    {
2781 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 			       "not SLPed: unsupported data-type ");
2783 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2784 				 scalar_type);
2785 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2786 	    }
2787 	  return false;
2788 	}
2789 
2790       if (dump_enabled_p ())
2791 	{
2792 	  dump_printf_loc (MSG_NOTE, vect_location, "vectype:  ");
2793 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2794 	  dump_printf (MSG_NOTE, "\n");
2795 	}
2796 
2797       gimple *sstmt;
2798       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2799 	STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2800     }
2801 
2802   /* Calculate the number of vector statements to be created for the
2803      scalar stmts in this node.  For SLP reductions it is equal to the
2804      number of vector statements in the children (which has already been
2805      calculated by the recursive call).  Otherwise it is the number of
2806      scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2807      VF divided by the number of elements in a vector.  */
2808   if (GROUP_FIRST_ELEMENT (stmt_info)
2809       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2810     SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2811       = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2812   else
2813     {
2814       poly_uint64 vf;
2815       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2816 	vf = loop_vinfo->vectorization_factor;
2817       else
2818 	vf = 1;
2819       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2820       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2821       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2822 	= vect_get_num_vectors (vf * group_size, vectype);
2823     }
2824 
2825   /* ???  We have to catch the case late where two first scalar stmts appear
2826      in multiple SLP children with different def type and fail.  Remember
2827      original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2828      match it when that is vect_internal_def.  */
2829   auto_vec<vect_def_type, 4> dt;
2830   dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2831   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2832     dt[j]
2833       = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]));
2834 
2835   /* Push SLP node def-type to stmt operands.  */
2836   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2837     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2838       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2839 	= SLP_TREE_DEF_TYPE (child);
2840 
2841   /* Check everything worked out.  */
2842   bool res = true;
2843   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2844    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2845      {
2846        if (STMT_VINFO_DEF_TYPE
2847 	     (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2848 	   != SLP_TREE_DEF_TYPE (child))
2849 	 res = false;
2850      }
2851    else if (STMT_VINFO_DEF_TYPE
2852 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) != dt[j])
2853      res = false;
2854   if (!res && dump_enabled_p ())
2855     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856 		     "not vectorized: same operand with different "
2857 		     "def type in stmt.\n");
2858   if (res)
2859     res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2860 
2861   /* Restore def-types.  */
2862   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2863     STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2864       = dt[j];
2865 
2866   return res;
2867 }
2868 
2869 
2870 /* Analyze statements in SLP instances of VINFO.  Return true if the
2871    operations are supported. */
2872 
2873 bool
vect_slp_analyze_operations(vec_info * vinfo)2874 vect_slp_analyze_operations (vec_info *vinfo)
2875 {
2876   slp_instance instance;
2877   int i;
2878 
2879   if (dump_enabled_p ())
2880     dump_printf_loc (MSG_NOTE, vect_location,
2881 		     "=== vect_slp_analyze_operations ===\n");
2882 
2883   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2884     {
2885       if (!vect_slp_analyze_node_operations (vinfo,
2886 					     SLP_INSTANCE_TREE (instance),
2887 					     instance))
2888         {
2889 	  dump_printf_loc (MSG_NOTE, vect_location,
2890 			   "removing SLP instance operations starting from: ");
2891 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2892 			    SLP_TREE_SCALAR_STMTS
2893 			      (SLP_INSTANCE_TREE (instance))[0], 0);
2894 	  vect_free_slp_instance (instance);
2895           vinfo->slp_instances.ordered_remove (i);
2896 	}
2897       else
2898 	i++;
2899     }
2900 
2901   if (dump_enabled_p ())
2902     dump_printf_loc (MSG_NOTE, vect_location,
2903 		     "=== vect_analyze_slp_cost ===\n");
2904 
2905   /* Compute the costs of the SLP instances.  */
2906   scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2907   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2908     vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2909   delete visited;
2910 
2911   return !vinfo->slp_instances.is_empty ();
2912 }
2913 
2914 
2915 /* Compute the scalar cost of the SLP node NODE and its children
2916    and return it.  Do not account defs that are marked in LIFE and
2917    update LIFE according to uses of NODE.  */
2918 
2919 static unsigned
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life)2920 vect_bb_slp_scalar_cost (basic_block bb,
2921 			 slp_tree node, vec<bool, va_heap> *life)
2922 {
2923   unsigned scalar_cost = 0;
2924   unsigned i;
2925   gimple *stmt;
2926   slp_tree child;
2927 
2928   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2929     {
2930       unsigned stmt_cost;
2931       ssa_op_iter op_iter;
2932       def_operand_p def_p;
2933       stmt_vec_info stmt_info;
2934 
2935       if ((*life)[i])
2936 	continue;
2937 
2938       /* If there is a non-vectorized use of the defs then the scalar
2939          stmt is kept live in which case we do not account it or any
2940 	 required defs in the SLP children in the scalar cost.  This
2941 	 way we make the vectorization more costly when compared to
2942 	 the scalar cost.  */
2943       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2944 	{
2945 	  imm_use_iterator use_iter;
2946 	  gimple *use_stmt;
2947 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2948 	    if (!is_gimple_debug (use_stmt)
2949 		&& (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2950 					     use_stmt)
2951 		    || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2952 	      {
2953 		(*life)[i] = true;
2954 		BREAK_FROM_IMM_USE_STMT (use_iter);
2955 	      }
2956 	}
2957       if ((*life)[i])
2958 	continue;
2959 
2960       /* Count scalar stmts only once.  */
2961       if (gimple_visited_p (stmt))
2962 	continue;
2963       gimple_set_visited (stmt, true);
2964 
2965       stmt_info = vinfo_for_stmt (stmt);
2966       if (STMT_VINFO_DATA_REF (stmt_info))
2967         {
2968           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2969             stmt_cost = vect_get_stmt_cost (scalar_load);
2970           else
2971             stmt_cost = vect_get_stmt_cost (scalar_store);
2972         }
2973       else
2974         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2975 
2976       scalar_cost += stmt_cost;
2977     }
2978 
2979   auto_vec<bool, 20> subtree_life;
2980   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2981     {
2982       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2983 	{
2984 	  /* Do not directly pass LIFE to the recursive call, copy it to
2985 	     confine changes in the callee to the current child/subtree.  */
2986 	  subtree_life.safe_splice (*life);
2987 	  scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2988 	  subtree_life.truncate (0);
2989 	}
2990     }
2991 
2992   return scalar_cost;
2993 }
2994 
2995 /* Check if vectorization of the basic block is profitable.  */
2996 
2997 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2998 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2999 {
3000   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3001   slp_instance instance;
3002   int i;
3003   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3004   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3005 
3006   /* Calculate scalar cost.  */
3007   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3008     {
3009       auto_vec<bool, 20> life;
3010       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3011       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3012 					      SLP_INSTANCE_TREE (instance),
3013 					      &life);
3014     }
3015 
3016   /* Unset visited flag.  */
3017   for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3018        gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3019     gimple_set_visited  (gsi_stmt (gsi), false);
3020 
3021   /* Complete the target-specific cost calculation.  */
3022   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3023 	       &vec_inside_cost, &vec_epilogue_cost);
3024 
3025   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3026 
3027   if (dump_enabled_p ())
3028     {
3029       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3030       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
3031 		   vec_inside_cost);
3032       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
3033       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
3034       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
3035     }
3036 
3037   /* Vectorization is profitable if its cost is more than the cost of scalar
3038      version.  Note that we err on the vector side for equal cost because
3039      the cost estimate is otherwise quite pessimistic (constant uses are
3040      free on the scalar side but cost a load on the vector side for
3041      example).  */
3042   if (vec_outside_cost + vec_inside_cost > scalar_cost)
3043     return false;
3044 
3045   return true;
3046 }
3047 
3048 /* Check if the basic block can be vectorized.  Returns a bb_vec_info
3049    if so and sets fatal to true if failure is independent of
3050    current_vector_size.  */
3051 
3052 static bb_vec_info
vect_slp_analyze_bb_1(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,int n_stmts,bool & fatal)3053 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3054 		       gimple_stmt_iterator region_end,
3055 		       vec<data_reference_p> datarefs, int n_stmts,
3056 		       bool &fatal)
3057 {
3058   bb_vec_info bb_vinfo;
3059   slp_instance instance;
3060   int i;
3061   poly_uint64 min_vf = 2;
3062 
3063   /* The first group of checks is independent of the vector size.  */
3064   fatal = true;
3065 
3066   if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3067     {
3068       if (dump_enabled_p ())
3069 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3070 			 "not vectorized: too many instructions in "
3071 			 "basic block.\n");
3072       free_data_refs (datarefs);
3073       return NULL;
3074     }
3075 
3076   bb_vinfo = new _bb_vec_info (region_begin, region_end);
3077   if (!bb_vinfo)
3078     return NULL;
3079 
3080   BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3081 
3082   /* Analyze the data references.  */
3083 
3084   if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3085     {
3086       if (dump_enabled_p ())
3087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3088 			 "not vectorized: unhandled data-ref in basic "
3089 			 "block.\n");
3090 
3091       delete bb_vinfo;
3092       return NULL;
3093     }
3094 
3095   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3096     {
3097       if (dump_enabled_p ())
3098         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3099 			 "not vectorized: not enough data-refs in "
3100 			 "basic block.\n");
3101 
3102       delete bb_vinfo;
3103       return NULL;
3104     }
3105 
3106   if (!vect_analyze_data_ref_accesses (bb_vinfo))
3107     {
3108      if (dump_enabled_p ())
3109        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3110 			"not vectorized: unhandled data access in "
3111 			"basic block.\n");
3112 
3113       delete bb_vinfo;
3114       return NULL;
3115     }
3116 
3117   /* If there are no grouped stores in the region there is no need
3118      to continue with pattern recog as vect_analyze_slp will fail
3119      anyway.  */
3120   if (bb_vinfo->grouped_stores.is_empty ())
3121     {
3122       if (dump_enabled_p ())
3123 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3124 			 "not vectorized: no grouped stores in "
3125 			 "basic block.\n");
3126 
3127       delete bb_vinfo;
3128       return NULL;
3129     }
3130 
3131   /* While the rest of the analysis below depends on it in some way.  */
3132   fatal = false;
3133 
3134   vect_pattern_recog (bb_vinfo);
3135 
3136   /* Check the SLP opportunities in the basic block, analyze and build SLP
3137      trees.  */
3138   if (!vect_analyze_slp (bb_vinfo, n_stmts))
3139     {
3140       if (dump_enabled_p ())
3141 	{
3142 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3143 			   "Failed to SLP the basic block.\n");
3144 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145 			   "not vectorized: failed to find SLP opportunities "
3146 			   "in basic block.\n");
3147 	}
3148 
3149       delete bb_vinfo;
3150       return NULL;
3151     }
3152 
3153   vect_record_base_alignments (bb_vinfo);
3154 
3155   /* Analyze and verify the alignment of data references and the
3156      dependence in the SLP instances.  */
3157   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3158     {
3159       if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3160 	  || ! vect_slp_analyze_instance_dependence (instance))
3161 	{
3162 	  dump_printf_loc (MSG_NOTE, vect_location,
3163 			   "removing SLP instance operations starting from: ");
3164 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3165 			    SLP_TREE_SCALAR_STMTS
3166 			      (SLP_INSTANCE_TREE (instance))[0], 0);
3167 	  vect_free_slp_instance (instance);
3168 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3169 	  continue;
3170 	}
3171 
3172       /* Mark all the statements that we want to vectorize as pure SLP and
3173 	 relevant.  */
3174       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3175       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3176 
3177       i++;
3178     }
3179   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3180     {
3181       delete bb_vinfo;
3182       return NULL;
3183     }
3184 
3185   if (!vect_slp_analyze_operations (bb_vinfo))
3186     {
3187       if (dump_enabled_p ())
3188         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 			 "not vectorized: bad operation in basic block.\n");
3190 
3191       delete bb_vinfo;
3192       return NULL;
3193     }
3194 
3195   /* Cost model: check if the vectorization is worthwhile.  */
3196   if (!unlimited_cost_model (NULL)
3197       && !vect_bb_vectorization_profitable_p (bb_vinfo))
3198     {
3199       if (dump_enabled_p ())
3200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201 			 "not vectorized: vectorization is not "
3202 			 "profitable.\n");
3203 
3204       delete bb_vinfo;
3205       return NULL;
3206     }
3207 
3208   if (dump_enabled_p ())
3209     dump_printf_loc (MSG_NOTE, vect_location,
3210 		     "Basic block will be vectorized using SLP\n");
3211 
3212   return bb_vinfo;
3213 }
3214 
3215 
3216 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
3217    true if anything in the basic-block was vectorized.  */
3218 
3219 bool
vect_slp_bb(basic_block bb)3220 vect_slp_bb (basic_block bb)
3221 {
3222   bb_vec_info bb_vinfo;
3223   gimple_stmt_iterator gsi;
3224   bool any_vectorized = false;
3225   auto_vector_sizes vector_sizes;
3226 
3227   if (dump_enabled_p ())
3228     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3229 
3230   /* Autodetect first vector size we try.  */
3231   current_vector_size = 0;
3232   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3233   unsigned int next_size = 0;
3234 
3235   gsi = gsi_start_bb (bb);
3236 
3237   poly_uint64 autodetected_vector_size = 0;
3238   while (1)
3239     {
3240       if (gsi_end_p (gsi))
3241 	break;
3242 
3243       gimple_stmt_iterator region_begin = gsi;
3244       vec<data_reference_p> datarefs = vNULL;
3245       int insns = 0;
3246 
3247       for (; !gsi_end_p (gsi); gsi_next (&gsi))
3248 	{
3249 	  gimple *stmt = gsi_stmt (gsi);
3250 	  if (is_gimple_debug (stmt))
3251 	    continue;
3252 	  insns++;
3253 
3254 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
3255 	    vect_location = gimple_location (stmt);
3256 
3257 	  if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3258 	    break;
3259 	}
3260 
3261       /* Skip leading unhandled stmts.  */
3262       if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3263 	{
3264 	  gsi_next (&gsi);
3265 	  continue;
3266 	}
3267 
3268       gimple_stmt_iterator region_end = gsi;
3269 
3270       bool vectorized = false;
3271       bool fatal = false;
3272       bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3273 					datarefs, insns, fatal);
3274       if (bb_vinfo
3275 	  && dbg_cnt (vect_slp))
3276 	{
3277 	  if (dump_enabled_p ())
3278 	    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3279 
3280 	  vect_schedule_slp (bb_vinfo);
3281 
3282 	  if (dump_enabled_p ())
3283 	    dump_printf_loc (MSG_NOTE, vect_location,
3284 			     "basic block part vectorized\n");
3285 
3286 	  vectorized = true;
3287 	}
3288       delete bb_vinfo;
3289 
3290       any_vectorized |= vectorized;
3291 
3292       if (next_size == 0)
3293 	autodetected_vector_size = current_vector_size;
3294 
3295       if (next_size < vector_sizes.length ()
3296 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
3297 	next_size += 1;
3298 
3299       if (vectorized
3300 	  || next_size == vector_sizes.length ()
3301 	  || known_eq (current_vector_size, 0U)
3302 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
3303 	     vector sizes will fail do not bother iterating.  */
3304 	  || fatal)
3305 	{
3306 	  if (gsi_end_p (region_end))
3307 	    break;
3308 
3309 	  /* Skip the unhandled stmt.  */
3310 	  gsi_next (&gsi);
3311 
3312 	  /* And reset vector sizes.  */
3313 	  current_vector_size = 0;
3314 	  next_size = 0;
3315 	}
3316       else
3317 	{
3318 	  /* Try the next biggest vector size.  */
3319 	  current_vector_size = vector_sizes[next_size++];
3320 	  if (dump_enabled_p ())
3321 	    {
3322 	      dump_printf_loc (MSG_NOTE, vect_location,
3323 			       "***** Re-trying analysis with "
3324 			       "vector size ");
3325 	      dump_dec (MSG_NOTE, current_vector_size);
3326 	      dump_printf (MSG_NOTE, "\n");
3327 	    }
3328 
3329 	  /* Start over.  */
3330 	  gsi = region_begin;
3331 	}
3332     }
3333 
3334   return any_vectorized;
3335 }
3336 
3337 
3338 /* Return 1 if vector type of boolean constant which is OPNUM
3339    operand in statement STMT is a boolean vector.  */
3340 
3341 static bool
vect_mask_constant_operand_p(gimple * stmt,int opnum)3342 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3343 {
3344   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3345   enum tree_code code = gimple_expr_code (stmt);
3346   tree op, vectype;
3347   gimple *def_stmt;
3348   enum vect_def_type dt;
3349 
3350   /* For comparison and COND_EXPR type is chosen depending
3351      on the other comparison operand.  */
3352   if (TREE_CODE_CLASS (code) == tcc_comparison)
3353     {
3354       if (opnum)
3355 	op = gimple_assign_rhs1 (stmt);
3356       else
3357 	op = gimple_assign_rhs2 (stmt);
3358 
3359       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3360 			       &dt, &vectype))
3361 	gcc_unreachable ();
3362 
3363       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3364     }
3365 
3366   if (code == COND_EXPR)
3367     {
3368       tree cond = gimple_assign_rhs1 (stmt);
3369 
3370       if (TREE_CODE (cond) == SSA_NAME)
3371 	op = cond;
3372       else if (opnum)
3373 	op = TREE_OPERAND (cond, 1);
3374       else
3375 	op = TREE_OPERAND (cond, 0);
3376 
3377       if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3378 			       &dt, &vectype))
3379 	gcc_unreachable ();
3380 
3381       return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3382     }
3383 
3384   return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3385 }
3386 
3387 /* Build a variable-length vector in which the elements in ELTS are repeated
3388    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
3389    RESULTS and add any new instructions to SEQ.
3390 
3391    The approach we use is:
3392 
3393    (1) Find a vector mode VM with integer elements of mode IM.
3394 
3395    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3396        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
3397        from small vectors to IM.
3398 
3399    (3) Duplicate each ELTS'[I] into a vector of mode VM.
3400 
3401    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3402        correct byte contents.
3403 
3404    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3405 
3406    We try to find the largest IM for which this sequence works, in order
3407    to cut down on the number of interleaves.  */
3408 
3409 void
duplicate_and_interleave(gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3410 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3411 			  unsigned int nresults, vec<tree> &results)
3412 {
3413   unsigned int nelts = elts.length ();
3414   tree element_type = TREE_TYPE (vector_type);
3415 
3416   /* (1) Find a vector mode VM with integer elements of mode IM.  */
3417   unsigned int nvectors = 1;
3418   tree new_vector_type;
3419   tree permutes[2];
3420   if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3421 				       &nvectors, &new_vector_type,
3422 				       permutes))
3423     gcc_unreachable ();
3424 
3425   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
3426   unsigned int partial_nelts = nelts / nvectors;
3427   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3428 
3429   tree_vector_builder partial_elts;
3430   auto_vec<tree, 32> pieces (nvectors * 2);
3431   pieces.quick_grow (nvectors * 2);
3432   for (unsigned int i = 0; i < nvectors; ++i)
3433     {
3434       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3435 	     ELTS' has mode IM.  */
3436       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3437       for (unsigned int j = 0; j < partial_nelts; ++j)
3438 	partial_elts.quick_push (elts[i * partial_nelts + j]);
3439       tree t = gimple_build_vector (seq, &partial_elts);
3440       t = gimple_build (seq, VIEW_CONVERT_EXPR,
3441 			TREE_TYPE (new_vector_type), t);
3442 
3443       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
3444       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3445     }
3446 
3447   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3448 	 correct byte contents.
3449 
3450      We need to repeat the following operation log2(nvectors) times:
3451 
3452 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3453 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3454 
3455      However, if each input repeats every N elements and the VF is
3456      a multiple of N * 2, the HI result is the same as the LO.  */
3457   unsigned int in_start = 0;
3458   unsigned int out_start = nvectors;
3459   unsigned int hi_start = nvectors / 2;
3460   /* A bound on the number of outputs needed to produce NRESULTS results
3461      in the final iteration.  */
3462   unsigned int noutputs_bound = nvectors * nresults;
3463   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3464     {
3465       noutputs_bound /= 2;
3466       unsigned int limit = MIN (noutputs_bound, nvectors);
3467       for (unsigned int i = 0; i < limit; ++i)
3468 	{
3469 	  if ((i & 1) != 0
3470 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3471 			     2 * in_repeat))
3472 	    {
3473 	      pieces[out_start + i] = pieces[out_start + i - 1];
3474 	      continue;
3475 	    }
3476 
3477 	  tree output = make_ssa_name (new_vector_type);
3478 	  tree input1 = pieces[in_start + (i / 2)];
3479 	  tree input2 = pieces[in_start + (i / 2) + hi_start];
3480 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3481 					       input1, input2,
3482 					       permutes[i & 1]);
3483 	  gimple_seq_add_stmt (seq, stmt);
3484 	  pieces[out_start + i] = output;
3485 	}
3486       std::swap (in_start, out_start);
3487     }
3488 
3489   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
3490   results.reserve (nresults);
3491   for (unsigned int i = 0; i < nresults; ++i)
3492     if (i < nvectors)
3493       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3494 					pieces[in_start + i]));
3495     else
3496       results.quick_push (results[i - nvectors]);
3497 }
3498 
3499 
3500 /* For constant and loop invariant defs of SLP_NODE this function returns
3501    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3502    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3503    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
3504    REDUC_INDEX is the index of the reduction operand in the statements, unless
3505    it is -1.  */
3506 
3507 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors)3508 vect_get_constant_vectors (tree op, slp_tree slp_node,
3509                            vec<tree> *vec_oprnds,
3510 			   unsigned int op_num, unsigned int number_of_vectors)
3511 {
3512   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3513   gimple *stmt = stmts[0];
3514   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3515   unsigned HOST_WIDE_INT nunits;
3516   tree vec_cst;
3517   unsigned j, number_of_places_left_in_vector;
3518   tree vector_type;
3519   tree vop;
3520   int group_size = stmts.length ();
3521   unsigned int vec_num, i;
3522   unsigned number_of_copies = 1;
3523   vec<tree> voprnds;
3524   voprnds.create (number_of_vectors);
3525   bool constant_p, is_store;
3526   tree neutral_op = NULL;
3527   enum tree_code code = gimple_expr_code (stmt);
3528   gimple_seq ctor_seq = NULL;
3529   auto_vec<tree, 16> permute_results;
3530 
3531   /* Check if vector type is a boolean vector.  */
3532   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3533       && vect_mask_constant_operand_p (stmt, op_num))
3534     vector_type
3535       = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3536   else
3537     vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3538 
3539   if (STMT_VINFO_DATA_REF (stmt_vinfo))
3540     {
3541       is_store = true;
3542       op = gimple_assign_rhs1 (stmt);
3543     }
3544   else
3545     is_store = false;
3546 
3547   gcc_assert (op);
3548 
3549   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3550      created vectors. It is greater than 1 if unrolling is performed.
3551 
3552      For example, we have two scalar operands, s1 and s2 (e.g., group of
3553      strided accesses of size two), while NUNITS is four (i.e., four scalars
3554      of this type can be packed in a vector).  The output vector will contain
3555      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3556      will be 2).
3557 
3558      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3559      containing the operands.
3560 
3561      For example, NUNITS is four as before, and the group size is 8
3562      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3563      {s5, s6, s7, s8}.  */
3564 
3565   /* When using duplicate_and_interleave, we just need one element for
3566      each scalar statement.  */
3567   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3568     nunits = group_size;
3569 
3570   number_of_copies = nunits * number_of_vectors / group_size;
3571 
3572   number_of_places_left_in_vector = nunits;
3573   constant_p = true;
3574   tree_vector_builder elts (vector_type, nunits, 1);
3575   elts.quick_grow (nunits);
3576   bool place_after_defs = false;
3577   for (j = 0; j < number_of_copies; j++)
3578     {
3579       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3580         {
3581           if (is_store)
3582             op = gimple_assign_rhs1 (stmt);
3583           else
3584 	    {
3585 	      switch (code)
3586 		{
3587 		  case COND_EXPR:
3588 		    {
3589 		      tree cond = gimple_assign_rhs1 (stmt);
3590 		      if (TREE_CODE (cond) == SSA_NAME)
3591 			op = gimple_op (stmt, op_num + 1);
3592 		      else if (op_num == 0 || op_num == 1)
3593 			op = TREE_OPERAND (cond, op_num);
3594 		      else
3595 			{
3596 			  if (op_num == 2)
3597 			    op = gimple_assign_rhs2 (stmt);
3598 			  else
3599 			    op = gimple_assign_rhs3 (stmt);
3600 			}
3601 		    }
3602 		    break;
3603 
3604 		  case CALL_EXPR:
3605 		    op = gimple_call_arg (stmt, op_num);
3606 		    break;
3607 
3608 		  case LSHIFT_EXPR:
3609 		  case RSHIFT_EXPR:
3610 		  case LROTATE_EXPR:
3611 		  case RROTATE_EXPR:
3612 		    op = gimple_op (stmt, op_num + 1);
3613 		    /* Unlike the other binary operators, shifts/rotates have
3614 		       the shift count being int, instead of the same type as
3615 		       the lhs, so make sure the scalar is the right type if
3616 		       we are dealing with vectors of
3617 		       long long/long/short/char.  */
3618 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3619 		      op = fold_convert (TREE_TYPE (vector_type), op);
3620 		    break;
3621 
3622 		  default:
3623 		    op = gimple_op (stmt, op_num + 1);
3624 		    break;
3625 		}
3626 	    }
3627 
3628           /* Create 'vect_ = {op0,op1,...,opn}'.  */
3629           number_of_places_left_in_vector--;
3630 	  tree orig_op = op;
3631 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3632 	    {
3633 	      if (CONSTANT_CLASS_P (op))
3634 		{
3635 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3636 		    {
3637 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
3638 			 of possibly different sizes of scalar value and
3639 			 vector element.  */
3640 		      if (integer_zerop (op))
3641 			op = build_int_cst (TREE_TYPE (vector_type), 0);
3642 		      else if (integer_onep (op))
3643 			op = build_all_ones_cst (TREE_TYPE (vector_type));
3644 		      else
3645 			gcc_unreachable ();
3646 		    }
3647 		  else
3648 		    op = fold_unary (VIEW_CONVERT_EXPR,
3649 				     TREE_TYPE (vector_type), op);
3650 		  gcc_assert (op && CONSTANT_CLASS_P (op));
3651 		}
3652 	      else
3653 		{
3654 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3655 		  gimple *init_stmt;
3656 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3657 		    {
3658 		      tree true_val
3659 			= build_all_ones_cst (TREE_TYPE (vector_type));
3660 		      tree false_val
3661 			= build_zero_cst (TREE_TYPE (vector_type));
3662 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3663 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3664 						       op, true_val,
3665 						       false_val);
3666 		    }
3667 		  else
3668 		    {
3669 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3670 				   op);
3671 		      init_stmt
3672 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3673 					       op);
3674 		    }
3675 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
3676 		  op = new_temp;
3677 		}
3678 	    }
3679 	  elts[number_of_places_left_in_vector] = op;
3680 	  if (!CONSTANT_CLASS_P (op))
3681 	    constant_p = false;
3682 	  if (TREE_CODE (orig_op) == SSA_NAME
3683 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3684 	      && STMT_VINFO_BB_VINFO (stmt_vinfo)
3685 	      && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3686 		  == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3687 	    place_after_defs = true;
3688 
3689           if (number_of_places_left_in_vector == 0)
3690             {
3691 	      if (constant_p
3692 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3693 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3694 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
3695 	      else
3696 		{
3697 		  if (vec_oprnds->is_empty ())
3698 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
3699 					      number_of_vectors,
3700 					      permute_results);
3701 		  vec_cst = permute_results[number_of_vectors - j - 1];
3702 		}
3703 	      tree init;
3704 	      gimple_stmt_iterator gsi;
3705 	      if (place_after_defs)
3706 		{
3707 		  gsi = gsi_for_stmt
3708 		          (vect_find_last_scalar_stmt_in_slp (slp_node));
3709 		  init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3710 		}
3711 	      else
3712 		init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3713 	      if (ctor_seq != NULL)
3714 		{
3715 		  gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3716 		  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3717 		  ctor_seq = NULL;
3718 		}
3719 	      voprnds.quick_push (init);
3720 	      place_after_defs = false;
3721               number_of_places_left_in_vector = nunits;
3722 	      constant_p = true;
3723 	      elts.new_vector (vector_type, nunits, 1);
3724 	      elts.quick_grow (nunits);
3725             }
3726         }
3727     }
3728 
3729   /* Since the vectors are created in the reverse order, we should invert
3730      them.  */
3731   vec_num = voprnds.length ();
3732   for (j = vec_num; j != 0; j--)
3733     {
3734       vop = voprnds[j - 1];
3735       vec_oprnds->quick_push (vop);
3736     }
3737 
3738   voprnds.release ();
3739 
3740   /* In case that VF is greater than the unrolling factor needed for the SLP
3741      group of stmts, NUMBER_OF_VECTORS to be created is greater than
3742      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3743      to replicate the vectors.  */
3744   while (number_of_vectors > vec_oprnds->length ())
3745     {
3746       tree neutral_vec = NULL;
3747 
3748       if (neutral_op)
3749         {
3750           if (!neutral_vec)
3751 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
3752 
3753           vec_oprnds->quick_push (neutral_vec);
3754         }
3755       else
3756         {
3757           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3758             vec_oprnds->quick_push (vop);
3759         }
3760     }
3761 }
3762 
3763 
3764 /* Get vectorized definitions from SLP_NODE that contains corresponding
3765    vectorized def-stmts.  */
3766 
3767 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3768 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3769 {
3770   tree vec_oprnd;
3771   gimple *vec_def_stmt;
3772   unsigned int i;
3773 
3774   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3775 
3776   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3777     {
3778       gcc_assert (vec_def_stmt);
3779       if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3780 	vec_oprnd = gimple_phi_result (vec_def_stmt);
3781       else
3782 	vec_oprnd = gimple_get_lhs (vec_def_stmt);
3783       vec_oprnds->quick_push (vec_oprnd);
3784     }
3785 }
3786 
3787 
3788 /* Get vectorized definitions for SLP_NODE.
3789    If the scalar definitions are loop invariants or constants, collect them and
3790    call vect_get_constant_vectors() to create vector stmts.
3791    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3792    must be stored in the corresponding child of SLP_NODE, and we call
3793    vect_get_slp_vect_defs () to retrieve them.  */
3794 
3795 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds)3796 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3797 		   vec<vec<tree> > *vec_oprnds)
3798 {
3799   gimple *first_stmt;
3800   int number_of_vects = 0, i;
3801   unsigned int child_index = 0;
3802   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3803   slp_tree child = NULL;
3804   vec<tree> vec_defs;
3805   tree oprnd;
3806   bool vectorized_defs;
3807 
3808   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3809   FOR_EACH_VEC_ELT (ops, i, oprnd)
3810     {
3811       /* For each operand we check if it has vectorized definitions in a child
3812 	 node or we need to create them (for invariants and constants).  We
3813 	 check if the LHS of the first stmt of the next child matches OPRND.
3814 	 If it does, we found the correct child.  Otherwise, we call
3815 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3816 	 to check this child node for the next operand.  */
3817       vectorized_defs = false;
3818       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3819         {
3820           child = SLP_TREE_CHILDREN (slp_node)[child_index];
3821 
3822 	  /* We have to check both pattern and original def, if available.  */
3823 	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3824 	    {
3825 	      gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3826 	      gimple *related
3827 		= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3828 	      tree first_def_op;
3829 
3830 	      if (gimple_code (first_def) == GIMPLE_PHI)
3831 		first_def_op = gimple_phi_result (first_def);
3832 	      else
3833 		first_def_op = gimple_get_lhs (first_def);
3834 	      if (operand_equal_p (oprnd, first_def_op, 0)
3835 		  || (related
3836 		      && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3837 		{
3838 		  /* The number of vector defs is determined by the number of
3839 		     vector statements in the node from which we get those
3840 		     statements.  */
3841 		  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3842 		  vectorized_defs = true;
3843 		  child_index++;
3844 		}
3845 	    }
3846 	  else
3847 	    child_index++;
3848         }
3849 
3850       if (!vectorized_defs)
3851         {
3852           if (i == 0)
3853             {
3854               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3855               /* Number of vector stmts was calculated according to LHS in
3856                  vect_schedule_slp_instance (), fix it by replacing LHS with
3857                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
3858                  details.  */
3859               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3860                                              &rhs_size_unit);
3861               if (rhs_size_unit != lhs_size_unit)
3862                 {
3863                   number_of_vects *= rhs_size_unit;
3864                   number_of_vects /= lhs_size_unit;
3865                 }
3866             }
3867         }
3868 
3869       /* Allocate memory for vectorized defs.  */
3870       vec_defs = vNULL;
3871       vec_defs.create (number_of_vects);
3872 
3873       /* For reduction defs we call vect_get_constant_vectors (), since we are
3874          looking for initial loop invariant values.  */
3875       if (vectorized_defs)
3876         /* The defs are already vectorized.  */
3877 	vect_get_slp_vect_defs (child, &vec_defs);
3878       else
3879 	/* Build vectors from scalar defs.  */
3880 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3881 				   number_of_vects);
3882 
3883       vec_oprnds->quick_push (vec_defs);
3884     }
3885 }
3886 
3887 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3888    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3889    permute statements for the SLP node NODE of the SLP instance
3890    SLP_NODE_INSTANCE.  */
3891 
3892 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3893 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3894 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
3895 			      slp_instance slp_node_instance, bool analyze_only,
3896 			      unsigned *n_perms)
3897 {
3898   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3899   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3900   tree mask_element_type = NULL_TREE, mask_type;
3901   int vec_index = 0;
3902   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3903   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3904   unsigned int mask_element;
3905   machine_mode mode;
3906   unsigned HOST_WIDE_INT nunits, const_vf;
3907 
3908   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3909     return false;
3910 
3911   stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3912 
3913   mode = TYPE_MODE (vectype);
3914 
3915   /* At the moment, all permutations are represented using per-element
3916      indices, so we can't cope with variable vector lengths or
3917      vectorization factors.  */
3918   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3919       || !vf.is_constant (&const_vf))
3920     return false;
3921 
3922   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3923      same size as the vector element being permuted.  */
3924   mask_element_type = lang_hooks.types.type_for_mode
3925     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3926   mask_type = get_vectype_for_scalar_type (mask_element_type);
3927   vec_perm_builder mask (nunits, nunits, 1);
3928   mask.quick_grow (nunits);
3929   vec_perm_indices indices;
3930 
3931   /* Initialize the vect stmts of NODE to properly insert the generated
3932      stmts later.  */
3933   if (! analyze_only)
3934     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3935 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3936       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3937 
3938   /* Generate permutation masks for every NODE. Number of masks for each NODE
3939      is equal to GROUP_SIZE.
3940      E.g., we have a group of three nodes with three loads from the same
3941      location in each node, and the vector size is 4. I.e., we have a
3942      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3943      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3944      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3945      ...
3946 
3947      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3948      The last mask is illegal since we assume two operands for permute
3949      operation, and the mask element values can't be outside that range.
3950      Hence, the last mask must be converted into {2,5,5,5}.
3951      For the first two permutations we need the first and the second input
3952      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3953      we need the second and the third vectors: {b1,c1,a2,b2} and
3954      {c2,a3,b3,c3}.  */
3955 
3956   int vect_stmts_counter = 0;
3957   unsigned int index = 0;
3958   int first_vec_index = -1;
3959   int second_vec_index = -1;
3960   bool noop_p = true;
3961   *n_perms = 0;
3962 
3963   for (unsigned int j = 0; j < const_vf; j++)
3964     {
3965       for (int k = 0; k < group_size; k++)
3966 	{
3967 	  unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3968 			    + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3969 	  vec_index = i / nunits;
3970 	  mask_element = i % nunits;
3971 	  if (vec_index == first_vec_index
3972 	      || first_vec_index == -1)
3973 	    {
3974 	      first_vec_index = vec_index;
3975 	    }
3976 	  else if (vec_index == second_vec_index
3977 		   || second_vec_index == -1)
3978 	    {
3979 	      second_vec_index = vec_index;
3980 	      mask_element += nunits;
3981 	    }
3982 	  else
3983 	    {
3984 	      if (dump_enabled_p ())
3985 		{
3986 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987 				   "permutation requires at "
3988 				   "least three vectors ");
3989 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3990 				    stmt, 0);
3991 		}
3992 	      gcc_assert (analyze_only);
3993 	      return false;
3994 	    }
3995 
3996 	  gcc_assert (mask_element < 2 * nunits);
3997 	  if (mask_element != index)
3998 	    noop_p = false;
3999 	  mask[index++] = mask_element;
4000 
4001 	  if (index == nunits && !noop_p)
4002 	    {
4003 	      indices.new_vector (mask, 2, nunits);
4004 	      if (!can_vec_perm_const_p (mode, indices))
4005 		{
4006 		  if (dump_enabled_p ())
4007 		    {
4008 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4009 				       vect_location,
4010 				       "unsupported vect permute { ");
4011 		      for (i = 0; i < nunits; ++i)
4012 			{
4013 			  dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4014 			  dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4015 			}
4016 		      dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4017 		    }
4018 		  gcc_assert (analyze_only);
4019 		  return false;
4020 		}
4021 
4022 	      ++*n_perms;
4023 	    }
4024 
4025 	  if (index == nunits)
4026 	    {
4027 	      if (!analyze_only)
4028 		{
4029 		  tree mask_vec = NULL_TREE;
4030 
4031 		  if (! noop_p)
4032 		    mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4033 
4034 		  if (second_vec_index == -1)
4035 		    second_vec_index = first_vec_index;
4036 
4037 		  /* Generate the permute statement if necessary.  */
4038 		  tree first_vec = dr_chain[first_vec_index];
4039 		  tree second_vec = dr_chain[second_vec_index];
4040 		  gimple *perm_stmt;
4041 		  if (! noop_p)
4042 		    {
4043 		      tree perm_dest
4044 			= vect_create_destination_var (gimple_assign_lhs (stmt),
4045 						       vectype);
4046 		      perm_dest = make_ssa_name (perm_dest);
4047 		      perm_stmt = gimple_build_assign (perm_dest,
4048 						       VEC_PERM_EXPR,
4049 						       first_vec, second_vec,
4050 						       mask_vec);
4051 		      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4052 		    }
4053 		  else
4054 		    /* If mask was NULL_TREE generate the requested
4055 		       identity transform.  */
4056 		    perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4057 
4058 		  /* Store the vector statement in NODE.  */
4059 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4060 		}
4061 
4062 	      index = 0;
4063 	      first_vec_index = -1;
4064 	      second_vec_index = -1;
4065 	      noop_p = true;
4066 	    }
4067 	}
4068     }
4069 
4070   return true;
4071 }
4072 
4073 typedef hash_map <vec <gimple *>, slp_tree,
4074 		  simple_hashmap_traits <bst_traits, slp_tree> >
4075   scalar_stmts_to_slp_tree_map_t;
4076 
4077 /* Vectorize SLP instance tree in postorder.  */
4078 
4079 static bool
vect_schedule_slp_instance(slp_tree node,slp_instance instance,scalar_stmts_to_slp_tree_map_t * bst_map)4080 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4081 			    scalar_stmts_to_slp_tree_map_t *bst_map)
4082 {
4083   gimple *stmt;
4084   bool grouped_store, is_store;
4085   gimple_stmt_iterator si;
4086   stmt_vec_info stmt_info;
4087   unsigned int group_size;
4088   tree vectype;
4089   int i, j;
4090   slp_tree child;
4091 
4092   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4093     return false;
4094 
4095   /* See if we have already vectorized the same set of stmts and reuse their
4096      vectorized stmts.  */
4097   if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
4098     {
4099       SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
4100       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4101 	= SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
4102       return false;
4103     }
4104 
4105   bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
4106   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4107     vect_schedule_slp_instance (child, instance, bst_map);
4108 
4109   /* Push SLP node def-type to stmts.  */
4110   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4111     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4112       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4113 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4114 
4115   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4116   stmt_info = vinfo_for_stmt (stmt);
4117 
4118   /* VECTYPE is the type of the destination.  */
4119   vectype = STMT_VINFO_VECTYPE (stmt_info);
4120   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4121   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4122 
4123   if (!SLP_TREE_VEC_STMTS (node).exists ())
4124     SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4125 
4126   if (dump_enabled_p ())
4127     {
4128       dump_printf_loc (MSG_NOTE,vect_location,
4129 		       "------>vectorizing SLP node starting from: ");
4130       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4131     }
4132 
4133   /* Vectorized stmts go before the last scalar stmt which is where
4134      all uses are ready.  */
4135   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4136 
4137   /* Mark the first element of the reduction chain as reduction to properly
4138      transform the node.  In the analysis phase only the last element of the
4139      chain is marked as reduction.  */
4140   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4141       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4142     {
4143       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4144       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4145     }
4146 
4147   /* Handle two-operation SLP nodes by vectorizing the group with
4148      both operations and then performing a merge.  */
4149   if (SLP_TREE_TWO_OPERATORS (node))
4150     {
4151       enum tree_code code0 = gimple_assign_rhs_code (stmt);
4152       enum tree_code ocode = ERROR_MARK;
4153       gimple *ostmt;
4154       vec_perm_builder mask (group_size, group_size, 1);
4155       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4156 	if (gimple_assign_rhs_code (ostmt) != code0)
4157 	  {
4158 	    mask.quick_push (1);
4159 	    ocode = gimple_assign_rhs_code (ostmt);
4160 	  }
4161 	else
4162 	  mask.quick_push (0);
4163       if (ocode != ERROR_MARK)
4164 	{
4165 	  vec<gimple *> v0;
4166 	  vec<gimple *> v1;
4167 	  unsigned j;
4168 	  tree tmask = NULL_TREE;
4169 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4170 	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
4171 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4172 	  gimple_assign_set_rhs_code (stmt, ocode);
4173 	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4174 	  gimple_assign_set_rhs_code (stmt, code0);
4175 	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
4176 	  SLP_TREE_VEC_STMTS (node).truncate (0);
4177 	  tree meltype = build_nonstandard_integer_type
4178 	      (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4179 	  tree mvectype = get_same_sized_vectype (meltype, vectype);
4180 	  unsigned k = 0, l;
4181 	  for (j = 0; j < v0.length (); ++j)
4182 	    {
4183 	      /* Enforced by vect_build_slp_tree, which rejects variable-length
4184 		 vectors for SLP_TREE_TWO_OPERATORS.  */
4185 	      unsigned int const_nunits = nunits.to_constant ();
4186 	      tree_vector_builder melts (mvectype, const_nunits, 1);
4187 	      for (l = 0; l < const_nunits; ++l)
4188 		{
4189 		  if (k >= group_size)
4190 		    k = 0;
4191 		  tree t = build_int_cst (meltype,
4192 					  mask[k++] * const_nunits + l);
4193 		  melts.quick_push (t);
4194 		}
4195 	      tmask = melts.build ();
4196 
4197 	      /* ???  Not all targets support a VEC_PERM_EXPR with a
4198 	         constant mask that would translate to a vec_merge RTX
4199 		 (with their vec_perm_const_ok).  We can either not
4200 		 vectorize in that case or let veclower do its job.
4201 		 Unfortunately that isn't too great and at least for
4202 		 plus/minus we'd eventually like to match targets
4203 		 vector addsub instructions.  */
4204 	      gimple *vstmt;
4205 	      vstmt = gimple_build_assign (make_ssa_name (vectype),
4206 					   VEC_PERM_EXPR,
4207 					   gimple_assign_lhs (v0[j]),
4208 					   gimple_assign_lhs (v1[j]), tmask);
4209 	      vect_finish_stmt_generation (stmt, vstmt, &si);
4210 	      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4211 	    }
4212 	  v0.release ();
4213 	  v1.release ();
4214 	  return false;
4215 	}
4216     }
4217   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4218 
4219   /* Restore stmt def-types.  */
4220   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4221     if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4222       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4223 	STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4224 
4225   return is_store;
4226 }
4227 
4228 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4229    For loop vectorization this is done in vectorizable_call, but for SLP
4230    it needs to be deferred until end of vect_schedule_slp, because multiple
4231    SLP instances may refer to the same scalar stmt.  */
4232 
4233 static void
vect_remove_slp_scalar_calls(slp_tree node)4234 vect_remove_slp_scalar_calls (slp_tree node)
4235 {
4236   gimple *stmt, *new_stmt;
4237   gimple_stmt_iterator gsi;
4238   int i;
4239   slp_tree child;
4240   tree lhs;
4241   stmt_vec_info stmt_info;
4242 
4243   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4244     return;
4245 
4246   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4247     vect_remove_slp_scalar_calls (child);
4248 
4249   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4250     {
4251       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4252 	continue;
4253       stmt_info = vinfo_for_stmt (stmt);
4254       if (stmt_info == NULL
4255 	  || is_pattern_stmt_p (stmt_info)
4256 	  || !PURE_SLP_STMT (stmt_info))
4257 	continue;
4258       lhs = gimple_call_lhs (stmt);
4259       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4260       set_vinfo_for_stmt (new_stmt, stmt_info);
4261       set_vinfo_for_stmt (stmt, NULL);
4262       STMT_VINFO_STMT (stmt_info) = new_stmt;
4263       gsi = gsi_for_stmt (stmt);
4264       gsi_replace (&gsi, new_stmt, false);
4265       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4266     }
4267 }
4268 
4269 /* Generate vector code for all SLP instances in the loop/basic block.  */
4270 
4271 bool
vect_schedule_slp(vec_info * vinfo)4272 vect_schedule_slp (vec_info *vinfo)
4273 {
4274   vec<slp_instance> slp_instances;
4275   slp_instance instance;
4276   unsigned int i;
4277   bool is_store = false;
4278 
4279 
4280   scalar_stmts_to_slp_tree_map_t *bst_map
4281     = new scalar_stmts_to_slp_tree_map_t ();
4282   slp_instances = vinfo->slp_instances;
4283   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4284     {
4285       /* Schedule the tree of INSTANCE.  */
4286       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4287                                              instance, bst_map);
4288       if (dump_enabled_p ())
4289 	dump_printf_loc (MSG_NOTE, vect_location,
4290                          "vectorizing stmts using SLP.\n");
4291     }
4292   delete bst_map;
4293 
4294   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4295     {
4296       slp_tree root = SLP_INSTANCE_TREE (instance);
4297       gimple *store;
4298       unsigned int j;
4299       gimple_stmt_iterator gsi;
4300 
4301       /* Remove scalar call stmts.  Do not do this for basic-block
4302 	 vectorization as not all uses may be vectorized.
4303 	 ???  Why should this be necessary?  DCE should be able to
4304 	 remove the stmts itself.
4305 	 ???  For BB vectorization we can as well remove scalar
4306 	 stmts starting from the SLP tree root if they have no
4307 	 uses.  */
4308       if (is_a <loop_vec_info> (vinfo))
4309 	vect_remove_slp_scalar_calls (root);
4310 
4311       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4312                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4313         {
4314           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4315             break;
4316 
4317          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4318            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4319           /* Free the attached stmt_vec_info and remove the stmt.  */
4320           gsi = gsi_for_stmt (store);
4321 	  unlink_stmt_vdef (store);
4322           gsi_remove (&gsi, true);
4323 	  release_defs (store);
4324           free_stmt_vec_info (store);
4325         }
4326     }
4327 
4328   return is_store;
4329 }
4330