1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2014 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-ssa-alias.h"
33 #include "internal-fn.h"
34 #include "gimple-expr.h"
35 #include "is-a.h"
36 #include "gimple.h"
37 #include "gimple-iterator.h"
38 #include "gimple-ssa.h"
39 #include "tree-phinodes.h"
40 #include "ssa-iterators.h"
41 #include "stringpool.h"
42 #include "tree-ssanames.h"
43 #include "tree-pass.h"
44 #include "cfgloop.h"
45 #include "expr.h"
46 #include "recog.h"		/* FIXME: for insn_data */
47 #include "optabs.h"
48 #include "tree-vectorizer.h"
49 #include "langhooks.h"
50 
51 /* Extract the location of the basic block in the source code.
52    Return the basic block location if succeed and NULL if not.  */
53 
54 source_location
find_bb_location(basic_block bb)55 find_bb_location (basic_block bb)
56 {
57   gimple stmt = NULL;
58   gimple_stmt_iterator si;
59 
60   if (!bb)
61     return UNKNOWN_LOCATION;
62 
63   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
64     {
65       stmt = gsi_stmt (si);
66       if (gimple_location (stmt) != UNKNOWN_LOCATION)
67         return gimple_location (stmt);
68     }
69 
70   return UNKNOWN_LOCATION;
71 }
72 
73 
74 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
75 
76 static void
vect_free_slp_tree(slp_tree node)77 vect_free_slp_tree (slp_tree node)
78 {
79   int i;
80   slp_tree child;
81 
82   if (!node)
83     return;
84 
85   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
86     vect_free_slp_tree (child);
87 
88   SLP_TREE_CHILDREN (node).release ();
89   SLP_TREE_SCALAR_STMTS (node).release ();
90   SLP_TREE_VEC_STMTS (node).release ();
91   SLP_TREE_LOAD_PERMUTATION (node).release ();
92 
93   free (node);
94 }
95 
96 
97 /* Free the memory allocated for the SLP instance.  */
98 
99 void
vect_free_slp_instance(slp_instance instance)100 vect_free_slp_instance (slp_instance instance)
101 {
102   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
103   SLP_INSTANCE_LOADS (instance).release ();
104   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
105   free (instance);
106 }
107 
108 
109 /* Create an SLP node for SCALAR_STMTS.  */
110 
111 static slp_tree
vect_create_new_slp_node(vec<gimple> scalar_stmts)112 vect_create_new_slp_node (vec<gimple> scalar_stmts)
113 {
114   slp_tree node;
115   gimple stmt = scalar_stmts[0];
116   unsigned int nops;
117 
118   if (is_gimple_call (stmt))
119     nops = gimple_call_num_args (stmt);
120   else if (is_gimple_assign (stmt))
121     {
122       nops = gimple_num_ops (stmt) - 1;
123       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
124 	nops++;
125     }
126   else
127     return NULL;
128 
129   node = XNEW (struct _slp_tree);
130   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
131   SLP_TREE_VEC_STMTS (node).create (0);
132   SLP_TREE_CHILDREN (node).create (nops);
133   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
134 
135   return node;
136 }
137 
138 
139 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
140    operand.  */
141 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)142 vect_create_oprnd_info (int nops, int group_size)
143 {
144   int i;
145   slp_oprnd_info oprnd_info;
146   vec<slp_oprnd_info> oprnds_info;
147 
148   oprnds_info.create (nops);
149   for (i = 0; i < nops; i++)
150     {
151       oprnd_info = XNEW (struct _slp_oprnd_info);
152       oprnd_info->def_stmts.create (group_size);
153       oprnd_info->first_dt = vect_uninitialized_def;
154       oprnd_info->first_op_type = NULL_TREE;
155       oprnd_info->first_pattern = false;
156       oprnds_info.quick_push (oprnd_info);
157     }
158 
159   return oprnds_info;
160 }
161 
162 
163 /* Free operands info.  */
164 
165 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)166 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
167 {
168   int i;
169   slp_oprnd_info oprnd_info;
170 
171   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
172     {
173       oprnd_info->def_stmts.release ();
174       XDELETE (oprnd_info);
175     }
176 
177   oprnds_info.release ();
178 }
179 
180 
181 /* Find the place of the data-ref in STMT in the interleaving chain that starts
182    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
183 
184 static int
vect_get_place_in_interleaving_chain(gimple stmt,gimple first_stmt)185 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
186 {
187   gimple next_stmt = first_stmt;
188   int result = 0;
189 
190   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
191     return -1;
192 
193   do
194     {
195       if (next_stmt == stmt)
196 	return result;
197       result++;
198       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
199     }
200   while (next_stmt);
201 
202   return -1;
203 }
204 
205 
206 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
207    they are of a valid type and that they match the defs of the first stmt of
208    the SLP group (stored in OPRNDS_INFO).  */
209 
210 static bool
vect_get_and_check_slp_defs(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,gimple stmt,bool first,vec<slp_oprnd_info> * oprnds_info)211 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
212                              gimple stmt, bool first,
213                              vec<slp_oprnd_info> *oprnds_info)
214 {
215   tree oprnd;
216   unsigned int i, number_of_oprnds;
217   tree def;
218   gimple def_stmt;
219   enum vect_def_type dt = vect_uninitialized_def;
220   struct loop *loop = NULL;
221   bool pattern = false;
222   slp_oprnd_info oprnd_info;
223   int op_idx = 1;
224   tree compare_rhs = NULL_TREE;
225 
226   if (loop_vinfo)
227     loop = LOOP_VINFO_LOOP (loop_vinfo);
228 
229   if (is_gimple_call (stmt))
230     {
231       number_of_oprnds = gimple_call_num_args (stmt);
232       op_idx = 3;
233     }
234   else if (is_gimple_assign (stmt))
235     {
236       number_of_oprnds = gimple_num_ops (stmt) - 1;
237       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
238         number_of_oprnds++;
239     }
240   else
241     return false;
242 
243   for (i = 0; i < number_of_oprnds; i++)
244     {
245       if (compare_rhs)
246 	{
247 	  oprnd = compare_rhs;
248 	  compare_rhs = NULL_TREE;
249 	}
250       else
251         oprnd = gimple_op (stmt, op_idx++);
252 
253       oprnd_info = (*oprnds_info)[i];
254 
255       if (COMPARISON_CLASS_P (oprnd))
256         {
257           compare_rhs = TREE_OPERAND (oprnd, 1);
258           oprnd = TREE_OPERAND (oprnd, 0);
259 	}
260 
261       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
262 			       &def, &dt)
263 	  || (!def_stmt && dt != vect_constant_def))
264 	{
265 	  if (dump_enabled_p ())
266 	    {
267 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 			       "Build SLP failed: can't find def for ");
269 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
270               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
271 	    }
272 
273 	  return false;
274 	}
275 
276       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
277          from the pattern.  Check that all the stmts of the node are in the
278          pattern.  */
279       if (def_stmt && gimple_bb (def_stmt)
280           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
281 	      || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
282 		  && gimple_code (def_stmt) != GIMPLE_PHI))
283           && vinfo_for_stmt (def_stmt)
284           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
285 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
286 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
287         {
288           pattern = true;
289           if (!first && !oprnd_info->first_pattern)
290 	    {
291 	      if (dump_enabled_p ())
292 		{
293 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 				   "Build SLP failed: some of the stmts"
295 				   " are in a pattern, and others are not ");
296 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
297                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
298 		}
299 
300 	      return false;
301             }
302 
303           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
304           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
305 
306           if (dt == vect_unknown_def_type)
307             {
308               if (dump_enabled_p ())
309                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
310 				 "Unsupported pattern.\n");
311               return false;
312             }
313 
314           switch (gimple_code (def_stmt))
315             {
316               case GIMPLE_PHI:
317                 def = gimple_phi_result (def_stmt);
318                 break;
319 
320               case GIMPLE_ASSIGN:
321                 def = gimple_assign_lhs (def_stmt);
322                 break;
323 
324               default:
325                 if (dump_enabled_p ())
326                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
327 				   "unsupported defining stmt:\n");
328                 return false;
329             }
330         }
331 
332       if (first)
333 	{
334 	  oprnd_info->first_dt = dt;
335 	  oprnd_info->first_pattern = pattern;
336 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
337 	}
338       else
339 	{
340 	  /* Not first stmt of the group, check that the def-stmt/s match
341 	     the def-stmt/s of the first stmt.  Allow different definition
342 	     types for reduction chains: the first stmt must be a
343 	     vect_reduction_def (a phi node), and the rest
344 	     vect_internal_def.  */
345 	  if (((oprnd_info->first_dt != dt
346                 && !(oprnd_info->first_dt == vect_reduction_def
347                      && dt == vect_internal_def)
348 		&& !((oprnd_info->first_dt == vect_external_def
349 		      || oprnd_info->first_dt == vect_constant_def)
350 		     && (dt == vect_external_def
351 			 || dt == vect_constant_def)))
352                || !types_compatible_p (oprnd_info->first_op_type,
353 				       TREE_TYPE (oprnd))))
354 	    {
355 	      if (dump_enabled_p ())
356 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 				 "Build SLP failed: different types\n");
358 
359 	      return false;
360 	    }
361 	}
362 
363       /* Check the types of the definitions.  */
364       switch (dt)
365 	{
366 	case vect_constant_def:
367 	case vect_external_def:
368         case vect_reduction_def:
369 	  break;
370 
371 	case vect_internal_def:
372 	  oprnd_info->def_stmts.quick_push (def_stmt);
373 	  break;
374 
375 	default:
376 	  /* FORNOW: Not supported.  */
377 	  if (dump_enabled_p ())
378 	    {
379 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
380 			       "Build SLP failed: illegal type of def ");
381 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
382               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
383 	    }
384 
385 	  return false;
386 	}
387     }
388 
389   return true;
390 }
391 
392 
393 /* Verify if the scalar stmts STMTS are isomorphic, require data
394    permutation or are of unsupported types of operation.  Return
395    true if they are, otherwise return false and indicate in *MATCHES
396    which stmts are not isomorphic to the first one.  If MATCHES[0]
397    is false then this indicates the comparison could not be
398    carried out or the stmts will never be vectorized by SLP.  */
399 
400 static bool
vect_build_slp_tree_1(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,vec<gimple> stmts,unsigned int group_size,unsigned nops,unsigned int * max_nunits,unsigned int vectorization_factor,bool * matches)401 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
402 		       vec<gimple> stmts, unsigned int group_size,
403 		       unsigned nops, unsigned int *max_nunits,
404 		       unsigned int vectorization_factor, bool *matches)
405 {
406   unsigned int i;
407   gimple stmt = stmts[0];
408   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
409   enum tree_code first_cond_code = ERROR_MARK;
410   tree lhs;
411   bool need_same_oprnds = false;
412   tree vectype, scalar_type, first_op1 = NULL_TREE;
413   optab optab;
414   int icode;
415   enum machine_mode optab_op2_mode;
416   enum machine_mode vec_mode;
417   struct data_reference *first_dr;
418   HOST_WIDE_INT dummy;
419   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
420   tree cond;
421 
422   /* For every stmt in NODE find its def stmt/s.  */
423   FOR_EACH_VEC_ELT (stmts, i, stmt)
424     {
425       matches[i] = false;
426 
427       if (dump_enabled_p ())
428 	{
429 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
430 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
431           dump_printf (MSG_NOTE, "\n");
432 	}
433 
434       /* Fail to vectorize statements marked as unvectorizable.  */
435       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
436         {
437           if (dump_enabled_p ())
438             {
439               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
440 			       "Build SLP failed: unvectorizable statement ");
441               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
442               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
443             }
444 	  /* Fatal mismatch.  */
445 	  matches[0] = false;
446           return false;
447         }
448 
449       lhs = gimple_get_lhs (stmt);
450       if (lhs == NULL_TREE)
451 	{
452 	  if (dump_enabled_p ())
453 	    {
454 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
456 			       "GIMPLE_CALL ");
457 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
458               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
459 	    }
460 	  /* Fatal mismatch.  */
461 	  matches[0] = false;
462 	  return false;
463 	}
464 
465        if (is_gimple_assign (stmt)
466 	   && gimple_assign_rhs_code (stmt) == COND_EXPR
467            && (cond = gimple_assign_rhs1 (stmt))
468            && !COMPARISON_CLASS_P (cond))
469         {
470           if (dump_enabled_p ())
471             {
472               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
473 			       "Build SLP failed: condition is not "
474 			       "comparison ");
475               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
476               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
477             }
478 	  /* Fatal mismatch.  */
479 	  matches[0] = false;
480           return false;
481         }
482 
483       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
484       vectype = get_vectype_for_scalar_type (scalar_type);
485       if (!vectype)
486         {
487           if (dump_enabled_p ())
488             {
489               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 			       "Build SLP failed: unsupported data-type ");
491               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
492 				 scalar_type);
493               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
494             }
495 	  /* Fatal mismatch.  */
496 	  matches[0] = false;
497           return false;
498         }
499 
500       /* In case of multiple types we need to detect the smallest type.  */
501       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
502         {
503           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
504           if (bb_vinfo)
505             vectorization_factor = *max_nunits;
506         }
507 
508       if (is_gimple_call (stmt))
509 	{
510 	  rhs_code = CALL_EXPR;
511 	  if (gimple_call_internal_p (stmt)
512 	      || gimple_call_tail_p (stmt)
513 	      || gimple_call_noreturn_p (stmt)
514 	      || !gimple_call_nothrow_p (stmt)
515 	      || gimple_call_chain (stmt))
516 	    {
517 	      if (dump_enabled_p ())
518 		{
519 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
520 				   "Build SLP failed: unsupported call type ");
521 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
522                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
523 		}
524 	      /* Fatal mismatch.  */
525 	      matches[0] = false;
526 	      return false;
527 	    }
528 	}
529       else
530 	rhs_code = gimple_assign_rhs_code (stmt);
531 
532       /* Check the operation.  */
533       if (i == 0)
534 	{
535 	  first_stmt_code = rhs_code;
536 
537 	  /* Shift arguments should be equal in all the packed stmts for a
538 	     vector shift with scalar shift operand.  */
539 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
540 	      || rhs_code == LROTATE_EXPR
541 	      || rhs_code == RROTATE_EXPR)
542 	    {
543 	      vec_mode = TYPE_MODE (vectype);
544 
545 	      /* First see if we have a vector/vector shift.  */
546 	      optab = optab_for_tree_code (rhs_code, vectype,
547 					   optab_vector);
548 
549 	      if (!optab
550 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
551 		{
552 		  /* No vector/vector shift, try for a vector/scalar shift.  */
553 		  optab = optab_for_tree_code (rhs_code, vectype,
554 					       optab_scalar);
555 
556 		  if (!optab)
557 		    {
558 		      if (dump_enabled_p ())
559 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 					 "Build SLP failed: no optab.\n");
561 		      /* Fatal mismatch.  */
562 		      matches[0] = false;
563 		      return false;
564 		    }
565 		  icode = (int) optab_handler (optab, vec_mode);
566 		  if (icode == CODE_FOR_nothing)
567 		    {
568 		      if (dump_enabled_p ())
569 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 					 "Build SLP failed: "
571 					 "op not supported by target.\n");
572 		      /* Fatal mismatch.  */
573 		      matches[0] = false;
574 		      return false;
575 		    }
576 		  optab_op2_mode = insn_data[icode].operand[2].mode;
577 		  if (!VECTOR_MODE_P (optab_op2_mode))
578 		    {
579 		      need_same_oprnds = true;
580 		      first_op1 = gimple_assign_rhs2 (stmt);
581 		    }
582 		}
583 	    }
584 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
585             {
586               need_same_oprnds = true;
587               first_op1 = gimple_assign_rhs2 (stmt);
588             }
589 	}
590       else
591 	{
592 	  if (first_stmt_code != rhs_code
593 	      && (first_stmt_code != IMAGPART_EXPR
594 		  || rhs_code != REALPART_EXPR)
595 	      && (first_stmt_code != REALPART_EXPR
596 		  || rhs_code != IMAGPART_EXPR)
597               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
598                    && (first_stmt_code == ARRAY_REF
599                        || first_stmt_code == BIT_FIELD_REF
600                        || first_stmt_code == INDIRECT_REF
601                        || first_stmt_code == COMPONENT_REF
602                        || first_stmt_code == MEM_REF)))
603 	    {
604 	      if (dump_enabled_p ())
605 		{
606 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 				   "Build SLP failed: different operation "
608 				   "in stmt ");
609 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
610                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
611 		}
612 	      /* Mismatch.  */
613 	      continue;
614 	    }
615 
616 	  if (need_same_oprnds
617 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
618 	    {
619 	      if (dump_enabled_p ())
620 		{
621 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
622 				   "Build SLP failed: different shift "
623 				   "arguments in ");
624 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
625                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
626 		}
627 	      /* Mismatch.  */
628 	      continue;
629 	    }
630 
631 	  if (rhs_code == CALL_EXPR)
632 	    {
633 	      gimple first_stmt = stmts[0];
634 	      if (gimple_call_num_args (stmt) != nops
635 		  || !operand_equal_p (gimple_call_fn (first_stmt),
636 				       gimple_call_fn (stmt), 0)
637 		  || gimple_call_fntype (first_stmt)
638 		     != gimple_call_fntype (stmt))
639 		{
640 		  if (dump_enabled_p ())
641 		    {
642 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
643 				       "Build SLP failed: different calls in ");
644 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
645 					stmt, 0);
646                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
647 		    }
648 		  /* Mismatch.  */
649 		  continue;
650 		}
651 	    }
652 	}
653 
654       /* Grouped store or load.  */
655       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
656 	{
657 	  if (REFERENCE_CLASS_P (lhs))
658 	    {
659 	      /* Store.  */
660 	      ;
661 	    }
662 	  else
663 	    {
664 	      /* Load.  */
665 	      unsigned unrolling_factor
666 		= least_common_multiple
667 		    (*max_nunits, group_size) / group_size;
668               /* FORNOW: Check that there is no gap between the loads
669 		 and no gap between the groups when we need to load
670 		 multiple groups at once.
671 		 ???  We should enhance this to only disallow gaps
672 		 inside vectors.  */
673               if ((unrolling_factor > 1
674 		   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
675 		   && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
676 		  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
677 		      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
678                 {
679                   if (dump_enabled_p ())
680                     {
681                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 				       "Build SLP failed: grouped "
683 				       "loads have gaps ");
684                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
685 					stmt, 0);
686                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
687                     }
688 		  /* Fatal mismatch.  */
689 		  matches[0] = false;
690                   return false;
691                 }
692 
693               /* Check that the size of interleaved loads group is not
694                  greater than the SLP group size.  */
695 	      unsigned ncopies
696 		= vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
697               if (loop_vinfo
698 		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
699                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
700 		       - GROUP_GAP (vinfo_for_stmt (stmt)))
701 		      > ncopies * group_size))
702                 {
703                   if (dump_enabled_p ())
704                     {
705                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 				       "Build SLP failed: the number "
707 				       "of interleaved loads is greater than "
708 				       "the SLP group size ");
709                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
710 					stmt, 0);
711                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
712                     }
713 		  /* Fatal mismatch.  */
714 		  matches[0] = false;
715                   return false;
716                 }
717 
718 	      old_first_load = first_load;
719               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
720               if (prev_first_load)
721                 {
722                   /* Check that there are no loads from different interleaving
723                      chains in the same node.  */
724                   if (prev_first_load != first_load)
725                     {
726                       if (dump_enabled_p ())
727                         {
728                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
729 					   vect_location,
730 					   "Build SLP failed: different "
731 					   "interleaving chains in one node ");
732                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
733 					    stmt, 0);
734                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
735                         }
736 		      /* Mismatch.  */
737 		      continue;
738                     }
739                 }
740               else
741                 prev_first_load = first_load;
742 
743 	      /* In some cases a group of loads is just the same load
744 		 repeated N times.  Only analyze its cost once.  */
745               if (first_load == stmt && old_first_load != first_load)
746                 {
747                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
748                   if (vect_supportable_dr_alignment (first_dr, false)
749                       == dr_unaligned_unsupported)
750                     {
751                       if (dump_enabled_p ())
752                         {
753                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
754 					   vect_location,
755 					   "Build SLP failed: unsupported "
756 					   "unaligned load ");
757                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
758 					    stmt, 0);
759                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
760                         }
761 		      /* Fatal mismatch.  */
762 		      matches[0] = false;
763                       return false;
764                     }
765                 }
766            }
767         } /* Grouped access.  */
768       else
769 	{
770 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
771 	    {
772 	      /* Not grouped load.  */
773 	      if (dump_enabled_p ())
774 		{
775 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
776 				   "Build SLP failed: not grouped load ");
777 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
778                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
779 		}
780 
781 	      /* FORNOW: Not grouped loads are not supported.  */
782 	      /* Fatal mismatch.  */
783 	      matches[0] = false;
784 	      return false;
785 	    }
786 
787 	  /* Not memory operation.  */
788 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
789 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
790 	      && rhs_code != COND_EXPR
791 	      && rhs_code != CALL_EXPR)
792 	    {
793 	      if (dump_enabled_p ())
794 		{
795 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
796 				   "Build SLP failed: operation");
797 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
798 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
799                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
800 		}
801 	      /* Fatal mismatch.  */
802 	      matches[0] = false;
803 	      return false;
804 	    }
805 
806           if (rhs_code == COND_EXPR)
807             {
808               tree cond_expr = gimple_assign_rhs1 (stmt);
809 
810 	      if (i == 0)
811 		first_cond_code = TREE_CODE (cond_expr);
812               else if (first_cond_code != TREE_CODE (cond_expr))
813                 {
814                   if (dump_enabled_p ())
815                     {
816                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 				       "Build SLP failed: different"
818 				       " operation");
819                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
820 					stmt, 0);
821                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
822                     }
823 		  /* Mismatch.  */
824 		  continue;
825 		}
826             }
827 	}
828 
829       matches[i] = true;
830     }
831 
832   for (i = 0; i < group_size; ++i)
833     if (!matches[i])
834       return false;
835 
836   return true;
837 }
838 
839 /* Recursively build an SLP tree starting from NODE.
840    Fail (and return a value not equal to zero) if def-stmts are not
841    isomorphic, require data permutation or are of unsupported types of
842    operation.  Otherwise, return 0.
843    The value returned is the depth in the SLP tree where a mismatch
844    was found.  */
845 
846 static bool
vect_build_slp_tree(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,slp_tree * node,unsigned int group_size,unsigned int * max_nunits,vec<slp_tree> * loads,unsigned int vectorization_factor,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)847 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
848                      slp_tree *node, unsigned int group_size,
849                      unsigned int *max_nunits,
850                      vec<slp_tree> *loads,
851                      unsigned int vectorization_factor,
852 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
853 		     unsigned max_tree_size)
854 {
855   unsigned nops, i, this_npermutes = 0, this_tree_size = 0;
856   gimple stmt;
857 
858   if (!matches)
859     matches = XALLOCAVEC (bool, group_size);
860   if (!npermutes)
861     npermutes = &this_npermutes;
862 
863   matches[0] = false;
864 
865   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
866   if (is_gimple_call (stmt))
867     nops = gimple_call_num_args (stmt);
868   else if (is_gimple_assign (stmt))
869     {
870       nops = gimple_num_ops (stmt) - 1;
871       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
872 	nops++;
873     }
874   else
875     return false;
876 
877   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
878 			      SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
879 			      max_nunits, vectorization_factor, matches))
880     return false;
881 
882   /* If the SLP node is a load, terminate the recursion.  */
883   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
884       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
885     {
886       loads->safe_push (*node);
887       return true;
888     }
889 
890   /* Get at the operands, verifying they are compatible.  */
891   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
892   slp_oprnd_info oprnd_info;
893   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
894     {
895       if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
896 					stmt, (i == 0), &oprnds_info))
897 	{
898 	  vect_free_oprnd_info (oprnds_info);
899 	  return false;
900 	}
901     }
902 
903   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
904 
905   /* Create SLP_TREE nodes for the definition node/s.  */
906   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
907     {
908       slp_tree child;
909       unsigned old_nloads = loads->length ();
910       unsigned old_max_nunits = *max_nunits;
911 
912       if (oprnd_info->first_dt != vect_internal_def)
913         continue;
914 
915       if (++this_tree_size > max_tree_size)
916 	{
917 	  vect_free_oprnd_info (oprnds_info);
918 	  return false;
919 	}
920 
921       child = vect_create_new_slp_node (oprnd_info->def_stmts);
922       if (!child)
923 	{
924 	  vect_free_oprnd_info (oprnds_info);
925 	  return false;
926 	}
927 
928       bool *matches = XALLOCAVEC (bool, group_size);
929       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
930 			       group_size, max_nunits, loads,
931 			       vectorization_factor, matches,
932 			       npermutes, &this_tree_size, max_tree_size))
933 	{
934 	  oprnd_info->def_stmts = vNULL;
935 	  SLP_TREE_CHILDREN (*node).quick_push (child);
936 	  continue;
937 	}
938 
939       /* If the SLP build for operand zero failed and operand zero
940 	 and one can be commutated try that for the scalar stmts
941 	 that failed the match.  */
942       if (i == 0
943 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
944 	  && matches[0]
945 	  /* ???  For COND_EXPRs we can swap the comparison operands
946 	     as well as the arms under some constraints.  */
947 	  && nops == 2
948 	  && oprnds_info[1]->first_dt == vect_internal_def
949 	  && is_gimple_assign (stmt)
950 	  && commutative_tree_code (gimple_assign_rhs_code (stmt))
951 	  /* Do so only if the number of not successful permutes was nor more
952 	     than a cut-ff as re-trying the recursive match on
953 	     possibly each level of the tree would expose exponential
954 	     behavior.  */
955 	  && *npermutes < 4)
956 	{
957 	  /* Roll back.  */
958 	  *max_nunits = old_max_nunits;
959 	  loads->truncate (old_nloads);
960 	  /* Swap mismatched definition stmts.  */
961 	  for (unsigned j = 0; j < group_size; ++j)
962 	    if (!matches[j])
963 	      {
964 		gimple tem = oprnds_info[0]->def_stmts[j];
965 		oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
966 		oprnds_info[1]->def_stmts[j] = tem;
967 	      }
968 	  /* And try again ... */
969 	  if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
970 				   group_size, max_nunits, loads,
971 				   vectorization_factor,
972 				   matches, npermutes, &this_tree_size,
973 				   max_tree_size))
974 	    {
975 	      oprnd_info->def_stmts = vNULL;
976 	      SLP_TREE_CHILDREN (*node).quick_push (child);
977 	      continue;
978 	    }
979 
980 	  ++*npermutes;
981 	}
982 
983       oprnd_info->def_stmts = vNULL;
984       vect_free_slp_tree (child);
985       vect_free_oprnd_info (oprnds_info);
986       return false;
987     }
988 
989   if (tree_size)
990     *tree_size += this_tree_size;
991 
992   vect_free_oprnd_info (oprnds_info);
993   return true;
994 }
995 
996 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
997 
998 static void
vect_print_slp_tree(int dump_kind,slp_tree node)999 vect_print_slp_tree (int dump_kind, slp_tree node)
1000 {
1001   int i;
1002   gimple stmt;
1003   slp_tree child;
1004 
1005   if (!node)
1006     return;
1007 
1008   dump_printf (dump_kind, "node ");
1009   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1010     {
1011       dump_printf (dump_kind, "\n\tstmt %d ", i);
1012       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1013     }
1014   dump_printf (dump_kind, "\n");
1015 
1016   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1017     vect_print_slp_tree (dump_kind, child);
1018 }
1019 
1020 
1021 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1022    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1023    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1024    stmts in NODE are to be marked.  */
1025 
1026 static void
vect_mark_slp_stmts(slp_tree node,enum slp_vect_type mark,int j)1027 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1028 {
1029   int i;
1030   gimple stmt;
1031   slp_tree child;
1032 
1033   if (!node)
1034     return;
1035 
1036   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1037     if (j < 0 || i == j)
1038       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1039 
1040   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1041     vect_mark_slp_stmts (child, mark, j);
1042 }
1043 
1044 
1045 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1046 
1047 static void
vect_mark_slp_stmts_relevant(slp_tree node)1048 vect_mark_slp_stmts_relevant (slp_tree node)
1049 {
1050   int i;
1051   gimple stmt;
1052   stmt_vec_info stmt_info;
1053   slp_tree child;
1054 
1055   if (!node)
1056     return;
1057 
1058   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1059     {
1060       stmt_info = vinfo_for_stmt (stmt);
1061       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1062                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1063       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1064     }
1065 
1066   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1067     vect_mark_slp_stmts_relevant (child);
1068 }
1069 
1070 
1071 /* Rearrange the statements of NODE according to PERMUTATION.  */
1072 
1073 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation)1074 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1075                           vec<unsigned> permutation)
1076 {
1077   gimple stmt;
1078   vec<gimple> tmp_stmts;
1079   unsigned int i;
1080   slp_tree child;
1081 
1082   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1083     vect_slp_rearrange_stmts (child, group_size, permutation);
1084 
1085   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1086   tmp_stmts.create (group_size);
1087   tmp_stmts.quick_grow_cleared (group_size);
1088 
1089   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1090     tmp_stmts[permutation[i]] = stmt;
1091 
1092   SLP_TREE_SCALAR_STMTS (node).release ();
1093   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1094 }
1095 
1096 
1097 /* Check if the required load permutations in the SLP instance
1098    SLP_INSTN are supported.  */
1099 
1100 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1101 vect_supported_load_permutation_p (slp_instance slp_instn)
1102 {
1103   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1104   unsigned int i, j, k, next;
1105   sbitmap load_index;
1106   slp_tree node;
1107   gimple stmt, load, next_load, first_load;
1108   struct data_reference *dr;
1109 
1110   if (dump_enabled_p ())
1111     {
1112       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1113       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1114 	if (node->load_permutation.exists ())
1115 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1116 	    dump_printf (MSG_NOTE, "%d ", next);
1117 	else
1118 	  for (k = 0; k < group_size; ++k)
1119 	    dump_printf (MSG_NOTE, "%d ", k);
1120       dump_printf (MSG_NOTE, "\n");
1121     }
1122 
1123   /* In case of reduction every load permutation is allowed, since the order
1124      of the reduction statements is not important (as opposed to the case of
1125      grouped stores).  The only condition we need to check is that all the
1126      load nodes are of the same size and have the same permutation (and then
1127      rearrange all the nodes of the SLP instance according to this
1128      permutation).  */
1129 
1130   /* Check that all the load nodes are of the same size.  */
1131   /* ???  Can't we assert this? */
1132   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1133     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1134       return false;
1135 
1136   node = SLP_INSTANCE_TREE (slp_instn);
1137   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1138 
1139   /* Reduction (there are no data-refs in the root).
1140      In reduction chain the order of the loads is important.  */
1141   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1142       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1143     {
1144       slp_tree load;
1145       unsigned int lidx;
1146 
1147       /* Compare all the permutation sequences to the first one.  We know
1148          that at least one load is permuted.  */
1149       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1150       if (!node->load_permutation.exists ())
1151 	return false;
1152       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1153 	{
1154 	  if (!load->load_permutation.exists ())
1155 	    return false;
1156 	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1157 	    if (lidx != node->load_permutation[j])
1158 	      return false;
1159 	}
1160 
1161       /* Check that the loads in the first sequence are different and there
1162 	 are no gaps between them.  */
1163       load_index = sbitmap_alloc (group_size);
1164       bitmap_clear (load_index);
1165       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1166 	{
1167 	  if (bitmap_bit_p (load_index, lidx))
1168 	    {
1169 	      sbitmap_free (load_index);
1170 	      return false;
1171 	    }
1172 	  bitmap_set_bit (load_index, lidx);
1173 	}
1174       for (i = 0; i < group_size; i++)
1175 	if (!bitmap_bit_p (load_index, i))
1176 	  {
1177 	    sbitmap_free (load_index);
1178 	    return false;
1179 	  }
1180       sbitmap_free (load_index);
1181 
1182       /* This permutation is valid for reduction.  Since the order of the
1183 	 statements in the nodes is not important unless they are memory
1184 	 accesses, we can rearrange the statements in all the nodes
1185 	 according to the order of the loads.  */
1186       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1187 				node->load_permutation);
1188 
1189       /* We are done, no actual permutations need to be generated.  */
1190       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1191 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1192       return true;
1193     }
1194 
1195   /* In basic block vectorization we allow any subchain of an interleaving
1196      chain.
1197      FORNOW: not supported in loop SLP because of realignment compications.  */
1198   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1199     {
1200       /* Check that for every node in the instance the loads
1201 	 form a subchain.  */
1202       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1203         {
1204           next_load = NULL;
1205           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1206             {
1207               if (j != 0 && next_load != load)
1208 		return false;
1209               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1210             }
1211         }
1212 
1213       /* Check that the alignment of the first load in every subchain, i.e.,
1214          the first statement in every load node, is supported.
1215 	 ???  This belongs in alignment checking.  */
1216       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1217 	{
1218 	  first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1219 	  if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1220 	    {
1221 	      dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1222 	      if (vect_supportable_dr_alignment (dr, false)
1223 		  == dr_unaligned_unsupported)
1224 		{
1225 		  if (dump_enabled_p ())
1226 		    {
1227 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1228 				       vect_location,
1229 				       "unsupported unaligned load ");
1230 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1231 					first_load, 0);
1232                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1233 		    }
1234 		  return false;
1235 		}
1236 	    }
1237 	}
1238 
1239       /* We are done, no actual permutations need to be generated.  */
1240       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1241 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1242       return true;
1243     }
1244 
1245   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1246      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1247      well (unless it's reduction).  */
1248   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1249     return false;
1250   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1251     if (!node->load_permutation.exists ())
1252       return false;
1253 
1254   load_index = sbitmap_alloc (group_size);
1255   bitmap_clear (load_index);
1256   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1257     {
1258       unsigned int lidx = node->load_permutation[0];
1259       if (bitmap_bit_p (load_index, lidx))
1260 	{
1261 	  sbitmap_free (load_index);
1262 	  return false;
1263 	}
1264       bitmap_set_bit (load_index, lidx);
1265       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1266 	if (k != lidx)
1267 	  {
1268 	    sbitmap_free (load_index);
1269 	    return false;
1270 	  }
1271     }
1272   for (i = 0; i < group_size; i++)
1273     if (!bitmap_bit_p (load_index, i))
1274       {
1275 	sbitmap_free (load_index);
1276 	return false;
1277       }
1278   sbitmap_free (load_index);
1279 
1280   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1281     if (node->load_permutation.exists ()
1282 	&& !vect_transform_slp_perm_load
1283 	      (node, vNULL, NULL,
1284 	       SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1285       return false;
1286   return true;
1287 }
1288 
1289 
1290 /* Find the first load in the loop that belongs to INSTANCE.
1291    When loads are in several SLP nodes, there can be a case in which the first
1292    load does not appear in the first SLP node to be transformed, causing
1293    incorrect order of statements.  Since we generate all the loads together,
1294    they must be inserted before the first load of the SLP instance and not
1295    before the first load of the first node of the instance.  */
1296 
1297 static gimple
vect_find_first_load_in_slp_instance(slp_instance instance)1298 vect_find_first_load_in_slp_instance (slp_instance instance)
1299 {
1300   int i, j;
1301   slp_tree load_node;
1302   gimple first_load = NULL, load;
1303 
1304   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1305     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1306       first_load = get_earlier_stmt (load, first_load);
1307 
1308   return first_load;
1309 }
1310 
1311 
1312 /* Find the last store in SLP INSTANCE.  */
1313 
1314 static gimple
vect_find_last_store_in_slp_instance(slp_instance instance)1315 vect_find_last_store_in_slp_instance (slp_instance instance)
1316 {
1317   int i;
1318   slp_tree node;
1319   gimple last_store = NULL, store;
1320 
1321   node = SLP_INSTANCE_TREE (instance);
1322   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1323     last_store = get_later_stmt (store, last_store);
1324 
1325   return last_store;
1326 }
1327 
1328 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1329 
1330 static void
vect_analyze_slp_cost_1(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,stmt_vector_for_cost * prologue_cost_vec,unsigned ncopies_for_cost)1331 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1332 			 slp_instance instance, slp_tree node,
1333 			 stmt_vector_for_cost *prologue_cost_vec,
1334 			 unsigned ncopies_for_cost)
1335 {
1336   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1337 
1338   unsigned i;
1339   slp_tree child;
1340   gimple stmt, s;
1341   stmt_vec_info stmt_info;
1342   tree lhs;
1343   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1344 
1345   /* Recurse down the SLP tree.  */
1346   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1347     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1348 			     instance, child, prologue_cost_vec,
1349 			     ncopies_for_cost);
1350 
1351   /* Look at the first scalar stmt to determine the cost.  */
1352   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1353   stmt_info = vinfo_for_stmt (stmt);
1354   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1355     {
1356       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1357 	vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1358 			       vect_uninitialized_def,
1359 			       node, prologue_cost_vec, body_cost_vec);
1360       else
1361 	{
1362 	  int i;
1363 	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1364 	  vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1365 				node, prologue_cost_vec, body_cost_vec);
1366 	  /* If the load is permuted record the cost for the permutation.
1367 	     ???  Loads from multiple chains are let through here only
1368 	     for a single special case involving complex numbers where
1369 	     in the end no permutation is necessary.  */
1370 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1371 	    if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1372 		 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1373 		&& vect_get_place_in_interleaving_chain
1374 		     (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1375 	      {
1376 		record_stmt_cost (body_cost_vec, group_size, vec_perm,
1377 				  stmt_info, 0, vect_body);
1378 		break;
1379 	      }
1380 	}
1381     }
1382   else
1383     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1384 		      stmt_info, 0, vect_body);
1385 
1386   /* Scan operands and account for prologue cost of constants/externals.
1387      ???  This over-estimates cost for multiple uses and should be
1388      re-engineered.  */
1389   lhs = gimple_get_lhs (stmt);
1390   for (i = 0; i < gimple_num_ops (stmt); ++i)
1391     {
1392       tree def, op = gimple_op (stmt, i);
1393       gimple def_stmt;
1394       enum vect_def_type dt;
1395       if (!op || op == lhs)
1396 	continue;
1397       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1398 			      &def_stmt, &def, &dt)
1399 	  && (dt == vect_constant_def || dt == vect_external_def))
1400 	record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1401 			  stmt_info, 0, vect_prologue);
1402     }
1403 }
1404 
1405 /* Compute the cost for the SLP instance INSTANCE.  */
1406 
1407 static void
vect_analyze_slp_cost(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,slp_instance instance,unsigned nunits)1408 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1409 		       slp_instance instance, unsigned nunits)
1410 {
1411   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1412   unsigned ncopies_for_cost;
1413   stmt_info_for_cost *si;
1414   unsigned i;
1415 
1416   /* Calculate the number of vector stmts to create based on the unrolling
1417      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1418      GROUP_SIZE / NUNITS otherwise.  */
1419   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1420   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1421 
1422   prologue_cost_vec.create (10);
1423   body_cost_vec.create (10);
1424   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1425   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1426 			   instance, SLP_INSTANCE_TREE (instance),
1427 			   &prologue_cost_vec, ncopies_for_cost);
1428 
1429   /* Record the prologue costs, which were delayed until we were
1430      sure that SLP was successful.  Unlike the body costs, we know
1431      the final values now regardless of the loop vectorization factor.  */
1432   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1433 		: BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1434   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1435     {
1436       struct _stmt_vec_info *stmt_info
1437 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1438       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1439 			    si->misalign, vect_prologue);
1440     }
1441 
1442   prologue_cost_vec.release ();
1443 }
1444 
1445 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1446    vect_build_slp_tree to build a tree of packed stmts if possible.
1447    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1448 
1449 static bool
vect_analyze_slp_instance(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,gimple stmt,unsigned max_tree_size)1450 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1451                            gimple stmt, unsigned max_tree_size)
1452 {
1453   slp_instance new_instance;
1454   slp_tree node;
1455   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1456   unsigned int unrolling_factor = 1, nunits;
1457   tree vectype, scalar_type = NULL_TREE;
1458   gimple next;
1459   unsigned int vectorization_factor = 0;
1460   int i;
1461   unsigned int max_nunits = 0;
1462   vec<slp_tree> loads;
1463   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1464   vec<gimple> scalar_stmts;
1465 
1466   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1467     {
1468       if (dr)
1469         {
1470           scalar_type = TREE_TYPE (DR_REF (dr));
1471           vectype = get_vectype_for_scalar_type (scalar_type);
1472         }
1473       else
1474         {
1475           gcc_assert (loop_vinfo);
1476           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1477         }
1478 
1479       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1480     }
1481   else
1482     {
1483       gcc_assert (loop_vinfo);
1484       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1485       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1486     }
1487 
1488   if (!vectype)
1489     {
1490       if (dump_enabled_p ())
1491         {
1492           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 			   "Build SLP failed: unsupported data-type ");
1494           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1495           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1496         }
1497 
1498       return false;
1499     }
1500 
1501   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1502   if (loop_vinfo)
1503     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1504   else
1505     vectorization_factor = nunits;
1506 
1507   /* Calculate the unrolling factor.  */
1508   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1509   if (unrolling_factor != 1 && !loop_vinfo)
1510     {
1511       if (dump_enabled_p ())
1512         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1513 			 "Build SLP failed: unrolling required in basic"
1514 			 " block SLP\n");
1515 
1516       return false;
1517     }
1518 
1519   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1520   scalar_stmts.create (group_size);
1521   next = stmt;
1522   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1523     {
1524       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1525       while (next)
1526         {
1527 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1528 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1529 	    scalar_stmts.safe_push (
1530 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1531 	  else
1532             scalar_stmts.safe_push (next);
1533           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1534         }
1535     }
1536   else
1537     {
1538       /* Collect reduction statements.  */
1539       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1540       for (i = 0; reductions.iterate (i, &next); i++)
1541 	scalar_stmts.safe_push (next);
1542     }
1543 
1544   node = vect_create_new_slp_node (scalar_stmts);
1545 
1546   loads.create (group_size);
1547 
1548   /* Build the tree for the SLP instance.  */
1549   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1550 			   &max_nunits, &loads,
1551 			   vectorization_factor, NULL, NULL, NULL,
1552 			   max_tree_size))
1553     {
1554       /* Calculate the unrolling factor based on the smallest type.  */
1555       if (max_nunits > nunits)
1556         unrolling_factor = least_common_multiple (max_nunits, group_size)
1557                            / group_size;
1558 
1559       if (unrolling_factor != 1 && !loop_vinfo)
1560         {
1561           if (dump_enabled_p ())
1562             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563 			     "Build SLP failed: unrolling required in basic"
1564 			     " block SLP\n");
1565 	  vect_free_slp_tree (node);
1566 	  loads.release ();
1567           return false;
1568         }
1569 
1570       /* Create a new SLP instance.  */
1571       new_instance = XNEW (struct _slp_instance);
1572       SLP_INSTANCE_TREE (new_instance) = node;
1573       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1574       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1575       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1576       SLP_INSTANCE_LOADS (new_instance) = loads;
1577       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1578 
1579       /* Compute the load permutation.  */
1580       slp_tree load_node;
1581       bool loads_permuted = false;
1582       FOR_EACH_VEC_ELT (loads, i, load_node)
1583 	{
1584 	  vec<unsigned> load_permutation;
1585 	  int j;
1586 	  gimple load, first_stmt;
1587 	  bool this_load_permuted = false;
1588 	  load_permutation.create (group_size);
1589 	  first_stmt = GROUP_FIRST_ELEMENT
1590 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1591 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1592 	    {
1593 	      int load_place
1594 		= vect_get_place_in_interleaving_chain (load, first_stmt);
1595 	      gcc_assert (load_place != -1);
1596 	      if (load_place != j)
1597 		this_load_permuted = true;
1598 	      load_permutation.safe_push (load_place);
1599 	    }
1600 	  if (!this_load_permuted)
1601 	    {
1602 	      load_permutation.release ();
1603 	      continue;
1604 	    }
1605 	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1606 	  loads_permuted = true;
1607 	}
1608 
1609       if (loads_permuted)
1610         {
1611           if (!vect_supported_load_permutation_p (new_instance))
1612             {
1613               if (dump_enabled_p ())
1614                 {
1615                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616 				   "Build SLP failed: unsupported load "
1617 				   "permutation ");
1618                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1619                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1620                 }
1621               vect_free_slp_instance (new_instance);
1622               return false;
1623             }
1624 
1625           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1626 	    = vect_find_first_load_in_slp_instance (new_instance);
1627         }
1628 
1629       /* Compute the costs of this SLP instance.  */
1630       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1631 			     new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1632 
1633       if (loop_vinfo)
1634         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1635       else
1636         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1637 
1638       if (dump_enabled_p ())
1639 	vect_print_slp_tree (MSG_NOTE, node);
1640 
1641       return true;
1642     }
1643 
1644   /* Failed to SLP.  */
1645   /* Free the allocated memory.  */
1646   vect_free_slp_tree (node);
1647   loads.release ();
1648 
1649   return false;
1650 }
1651 
1652 
1653 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1654    trees of packed scalar stmts if SLP is possible.  */
1655 
1656 bool
vect_analyze_slp(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,unsigned max_tree_size)1657 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1658 		  unsigned max_tree_size)
1659 {
1660   unsigned int i;
1661   vec<gimple> grouped_stores;
1662   vec<gimple> reductions = vNULL;
1663   vec<gimple> reduc_chains = vNULL;
1664   gimple first_element;
1665   bool ok = false;
1666 
1667   if (dump_enabled_p ())
1668     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1669 
1670   if (loop_vinfo)
1671     {
1672       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1673       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1674       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1675     }
1676   else
1677     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1678 
1679   /* Find SLP sequences starting from groups of grouped stores.  */
1680   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1681     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1682 				   max_tree_size))
1683       ok = true;
1684 
1685   if (bb_vinfo && !ok)
1686     {
1687       if (dump_enabled_p ())
1688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1689 			 "Failed to SLP the basic block.\n");
1690 
1691       return false;
1692     }
1693 
1694   if (loop_vinfo
1695       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1696     {
1697       /* Find SLP sequences starting from reduction chains.  */
1698       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1699         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1700 				       max_tree_size))
1701           ok = true;
1702         else
1703           return false;
1704 
1705       /* Don't try to vectorize SLP reductions if reduction chain was
1706          detected.  */
1707       return ok;
1708     }
1709 
1710   /* Find SLP sequences starting from groups of reductions.  */
1711   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1712       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1713 				    max_tree_size))
1714     ok = true;
1715 
1716   return true;
1717 }
1718 
1719 
1720 /* For each possible SLP instance decide whether to SLP it and calculate overall
1721    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1722    least one instance.  */
1723 
1724 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)1725 vect_make_slp_decision (loop_vec_info loop_vinfo)
1726 {
1727   unsigned int i, unrolling_factor = 1;
1728   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1729   slp_instance instance;
1730   int decided_to_slp = 0;
1731 
1732   if (dump_enabled_p ())
1733     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1734                      "\n");
1735 
1736   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1737     {
1738       /* FORNOW: SLP if you can.  */
1739       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1740 	unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1741 
1742       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1743 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1744 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1745       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1746       decided_to_slp++;
1747     }
1748 
1749   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1750 
1751   if (decided_to_slp && dump_enabled_p ())
1752     dump_printf_loc (MSG_NOTE, vect_location,
1753 		     "Decided to SLP %d instances. Unrolling factor %d\n",
1754 		     decided_to_slp, unrolling_factor);
1755 
1756   return (decided_to_slp > 0);
1757 }
1758 
1759 
1760 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1761    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1762 
1763 static void
vect_detect_hybrid_slp_stmts(slp_tree node)1764 vect_detect_hybrid_slp_stmts (slp_tree node)
1765 {
1766   int i;
1767   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1768   gimple stmt = stmts[0];
1769   imm_use_iterator imm_iter;
1770   gimple use_stmt;
1771   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1772   slp_tree child;
1773   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1774   struct loop *loop = NULL;
1775   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1776   basic_block bb = NULL;
1777 
1778   if (!node)
1779     return;
1780 
1781   if (loop_vinfo)
1782     loop = LOOP_VINFO_LOOP (loop_vinfo);
1783   else
1784     bb = BB_VINFO_BB (bb_vinfo);
1785 
1786   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1787     if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1788 	&& TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1789       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1790 	if (gimple_bb (use_stmt)
1791             && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1792 		 || bb == gimple_bb (use_stmt))
1793 	    && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1794 	    && !STMT_SLP_TYPE (stmt_vinfo)
1795             && (STMT_VINFO_RELEVANT (stmt_vinfo)
1796                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))
1797 		|| (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
1798 		    && STMT_VINFO_RELATED_STMT (stmt_vinfo)
1799 		    && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo)))))
1800 	    && !(gimple_code (use_stmt) == GIMPLE_PHI
1801                  && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1802                   == vect_reduction_def))
1803 	  vect_mark_slp_stmts (node, hybrid, i);
1804 
1805   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1806     vect_detect_hybrid_slp_stmts (child);
1807 }
1808 
1809 
1810 /* Find stmts that must be both vectorized and SLPed.  */
1811 
1812 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)1813 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int i;
1816   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1817   slp_instance instance;
1818 
1819   if (dump_enabled_p ())
1820     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1821                      "\n");
1822 
1823   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1824     vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1825 }
1826 
1827 
1828 /* Create and initialize a new bb_vec_info struct for BB, as well as
1829    stmt_vec_info structs for all the stmts in it.  */
1830 
1831 static bb_vec_info
new_bb_vec_info(basic_block bb)1832 new_bb_vec_info (basic_block bb)
1833 {
1834   bb_vec_info res = NULL;
1835   gimple_stmt_iterator gsi;
1836 
1837   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1838   BB_VINFO_BB (res) = bb;
1839 
1840   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1841     {
1842       gimple stmt = gsi_stmt (gsi);
1843       gimple_set_uid (stmt, 0);
1844       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1845     }
1846 
1847   BB_VINFO_GROUPED_STORES (res).create (10);
1848   BB_VINFO_SLP_INSTANCES (res).create (2);
1849   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1850 
1851   bb->aux = res;
1852   return res;
1853 }
1854 
1855 
1856 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1857    stmts in the basic block.  */
1858 
1859 static void
destroy_bb_vec_info(bb_vec_info bb_vinfo)1860 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1861 {
1862   vec<slp_instance> slp_instances;
1863   slp_instance instance;
1864   basic_block bb;
1865   gimple_stmt_iterator si;
1866   unsigned i;
1867 
1868   if (!bb_vinfo)
1869     return;
1870 
1871   bb = BB_VINFO_BB (bb_vinfo);
1872 
1873   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1874     {
1875       gimple stmt = gsi_stmt (si);
1876       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1877 
1878       if (stmt_info)
1879         /* Free stmt_vec_info.  */
1880         free_stmt_vec_info (stmt);
1881     }
1882 
1883   vect_destroy_datarefs (NULL, bb_vinfo);
1884   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1885   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1886   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1887   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1888     vect_free_slp_instance (instance);
1889   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1890   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1891   free (bb_vinfo);
1892   bb->aux = NULL;
1893 }
1894 
1895 
1896 /* Analyze statements contained in SLP tree node after recursively analyzing
1897    the subtree. Return TRUE if the operations are supported.  */
1898 
1899 static bool
vect_slp_analyze_node_operations(bb_vec_info bb_vinfo,slp_tree node)1900 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1901 {
1902   bool dummy;
1903   int i;
1904   gimple stmt;
1905   slp_tree child;
1906 
1907   if (!node)
1908     return true;
1909 
1910   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1911     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1912       return false;
1913 
1914   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1915     {
1916       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1917       gcc_assert (stmt_info);
1918       gcc_assert (PURE_SLP_STMT (stmt_info));
1919 
1920       if (!vect_analyze_stmt (stmt, &dummy, node))
1921         return false;
1922     }
1923 
1924   return true;
1925 }
1926 
1927 
1928 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
1929    operations are supported. */
1930 
1931 static bool
vect_slp_analyze_operations(bb_vec_info bb_vinfo)1932 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1933 {
1934   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1935   slp_instance instance;
1936   int i;
1937 
1938   for (i = 0; slp_instances.iterate (i, &instance); )
1939     {
1940       if (!vect_slp_analyze_node_operations (bb_vinfo,
1941                                              SLP_INSTANCE_TREE (instance)))
1942         {
1943  	  vect_free_slp_instance (instance);
1944           slp_instances.ordered_remove (i);
1945 	}
1946       else
1947         i++;
1948     }
1949 
1950   if (!slp_instances.length ())
1951     return false;
1952 
1953   return true;
1954 }
1955 
1956 
1957 /* Compute the scalar cost of the SLP node NODE and its children
1958    and return it.  Do not account defs that are marked in LIFE and
1959    update LIFE according to uses of NODE.  */
1960 
1961 static unsigned
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life)1962 vect_bb_slp_scalar_cost (basic_block bb,
1963 			 slp_tree node, vec<bool, va_heap> *life)
1964 {
1965   unsigned scalar_cost = 0;
1966   unsigned i;
1967   gimple stmt;
1968   slp_tree child;
1969 
1970   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1971     {
1972       unsigned stmt_cost;
1973       ssa_op_iter op_iter;
1974       def_operand_p def_p;
1975       stmt_vec_info stmt_info;
1976 
1977       if ((*life)[i])
1978 	continue;
1979 
1980       /* If there is a non-vectorized use of the defs then the scalar
1981          stmt is kept live in which case we do not account it or any
1982 	 required defs in the SLP children in the scalar cost.  This
1983 	 way we make the vectorization more costly when compared to
1984 	 the scalar cost.  */
1985       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
1986 	{
1987 	  imm_use_iterator use_iter;
1988 	  gimple use_stmt;
1989 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
1990 	    if (!is_gimple_debug (use_stmt)
1991 		&& (gimple_code (use_stmt) == GIMPLE_PHI
1992 		    || gimple_bb (use_stmt) != bb
1993 		    || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
1994 	      {
1995 		(*life)[i] = true;
1996 		BREAK_FROM_IMM_USE_STMT (use_iter);
1997 	      }
1998 	}
1999       if ((*life)[i])
2000 	continue;
2001 
2002       stmt_info = vinfo_for_stmt (stmt);
2003       if (STMT_VINFO_DATA_REF (stmt_info))
2004         {
2005           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2006             stmt_cost = vect_get_stmt_cost (scalar_load);
2007           else
2008             stmt_cost = vect_get_stmt_cost (scalar_store);
2009         }
2010       else
2011         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2012 
2013       scalar_cost += stmt_cost;
2014     }
2015 
2016   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2017     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2018 
2019   return scalar_cost;
2020 }
2021 
2022 /* Check if vectorization of the basic block is profitable.  */
2023 
2024 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2025 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2026 {
2027   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2028   slp_instance instance;
2029   int i, j;
2030   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2031   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2032   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2033   stmt_vec_info stmt_info = NULL;
2034   stmt_vector_for_cost body_cost_vec;
2035   stmt_info_for_cost *ci;
2036 
2037   /* Calculate vector costs.  */
2038   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2039     {
2040       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2041 
2042       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2043 	{
2044 	  stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2045 	  (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2046 				stmt_info, ci->misalign, vect_body);
2047 	}
2048     }
2049 
2050   /* Calculate scalar cost.  */
2051   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2052     {
2053       auto_vec<bool, 20> life;
2054       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2055       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2056 					      SLP_INSTANCE_TREE (instance),
2057 					      &life);
2058     }
2059 
2060   /* Complete the target-specific cost calculation.  */
2061   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2062 	       &vec_inside_cost, &vec_epilogue_cost);
2063 
2064   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2065 
2066   if (dump_enabled_p ())
2067     {
2068       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2069       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2070 		   vec_inside_cost);
2071       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2072       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2073       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2074     }
2075 
2076   /* Vectorization is profitable if its cost is less than the cost of scalar
2077      version.  */
2078   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2079     return false;
2080 
2081   return true;
2082 }
2083 
2084 /* Check if the basic block can be vectorized.  */
2085 
2086 static bb_vec_info
vect_slp_analyze_bb_1(basic_block bb)2087 vect_slp_analyze_bb_1 (basic_block bb)
2088 {
2089   bb_vec_info bb_vinfo;
2090   vec<slp_instance> slp_instances;
2091   slp_instance instance;
2092   int i;
2093   int min_vf = 2;
2094   unsigned n_stmts = 0;
2095 
2096   bb_vinfo = new_bb_vec_info (bb);
2097   if (!bb_vinfo)
2098     return NULL;
2099 
2100   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2101     {
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2104 			 "not vectorized: unhandled data-ref in basic "
2105 			 "block.\n");
2106 
2107       destroy_bb_vec_info (bb_vinfo);
2108       return NULL;
2109     }
2110 
2111   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2112     {
2113       if (dump_enabled_p ())
2114         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115 			 "not vectorized: not enough data-refs in "
2116 			 "basic block.\n");
2117 
2118       destroy_bb_vec_info (bb_vinfo);
2119       return NULL;
2120     }
2121 
2122   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2123     {
2124      if (dump_enabled_p ())
2125        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 			"not vectorized: unhandled data access in "
2127 			"basic block.\n");
2128 
2129       destroy_bb_vec_info (bb_vinfo);
2130       return NULL;
2131     }
2132 
2133   vect_pattern_recog (NULL, bb_vinfo);
2134 
2135   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2136     {
2137       if (dump_enabled_p ())
2138         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 			 "not vectorized: bad data alignment in basic "
2140 			 "block.\n");
2141 
2142       destroy_bb_vec_info (bb_vinfo);
2143       return NULL;
2144     }
2145 
2146   /* Check the SLP opportunities in the basic block, analyze and build SLP
2147      trees.  */
2148   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2149     {
2150       if (dump_enabled_p ())
2151         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152 			 "not vectorized: failed to find SLP opportunities "
2153 			 "in basic block.\n");
2154 
2155       destroy_bb_vec_info (bb_vinfo);
2156       return NULL;
2157     }
2158 
2159   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2160 
2161   /* Mark all the statements that we want to vectorize as pure SLP and
2162      relevant.  */
2163   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2164     {
2165       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2166       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2167     }
2168 
2169   /* Mark all the statements that we do not want to vectorize.  */
2170   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2171        !gsi_end_p (gsi); gsi_next (&gsi))
2172     {
2173       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2174       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2175 	STMT_VINFO_VECTORIZABLE (vinfo) = false;
2176     }
2177 
2178   /* Analyze dependences.  At this point all stmts not participating in
2179      vectorization have to be marked.  Dependence analysis assumes
2180      that we either vectorize all SLP instances or none at all.  */
2181   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2182      {
2183        if (dump_enabled_p ())
2184 	 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 			  "not vectorized: unhandled data dependence "
2186 			  "in basic block.\n");
2187 
2188        destroy_bb_vec_info (bb_vinfo);
2189        return NULL;
2190      }
2191 
2192   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2193     {
2194       if (dump_enabled_p ())
2195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2196                          "not vectorized: unsupported alignment in basic "
2197                          "block.\n");
2198       destroy_bb_vec_info (bb_vinfo);
2199       return NULL;
2200     }
2201 
2202   if (!vect_slp_analyze_operations (bb_vinfo))
2203     {
2204       if (dump_enabled_p ())
2205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2206 			 "not vectorized: bad operation in basic block.\n");
2207 
2208       destroy_bb_vec_info (bb_vinfo);
2209       return NULL;
2210     }
2211 
2212   /* Cost model: check if the vectorization is worthwhile.  */
2213   if (!unlimited_cost_model (NULL)
2214       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2215     {
2216       if (dump_enabled_p ())
2217         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 			 "not vectorized: vectorization is not "
2219 			 "profitable.\n");
2220 
2221       destroy_bb_vec_info (bb_vinfo);
2222       return NULL;
2223     }
2224 
2225   if (dump_enabled_p ())
2226     dump_printf_loc (MSG_NOTE, vect_location,
2227 		     "Basic block will be vectorized using SLP\n");
2228 
2229   return bb_vinfo;
2230 }
2231 
2232 
2233 bb_vec_info
vect_slp_analyze_bb(basic_block bb)2234 vect_slp_analyze_bb (basic_block bb)
2235 {
2236   bb_vec_info bb_vinfo;
2237   int insns = 0;
2238   gimple_stmt_iterator gsi;
2239   unsigned int vector_sizes;
2240 
2241   if (dump_enabled_p ())
2242     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2243 
2244   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2245     {
2246       gimple stmt = gsi_stmt (gsi);
2247       if (!is_gimple_debug (stmt)
2248           && !gimple_nop_p (stmt)
2249           && gimple_code (stmt) != GIMPLE_LABEL)
2250         insns++;
2251     }
2252 
2253   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2254     {
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 			 "not vectorized: too many instructions in "
2258 			 "basic block.\n");
2259 
2260       return NULL;
2261     }
2262 
2263   /* Autodetect first vector size we try.  */
2264   current_vector_size = 0;
2265   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2266 
2267   while (1)
2268     {
2269       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2270       if (bb_vinfo)
2271         return bb_vinfo;
2272 
2273       destroy_bb_vec_info (bb_vinfo);
2274 
2275       vector_sizes &= ~current_vector_size;
2276       if (vector_sizes == 0
2277           || current_vector_size == 0)
2278         return NULL;
2279 
2280       /* Try the next biggest vector size.  */
2281       current_vector_size = 1 << floor_log2 (vector_sizes);
2282       if (dump_enabled_p ())
2283         dump_printf_loc (MSG_NOTE, vect_location,
2284 			 "***** Re-trying analysis with "
2285 			 "vector size %d\n", current_vector_size);
2286     }
2287 }
2288 
2289 
2290 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2291    the number of created vector stmts depends on the unrolling factor).
2292    However, the actual number of vector stmts for every SLP node depends on
2293    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2294    should be updated.  In this function we assume that the inside costs
2295    calculated in vect_model_xxx_cost are linear in ncopies.  */
2296 
2297 void
vect_update_slp_costs_according_to_vf(loop_vec_info loop_vinfo)2298 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2299 {
2300   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2301   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2302   slp_instance instance;
2303   stmt_vector_for_cost body_cost_vec;
2304   stmt_info_for_cost *si;
2305   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2306 
2307   if (dump_enabled_p ())
2308     dump_printf_loc (MSG_NOTE, vect_location,
2309 		     "=== vect_update_slp_costs_according_to_vf ===\n");
2310 
2311   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2312     {
2313       /* We assume that costs are linear in ncopies.  */
2314       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2315 
2316       /* Record the instance's instructions in the target cost model.
2317 	 This was delayed until here because the count of instructions
2318 	 isn't known beforehand.  */
2319       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2320 
2321       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2322 	(void) add_stmt_cost (data, si->count * ncopies, si->kind,
2323 			      vinfo_for_stmt (si->stmt), si->misalign,
2324 			      vect_body);
2325     }
2326 }
2327 
2328 
2329 /* For constant and loop invariant defs of SLP_NODE this function returns
2330    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2331    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2332    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2333    REDUC_INDEX is the index of the reduction operand in the statements, unless
2334    it is -1.  */
2335 
2336 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors,int reduc_index)2337 vect_get_constant_vectors (tree op, slp_tree slp_node,
2338                            vec<tree> *vec_oprnds,
2339 			   unsigned int op_num, unsigned int number_of_vectors,
2340                            int reduc_index)
2341 {
2342   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2343   gimple stmt = stmts[0];
2344   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2345   unsigned nunits;
2346   tree vec_cst;
2347   tree *elts;
2348   unsigned j, number_of_places_left_in_vector;
2349   tree vector_type;
2350   tree vop;
2351   int group_size = stmts.length ();
2352   unsigned int vec_num, i;
2353   unsigned number_of_copies = 1;
2354   vec<tree> voprnds;
2355   voprnds.create (number_of_vectors);
2356   bool constant_p, is_store;
2357   tree neutral_op = NULL;
2358   enum tree_code code = gimple_expr_code (stmt);
2359   gimple def_stmt;
2360   struct loop *loop;
2361   gimple_seq ctor_seq = NULL;
2362 
2363   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2364       && reduc_index != -1)
2365     {
2366       op_num = reduc_index - 1;
2367       op = gimple_op (stmt, reduc_index);
2368       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2369          we need either neutral operands or the original operands.  See
2370          get_initial_def_for_reduction() for details.  */
2371       switch (code)
2372         {
2373           case WIDEN_SUM_EXPR:
2374           case DOT_PROD_EXPR:
2375           case PLUS_EXPR:
2376           case MINUS_EXPR:
2377           case BIT_IOR_EXPR:
2378           case BIT_XOR_EXPR:
2379              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2380                neutral_op = build_real (TREE_TYPE (op), dconst0);
2381              else
2382                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2383 
2384              break;
2385 
2386           case MULT_EXPR:
2387              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2388                neutral_op = build_real (TREE_TYPE (op), dconst1);
2389              else
2390                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2391 
2392              break;
2393 
2394           case BIT_AND_EXPR:
2395             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2396             break;
2397 
2398 	  /* For MIN/MAX we don't have an easy neutral operand but
2399 	     the initial values can be used fine here.  Only for
2400 	     a reduction chain we have to force a neutral element.  */
2401 	  case MAX_EXPR:
2402 	  case MIN_EXPR:
2403 	    if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2404 	      neutral_op = NULL;
2405 	    else
2406 	      {
2407 		def_stmt = SSA_NAME_DEF_STMT (op);
2408 		loop = (gimple_bb (stmt))->loop_father;
2409 		neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2410 						    loop_preheader_edge (loop));
2411 	      }
2412 	    break;
2413 
2414           default:
2415             neutral_op = NULL;
2416         }
2417     }
2418 
2419   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2420     {
2421       is_store = true;
2422       op = gimple_assign_rhs1 (stmt);
2423     }
2424   else
2425     is_store = false;
2426 
2427   gcc_assert (op);
2428 
2429   if (CONSTANT_CLASS_P (op))
2430     constant_p = true;
2431   else
2432     constant_p = false;
2433 
2434   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2435   gcc_assert (vector_type);
2436   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2437 
2438   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2439      created vectors. It is greater than 1 if unrolling is performed.
2440 
2441      For example, we have two scalar operands, s1 and s2 (e.g., group of
2442      strided accesses of size two), while NUNITS is four (i.e., four scalars
2443      of this type can be packed in a vector).  The output vector will contain
2444      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2445      will be 2).
2446 
2447      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2448      containing the operands.
2449 
2450      For example, NUNITS is four as before, and the group size is 8
2451      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2452      {s5, s6, s7, s8}.  */
2453 
2454   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2455 
2456   number_of_places_left_in_vector = nunits;
2457   elts = XALLOCAVEC (tree, nunits);
2458   for (j = 0; j < number_of_copies; j++)
2459     {
2460       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2461         {
2462           if (is_store)
2463             op = gimple_assign_rhs1 (stmt);
2464           else
2465 	    {
2466 	      switch (code)
2467 		{
2468 		  case COND_EXPR:
2469 		    if (op_num == 0 || op_num == 1)
2470 		      {
2471 			tree cond = gimple_assign_rhs1 (stmt);
2472 			op = TREE_OPERAND (cond, op_num);
2473 		      }
2474 		    else
2475 		      {
2476 			if (op_num == 2)
2477 			  op = gimple_assign_rhs2 (stmt);
2478 			else
2479 			  op = gimple_assign_rhs3 (stmt);
2480 		      }
2481 		    break;
2482 
2483 		  case CALL_EXPR:
2484 		    op = gimple_call_arg (stmt, op_num);
2485 		    break;
2486 
2487 		  case LSHIFT_EXPR:
2488 		  case RSHIFT_EXPR:
2489 		  case LROTATE_EXPR:
2490 		  case RROTATE_EXPR:
2491 		    op = gimple_op (stmt, op_num + 1);
2492 		    /* Unlike the other binary operators, shifts/rotates have
2493 		       the shift count being int, instead of the same type as
2494 		       the lhs, so make sure the scalar is the right type if
2495 		       we are dealing with vectors of
2496 		       long long/long/short/char.  */
2497 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2498 		      op = fold_convert (TREE_TYPE (vector_type), op);
2499 		    break;
2500 
2501 		  default:
2502 		    op = gimple_op (stmt, op_num + 1);
2503 		    break;
2504 		}
2505 	    }
2506 
2507           if (reduc_index != -1)
2508             {
2509               loop = (gimple_bb (stmt))->loop_father;
2510               def_stmt = SSA_NAME_DEF_STMT (op);
2511 
2512               gcc_assert (loop);
2513 
2514               /* Get the def before the loop.  In reduction chain we have only
2515                  one initial value.  */
2516               if ((j != (number_of_copies - 1)
2517                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2518                        && i != 0))
2519                   && neutral_op)
2520                 op = neutral_op;
2521               else
2522                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2523                                             loop_preheader_edge (loop));
2524             }
2525 
2526           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2527           number_of_places_left_in_vector--;
2528 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2529 	    {
2530 	      if (CONSTANT_CLASS_P (op))
2531 		{
2532 		  op = fold_unary (VIEW_CONVERT_EXPR,
2533 				   TREE_TYPE (vector_type), op);
2534 		  gcc_assert (op && CONSTANT_CLASS_P (op));
2535 		}
2536 	      else
2537 		{
2538 		  tree new_temp
2539 		    = make_ssa_name (TREE_TYPE (vector_type), NULL);
2540 		  gimple init_stmt;
2541 		  op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2542 			       op);
2543 		  init_stmt
2544 		    = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2545 						    new_temp, op, NULL_TREE);
2546 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
2547 		  op = new_temp;
2548 		}
2549 	    }
2550 	  elts[number_of_places_left_in_vector] = op;
2551 	  if (!CONSTANT_CLASS_P (op))
2552 	    constant_p = false;
2553 
2554           if (number_of_places_left_in_vector == 0)
2555             {
2556               number_of_places_left_in_vector = nunits;
2557 
2558 	      if (constant_p)
2559 		vec_cst = build_vector (vector_type, elts);
2560 	      else
2561 		{
2562 		  vec<constructor_elt, va_gc> *v;
2563 		  unsigned k;
2564 		  vec_alloc (v, nunits);
2565 		  for (k = 0; k < nunits; ++k)
2566 		    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2567 		  vec_cst = build_constructor (vector_type, v);
2568 		}
2569               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2570 						    vector_type, NULL));
2571 	      if (ctor_seq != NULL)
2572 		{
2573 		  gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2574 		  gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2575 		  gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2576 							GSI_SAME_STMT);
2577 		  ctor_seq = NULL;
2578 		}
2579             }
2580         }
2581     }
2582 
2583   /* Since the vectors are created in the reverse order, we should invert
2584      them.  */
2585   vec_num = voprnds.length ();
2586   for (j = vec_num; j != 0; j--)
2587     {
2588       vop = voprnds[j - 1];
2589       vec_oprnds->quick_push (vop);
2590     }
2591 
2592   voprnds.release ();
2593 
2594   /* In case that VF is greater than the unrolling factor needed for the SLP
2595      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2596      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2597      to replicate the vectors.  */
2598   while (number_of_vectors > vec_oprnds->length ())
2599     {
2600       tree neutral_vec = NULL;
2601 
2602       if (neutral_op)
2603         {
2604           if (!neutral_vec)
2605 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
2606 
2607           vec_oprnds->quick_push (neutral_vec);
2608         }
2609       else
2610         {
2611           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2612             vec_oprnds->quick_push (vop);
2613         }
2614     }
2615 }
2616 
2617 
2618 /* Get vectorized definitions from SLP_NODE that contains corresponding
2619    vectorized def-stmts.  */
2620 
2621 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)2622 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2623 {
2624   tree vec_oprnd;
2625   gimple vec_def_stmt;
2626   unsigned int i;
2627 
2628   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2629 
2630   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2631     {
2632       gcc_assert (vec_def_stmt);
2633       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2634       vec_oprnds->quick_push (vec_oprnd);
2635     }
2636 }
2637 
2638 
2639 /* Get vectorized definitions for SLP_NODE.
2640    If the scalar definitions are loop invariants or constants, collect them and
2641    call vect_get_constant_vectors() to create vector stmts.
2642    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2643    must be stored in the corresponding child of SLP_NODE, and we call
2644    vect_get_slp_vect_defs () to retrieve them.  */
2645 
2646 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,int reduc_index)2647 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2648 		   vec<vec<tree> > *vec_oprnds, int reduc_index)
2649 {
2650   gimple first_stmt;
2651   int number_of_vects = 0, i;
2652   unsigned int child_index = 0;
2653   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2654   slp_tree child = NULL;
2655   vec<tree> vec_defs;
2656   tree oprnd;
2657   bool vectorized_defs;
2658 
2659   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2660   FOR_EACH_VEC_ELT (ops, i, oprnd)
2661     {
2662       /* For each operand we check if it has vectorized definitions in a child
2663 	 node or we need to create them (for invariants and constants).  We
2664 	 check if the LHS of the first stmt of the next child matches OPRND.
2665 	 If it does, we found the correct child.  Otherwise, we call
2666 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2667 	 to check this child node for the next operand.  */
2668       vectorized_defs = false;
2669       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2670         {
2671           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2672 
2673 	  /* We have to check both pattern and original def, if available.  */
2674 	  gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2675 	  gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2676 
2677 	  if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2678 	      || (related
2679 		  && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2680 	    {
2681 	      /* The number of vector defs is determined by the number of
2682 		 vector statements in the node from which we get those
2683 		 statements.  */
2684 	      number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2685 	      vectorized_defs = true;
2686 	      child_index++;
2687 	    }
2688         }
2689 
2690       if (!vectorized_defs)
2691         {
2692           if (i == 0)
2693             {
2694               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2695               /* Number of vector stmts was calculated according to LHS in
2696                  vect_schedule_slp_instance (), fix it by replacing LHS with
2697                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2698                  details.  */
2699               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2700                                              &rhs_size_unit);
2701               if (rhs_size_unit != lhs_size_unit)
2702                 {
2703                   number_of_vects *= rhs_size_unit;
2704                   number_of_vects /= lhs_size_unit;
2705                 }
2706             }
2707         }
2708 
2709       /* Allocate memory for vectorized defs.  */
2710       vec_defs = vNULL;
2711       vec_defs.create (number_of_vects);
2712 
2713       /* For reduction defs we call vect_get_constant_vectors (), since we are
2714          looking for initial loop invariant values.  */
2715       if (vectorized_defs && reduc_index == -1)
2716         /* The defs are already vectorized.  */
2717 	vect_get_slp_vect_defs (child, &vec_defs);
2718       else
2719         /* Build vectors from scalar defs.  */
2720 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2721                                    number_of_vects, reduc_index);
2722 
2723       vec_oprnds->quick_push (vec_defs);
2724 
2725       /* For reductions, we only need initial values.  */
2726       if (reduc_index != -1)
2727         return;
2728     }
2729 }
2730 
2731 
2732 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2733    building a vector of type MASK_TYPE from it) and two input vectors placed in
2734    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2735    shifting by STRIDE elements of DR_CHAIN for every copy.
2736    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2737    copies).
2738    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2739    the created stmts must be inserted.  */
2740 
2741 static inline void
vect_create_mask_and_perm(gimple stmt,gimple next_scalar_stmt,tree mask,int first_vec_indx,int second_vec_indx,gimple_stmt_iterator * gsi,slp_tree node,tree vectype,vec<tree> dr_chain,int ncopies,int vect_stmts_counter)2742 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2743                            tree mask, int first_vec_indx, int second_vec_indx,
2744                            gimple_stmt_iterator *gsi, slp_tree node,
2745                            tree vectype, vec<tree> dr_chain,
2746                            int ncopies, int vect_stmts_counter)
2747 {
2748   tree perm_dest;
2749   gimple perm_stmt = NULL;
2750   stmt_vec_info next_stmt_info;
2751   int i, stride;
2752   tree first_vec, second_vec, data_ref;
2753 
2754   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2755 
2756   /* Initialize the vect stmts of NODE to properly insert the generated
2757      stmts later.  */
2758   for (i = SLP_TREE_VEC_STMTS (node).length ();
2759        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2760     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2761 
2762   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2763   for (i = 0; i < ncopies; i++)
2764     {
2765       first_vec = dr_chain[first_vec_indx];
2766       second_vec = dr_chain[second_vec_indx];
2767 
2768       /* Generate the permute statement.  */
2769       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2770 						first_vec, second_vec, mask);
2771       data_ref = make_ssa_name (perm_dest, perm_stmt);
2772       gimple_set_lhs (perm_stmt, data_ref);
2773       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2774 
2775       /* Store the vector statement in NODE.  */
2776       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2777 
2778       first_vec_indx += stride;
2779       second_vec_indx += stride;
2780     }
2781 
2782   /* Mark the scalar stmt as vectorized.  */
2783   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2784   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2785 }
2786 
2787 
2788 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2789    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2790    representation.  Check that the mask is valid and return FALSE if not.
2791    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2792    the next vector, i.e., the current first vector is not needed.  */
2793 
2794 static bool
vect_get_mask_element(gimple stmt,int first_mask_element,int m,int mask_nunits,bool only_one_vec,int index,unsigned char * mask,int * current_mask_element,bool * need_next_vector,int * number_of_mask_fixes,bool * mask_fixed,bool * needs_first_vector)2795 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2796                        int mask_nunits, bool only_one_vec, int index,
2797 		       unsigned char *mask, int *current_mask_element,
2798                        bool *need_next_vector, int *number_of_mask_fixes,
2799                        bool *mask_fixed, bool *needs_first_vector)
2800 {
2801   int i;
2802 
2803   /* Convert to target specific representation.  */
2804   *current_mask_element = first_mask_element + m;
2805   /* Adjust the value in case it's a mask for second and third vectors.  */
2806   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2807 
2808   if (*current_mask_element < mask_nunits)
2809     *needs_first_vector = true;
2810 
2811   /* We have only one input vector to permute but the mask accesses values in
2812      the next vector as well.  */
2813   if (only_one_vec && *current_mask_element >= mask_nunits)
2814     {
2815       if (dump_enabled_p ())
2816         {
2817           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2818 			   "permutation requires at least two vectors ");
2819           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2820           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2821         }
2822 
2823       return false;
2824     }
2825 
2826   /* The mask requires the next vector.  */
2827   if (*current_mask_element >= mask_nunits * 2)
2828     {
2829       if (*needs_first_vector || *mask_fixed)
2830         {
2831           /* We either need the first vector too or have already moved to the
2832              next vector. In both cases, this permutation needs three
2833              vectors.  */
2834           if (dump_enabled_p ())
2835             {
2836               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2837 			       "permutation requires at "
2838 			       "least three vectors ");
2839               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2840               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2841             }
2842 
2843           return false;
2844         }
2845 
2846       /* We move to the next vector, dropping the first one and working with
2847          the second and the third - we need to adjust the values of the mask
2848          accordingly.  */
2849       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2850 
2851       for (i = 0; i < index; i++)
2852         mask[i] -= mask_nunits * *number_of_mask_fixes;
2853 
2854       (*number_of_mask_fixes)++;
2855       *mask_fixed = true;
2856     }
2857 
2858   *need_next_vector = *mask_fixed;
2859 
2860   /* This was the last element of this mask. Start a new one.  */
2861   if (index == mask_nunits - 1)
2862     {
2863       *number_of_mask_fixes = 1;
2864       *mask_fixed = false;
2865       *needs_first_vector = false;
2866     }
2867 
2868   return true;
2869 }
2870 
2871 
2872 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2873    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2874    permute statements for the SLP node NODE of the SLP instance
2875    SLP_NODE_INSTANCE.  */
2876 
2877 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,int vf,slp_instance slp_node_instance,bool analyze_only)2878 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
2879                               gimple_stmt_iterator *gsi, int vf,
2880                               slp_instance slp_node_instance, bool analyze_only)
2881 {
2882   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2883   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2884   tree mask_element_type = NULL_TREE, mask_type;
2885   int i, j, k, nunits, vec_index = 0, scalar_index;
2886   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2887   gimple next_scalar_stmt;
2888   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2889   int first_mask_element;
2890   int index, unroll_factor, current_mask_element, ncopies;
2891   unsigned char *mask;
2892   bool only_one_vec = false, need_next_vector = false;
2893   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2894   int number_of_mask_fixes = 1;
2895   bool mask_fixed = false;
2896   bool needs_first_vector = false;
2897   enum machine_mode mode;
2898 
2899   mode = TYPE_MODE (vectype);
2900 
2901   if (!can_vec_perm_p (mode, false, NULL))
2902     {
2903       if (dump_enabled_p ())
2904         {
2905           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2906 			   "no vect permute for ");
2907           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2908           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2909         }
2910       return false;
2911     }
2912 
2913   /* The generic VEC_PERM_EXPR code always uses an integral type of the
2914      same size as the vector element being permuted.  */
2915   mask_element_type = lang_hooks.types.type_for_mode
2916 		(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2917   mask_type = get_vectype_for_scalar_type (mask_element_type);
2918   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2919   mask = XALLOCAVEC (unsigned char, nunits);
2920   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2921 
2922   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2923      unrolling factor.  */
2924   orig_vec_stmts_num = group_size *
2925                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2926   if (orig_vec_stmts_num == 1)
2927     only_one_vec = true;
2928 
2929   /* Number of copies is determined by the final vectorization factor
2930      relatively to SLP_NODE_INSTANCE unrolling factor.  */
2931   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2932 
2933   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2934     return false;
2935 
2936   /* Generate permutation masks for every NODE. Number of masks for each NODE
2937      is equal to GROUP_SIZE.
2938      E.g., we have a group of three nodes with three loads from the same
2939      location in each node, and the vector size is 4. I.e., we have a
2940      a0b0c0a1b1c1... sequence and we need to create the following vectors:
2941      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2942      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2943      ...
2944 
2945      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2946      The last mask is illegal since we assume two operands for permute
2947      operation, and the mask element values can't be outside that range.
2948      Hence, the last mask must be converted into {2,5,5,5}.
2949      For the first two permutations we need the first and the second input
2950      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2951      we need the second and the third vectors: {b1,c1,a2,b2} and
2952      {c2,a3,b3,c3}.  */
2953 
2954     {
2955       scalar_index = 0;
2956       index = 0;
2957       vect_stmts_counter = 0;
2958       vec_index = 0;
2959       first_vec_index = vec_index++;
2960       if (only_one_vec)
2961         second_vec_index = first_vec_index;
2962       else
2963         second_vec_index =  vec_index++;
2964 
2965       for (j = 0; j < unroll_factor; j++)
2966         {
2967           for (k = 0; k < group_size; k++)
2968             {
2969 	      i = SLP_TREE_LOAD_PERMUTATION (node)[k];
2970               first_mask_element = i + j * group_size;
2971               if (!vect_get_mask_element (stmt, first_mask_element, 0,
2972 					  nunits, only_one_vec, index,
2973 					  mask, &current_mask_element,
2974 					  &need_next_vector,
2975 					  &number_of_mask_fixes, &mask_fixed,
2976 					  &needs_first_vector))
2977 		return false;
2978 	      mask[index++] = current_mask_element;
2979 
2980               if (index == nunits)
2981                 {
2982 		  index = 0;
2983 		  if (!can_vec_perm_p (mode, false, mask))
2984 		    {
2985 		      if (dump_enabled_p ())
2986 			{
2987 			  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2988 					   vect_location,
2989 					   "unsupported vect permute { ");
2990 			  for (i = 0; i < nunits; ++i)
2991 			    dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2992 					 mask[i]);
2993 			  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2994 			}
2995 		      return false;
2996 		    }
2997 
2998                   if (!analyze_only)
2999                     {
3000 		      int l;
3001 		      tree mask_vec, *mask_elts;
3002 		      mask_elts = XALLOCAVEC (tree, nunits);
3003 		      for (l = 0; l < nunits; ++l)
3004 			mask_elts[l] = build_int_cst (mask_element_type,
3005 						      mask[l]);
3006 		      mask_vec = build_vector (mask_type, mask_elts);
3007 
3008 		      if (need_next_vector)
3009                         {
3010                           first_vec_index = second_vec_index;
3011                           second_vec_index = vec_index;
3012                         }
3013 
3014                       next_scalar_stmt
3015 			  = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3016 
3017                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3018                                mask_vec, first_vec_index, second_vec_index,
3019 			       gsi, node, vectype, dr_chain,
3020 			       ncopies, vect_stmts_counter++);
3021                     }
3022                 }
3023             }
3024         }
3025     }
3026 
3027   return true;
3028 }
3029 
3030 
3031 
3032 /* Vectorize SLP instance tree in postorder.  */
3033 
3034 static bool
vect_schedule_slp_instance(slp_tree node,slp_instance instance,unsigned int vectorization_factor)3035 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3036                             unsigned int vectorization_factor)
3037 {
3038   gimple stmt;
3039   bool grouped_store, is_store;
3040   gimple_stmt_iterator si;
3041   stmt_vec_info stmt_info;
3042   unsigned int vec_stmts_size, nunits, group_size;
3043   tree vectype;
3044   int i;
3045   slp_tree child;
3046 
3047   if (!node)
3048     return false;
3049 
3050   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3051     vect_schedule_slp_instance (child, instance, vectorization_factor);
3052 
3053   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3054   stmt_info = vinfo_for_stmt (stmt);
3055 
3056   /* VECTYPE is the type of the destination.  */
3057   vectype = STMT_VINFO_VECTYPE (stmt_info);
3058   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3059   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3060 
3061   /* For each SLP instance calculate number of vector stmts to be created
3062      for the scalar stmts in each node of the SLP tree.  Number of vector
3063      elements in one vector iteration is the number of scalar elements in
3064      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3065      size.  */
3066   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3067 
3068   if (!SLP_TREE_VEC_STMTS (node).exists ())
3069     {
3070       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3071       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3072     }
3073 
3074   if (dump_enabled_p ())
3075     {
3076       dump_printf_loc (MSG_NOTE,vect_location,
3077 		       "------>vectorizing SLP node starting from: ");
3078       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3079       dump_printf (MSG_NOTE, "\n");
3080     }
3081 
3082   /* Loads should be inserted before the first load.  */
3083   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3084       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3085       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3086       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3087     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3088   else if (is_pattern_stmt_p (stmt_info))
3089     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3090   else
3091     si = gsi_for_stmt (stmt);
3092 
3093   /* Stores should be inserted just before the last store.  */
3094   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3095       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3096     {
3097       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3098       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3099        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3100       si = gsi_for_stmt (last_store);
3101     }
3102 
3103   /* Mark the first element of the reduction chain as reduction to properly
3104      transform the node.  In the analysis phase only the last element of the
3105      chain is marked as reduction.  */
3106   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3107       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3108     {
3109       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3110       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3111     }
3112 
3113   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3114   return is_store;
3115 }
3116 
3117 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3118    For loop vectorization this is done in vectorizable_call, but for SLP
3119    it needs to be deferred until end of vect_schedule_slp, because multiple
3120    SLP instances may refer to the same scalar stmt.  */
3121 
3122 static void
vect_remove_slp_scalar_calls(slp_tree node)3123 vect_remove_slp_scalar_calls (slp_tree node)
3124 {
3125   gimple stmt, new_stmt;
3126   gimple_stmt_iterator gsi;
3127   int i;
3128   slp_tree child;
3129   tree lhs;
3130   stmt_vec_info stmt_info;
3131 
3132   if (!node)
3133     return;
3134 
3135   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3136     vect_remove_slp_scalar_calls (child);
3137 
3138   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3139     {
3140       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3141 	continue;
3142       stmt_info = vinfo_for_stmt (stmt);
3143       if (stmt_info == NULL
3144 	  || is_pattern_stmt_p (stmt_info)
3145 	  || !PURE_SLP_STMT (stmt_info))
3146 	continue;
3147       lhs = gimple_call_lhs (stmt);
3148       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3149       set_vinfo_for_stmt (new_stmt, stmt_info);
3150       set_vinfo_for_stmt (stmt, NULL);
3151       STMT_VINFO_STMT (stmt_info) = new_stmt;
3152       gsi = gsi_for_stmt (stmt);
3153       gsi_replace (&gsi, new_stmt, false);
3154       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3155     }
3156 }
3157 
3158 /* Generate vector code for all SLP instances in the loop/basic block.  */
3159 
3160 bool
vect_schedule_slp(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)3161 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3162 {
3163   vec<slp_instance> slp_instances;
3164   slp_instance instance;
3165   unsigned int i, vf;
3166   bool is_store = false;
3167 
3168   if (loop_vinfo)
3169     {
3170       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3171       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3172     }
3173   else
3174     {
3175       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3176       vf = 1;
3177     }
3178 
3179   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3180     {
3181       /* Schedule the tree of INSTANCE.  */
3182       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3183                                              instance, vf);
3184       if (dump_enabled_p ())
3185 	dump_printf_loc (MSG_NOTE, vect_location,
3186                          "vectorizing stmts using SLP.\n");
3187     }
3188 
3189   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3190     {
3191       slp_tree root = SLP_INSTANCE_TREE (instance);
3192       gimple store;
3193       unsigned int j;
3194       gimple_stmt_iterator gsi;
3195 
3196       /* Remove scalar call stmts.  Do not do this for basic-block
3197 	 vectorization as not all uses may be vectorized.
3198 	 ???  Why should this be necessary?  DCE should be able to
3199 	 remove the stmts itself.
3200 	 ???  For BB vectorization we can as well remove scalar
3201 	 stmts starting from the SLP tree root if they have no
3202 	 uses.  */
3203       if (loop_vinfo)
3204 	vect_remove_slp_scalar_calls (root);
3205 
3206       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3207                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3208         {
3209           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3210             break;
3211 
3212          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3213            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3214           /* Free the attached stmt_vec_info and remove the stmt.  */
3215           gsi = gsi_for_stmt (store);
3216 	  unlink_stmt_vdef (store);
3217           gsi_remove (&gsi, true);
3218 	  release_defs (store);
3219           free_stmt_vec_info (store);
3220         }
3221     }
3222 
3223   return is_store;
3224 }
3225 
3226 
3227 /* Vectorize the basic block.  */
3228 
3229 void
vect_slp_transform_bb(basic_block bb)3230 vect_slp_transform_bb (basic_block bb)
3231 {
3232   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3233   gimple_stmt_iterator si;
3234 
3235   gcc_assert (bb_vinfo);
3236 
3237   if (dump_enabled_p ())
3238     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3239 
3240   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3241     {
3242       gimple stmt = gsi_stmt (si);
3243       stmt_vec_info stmt_info;
3244 
3245       if (dump_enabled_p ())
3246         {
3247           dump_printf_loc (MSG_NOTE, vect_location,
3248                            "------>SLPing statement: ");
3249           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3250           dump_printf (MSG_NOTE, "\n");
3251         }
3252 
3253       stmt_info = vinfo_for_stmt (stmt);
3254       gcc_assert (stmt_info);
3255 
3256       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3257       if (STMT_SLP_TYPE (stmt_info))
3258         {
3259           vect_schedule_slp (NULL, bb_vinfo);
3260           break;
3261         }
3262     }
3263 
3264   if (dump_enabled_p ())
3265     dump_printf_loc (MSG_NOTE, vect_location,
3266 		     "BASIC BLOCK VECTORIZED\n");
3267 
3268   destroy_bb_vec_info (bb_vinfo);
3269 }
3270