1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2013 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h"		/* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
40 
41 /* Extract the location of the basic block in the source code.
42    Return the basic block location if succeed and NULL if not.  */
43 
44 LOC
find_bb_location(basic_block bb)45 find_bb_location (basic_block bb)
46 {
47   gimple stmt = NULL;
48   gimple_stmt_iterator si;
49 
50   if (!bb)
51     return UNKNOWN_LOC;
52 
53   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
54     {
55       stmt = gsi_stmt (si);
56       if (gimple_location (stmt) != UNKNOWN_LOC)
57         return gimple_location (stmt);
58     }
59 
60   return UNKNOWN_LOC;
61 }
62 
63 
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
65 
66 static void
vect_free_slp_tree(slp_tree node)67 vect_free_slp_tree (slp_tree node)
68 {
69   int i;
70   slp_void_p child;
71 
72   if (!node)
73     return;
74 
75   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76     vect_free_slp_tree ((slp_tree) child);
77 
78   SLP_TREE_CHILDREN (node).release ();
79   SLP_TREE_SCALAR_STMTS (node).release ();
80   SLP_TREE_VEC_STMTS (node).release ();
81 
82   free (node);
83 }
84 
85 
86 /* Free the memory allocated for the SLP instance.  */
87 
88 void
vect_free_slp_instance(slp_instance instance)89 vect_free_slp_instance (slp_instance instance)
90 {
91   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92   SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93   SLP_INSTANCE_LOADS (instance).release ();
94   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95   free (instance);
96 }
97 
98 
99 /* Create an SLP node for SCALAR_STMTS.  */
100 
101 static slp_tree
vect_create_new_slp_node(vec<gimple> scalar_stmts)102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
103 {
104   slp_tree node;
105   gimple stmt = scalar_stmts[0];
106   unsigned int nops;
107 
108   if (is_gimple_call (stmt))
109     nops = gimple_call_num_args (stmt);
110   else if (is_gimple_assign (stmt))
111     {
112       nops = gimple_num_ops (stmt) - 1;
113       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 	nops++;
115     }
116   else
117     return NULL;
118 
119   node = XNEW (struct _slp_tree);
120   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121   SLP_TREE_VEC_STMTS (node).create (0);
122   SLP_TREE_CHILDREN (node).create (nops);
123 
124   return node;
125 }
126 
127 
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129    operand.  */
130 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)131 vect_create_oprnd_info (int nops, int group_size)
132 {
133   int i;
134   slp_oprnd_info oprnd_info;
135   vec<slp_oprnd_info> oprnds_info;
136 
137   oprnds_info.create (nops);
138   for (i = 0; i < nops; i++)
139     {
140       oprnd_info = XNEW (struct _slp_oprnd_info);
141       oprnd_info->def_stmts.create (group_size);
142       oprnd_info->first_dt = vect_uninitialized_def;
143       oprnd_info->first_def_type = NULL_TREE;
144       oprnd_info->first_const_oprnd = NULL_TREE;
145       oprnd_info->first_pattern = false;
146       oprnds_info.quick_push (oprnd_info);
147     }
148 
149   return oprnds_info;
150 }
151 
152 
153 /* Free operands info.  */
154 
155 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
157 {
158   int i;
159   slp_oprnd_info oprnd_info;
160 
161   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
162     {
163       oprnd_info->def_stmts.release ();
164       XDELETE (oprnd_info);
165     }
166 
167   oprnds_info.release ();
168 }
169 
170 
171 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
172    they are of a valid type and that they match the defs of the first stmt of
173    the SLP group (stored in OPRNDS_INFO).  */
174 
175 static bool
vect_get_and_check_slp_defs(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,slp_tree slp_node,gimple stmt,int ncopies_for_cost,bool first,vec<slp_oprnd_info> * oprnds_info,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * body_cost_vec)176 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
177                              slp_tree slp_node, gimple stmt,
178 			     int ncopies_for_cost, bool first,
179                              vec<slp_oprnd_info> *oprnds_info,
180 			     stmt_vector_for_cost *prologue_cost_vec,
181 			     stmt_vector_for_cost *body_cost_vec)
182 {
183   tree oprnd;
184   unsigned int i, number_of_oprnds;
185   tree def, def_op0 = NULL_TREE;
186   gimple def_stmt;
187   enum vect_def_type dt = vect_uninitialized_def;
188   enum vect_def_type dt_op0 = vect_uninitialized_def;
189   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
190   tree lhs = gimple_get_lhs (stmt);
191   struct loop *loop = NULL;
192   enum tree_code rhs_code;
193   bool different_types = false;
194   bool pattern = false;
195   slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
196   int op_idx = 1;
197   tree compare_rhs = NULL_TREE;
198 
199   if (loop_vinfo)
200     loop = LOOP_VINFO_LOOP (loop_vinfo);
201 
202   if (is_gimple_call (stmt))
203     {
204       number_of_oprnds = gimple_call_num_args (stmt);
205       op_idx = 3;
206     }
207   else if (is_gimple_assign (stmt))
208     {
209       number_of_oprnds = gimple_num_ops (stmt) - 1;
210       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
211         number_of_oprnds++;
212     }
213   else
214     return false;
215 
216   for (i = 0; i < number_of_oprnds; i++)
217     {
218       if (compare_rhs)
219 	{
220 	  oprnd = compare_rhs;
221 	  compare_rhs = NULL_TREE;
222 	}
223       else
224         oprnd = gimple_op (stmt, op_idx++);
225 
226       oprnd_info = (*oprnds_info)[i];
227 
228       if (COMPARISON_CLASS_P (oprnd))
229         {
230           compare_rhs = TREE_OPERAND (oprnd, 1);
231           oprnd = TREE_OPERAND (oprnd, 0);
232 	}
233 
234       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
235 			       &def, &dt)
236 	  || (!def_stmt && dt != vect_constant_def))
237 	{
238 	  if (dump_enabled_p ())
239 	    {
240 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 			       "Build SLP failed: can't find def for ");
242 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
243 	    }
244 
245 	  return false;
246 	}
247 
248       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
249          from the pattern.  Check that all the stmts of the node are in the
250          pattern.  */
251       if (def_stmt && gimple_bb (def_stmt)
252           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
253 	      || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
254 		  && gimple_code (def_stmt) != GIMPLE_PHI))
255           && vinfo_for_stmt (def_stmt)
256           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
257 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
258 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
259         {
260           pattern = true;
261           if (!first && !oprnd_info->first_pattern)
262 	    {
263 	      if (dump_enabled_p ())
264 		{
265 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 				   "Build SLP failed: some of the stmts"
267 				   " are in a pattern, and others are not ");
268 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
269 		}
270 
271 	      return false;
272             }
273 
274           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
275           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
276 
277           if (dt == vect_unknown_def_type)
278             {
279               if (dump_enabled_p ())
280                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 				 "Unsupported pattern.");
282               return false;
283             }
284 
285           switch (gimple_code (def_stmt))
286             {
287               case GIMPLE_PHI:
288                 def = gimple_phi_result (def_stmt);
289                 break;
290 
291               case GIMPLE_ASSIGN:
292                 def = gimple_assign_lhs (def_stmt);
293                 break;
294 
295               default:
296                 if (dump_enabled_p ())
297                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
298 				   "unsupported defining stmt: ");
299                 return false;
300             }
301         }
302 
303       if (first)
304 	{
305 	  oprnd_info->first_dt = dt;
306 	  oprnd_info->first_pattern = pattern;
307 	  if (def)
308 	    {
309 	      oprnd_info->first_def_type = TREE_TYPE (def);
310 	      oprnd_info->first_const_oprnd = NULL_TREE;
311 	    }
312 	  else
313             {
314               oprnd_info->first_def_type = NULL_TREE;
315               oprnd_info->first_const_oprnd = oprnd;
316             }
317 
318 	  if (i == 0)
319 	    {
320 	      def_op0 = def;
321 	      dt_op0 = dt;
322 	      /* Analyze costs (for the first stmt of the group only).  */
323 	      if (REFERENCE_CLASS_P (lhs))
324 		/* Store.  */
325                 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
326 				       dt, slp_node, prologue_cost_vec,
327 				       body_cost_vec);
328 	      else
329 		{
330 		  enum vect_def_type dts[2];
331 		  dts[0] = dt;
332 		  dts[1] = vect_uninitialized_def;
333 		  /* Not memory operation (we don't call this function for
334 		     loads).  */
335 		  vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
336 					  prologue_cost_vec, body_cost_vec);
337 		}
338 	    }
339 	}
340       else
341 	{
342 	  /* Not first stmt of the group, check that the def-stmt/s match
343 	     the def-stmt/s of the first stmt.  Allow different definition
344 	     types for reduction chains: the first stmt must be a
345 	     vect_reduction_def (a phi node), and the rest
346 	     vect_internal_def.  */
347 	  if (((oprnd_info->first_dt != dt
348                 && !(oprnd_info->first_dt == vect_reduction_def
349                      && dt == vect_internal_def))
350                || (oprnd_info->first_def_type != NULL_TREE
351 		   && def
352 		   && !types_compatible_p (oprnd_info->first_def_type,
353 					   TREE_TYPE (def))))
354 	       || (!def
355 		   && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
356 					   TREE_TYPE (oprnd)))
357 	       || different_types)
358 	    {
359 	      if (number_of_oprnds != 2)
360 		{
361 		  if (dump_enabled_p ())
362 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 				     "Build SLP failed: different types ");
364 
365 		  return false;
366                 }
367 
368 	      /* Try to swap operands in case of binary operation.  */
369               if (i == 0)
370                 different_types = true;
371               else
372 		{
373 		  oprnd0_info = (*oprnds_info)[0];
374 		  if (is_gimple_assign (stmt)
375 		      && (rhs_code = gimple_assign_rhs_code (stmt))
376  		      && TREE_CODE_CLASS (rhs_code) == tcc_binary
377 		      && commutative_tree_code (rhs_code)
378 		      && oprnd0_info->first_dt == dt
379 		      && oprnd_info->first_dt == dt_op0
380 		      && def_op0 && def
381 		      && !(oprnd0_info->first_def_type
382 			   && !types_compatible_p (oprnd0_info->first_def_type,
383 			                           TREE_TYPE (def)))
384                       && !(oprnd_info->first_def_type
385                            && !types_compatible_p (oprnd_info->first_def_type,
386                                                    TREE_TYPE (def_op0))))
387                     {
388                       if (dump_enabled_p ())
389 	                {
390 			  dump_printf_loc (MSG_NOTE, vect_location,
391 					   "Swapping operands of ");
392  		          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
393 			}
394 
395  		      swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
396  	                                  gimple_assign_rhs2_ptr (stmt));
397 		    }
398                   else
399                     {
400          	      if (dump_enabled_p ())
401 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402 					 "Build SLP failed: different types ");
403 
404 		      return false;
405 		    }
406 		}
407 	    }
408 	}
409 
410       /* Check the types of the definitions.  */
411       switch (dt)
412 	{
413 	case vect_constant_def:
414 	case vect_external_def:
415         case vect_reduction_def:
416 	  break;
417 
418 	case vect_internal_def:
419           if (different_types)
420             {
421 	      oprnd0_info = (*oprnds_info)[0];
422 	      oprnd1_info = (*oprnds_info)[0];
423               if (i == 0)
424                 oprnd1_info->def_stmts.quick_push (def_stmt);
425               else
426                 oprnd0_info->def_stmts.quick_push (def_stmt);
427             }
428 	  else
429  	    oprnd_info->def_stmts.quick_push (def_stmt);
430 
431 	  break;
432 
433 	default:
434 	  /* FORNOW: Not supported.  */
435 	  if (dump_enabled_p ())
436 	    {
437 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 			       "Build SLP failed: illegal type of def ");
439 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
440 	    }
441 
442 	  return false;
443 	}
444     }
445 
446   return true;
447 }
448 
449 
450 /* Recursively build an SLP tree starting from NODE.
451    Fail (and return FALSE) if def-stmts are not isomorphic, require data
452    permutation or are of unsupported types of operation.  Otherwise, return
453    TRUE.  */
454 
455 static bool
vect_build_slp_tree(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,slp_tree * node,unsigned int group_size,int * outside_cost,int ncopies_for_cost,unsigned int * max_nunits,vec<int> * load_permutation,vec<slp_tree> * loads,unsigned int vectorization_factor,bool * loads_permuted,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * body_cost_vec)456 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
457                      slp_tree *node, unsigned int group_size, int *outside_cost,
458                      int ncopies_for_cost, unsigned int *max_nunits,
459                      vec<int> *load_permutation,
460                      vec<slp_tree> *loads,
461                      unsigned int vectorization_factor, bool *loads_permuted,
462 		     stmt_vector_for_cost *prologue_cost_vec,
463 		     stmt_vector_for_cost *body_cost_vec)
464 {
465   unsigned int i;
466   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
467   gimple stmt = stmts[0];
468   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
469   enum tree_code first_cond_code = ERROR_MARK;
470   tree lhs;
471   bool stop_recursion = false, need_same_oprnds = false;
472   tree vectype, scalar_type, first_op1 = NULL_TREE;
473   unsigned int ncopies;
474   optab optab;
475   int icode;
476   enum machine_mode optab_op2_mode;
477   enum machine_mode vec_mode;
478   struct data_reference *first_dr;
479   HOST_WIDE_INT dummy;
480   bool permutation = false;
481   unsigned int load_place;
482   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
483   vec<slp_oprnd_info> oprnds_info;
484   unsigned int nops;
485   slp_oprnd_info oprnd_info;
486   tree cond;
487 
488   if (is_gimple_call (stmt))
489     nops = gimple_call_num_args (stmt);
490   else if (is_gimple_assign (stmt))
491     {
492       nops = gimple_num_ops (stmt) - 1;
493       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
494 	nops++;
495     }
496   else
497     return false;
498 
499   oprnds_info = vect_create_oprnd_info (nops, group_size);
500 
501   /* For every stmt in NODE find its def stmt/s.  */
502   FOR_EACH_VEC_ELT (stmts, i, stmt)
503     {
504       if (dump_enabled_p ())
505 	{
506 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
507 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
508 	}
509 
510       /* Fail to vectorize statements marked as unvectorizable.  */
511       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
512         {
513           if (dump_enabled_p ())
514             {
515               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
516 			       "Build SLP failed: unvectorizable statement ");
517               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
518             }
519 
520 	  vect_free_oprnd_info (oprnds_info);
521           return false;
522         }
523 
524       lhs = gimple_get_lhs (stmt);
525       if (lhs == NULL_TREE)
526 	{
527 	  if (dump_enabled_p ())
528 	    {
529 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
531 			       "GIMPLE_CALL ");
532 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
533 	    }
534 
535 	  vect_free_oprnd_info (oprnds_info);
536 	  return false;
537 	}
538 
539        if (is_gimple_assign (stmt)
540 	   && gimple_assign_rhs_code (stmt) == COND_EXPR
541            && (cond = gimple_assign_rhs1 (stmt))
542            && !COMPARISON_CLASS_P (cond))
543         {
544           if (dump_enabled_p ())
545             {
546               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
547 			       "Build SLP failed: condition is not "
548 			       "comparison ");
549               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
550             }
551 
552 	  vect_free_oprnd_info (oprnds_info);
553           return false;
554         }
555 
556       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557       vectype = get_vectype_for_scalar_type (scalar_type);
558       if (!vectype)
559         {
560           if (dump_enabled_p ())
561             {
562               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 			       "Build SLP failed: unsupported data-type ");
564               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
565 				 scalar_type);
566             }
567 
568 	  vect_free_oprnd_info (oprnds_info);
569           return false;
570         }
571 
572       /* In case of multiple types we need to detect the smallest type.  */
573       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
574         {
575           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
576           if (bb_vinfo)
577             vectorization_factor = *max_nunits;
578         }
579 
580       ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
581 
582       if (is_gimple_call (stmt))
583 	{
584 	  rhs_code = CALL_EXPR;
585 	  if (gimple_call_internal_p (stmt)
586 	      || gimple_call_tail_p (stmt)
587 	      || gimple_call_noreturn_p (stmt)
588 	      || !gimple_call_nothrow_p (stmt)
589 	      || gimple_call_chain (stmt))
590 	    {
591 	      if (dump_enabled_p ())
592 		{
593 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
594 				   "Build SLP failed: unsupported call type ");
595 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
596 		}
597 
598 	      vect_free_oprnd_info (oprnds_info);
599 	      return false;
600 	    }
601 	}
602       else
603 	rhs_code = gimple_assign_rhs_code (stmt);
604 
605       /* Check the operation.  */
606       if (i == 0)
607 	{
608 	  first_stmt_code = rhs_code;
609 
610 	  /* Shift arguments should be equal in all the packed stmts for a
611 	     vector shift with scalar shift operand.  */
612 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
613 	      || rhs_code == LROTATE_EXPR
614 	      || rhs_code == RROTATE_EXPR)
615 	    {
616 	      vec_mode = TYPE_MODE (vectype);
617 
618 	      /* First see if we have a vector/vector shift.  */
619 	      optab = optab_for_tree_code (rhs_code, vectype,
620 					   optab_vector);
621 
622 	      if (!optab
623 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
624 		{
625 		  /* No vector/vector shift, try for a vector/scalar shift.  */
626 		  optab = optab_for_tree_code (rhs_code, vectype,
627 					       optab_scalar);
628 
629 		  if (!optab)
630 		    {
631 		      if (dump_enabled_p ())
632 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633 					 "Build SLP failed: no optab.");
634 	  	      vect_free_oprnd_info (oprnds_info);
635 		      return false;
636 		    }
637 		  icode = (int) optab_handler (optab, vec_mode);
638 		  if (icode == CODE_FOR_nothing)
639 		    {
640 		      if (dump_enabled_p ())
641 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 					 "Build SLP failed: "
643 					 "op not supported by target.");
644 	  	      vect_free_oprnd_info (oprnds_info);
645 		      return false;
646 		    }
647 		  optab_op2_mode = insn_data[icode].operand[2].mode;
648 		  if (!VECTOR_MODE_P (optab_op2_mode))
649 		    {
650 		      need_same_oprnds = true;
651 		      first_op1 = gimple_assign_rhs2 (stmt);
652 		    }
653 		}
654 	    }
655 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660 	}
661       else
662 	{
663 	  if (first_stmt_code != rhs_code
664 	      && (first_stmt_code != IMAGPART_EXPR
665 		  || rhs_code != REALPART_EXPR)
666 	      && (first_stmt_code != REALPART_EXPR
667 		  || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == INDIRECT_REF
671                        || first_stmt_code == COMPONENT_REF
672                        || first_stmt_code == MEM_REF)))
673 	    {
674 	      if (dump_enabled_p ())
675 		{
676 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 				   "Build SLP failed: different operation "
678 				   "in stmt ");
679 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
680 		}
681 
682 	      vect_free_oprnd_info (oprnds_info);
683 	      return false;
684 	    }
685 
686 	  if (need_same_oprnds
687 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
688 	    {
689 	      if (dump_enabled_p ())
690 		{
691 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
692 				   "Build SLP failed: different shift "
693 				   "arguments in ");
694 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
695 		}
696 
697 	      vect_free_oprnd_info (oprnds_info);
698 	      return false;
699 	    }
700 
701 	  if (rhs_code == CALL_EXPR)
702 	    {
703 	      gimple first_stmt = stmts[0];
704 	      if (gimple_call_num_args (stmt) != nops
705 		  || !operand_equal_p (gimple_call_fn (first_stmt),
706 				       gimple_call_fn (stmt), 0)
707 		  || gimple_call_fntype (first_stmt)
708 		     != gimple_call_fntype (stmt))
709 		{
710 		  if (dump_enabled_p ())
711 		    {
712 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 				       "Build SLP failed: different calls in ");
714 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
715 					stmt, 0);
716 		    }
717 
718 		  vect_free_oprnd_info (oprnds_info);
719 		  return false;
720 		}
721 	    }
722 	}
723 
724       /* Grouped store or load.  */
725       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
726 	{
727 	  if (REFERENCE_CLASS_P (lhs))
728 	    {
729 	      /* Store.  */
730 	      if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
731 						stmt, ncopies_for_cost,
732 						(i == 0), &oprnds_info,
733 						prologue_cost_vec,
734 						body_cost_vec))
735 		{
736 	  	  vect_free_oprnd_info (oprnds_info);
737  		  return false;
738 		}
739 	    }
740 	  else
741 	    {
742 	      /* Load.  */
743               /* FORNOW: Check that there is no gap between the loads.  */
744               if ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
745                    && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
746                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
747                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
748                 {
749                   if (dump_enabled_p ())
750                     {
751                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 				       "Build SLP failed: grouped "
753 				       "loads have gaps ");
754                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
755 					stmt, 0);
756                     }
757 
758 	  	  vect_free_oprnd_info (oprnds_info);
759                   return false;
760                 }
761 
762               /* Check that the size of interleaved loads group is not
763                  greater than the SLP group size.  */
764               if (loop_vinfo
765                   && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
766                 {
767                   if (dump_enabled_p ())
768                     {
769                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
770 				       "Build SLP failed: the number "
771 				       "of interleaved loads is greater than "
772 				       "the SLP group size ");
773                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
774 					stmt, 0);
775                     }
776 
777 	  	  vect_free_oprnd_info (oprnds_info);
778                   return false;
779                 }
780 
781 	      old_first_load = first_load;
782               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
783               if (prev_first_load)
784                 {
785                   /* Check that there are no loads from different interleaving
786                      chains in the same node.  The only exception is complex
787                      numbers.  */
788                   if (prev_first_load != first_load
789                       && rhs_code != REALPART_EXPR
790                       && rhs_code != IMAGPART_EXPR)
791                     {
792                       if (dump_enabled_p ())
793                         {
794                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
795 					   vect_location,
796 					   "Build SLP failed: different "
797 					   "interleaving chains in one node ");
798                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
799 					    stmt, 0);
800                         }
801 
802 	  	      vect_free_oprnd_info (oprnds_info);
803                       return false;
804                     }
805                 }
806               else
807                 prev_first_load = first_load;
808 
809 	      /* In some cases a group of loads is just the same load
810 		 repeated N times.  Only analyze its cost once.  */
811               if (first_load == stmt && old_first_load != first_load)
812                 {
813                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
814                   if (vect_supportable_dr_alignment (first_dr, false)
815                       == dr_unaligned_unsupported)
816                     {
817                       if (dump_enabled_p ())
818                         {
819                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
820 					   vect_location,
821 					   "Build SLP failed: unsupported "
822 					   "unaligned load ");
823                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
824 					    stmt, 0);
825                         }
826 
827 	  	      vect_free_oprnd_info (oprnds_info);
828                       return false;
829                     }
830 
831                   /* Analyze costs (for the first stmt in the group).  */
832                   vect_model_load_cost (vinfo_for_stmt (stmt),
833                                         ncopies_for_cost, false, *node,
834 					prologue_cost_vec, body_cost_vec);
835                 }
836 
837               /* Store the place of this load in the interleaving chain.  In
838                  case that permutation is needed we later decide if a specific
839                  permutation is supported.  */
840               load_place = vect_get_place_in_interleaving_chain (stmt,
841                                                                  first_load);
842               if (load_place != i)
843                 permutation = true;
844 
845               load_permutation->safe_push (load_place);
846 
847               /* We stop the tree when we reach a group of loads.  */
848               stop_recursion = true;
849              continue;
850            }
851         } /* Grouped access.  */
852       else
853 	{
854 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
855 	    {
856 	      /* Not grouped load.  */
857 	      if (dump_enabled_p ())
858 		{
859 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
860 				   "Build SLP failed: not grouped load ");
861 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
862 		}
863 
864 	      /* FORNOW: Not grouped loads are not supported.  */
865 	      vect_free_oprnd_info (oprnds_info);
866 	      return false;
867 	    }
868 
869 	  /* Not memory operation.  */
870 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
871 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
872 	      && rhs_code != COND_EXPR
873 	      && rhs_code != CALL_EXPR)
874 	    {
875 	      if (dump_enabled_p ())
876 		{
877 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 				   "Build SLP failed: operation");
879 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
880 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
881 		}
882 
883 	      vect_free_oprnd_info (oprnds_info);
884 	      return false;
885 	    }
886 
887           if (rhs_code == COND_EXPR)
888             {
889               tree cond_expr = gimple_assign_rhs1 (stmt);
890 
891 	      if (i == 0)
892 		first_cond_code = TREE_CODE (cond_expr);
893               else if (first_cond_code != TREE_CODE (cond_expr))
894                 {
895                   if (dump_enabled_p ())
896                     {
897                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 				       "Build SLP failed: different"
899 				       " operation");
900                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
901 					stmt, 0);
902                     }
903 
904 		  vect_free_oprnd_info (oprnds_info);
905                   return false;
906 		}
907             }
908 
909 	  /* Find the def-stmts.  */
910 	  if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
911 					    ncopies_for_cost, (i == 0),
912 					    &oprnds_info, prologue_cost_vec,
913 					    body_cost_vec))
914 	    {
915 	      vect_free_oprnd_info (oprnds_info);
916 	      return false;
917 	    }
918 	}
919     }
920 
921   /* Grouped loads were reached - stop the recursion.  */
922   if (stop_recursion)
923     {
924       loads->safe_push (*node);
925       if (permutation)
926         {
927 	  gimple first_stmt = stmts[0];
928           *loads_permuted = true;
929 	  (void) record_stmt_cost (body_cost_vec, group_size, vec_perm,
930 				   vinfo_for_stmt (first_stmt), 0, vect_body);
931         }
932       else
933         {
934           /* We don't check here complex numbers chains, so we set
935              LOADS_PERMUTED for further check in
936              vect_supported_load_permutation_p.  */
937           if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
938             *loads_permuted = true;
939         }
940 
941       vect_free_oprnd_info (oprnds_info);
942       return true;
943     }
944 
945   /* Create SLP_TREE nodes for the definition node/s.  */
946   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
947     {
948       slp_tree child;
949 
950       if (oprnd_info->first_dt != vect_internal_def)
951         continue;
952 
953       child = vect_create_new_slp_node (oprnd_info->def_stmts);
954       if (!child
955           || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
956 				   outside_cost, ncopies_for_cost,
957 				   max_nunits, load_permutation, loads,
958 				   vectorization_factor, loads_permuted,
959 				   prologue_cost_vec, body_cost_vec))
960         {
961 	  if (child)
962 	    oprnd_info->def_stmts = vNULL;
963 	  vect_free_slp_tree (child);
964 	  vect_free_oprnd_info (oprnds_info);
965    	  return false;
966 	}
967 
968       oprnd_info->def_stmts.create (0);
969       SLP_TREE_CHILDREN (*node).quick_push (child);
970     }
971 
972   vect_free_oprnd_info (oprnds_info);
973   return true;
974 }
975 
976 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
977 
978 static void
vect_print_slp_tree(int dump_kind,slp_tree node)979 vect_print_slp_tree (int dump_kind, slp_tree node)
980 {
981   int i;
982   gimple stmt;
983   slp_void_p child;
984 
985   if (!node)
986     return;
987 
988   dump_printf (dump_kind, "node ");
989   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
990     {
991       dump_printf (dump_kind, "\n\tstmt %d ", i);
992       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
993     }
994   dump_printf (dump_kind, "\n");
995 
996   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
997     vect_print_slp_tree (dump_kind, (slp_tree) child);
998 }
999 
1000 
1001 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1002    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1003    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1004    stmts in NODE are to be marked.  */
1005 
1006 static void
vect_mark_slp_stmts(slp_tree node,enum slp_vect_type mark,int j)1007 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1008 {
1009   int i;
1010   gimple stmt;
1011   slp_void_p child;
1012 
1013   if (!node)
1014     return;
1015 
1016   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1017     if (j < 0 || i == j)
1018       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1019 
1020   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1021     vect_mark_slp_stmts ((slp_tree) child, mark, j);
1022 }
1023 
1024 
1025 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1026 
1027 static void
vect_mark_slp_stmts_relevant(slp_tree node)1028 vect_mark_slp_stmts_relevant (slp_tree node)
1029 {
1030   int i;
1031   gimple stmt;
1032   stmt_vec_info stmt_info;
1033   slp_void_p child;
1034 
1035   if (!node)
1036     return;
1037 
1038   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1039     {
1040       stmt_info = vinfo_for_stmt (stmt);
1041       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1042                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1043       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1044     }
1045 
1046   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1047     vect_mark_slp_stmts_relevant ((slp_tree) child);
1048 }
1049 
1050 
1051 /* Check if the permutation required by the SLP INSTANCE is supported.
1052    Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed.  */
1053 
1054 static bool
vect_supported_slp_permutation_p(slp_instance instance)1055 vect_supported_slp_permutation_p (slp_instance instance)
1056 {
1057   slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1058   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1059   gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1060   vec<slp_tree> sorted_loads = vNULL;
1061   int index;
1062   slp_tree *tmp_loads = NULL;
1063   int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1064   slp_tree load;
1065 
1066   /* FORNOW: The only supported loads permutation is loads from the same
1067      location in all the loads in the node, when the data-refs in
1068      nodes of LOADS constitute an interleaving chain.
1069      Sort the nodes according to the order of accesses in the chain.  */
1070   tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1071   for (i = 0, j = 0;
1072        SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1073        && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1074        i += group_size, j++)
1075     {
1076       gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1077       /* Check that the loads are all in the same interleaving chain.  */
1078       if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1079         {
1080           if (dump_enabled_p ())
1081             {
1082               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1083 			       "Build SLP failed: unsupported data "
1084 			       "permutation ");
1085               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1086 				scalar_stmt, 0);
1087             }
1088 
1089           free (tmp_loads);
1090           return false;
1091         }
1092 
1093       tmp_loads[index] = load;
1094     }
1095 
1096   sorted_loads.create (group_size);
1097   for (i = 0; i < group_size; i++)
1098      sorted_loads.safe_push (tmp_loads[i]);
1099 
1100   SLP_INSTANCE_LOADS (instance).release ();
1101   SLP_INSTANCE_LOADS (instance) = sorted_loads;
1102   free (tmp_loads);
1103 
1104   if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1105                                      SLP_INSTANCE_UNROLLING_FACTOR (instance),
1106                                      instance, true))
1107     return false;
1108 
1109   return true;
1110 }
1111 
1112 
1113 /* Rearrange the statements of NODE according to PERMUTATION.  */
1114 
1115 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<int> permutation)1116 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1117                           vec<int> permutation)
1118 {
1119   gimple stmt;
1120   vec<gimple> tmp_stmts;
1121   unsigned int index, i;
1122   slp_void_p child;
1123 
1124   if (!node)
1125     return;
1126 
1127   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1128     vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
1129 
1130   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1131   tmp_stmts.create (group_size);
1132 
1133   for (i = 0; i < group_size; i++)
1134     tmp_stmts.safe_push (NULL);
1135 
1136   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1137     {
1138       index = permutation[i];
1139       tmp_stmts[index] = stmt;
1140     }
1141 
1142   SLP_TREE_SCALAR_STMTS (node).release ();
1143   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1144 }
1145 
1146 
1147 /* Check if the required load permutation is supported.
1148    LOAD_PERMUTATION contains a list of indices of the loads.
1149    In SLP this permutation is relative to the order of grouped stores that are
1150    the base of the SLP instance.  */
1151 
1152 static bool
vect_supported_load_permutation_p(slp_instance slp_instn,int group_size,vec<int> load_permutation)1153 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1154                                    vec<int> load_permutation)
1155 {
1156   int i = 0, j, prev = -1, next, k, number_of_groups;
1157   bool supported, bad_permutation = false;
1158   sbitmap load_index;
1159   slp_tree node, other_complex_node;
1160   gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1161   unsigned complex_numbers = 0;
1162   struct data_reference *dr;
1163   bb_vec_info bb_vinfo;
1164 
1165   /* FORNOW: permutations are only supported in SLP.  */
1166   if (!slp_instn)
1167     return false;
1168 
1169   if (dump_enabled_p ())
1170     {
1171       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1172       FOR_EACH_VEC_ELT (load_permutation, i, next)
1173         dump_printf (MSG_NOTE, "%d ", next);
1174     }
1175 
1176   /* In case of reduction every load permutation is allowed, since the order
1177      of the reduction statements is not important (as opposed to the case of
1178      grouped stores).  The only condition we need to check is that all the
1179      load nodes are of the same size and have the same permutation (and then
1180      rearrange all the nodes of the SLP instance according to this
1181      permutation).  */
1182 
1183   /* Check that all the load nodes are of the same size.  */
1184   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1185     {
1186       if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1187         return false;
1188 
1189       stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1190       if (is_gimple_assign (stmt)
1191           && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1192               || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1193         complex_numbers++;
1194     }
1195 
1196   /* Complex operands can be swapped as following:
1197       real_c = real_b + real_a;
1198       imag_c = imag_a + imag_b;
1199      i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1200      {real_a, imag_a} and {real_b, imag_b}.  We check here that if interleaving
1201      chains are mixed, they match the above pattern.  */
1202   if (complex_numbers)
1203     {
1204       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1205         {
1206 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1207             {
1208               if (j == 0)
1209                 first = stmt;
1210               else
1211                 {
1212                   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1213                     {
1214                       if (complex_numbers != 2)
1215                         return false;
1216 
1217                       if (i == 0)
1218                         k = 1;
1219                       else
1220                         k = 0;
1221 
1222                       other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1223                       other_node_first =
1224                                 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1225 
1226                       if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1227                           != other_node_first)
1228                        return false;
1229                     }
1230                 }
1231             }
1232         }
1233     }
1234 
1235   /* We checked that this case ok, so there is no need to proceed with
1236      permutation tests.  */
1237   if (complex_numbers == 2
1238       && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1239     {
1240       SLP_INSTANCE_LOADS (slp_instn).release ();
1241       SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1242       return true;
1243     }
1244 
1245   node = SLP_INSTANCE_TREE (slp_instn);
1246   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1247   /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1248      instance, not all the loads belong to the same node or interleaving
1249      group.  Hence, we need to divide them into groups according to
1250      GROUP_SIZE.  */
1251   number_of_groups = load_permutation.length () / group_size;
1252 
1253   /* Reduction (there are no data-refs in the root).
1254      In reduction chain the order of the loads is important.  */
1255   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1256       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1257     {
1258       int first_group_load_index;
1259 
1260       /* Compare all the permutation sequences to the first one.  */
1261       for (i = 1; i < number_of_groups; i++)
1262         {
1263           k = 0;
1264           for (j = i * group_size; j < i * group_size + group_size; j++)
1265             {
1266               next = load_permutation[j];
1267               first_group_load_index = load_permutation[k];
1268 
1269               if (next != first_group_load_index)
1270                 {
1271                   bad_permutation = true;
1272                   break;
1273                 }
1274 
1275               k++;
1276             }
1277 
1278           if (bad_permutation)
1279             break;
1280         }
1281 
1282       if (!bad_permutation)
1283         {
1284           /* Check that the loads in the first sequence are different and there
1285              are no gaps between them.  */
1286           load_index = sbitmap_alloc (group_size);
1287           bitmap_clear (load_index);
1288           for (k = 0; k < group_size; k++)
1289             {
1290               first_group_load_index = load_permutation[k];
1291               if (bitmap_bit_p (load_index, first_group_load_index))
1292                 {
1293                   bad_permutation = true;
1294                   break;
1295                 }
1296 
1297               bitmap_set_bit (load_index, first_group_load_index);
1298             }
1299 
1300           if (!bad_permutation)
1301             for (k = 0; k < group_size; k++)
1302               if (!bitmap_bit_p (load_index, k))
1303                 {
1304                   bad_permutation = true;
1305                   break;
1306                 }
1307 
1308           sbitmap_free (load_index);
1309         }
1310 
1311       if (!bad_permutation)
1312         {
1313           /* This permutation is valid for reduction.  Since the order of the
1314              statements in the nodes is not important unless they are memory
1315              accesses, we can rearrange the statements in all the nodes
1316              according to the order of the loads.  */
1317           vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1318                                     load_permutation);
1319           SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1320           return true;
1321         }
1322     }
1323 
1324   /* In basic block vectorization we allow any subchain of an interleaving
1325      chain.
1326      FORNOW: not supported in loop SLP because of realignment compications.  */
1327   bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1328   bad_permutation = false;
1329   /* Check that for every node in the instance the loads form a subchain.  */
1330   if (bb_vinfo)
1331     {
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333         {
1334           next_load = NULL;
1335           first_load = NULL;
1336           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1337             {
1338               if (!first_load)
1339                 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1340               else if (first_load
1341                          != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1342                 {
1343                   bad_permutation = true;
1344 	          break;
1345 	        }
1346 
1347               if (j != 0 && next_load != load)
1348                 {
1349                   bad_permutation = true;
1350                   break;
1351                 }
1352 
1353               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1354             }
1355 
1356           if (bad_permutation)
1357             break;
1358         }
1359 
1360       /* Check that the alignment of the first load in every subchain, i.e.,
1361          the first statement in every load node, is supported.  */
1362       if (!bad_permutation)
1363         {
1364           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1365             {
1366               first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1367               if (first_load
1368                     != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1369                 {
1370                   dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1371                   if (vect_supportable_dr_alignment (dr, false)
1372  	               == dr_unaligned_unsupported)
1373                     {
1374    		      if (dump_enabled_p ())
1375 		        {
1376   	                  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1377 					   vect_location,
1378 					   "unsupported unaligned load ");
1379                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1380 					    first_load, 0);
1381                         }
1382   		      bad_permutation = true;
1383                       break;
1384                     }
1385 	        }
1386             }
1387 
1388           if (!bad_permutation)
1389             {
1390               SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1391               return true;
1392     	    }
1393         }
1394     }
1395 
1396   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1397      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1398      well (unless it's reduction).  */
1399   if (load_permutation.length ()
1400       != (unsigned int) (group_size * group_size))
1401     return false;
1402 
1403   supported = true;
1404   load_index = sbitmap_alloc (group_size);
1405   bitmap_clear (load_index);
1406   for (j = 0; j < group_size; j++)
1407     {
1408       for (i = j * group_size, k = 0;
1409            load_permutation.iterate (i, &next) && k < group_size;
1410            i++, k++)
1411        {
1412          if (i != j * group_size && next != prev)
1413           {
1414             supported = false;
1415             break;
1416           }
1417 
1418          prev = next;
1419        }
1420 
1421       if (bitmap_bit_p (load_index, prev))
1422         {
1423           supported = false;
1424           break;
1425         }
1426 
1427       bitmap_set_bit (load_index, prev);
1428     }
1429 
1430   for (j = 0; j < group_size; j++)
1431     if (!bitmap_bit_p (load_index, j))
1432       {
1433 	sbitmap_free (load_index);
1434 	return false;
1435       }
1436 
1437   sbitmap_free (load_index);
1438 
1439   if (supported && i == group_size * group_size
1440       && vect_supported_slp_permutation_p (slp_instn))
1441     return true;
1442 
1443   return false;
1444 }
1445 
1446 
1447 /* Find the first load in the loop that belongs to INSTANCE.
1448    When loads are in several SLP nodes, there can be a case in which the first
1449    load does not appear in the first SLP node to be transformed, causing
1450    incorrect order of statements.  Since we generate all the loads together,
1451    they must be inserted before the first load of the SLP instance and not
1452    before the first load of the first node of the instance.  */
1453 
1454 static gimple
vect_find_first_load_in_slp_instance(slp_instance instance)1455 vect_find_first_load_in_slp_instance (slp_instance instance)
1456 {
1457   int i, j;
1458   slp_tree load_node;
1459   gimple first_load = NULL, load;
1460 
1461   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1462     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1463       first_load = get_earlier_stmt (load, first_load);
1464 
1465   return first_load;
1466 }
1467 
1468 
1469 /* Find the last store in SLP INSTANCE.  */
1470 
1471 static gimple
vect_find_last_store_in_slp_instance(slp_instance instance)1472 vect_find_last_store_in_slp_instance (slp_instance instance)
1473 {
1474   int i;
1475   slp_tree node;
1476   gimple last_store = NULL, store;
1477 
1478   node = SLP_INSTANCE_TREE (instance);
1479   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1480     last_store = get_later_stmt (store, last_store);
1481 
1482   return last_store;
1483 }
1484 
1485 
1486 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1487    vect_build_slp_tree to build a tree of packed stmts if possible.
1488    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1489 
1490 static bool
vect_analyze_slp_instance(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,gimple stmt)1491 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1492                            gimple stmt)
1493 {
1494   slp_instance new_instance;
1495   slp_tree node;
1496   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1497   unsigned int unrolling_factor = 1, nunits;
1498   tree vectype, scalar_type = NULL_TREE;
1499   gimple next;
1500   unsigned int vectorization_factor = 0;
1501   int outside_cost = 0, ncopies_for_cost, i;
1502   unsigned int max_nunits = 0;
1503   vec<int> load_permutation;
1504   vec<slp_tree> loads;
1505   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1506   bool loads_permuted = false;
1507   vec<gimple> scalar_stmts;
1508   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1509   stmt_info_for_cost *si;
1510 
1511   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1512     {
1513       if (dr)
1514         {
1515           scalar_type = TREE_TYPE (DR_REF (dr));
1516           vectype = get_vectype_for_scalar_type (scalar_type);
1517         }
1518       else
1519         {
1520           gcc_assert (loop_vinfo);
1521           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1522         }
1523 
1524       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1525     }
1526   else
1527     {
1528       gcc_assert (loop_vinfo);
1529       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1530       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1531     }
1532 
1533   if (!vectype)
1534     {
1535       if (dump_enabled_p ())
1536         {
1537           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1538 			   "Build SLP failed: unsupported data-type ");
1539           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1540         }
1541 
1542       return false;
1543     }
1544 
1545   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1546   if (loop_vinfo)
1547     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1548   else
1549     vectorization_factor = nunits;
1550 
1551   /* Calculate the unrolling factor.  */
1552   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1553   if (unrolling_factor != 1 && !loop_vinfo)
1554     {
1555       if (dump_enabled_p ())
1556         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557 			 "Build SLP failed: unrolling required in basic"
1558 			 " block SLP");
1559 
1560       return false;
1561     }
1562 
1563   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1564   scalar_stmts.create (group_size);
1565   next = stmt;
1566   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1567     {
1568       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1569       while (next)
1570         {
1571 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1572 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1573 	    scalar_stmts.safe_push (
1574 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1575 	  else
1576             scalar_stmts.safe_push (next);
1577           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1578         }
1579     }
1580   else
1581     {
1582       /* Collect reduction statements.  */
1583       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1584       for (i = 0; reductions.iterate (i, &next); i++)
1585 	scalar_stmts.safe_push (next);
1586     }
1587 
1588   node = vect_create_new_slp_node (scalar_stmts);
1589 
1590   /* Calculate the number of vector stmts to create based on the unrolling
1591      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1592      GROUP_SIZE / NUNITS otherwise.  */
1593   ncopies_for_cost = unrolling_factor * group_size / nunits;
1594 
1595   load_permutation.create (group_size * group_size);
1596   loads.create (group_size);
1597   prologue_cost_vec.create (10);
1598   body_cost_vec.create (10);
1599 
1600   /* Build the tree for the SLP instance.  */
1601   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1602                            &outside_cost, ncopies_for_cost,
1603 			   &max_nunits, &load_permutation, &loads,
1604 			   vectorization_factor, &loads_permuted,
1605 			   &prologue_cost_vec, &body_cost_vec))
1606     {
1607       void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1608 		    : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1609 
1610       /* Calculate the unrolling factor based on the smallest type.  */
1611       if (max_nunits > nunits)
1612         unrolling_factor = least_common_multiple (max_nunits, group_size)
1613                            / group_size;
1614 
1615       if (unrolling_factor != 1 && !loop_vinfo)
1616         {
1617           if (dump_enabled_p ())
1618             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619 			     "Build SLP failed: unrolling required in basic"
1620 			     " block SLP");
1621 	  vect_free_slp_tree (node);
1622 	  body_cost_vec.release ();
1623 	  prologue_cost_vec.release ();
1624 	  load_permutation.release ();
1625 	  loads.release ();
1626           return false;
1627         }
1628 
1629       /* Create a new SLP instance.  */
1630       new_instance = XNEW (struct _slp_instance);
1631       SLP_INSTANCE_TREE (new_instance) = node;
1632       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1633       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1634       SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
1635       SLP_INSTANCE_LOADS (new_instance) = loads;
1636       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1637       SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1638 
1639       if (loads_permuted)
1640         {
1641           if (!vect_supported_load_permutation_p (new_instance, group_size,
1642                                                   load_permutation))
1643             {
1644               if (dump_enabled_p ())
1645                 {
1646                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 				   "Build SLP failed: unsupported load "
1648 				   "permutation ");
1649                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1650                 }
1651 
1652               vect_free_slp_instance (new_instance);
1653 	      prologue_cost_vec.release ();
1654               return false;
1655             }
1656 
1657           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1658              = vect_find_first_load_in_slp_instance (new_instance);
1659         }
1660       else
1661         SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1662 
1663       /* Record the prologue costs, which were delayed until we were
1664 	 sure that SLP was successful.  Unlike the body costs, we know
1665 	 the final values now regardless of the loop vectorization factor.  */
1666       FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1667 	{
1668 	  struct _stmt_vec_info *stmt_info
1669 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1670 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1671 				si->misalign, vect_prologue);
1672 	}
1673 
1674       prologue_cost_vec.release ();
1675 
1676       if (loop_vinfo)
1677         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1678       else
1679         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1680 
1681       if (dump_enabled_p ())
1682 	vect_print_slp_tree (MSG_NOTE, node);
1683 
1684       return true;
1685     }
1686   else
1687     {
1688       body_cost_vec.release ();
1689       prologue_cost_vec.release ();
1690     }
1691 
1692   /* Failed to SLP.  */
1693   /* Free the allocated memory.  */
1694   vect_free_slp_tree (node);
1695   load_permutation.release ();
1696   loads.release ();
1697 
1698   return false;
1699 }
1700 
1701 
1702 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1703    trees of packed scalar stmts if SLP is possible.  */
1704 
1705 bool
vect_analyze_slp(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)1706 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1707 {
1708   unsigned int i;
1709   vec<gimple> grouped_stores;
1710   vec<gimple> reductions = vNULL;
1711   vec<gimple> reduc_chains = vNULL;
1712   gimple first_element;
1713   bool ok = false;
1714 
1715   if (dump_enabled_p ())
1716     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1717 
1718   if (loop_vinfo)
1719     {
1720       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1721       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1722       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1723     }
1724   else
1725     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1726 
1727   /* Find SLP sequences starting from groups of grouped stores.  */
1728   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1729     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1730       ok = true;
1731 
1732   if (bb_vinfo && !ok)
1733     {
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1736 			 "Failed to SLP the basic block.");
1737 
1738       return false;
1739     }
1740 
1741   if (loop_vinfo
1742       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1743     {
1744       /* Find SLP sequences starting from reduction chains.  */
1745       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1746         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1747           ok = true;
1748         else
1749           return false;
1750 
1751       /* Don't try to vectorize SLP reductions if reduction chain was
1752          detected.  */
1753       return ok;
1754     }
1755 
1756   /* Find SLP sequences starting from groups of reductions.  */
1757   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1758       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1759     ok = true;
1760 
1761   return true;
1762 }
1763 
1764 
1765 /* For each possible SLP instance decide whether to SLP it and calculate overall
1766    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1767    least one instance.  */
1768 
1769 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)1770 vect_make_slp_decision (loop_vec_info loop_vinfo)
1771 {
1772   unsigned int i, unrolling_factor = 1;
1773   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1774   slp_instance instance;
1775   int decided_to_slp = 0;
1776 
1777   if (dump_enabled_p ())
1778     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1779 
1780   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1781     {
1782       /* FORNOW: SLP if you can.  */
1783       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1784 	unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1785 
1786       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1787 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1788 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1789       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1790       decided_to_slp++;
1791     }
1792 
1793   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1794 
1795   if (decided_to_slp && dump_enabled_p ())
1796     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1797 		     "Decided to SLP %d instances. Unrolling factor %d",
1798 		     decided_to_slp, unrolling_factor);
1799 
1800   return (decided_to_slp > 0);
1801 }
1802 
1803 
1804 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1805    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1806 
1807 static void
vect_detect_hybrid_slp_stmts(slp_tree node)1808 vect_detect_hybrid_slp_stmts (slp_tree node)
1809 {
1810   int i;
1811   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1812   gimple stmt = stmts[0];
1813   imm_use_iterator imm_iter;
1814   gimple use_stmt;
1815   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1816   slp_void_p child;
1817   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1818   struct loop *loop = NULL;
1819   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1820   basic_block bb = NULL;
1821 
1822   if (!node)
1823     return;
1824 
1825   if (loop_vinfo)
1826     loop = LOOP_VINFO_LOOP (loop_vinfo);
1827   else
1828     bb = BB_VINFO_BB (bb_vinfo);
1829 
1830   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1831     if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1832 	&& TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1833       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1834 	if (gimple_bb (use_stmt)
1835             && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1836 		 || bb == gimple_bb (use_stmt))
1837 	    && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1838 	    && !STMT_SLP_TYPE (stmt_vinfo)
1839             && (STMT_VINFO_RELEVANT (stmt_vinfo)
1840                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1841 	    && !(gimple_code (use_stmt) == GIMPLE_PHI
1842                  && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1843                   == vect_reduction_def))
1844 	  vect_mark_slp_stmts (node, hybrid, i);
1845 
1846   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1847     vect_detect_hybrid_slp_stmts ((slp_tree) child);
1848 }
1849 
1850 
1851 /* Find stmts that must be both vectorized and SLPed.  */
1852 
1853 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)1854 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1855 {
1856   unsigned int i;
1857   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1858   slp_instance instance;
1859 
1860   if (dump_enabled_p ())
1861     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1862 
1863   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1864     vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1865 }
1866 
1867 
1868 /* Create and initialize a new bb_vec_info struct for BB, as well as
1869    stmt_vec_info structs for all the stmts in it.  */
1870 
1871 static bb_vec_info
new_bb_vec_info(basic_block bb)1872 new_bb_vec_info (basic_block bb)
1873 {
1874   bb_vec_info res = NULL;
1875   gimple_stmt_iterator gsi;
1876 
1877   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1878   BB_VINFO_BB (res) = bb;
1879 
1880   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1881     {
1882       gimple stmt = gsi_stmt (gsi);
1883       gimple_set_uid (stmt, 0);
1884       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1885     }
1886 
1887   BB_VINFO_GROUPED_STORES (res).create (10);
1888   BB_VINFO_SLP_INSTANCES (res).create (2);
1889   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1890 
1891   bb->aux = res;
1892   return res;
1893 }
1894 
1895 
1896 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1897    stmts in the basic block.  */
1898 
1899 static void
destroy_bb_vec_info(bb_vec_info bb_vinfo)1900 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1901 {
1902   vec<slp_instance> slp_instances;
1903   slp_instance instance;
1904   basic_block bb;
1905   gimple_stmt_iterator si;
1906   unsigned i;
1907 
1908   if (!bb_vinfo)
1909     return;
1910 
1911   bb = BB_VINFO_BB (bb_vinfo);
1912 
1913   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1914     {
1915       gimple stmt = gsi_stmt (si);
1916       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1917 
1918       if (stmt_info)
1919         /* Free stmt_vec_info.  */
1920         free_stmt_vec_info (stmt);
1921     }
1922 
1923   free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1924   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1925   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1926   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1927   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1928     vect_free_slp_instance (instance);
1929   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1930   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1931   free (bb_vinfo);
1932   bb->aux = NULL;
1933 }
1934 
1935 
1936 /* Analyze statements contained in SLP tree node after recursively analyzing
1937    the subtree. Return TRUE if the operations are supported.  */
1938 
1939 static bool
vect_slp_analyze_node_operations(bb_vec_info bb_vinfo,slp_tree node)1940 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1941 {
1942   bool dummy;
1943   int i;
1944   gimple stmt;
1945   slp_void_p child;
1946 
1947   if (!node)
1948     return true;
1949 
1950   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1951     if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
1952       return false;
1953 
1954   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1955     {
1956       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1957       gcc_assert (stmt_info);
1958       gcc_assert (PURE_SLP_STMT (stmt_info));
1959 
1960       if (!vect_analyze_stmt (stmt, &dummy, node))
1961         return false;
1962     }
1963 
1964   return true;
1965 }
1966 
1967 
1968 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
1969    operations are supported. */
1970 
1971 static bool
vect_slp_analyze_operations(bb_vec_info bb_vinfo)1972 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1973 {
1974   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1975   slp_instance instance;
1976   int i;
1977 
1978   for (i = 0; slp_instances.iterate (i, &instance); )
1979     {
1980       if (!vect_slp_analyze_node_operations (bb_vinfo,
1981                                              SLP_INSTANCE_TREE (instance)))
1982         {
1983  	  vect_free_slp_instance (instance);
1984           slp_instances.ordered_remove (i);
1985 	}
1986       else
1987         i++;
1988     }
1989 
1990   if (!slp_instances.length ())
1991     return false;
1992 
1993   return true;
1994 }
1995 
1996 /* Check if vectorization of the basic block is profitable.  */
1997 
1998 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)1999 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2000 {
2001   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2002   slp_instance instance;
2003   int i, j;
2004   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2005   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2006   unsigned int stmt_cost;
2007   gimple stmt;
2008   gimple_stmt_iterator si;
2009   basic_block bb = BB_VINFO_BB (bb_vinfo);
2010   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2011   stmt_vec_info stmt_info = NULL;
2012   stmt_vector_for_cost body_cost_vec;
2013   stmt_info_for_cost *ci;
2014 
2015   /* Calculate vector costs.  */
2016   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2017     {
2018       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2019 
2020       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2021 	{
2022 	  stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2023 	  (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2024 				stmt_info, ci->misalign, vect_body);
2025 	}
2026     }
2027 
2028   /* Calculate scalar cost.  */
2029   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2030     {
2031       stmt = gsi_stmt (si);
2032       stmt_info = vinfo_for_stmt (stmt);
2033 
2034       if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2035           || !PURE_SLP_STMT (stmt_info))
2036         continue;
2037 
2038       if (STMT_VINFO_DATA_REF (stmt_info))
2039         {
2040           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2041             stmt_cost = vect_get_stmt_cost (scalar_load);
2042           else
2043             stmt_cost = vect_get_stmt_cost (scalar_store);
2044         }
2045       else
2046         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2047 
2048       scalar_cost += stmt_cost;
2049     }
2050 
2051   /* Complete the target-specific cost calculation.  */
2052   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2053 	       &vec_inside_cost, &vec_epilogue_cost);
2054 
2055   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2056 
2057   if (dump_enabled_p ())
2058     {
2059       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2060       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2061 		   vec_inside_cost);
2062       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2063       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2064       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d", scalar_cost);
2065     }
2066 
2067   /* Vectorization is profitable if its cost is less than the cost of scalar
2068      version.  */
2069   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2070     return false;
2071 
2072   return true;
2073 }
2074 
2075 /* Check if the basic block can be vectorized.  */
2076 
2077 static bb_vec_info
vect_slp_analyze_bb_1(basic_block bb)2078 vect_slp_analyze_bb_1 (basic_block bb)
2079 {
2080   bb_vec_info bb_vinfo;
2081   vec<ddr_p> ddrs;
2082   vec<slp_instance> slp_instances;
2083   slp_instance instance;
2084   int i;
2085   int min_vf = 2;
2086   int max_vf = MAX_VECTORIZATION_FACTOR;
2087 
2088   bb_vinfo = new_bb_vec_info (bb);
2089   if (!bb_vinfo)
2090     return NULL;
2091 
2092   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2093     {
2094       if (dump_enabled_p ())
2095         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 			 "not vectorized: unhandled data-ref in basic "
2097 			 "block.\n");
2098 
2099       destroy_bb_vec_info (bb_vinfo);
2100       return NULL;
2101     }
2102 
2103   ddrs = BB_VINFO_DDRS (bb_vinfo);
2104   if (!ddrs.length ())
2105     {
2106       if (dump_enabled_p ())
2107         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 			 "not vectorized: not enough data-refs in "
2109 			 "basic block.\n");
2110 
2111       destroy_bb_vec_info (bb_vinfo);
2112       return NULL;
2113     }
2114 
2115   vect_pattern_recog (NULL, bb_vinfo);
2116 
2117   if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf)
2118        || min_vf > max_vf)
2119      {
2120        if (dump_enabled_p ())
2121 	 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2122 			  "not vectorized: unhandled data dependence "
2123 			  "in basic block.\n");
2124 
2125        destroy_bb_vec_info (bb_vinfo);
2126        return NULL;
2127      }
2128 
2129   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2130     {
2131       if (dump_enabled_p ())
2132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 			 "not vectorized: bad data alignment in basic "
2134 			 "block.\n");
2135 
2136       destroy_bb_vec_info (bb_vinfo);
2137       return NULL;
2138     }
2139 
2140   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2141     {
2142      if (dump_enabled_p ())
2143        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 			"not vectorized: unhandled data access in "
2145 			"basic block.\n");
2146 
2147       destroy_bb_vec_info (bb_vinfo);
2148       return NULL;
2149     }
2150 
2151   /* Check the SLP opportunities in the basic block, analyze and build SLP
2152      trees.  */
2153   if (!vect_analyze_slp (NULL, bb_vinfo))
2154     {
2155       if (dump_enabled_p ())
2156         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 			 "not vectorized: failed to find SLP opportunities "
2158 			 "in basic block.\n");
2159 
2160       destroy_bb_vec_info (bb_vinfo);
2161       return NULL;
2162     }
2163 
2164   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2165 
2166   /* Mark all the statements that we want to vectorize as pure SLP and
2167      relevant.  */
2168   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2169     {
2170       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2171       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2172     }
2173 
2174   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178                          "not vectorized: unsupported alignment in basic "
2179                          "block.\n");
2180       destroy_bb_vec_info (bb_vinfo);
2181       return NULL;
2182     }
2183 
2184   if (!vect_slp_analyze_operations (bb_vinfo))
2185     {
2186       if (dump_enabled_p ())
2187         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188 			 "not vectorized: bad operation in basic block.\n");
2189 
2190       destroy_bb_vec_info (bb_vinfo);
2191       return NULL;
2192     }
2193 
2194   /* Cost model: check if the vectorization is worthwhile.  */
2195   if (flag_vect_cost_model
2196       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2197     {
2198       if (dump_enabled_p ())
2199         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2200 			 "not vectorized: vectorization is not "
2201 			 "profitable.\n");
2202 
2203       destroy_bb_vec_info (bb_vinfo);
2204       return NULL;
2205     }
2206 
2207   if (dump_enabled_p ())
2208     dump_printf_loc (MSG_NOTE, vect_location,
2209 		     "Basic block will be vectorized using SLP\n");
2210 
2211   return bb_vinfo;
2212 }
2213 
2214 
2215 bb_vec_info
vect_slp_analyze_bb(basic_block bb)2216 vect_slp_analyze_bb (basic_block bb)
2217 {
2218   bb_vec_info bb_vinfo;
2219   int insns = 0;
2220   gimple_stmt_iterator gsi;
2221   unsigned int vector_sizes;
2222 
2223   if (dump_enabled_p ())
2224     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2225 
2226   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2227     {
2228       gimple stmt = gsi_stmt (gsi);
2229       if (!is_gimple_debug (stmt)
2230           && !gimple_nop_p (stmt)
2231           && gimple_code (stmt) != GIMPLE_LABEL)
2232         insns++;
2233     }
2234 
2235   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2236     {
2237       if (dump_enabled_p ())
2238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 			 "not vectorized: too many instructions in "
2240 			 "basic block.\n");
2241 
2242       return NULL;
2243     }
2244 
2245   /* Autodetect first vector size we try.  */
2246   current_vector_size = 0;
2247   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2248 
2249   while (1)
2250     {
2251       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2252       if (bb_vinfo)
2253         return bb_vinfo;
2254 
2255       destroy_bb_vec_info (bb_vinfo);
2256 
2257       vector_sizes &= ~current_vector_size;
2258       if (vector_sizes == 0
2259           || current_vector_size == 0)
2260         return NULL;
2261 
2262       /* Try the next biggest vector size.  */
2263       current_vector_size = 1 << floor_log2 (vector_sizes);
2264       if (dump_enabled_p ())
2265         dump_printf_loc (MSG_NOTE, vect_location,
2266 			 "***** Re-trying analysis with "
2267 			 "vector size %d\n", current_vector_size);
2268     }
2269 }
2270 
2271 
2272 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2273    the number of created vector stmts depends on the unrolling factor).
2274    However, the actual number of vector stmts for every SLP node depends on
2275    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2276    should be updated.  In this function we assume that the inside costs
2277    calculated in vect_model_xxx_cost are linear in ncopies.  */
2278 
2279 void
vect_update_slp_costs_according_to_vf(loop_vec_info loop_vinfo)2280 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2281 {
2282   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2283   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2284   slp_instance instance;
2285   stmt_vector_for_cost body_cost_vec;
2286   stmt_info_for_cost *si;
2287   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2288 
2289   if (dump_enabled_p ())
2290     dump_printf_loc (MSG_NOTE, vect_location,
2291 		     "=== vect_update_slp_costs_according_to_vf ===");
2292 
2293   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2294     {
2295       /* We assume that costs are linear in ncopies.  */
2296       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2297 
2298       /* Record the instance's instructions in the target cost model.
2299 	 This was delayed until here because the count of instructions
2300 	 isn't known beforehand.  */
2301       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2302 
2303       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2304 	(void) add_stmt_cost (data, si->count * ncopies, si->kind,
2305 			      vinfo_for_stmt (si->stmt), si->misalign,
2306 			      vect_body);
2307     }
2308 }
2309 
2310 
2311 /* For constant and loop invariant defs of SLP_NODE this function returns
2312    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2313    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2314    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2315    REDUC_INDEX is the index of the reduction operand in the statements, unless
2316    it is -1.  */
2317 
2318 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors,int reduc_index)2319 vect_get_constant_vectors (tree op, slp_tree slp_node,
2320                            vec<tree> *vec_oprnds,
2321 			   unsigned int op_num, unsigned int number_of_vectors,
2322                            int reduc_index)
2323 {
2324   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2325   gimple stmt = stmts[0];
2326   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2327   unsigned nunits;
2328   tree vec_cst;
2329   tree *elts;
2330   unsigned j, number_of_places_left_in_vector;
2331   tree vector_type;
2332   tree vop;
2333   int group_size = stmts.length ();
2334   unsigned int vec_num, i;
2335   unsigned number_of_copies = 1;
2336   vec<tree> voprnds;
2337   voprnds.create (number_of_vectors);
2338   bool constant_p, is_store;
2339   tree neutral_op = NULL;
2340   enum tree_code code = gimple_expr_code (stmt);
2341   gimple def_stmt;
2342   struct loop *loop;
2343   gimple_seq ctor_seq = NULL;
2344 
2345   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2346       && reduc_index != -1)
2347     {
2348       op_num = reduc_index - 1;
2349       op = gimple_op (stmt, reduc_index);
2350       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2351          we need either neutral operands or the original operands.  See
2352          get_initial_def_for_reduction() for details.  */
2353       switch (code)
2354         {
2355           case WIDEN_SUM_EXPR:
2356           case DOT_PROD_EXPR:
2357           case PLUS_EXPR:
2358           case MINUS_EXPR:
2359           case BIT_IOR_EXPR:
2360           case BIT_XOR_EXPR:
2361              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2362                neutral_op = build_real (TREE_TYPE (op), dconst0);
2363              else
2364                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2365 
2366              break;
2367 
2368           case MULT_EXPR:
2369              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2370                neutral_op = build_real (TREE_TYPE (op), dconst1);
2371              else
2372                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2373 
2374              break;
2375 
2376           case BIT_AND_EXPR:
2377             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2378             break;
2379 
2380           case MAX_EXPR:
2381           case MIN_EXPR:
2382             def_stmt = SSA_NAME_DEF_STMT (op);
2383             loop = (gimple_bb (stmt))->loop_father;
2384             neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2385                                                 loop_preheader_edge (loop));
2386             break;
2387 
2388           default:
2389             neutral_op = NULL;
2390         }
2391     }
2392 
2393   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2394     {
2395       is_store = true;
2396       op = gimple_assign_rhs1 (stmt);
2397     }
2398   else
2399     is_store = false;
2400 
2401   gcc_assert (op);
2402 
2403   if (CONSTANT_CLASS_P (op))
2404     constant_p = true;
2405   else
2406     constant_p = false;
2407 
2408   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2409   gcc_assert (vector_type);
2410   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2411 
2412   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2413      created vectors. It is greater than 1 if unrolling is performed.
2414 
2415      For example, we have two scalar operands, s1 and s2 (e.g., group of
2416      strided accesses of size two), while NUNITS is four (i.e., four scalars
2417      of this type can be packed in a vector).  The output vector will contain
2418      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2419      will be 2).
2420 
2421      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2422      containing the operands.
2423 
2424      For example, NUNITS is four as before, and the group size is 8
2425      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2426      {s5, s6, s7, s8}.  */
2427 
2428   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2429 
2430   number_of_places_left_in_vector = nunits;
2431   elts = XALLOCAVEC (tree, nunits);
2432   for (j = 0; j < number_of_copies; j++)
2433     {
2434       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2435         {
2436           if (is_store)
2437             op = gimple_assign_rhs1 (stmt);
2438           else
2439 	    {
2440 	      switch (code)
2441 		{
2442 		  case COND_EXPR:
2443 		    if (op_num == 0 || op_num == 1)
2444 		      {
2445 			tree cond = gimple_assign_rhs1 (stmt);
2446 			op = TREE_OPERAND (cond, op_num);
2447 		      }
2448 		    else
2449 		      {
2450 			if (op_num == 2)
2451 			  op = gimple_assign_rhs2 (stmt);
2452 			else
2453 			  op = gimple_assign_rhs3 (stmt);
2454 		      }
2455 		    break;
2456 
2457 		  case CALL_EXPR:
2458 		    op = gimple_call_arg (stmt, op_num);
2459 		    break;
2460 
2461 		  case LSHIFT_EXPR:
2462 		  case RSHIFT_EXPR:
2463 		  case LROTATE_EXPR:
2464 		  case RROTATE_EXPR:
2465 		    op = gimple_op (stmt, op_num + 1);
2466 		    /* Unlike the other binary operators, shifts/rotates have
2467 		       the shift count being int, instead of the same type as
2468 		       the lhs, so make sure the scalar is the right type if
2469 		       we are dealing with vectors of
2470 		       long long/long/short/char.  */
2471 		    if (op_num == 1 && constant_p)
2472 		      op = fold_convert (TREE_TYPE (vector_type), op);
2473 		    break;
2474 
2475 		  default:
2476 		    op = gimple_op (stmt, op_num + 1);
2477 		    break;
2478 		}
2479 	    }
2480 
2481           if (reduc_index != -1)
2482             {
2483               loop = (gimple_bb (stmt))->loop_father;
2484               def_stmt = SSA_NAME_DEF_STMT (op);
2485 
2486               gcc_assert (loop);
2487 
2488               /* Get the def before the loop.  In reduction chain we have only
2489                  one initial value.  */
2490               if ((j != (number_of_copies - 1)
2491                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2492                        && i != 0))
2493                   && neutral_op)
2494                 op = neutral_op;
2495               else
2496                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2497                                             loop_preheader_edge (loop));
2498             }
2499 
2500           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2501           number_of_places_left_in_vector--;
2502 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2503 	    {
2504 	      if (constant_p)
2505 		{
2506 		  op = fold_unary (VIEW_CONVERT_EXPR,
2507 				   TREE_TYPE (vector_type), op);
2508 		  gcc_assert (op && CONSTANT_CLASS_P (op));
2509 		}
2510 	      else
2511 		{
2512 		  tree new_temp
2513 		    = make_ssa_name (TREE_TYPE (vector_type), NULL);
2514 		  gimple init_stmt;
2515 		  op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2516 			       op);
2517 		  init_stmt
2518 		    = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2519 						    new_temp, op, NULL_TREE);
2520 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
2521 		  op = new_temp;
2522 		}
2523 	    }
2524 	  elts[number_of_places_left_in_vector] = op;
2525 
2526           if (number_of_places_left_in_vector == 0)
2527             {
2528               number_of_places_left_in_vector = nunits;
2529 
2530 	      if (constant_p)
2531 		vec_cst = build_vector (vector_type, elts);
2532 	      else
2533 		{
2534 		  vec<constructor_elt, va_gc> *v;
2535 		  unsigned k;
2536 		  vec_alloc (v, nunits);
2537 		  for (k = 0; k < nunits; ++k)
2538 		    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2539 		  vec_cst = build_constructor (vector_type, v);
2540 		}
2541               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2542 						    vector_type, NULL));
2543 	      if (ctor_seq != NULL)
2544 		{
2545 		  gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2546 		  gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2547 		  gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2548 							GSI_SAME_STMT);
2549 		  ctor_seq = NULL;
2550 		}
2551             }
2552         }
2553     }
2554 
2555   /* Since the vectors are created in the reverse order, we should invert
2556      them.  */
2557   vec_num = voprnds.length ();
2558   for (j = vec_num; j != 0; j--)
2559     {
2560       vop = voprnds[j - 1];
2561       vec_oprnds->quick_push (vop);
2562     }
2563 
2564   voprnds.release ();
2565 
2566   /* In case that VF is greater than the unrolling factor needed for the SLP
2567      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2568      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2569      to replicate the vectors.  */
2570   while (number_of_vectors > vec_oprnds->length ())
2571     {
2572       tree neutral_vec = NULL;
2573 
2574       if (neutral_op)
2575         {
2576           if (!neutral_vec)
2577 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
2578 
2579           vec_oprnds->quick_push (neutral_vec);
2580         }
2581       else
2582         {
2583           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2584             vec_oprnds->quick_push (vop);
2585         }
2586     }
2587 }
2588 
2589 
2590 /* Get vectorized definitions from SLP_NODE that contains corresponding
2591    vectorized def-stmts.  */
2592 
2593 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)2594 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2595 {
2596   tree vec_oprnd;
2597   gimple vec_def_stmt;
2598   unsigned int i;
2599 
2600   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2601 
2602   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2603     {
2604       gcc_assert (vec_def_stmt);
2605       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2606       vec_oprnds->quick_push (vec_oprnd);
2607     }
2608 }
2609 
2610 
2611 /* Get vectorized definitions for SLP_NODE.
2612    If the scalar definitions are loop invariants or constants, collect them and
2613    call vect_get_constant_vectors() to create vector stmts.
2614    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2615    must be stored in the corresponding child of SLP_NODE, and we call
2616    vect_get_slp_vect_defs () to retrieve them.  */
2617 
2618 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,int reduc_index)2619 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2620 		   vec<vec<tree> > *vec_oprnds, int reduc_index)
2621 {
2622   gimple first_stmt;
2623   int number_of_vects = 0, i;
2624   unsigned int child_index = 0;
2625   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2626   slp_tree child = NULL;
2627   vec<tree> vec_defs;
2628   tree oprnd;
2629   bool vectorized_defs;
2630 
2631   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2632   FOR_EACH_VEC_ELT (ops, i, oprnd)
2633     {
2634       /* For each operand we check if it has vectorized definitions in a child
2635 	 node or we need to create them (for invariants and constants).  We
2636 	 check if the LHS of the first stmt of the next child matches OPRND.
2637 	 If it does, we found the correct child.  Otherwise, we call
2638 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2639 	 to check this child node for the next operand.  */
2640       vectorized_defs = false;
2641       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2642         {
2643           child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2644 
2645 	  /* We have to check both pattern and original def, if available.  */
2646 	  gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2647 	  gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2648 
2649 	  if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2650 	      || (related
2651 		  && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2652 	    {
2653 	      /* The number of vector defs is determined by the number of
2654 		 vector statements in the node from which we get those
2655 		 statements.  */
2656 	      number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2657 	      vectorized_defs = true;
2658 	      child_index++;
2659 	    }
2660         }
2661 
2662       if (!vectorized_defs)
2663         {
2664           if (i == 0)
2665             {
2666               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2667               /* Number of vector stmts was calculated according to LHS in
2668                  vect_schedule_slp_instance (), fix it by replacing LHS with
2669                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2670                  details.  */
2671               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2672                                              &rhs_size_unit);
2673               if (rhs_size_unit != lhs_size_unit)
2674                 {
2675                   number_of_vects *= rhs_size_unit;
2676                   number_of_vects /= lhs_size_unit;
2677                 }
2678             }
2679         }
2680 
2681       /* Allocate memory for vectorized defs.  */
2682       vec_defs = vNULL;
2683       vec_defs.create (number_of_vects);
2684 
2685       /* For reduction defs we call vect_get_constant_vectors (), since we are
2686          looking for initial loop invariant values.  */
2687       if (vectorized_defs && reduc_index == -1)
2688         /* The defs are already vectorized.  */
2689 	vect_get_slp_vect_defs (child, &vec_defs);
2690       else
2691         /* Build vectors from scalar defs.  */
2692 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2693                                    number_of_vects, reduc_index);
2694 
2695       vec_oprnds->quick_push (vec_defs);
2696 
2697       /* For reductions, we only need initial values.  */
2698       if (reduc_index != -1)
2699         return;
2700     }
2701 }
2702 
2703 
2704 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2705    building a vector of type MASK_TYPE from it) and two input vectors placed in
2706    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2707    shifting by STRIDE elements of DR_CHAIN for every copy.
2708    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2709    copies).
2710    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2711    the created stmts must be inserted.  */
2712 
2713 static inline void
vect_create_mask_and_perm(gimple stmt,gimple next_scalar_stmt,tree mask,int first_vec_indx,int second_vec_indx,gimple_stmt_iterator * gsi,slp_tree node,tree vectype,vec<tree> dr_chain,int ncopies,int vect_stmts_counter)2714 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2715                            tree mask, int first_vec_indx, int second_vec_indx,
2716                            gimple_stmt_iterator *gsi, slp_tree node,
2717                            tree vectype, vec<tree> dr_chain,
2718                            int ncopies, int vect_stmts_counter)
2719 {
2720   tree perm_dest;
2721   gimple perm_stmt = NULL;
2722   stmt_vec_info next_stmt_info;
2723   int i, stride;
2724   tree first_vec, second_vec, data_ref;
2725 
2726   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2727 
2728   /* Initialize the vect stmts of NODE to properly insert the generated
2729      stmts later.  */
2730   for (i = SLP_TREE_VEC_STMTS (node).length ();
2731        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2732     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2733 
2734   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2735   for (i = 0; i < ncopies; i++)
2736     {
2737       first_vec = dr_chain[first_vec_indx];
2738       second_vec = dr_chain[second_vec_indx];
2739 
2740       /* Generate the permute statement.  */
2741       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2742 						first_vec, second_vec, mask);
2743       data_ref = make_ssa_name (perm_dest, perm_stmt);
2744       gimple_set_lhs (perm_stmt, data_ref);
2745       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2746 
2747       /* Store the vector statement in NODE.  */
2748       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2749 
2750       first_vec_indx += stride;
2751       second_vec_indx += stride;
2752     }
2753 
2754   /* Mark the scalar stmt as vectorized.  */
2755   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2756   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2757 }
2758 
2759 
2760 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2761    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2762    representation.  Check that the mask is valid and return FALSE if not.
2763    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2764    the next vector, i.e., the current first vector is not needed.  */
2765 
2766 static bool
vect_get_mask_element(gimple stmt,int first_mask_element,int m,int mask_nunits,bool only_one_vec,int index,unsigned char * mask,int * current_mask_element,bool * need_next_vector,int * number_of_mask_fixes,bool * mask_fixed,bool * needs_first_vector)2767 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2768                        int mask_nunits, bool only_one_vec, int index,
2769 		       unsigned char *mask, int *current_mask_element,
2770                        bool *need_next_vector, int *number_of_mask_fixes,
2771                        bool *mask_fixed, bool *needs_first_vector)
2772 {
2773   int i;
2774 
2775   /* Convert to target specific representation.  */
2776   *current_mask_element = first_mask_element + m;
2777   /* Adjust the value in case it's a mask for second and third vectors.  */
2778   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2779 
2780   if (*current_mask_element < mask_nunits)
2781     *needs_first_vector = true;
2782 
2783   /* We have only one input vector to permute but the mask accesses values in
2784      the next vector as well.  */
2785   if (only_one_vec && *current_mask_element >= mask_nunits)
2786     {
2787       if (dump_enabled_p ())
2788         {
2789           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2790 			   "permutation requires at least two vectors ");
2791           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2792         }
2793 
2794       return false;
2795     }
2796 
2797   /* The mask requires the next vector.  */
2798   if (*current_mask_element >= mask_nunits * 2)
2799     {
2800       if (*needs_first_vector || *mask_fixed)
2801         {
2802           /* We either need the first vector too or have already moved to the
2803              next vector. In both cases, this permutation needs three
2804              vectors.  */
2805           if (dump_enabled_p ())
2806             {
2807               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2808 			       "permutation requires at "
2809 			       "least three vectors ");
2810               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2811             }
2812 
2813           return false;
2814         }
2815 
2816       /* We move to the next vector, dropping the first one and working with
2817          the second and the third - we need to adjust the values of the mask
2818          accordingly.  */
2819       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2820 
2821       for (i = 0; i < index; i++)
2822         mask[i] -= mask_nunits * *number_of_mask_fixes;
2823 
2824       (*number_of_mask_fixes)++;
2825       *mask_fixed = true;
2826     }
2827 
2828   *need_next_vector = *mask_fixed;
2829 
2830   /* This was the last element of this mask. Start a new one.  */
2831   if (index == mask_nunits - 1)
2832     {
2833       *number_of_mask_fixes = 1;
2834       *mask_fixed = false;
2835       *needs_first_vector = false;
2836     }
2837 
2838   return true;
2839 }
2840 
2841 
2842 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2843    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2844    permute statements for SLP_NODE_INSTANCE.  */
2845 bool
vect_transform_slp_perm_load(gimple stmt,vec<tree> dr_chain,gimple_stmt_iterator * gsi,int vf,slp_instance slp_node_instance,bool analyze_only)2846 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2847                               gimple_stmt_iterator *gsi, int vf,
2848                               slp_instance slp_node_instance, bool analyze_only)
2849 {
2850   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2851   tree mask_element_type = NULL_TREE, mask_type;
2852   int i, j, k, nunits, vec_index = 0, scalar_index;
2853   slp_tree node;
2854   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2855   gimple next_scalar_stmt;
2856   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2857   int first_mask_element;
2858   int index, unroll_factor, current_mask_element, ncopies;
2859   unsigned char *mask;
2860   bool only_one_vec = false, need_next_vector = false;
2861   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2862   int number_of_mask_fixes = 1;
2863   bool mask_fixed = false;
2864   bool needs_first_vector = false;
2865   enum machine_mode mode;
2866 
2867   mode = TYPE_MODE (vectype);
2868 
2869   if (!can_vec_perm_p (mode, false, NULL))
2870     {
2871       if (dump_enabled_p ())
2872         {
2873           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874 			   "no vect permute for ");
2875           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2876         }
2877       return false;
2878     }
2879 
2880   /* The generic VEC_PERM_EXPR code always uses an integral type of the
2881      same size as the vector element being permuted.  */
2882   mask_element_type = lang_hooks.types.type_for_mode
2883 		(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2884   mask_type = get_vectype_for_scalar_type (mask_element_type);
2885   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2886   mask = XALLOCAVEC (unsigned char, nunits);
2887   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2888 
2889   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2890      unrolling factor.  */
2891   orig_vec_stmts_num = group_size *
2892                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2893   if (orig_vec_stmts_num == 1)
2894     only_one_vec = true;
2895 
2896   /* Number of copies is determined by the final vectorization factor
2897      relatively to SLP_NODE_INSTANCE unrolling factor.  */
2898   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2899 
2900   /* Generate permutation masks for every NODE. Number of masks for each NODE
2901      is equal to GROUP_SIZE.
2902      E.g., we have a group of three nodes with three loads from the same
2903      location in each node, and the vector size is 4. I.e., we have a
2904      a0b0c0a1b1c1... sequence and we need to create the following vectors:
2905      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2906      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2907      ...
2908 
2909      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2910      The last mask is illegal since we assume two operands for permute
2911      operation, and the mask element values can't be outside that range.
2912      Hence, the last mask must be converted into {2,5,5,5}.
2913      For the first two permutations we need the first and the second input
2914      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2915      we need the second and the third vectors: {b1,c1,a2,b2} and
2916      {c2,a3,b3,c3}.  */
2917 
2918   FOR_EACH_VEC_ELT  (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2919     {
2920       scalar_index = 0;
2921       index = 0;
2922       vect_stmts_counter = 0;
2923       vec_index = 0;
2924       first_vec_index = vec_index++;
2925       if (only_one_vec)
2926         second_vec_index = first_vec_index;
2927       else
2928         second_vec_index =  vec_index++;
2929 
2930       for (j = 0; j < unroll_factor; j++)
2931         {
2932           for (k = 0; k < group_size; k++)
2933             {
2934               first_mask_element = i + j * group_size;
2935               if (!vect_get_mask_element (stmt, first_mask_element, 0,
2936 					  nunits, only_one_vec, index,
2937 					  mask, &current_mask_element,
2938 					  &need_next_vector,
2939 					  &number_of_mask_fixes, &mask_fixed,
2940 					  &needs_first_vector))
2941 		return false;
2942 	      mask[index++] = current_mask_element;
2943 
2944               if (index == nunits)
2945                 {
2946 		  tree mask_vec, *mask_elts;
2947 		  int l;
2948 
2949 		  if (!can_vec_perm_p (mode, false, mask))
2950 		    {
2951 		      if (dump_enabled_p ())
2952 			{
2953 			  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2954 					   vect_location,
2955 					   "unsupported vect permute { ");
2956 			  for (i = 0; i < nunits; ++i)
2957 			    dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2958 					 mask[i]);
2959 			  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2960 			}
2961 		      return false;
2962 		    }
2963 
2964 		  mask_elts = XALLOCAVEC (tree, nunits);
2965 		  for (l = 0; l < nunits; ++l)
2966 		    mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2967 		  mask_vec = build_vector (mask_type, mask_elts);
2968 		  index = 0;
2969 
2970                   if (!analyze_only)
2971                     {
2972                       if (need_next_vector)
2973                         {
2974                           first_vec_index = second_vec_index;
2975                           second_vec_index = vec_index;
2976                         }
2977 
2978                       next_scalar_stmt
2979 			  = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2980 
2981                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
2982                                mask_vec, first_vec_index, second_vec_index,
2983 			       gsi, node, vectype, dr_chain,
2984 			       ncopies, vect_stmts_counter++);
2985                     }
2986                 }
2987             }
2988         }
2989     }
2990 
2991   return true;
2992 }
2993 
2994 
2995 
2996 /* Vectorize SLP instance tree in postorder.  */
2997 
2998 static bool
vect_schedule_slp_instance(slp_tree node,slp_instance instance,unsigned int vectorization_factor)2999 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3000                             unsigned int vectorization_factor)
3001 {
3002   gimple stmt;
3003   bool grouped_store, is_store;
3004   gimple_stmt_iterator si;
3005   stmt_vec_info stmt_info;
3006   unsigned int vec_stmts_size, nunits, group_size;
3007   tree vectype;
3008   int i;
3009   slp_tree loads_node;
3010   slp_void_p child;
3011 
3012   if (!node)
3013     return false;
3014 
3015   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3016     vect_schedule_slp_instance ((slp_tree) child, instance,
3017                                 vectorization_factor);
3018 
3019   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3020   stmt_info = vinfo_for_stmt (stmt);
3021 
3022   /* VECTYPE is the type of the destination.  */
3023   vectype = STMT_VINFO_VECTYPE (stmt_info);
3024   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3025   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3026 
3027   /* For each SLP instance calculate number of vector stmts to be created
3028      for the scalar stmts in each node of the SLP tree.  Number of vector
3029      elements in one vector iteration is the number of scalar elements in
3030      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3031      size.  */
3032   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3033 
3034   /* In case of load permutation we have to allocate vectorized statements for
3035      all the nodes that participate in that permutation.  */
3036   if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3037     {
3038       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3039         {
3040           if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3041             {
3042               SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3043               SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3044             }
3045         }
3046     }
3047 
3048   if (!SLP_TREE_VEC_STMTS (node).exists ())
3049     {
3050       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3051       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3052     }
3053 
3054   if (dump_enabled_p ())
3055     {
3056       dump_printf_loc (MSG_NOTE,vect_location,
3057 		       "------>vectorizing SLP node starting from: ");
3058       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3059     }
3060 
3061   /* Loads should be inserted before the first load.  */
3062   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3063       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3064       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3065       && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3066     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3067   else if (is_pattern_stmt_p (stmt_info))
3068     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3069   else
3070     si = gsi_for_stmt (stmt);
3071 
3072   /* Stores should be inserted just before the last store.  */
3073   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3074       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3075     {
3076       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3077       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3078        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3079       si = gsi_for_stmt (last_store);
3080     }
3081 
3082   /* Mark the first element of the reduction chain as reduction to properly
3083      transform the node.  In the analysis phase only the last element of the
3084      chain is marked as reduction.  */
3085   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3086       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3087     {
3088       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3089       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3090     }
3091 
3092   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3093   return is_store;
3094 }
3095 
3096 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3097    For loop vectorization this is done in vectorizable_call, but for SLP
3098    it needs to be deferred until end of vect_schedule_slp, because multiple
3099    SLP instances may refer to the same scalar stmt.  */
3100 
3101 static void
vect_remove_slp_scalar_calls(slp_tree node)3102 vect_remove_slp_scalar_calls (slp_tree node)
3103 {
3104   gimple stmt, new_stmt;
3105   gimple_stmt_iterator gsi;
3106   int i;
3107   slp_void_p child;
3108   tree lhs;
3109   stmt_vec_info stmt_info;
3110 
3111   if (!node)
3112     return;
3113 
3114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3115     vect_remove_slp_scalar_calls ((slp_tree) child);
3116 
3117   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3118     {
3119       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3120 	continue;
3121       stmt_info = vinfo_for_stmt (stmt);
3122       if (stmt_info == NULL
3123 	  || is_pattern_stmt_p (stmt_info)
3124 	  || !PURE_SLP_STMT (stmt_info))
3125 	continue;
3126       lhs = gimple_call_lhs (stmt);
3127       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3128       set_vinfo_for_stmt (new_stmt, stmt_info);
3129       set_vinfo_for_stmt (stmt, NULL);
3130       STMT_VINFO_STMT (stmt_info) = new_stmt;
3131       gsi = gsi_for_stmt (stmt);
3132       gsi_replace (&gsi, new_stmt, false);
3133       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3134     }
3135 }
3136 
3137 /* Generate vector code for all SLP instances in the loop/basic block.  */
3138 
3139 bool
vect_schedule_slp(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)3140 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3141 {
3142   vec<slp_instance> slp_instances;
3143   slp_instance instance;
3144   slp_tree loads_node;
3145   unsigned int i, j, vf;
3146   bool is_store = false;
3147 
3148   if (loop_vinfo)
3149     {
3150       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3151       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3152     }
3153   else
3154     {
3155       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3156       vf = 1;
3157     }
3158 
3159   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3160     {
3161       /* Schedule the tree of INSTANCE.  */
3162       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3163                                              instance, vf);
3164 
3165       /* Clear STMT_VINFO_VEC_STMT of all loads.  With shared loads
3166          between SLP instances we fail to properly initialize the
3167 	 vectorized SLP stmts and confuse different load permutations.  */
3168       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3169 	STMT_VINFO_VEC_STMT
3170 	  (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3171 
3172       if (dump_enabled_p ())
3173 	dump_printf_loc (MSG_NOTE, vect_location,
3174                          "vectorizing stmts using SLP.");
3175     }
3176 
3177   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3178     {
3179       slp_tree root = SLP_INSTANCE_TREE (instance);
3180       gimple store;
3181       unsigned int j;
3182       gimple_stmt_iterator gsi;
3183 
3184       /* Remove scalar call stmts.  Do not do this for basic-block
3185 	 vectorization as not all uses may be vectorized.
3186 	 ???  Why should this be necessary?  DCE should be able to
3187 	 remove the stmts itself.
3188 	 ???  For BB vectorization we can as well remove scalar
3189 	 stmts starting from the SLP tree root if they have no
3190 	 uses.  */
3191       if (loop_vinfo)
3192 	vect_remove_slp_scalar_calls (root);
3193 
3194       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3195                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3196         {
3197           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3198             break;
3199 
3200          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3201            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3202           /* Free the attached stmt_vec_info and remove the stmt.  */
3203           gsi = gsi_for_stmt (store);
3204 	  unlink_stmt_vdef (store);
3205           gsi_remove (&gsi, true);
3206 	  release_defs (store);
3207           free_stmt_vec_info (store);
3208         }
3209     }
3210 
3211   return is_store;
3212 }
3213 
3214 
3215 /* Vectorize the basic block.  */
3216 
3217 void
vect_slp_transform_bb(basic_block bb)3218 vect_slp_transform_bb (basic_block bb)
3219 {
3220   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3221   gimple_stmt_iterator si;
3222 
3223   gcc_assert (bb_vinfo);
3224 
3225   if (dump_enabled_p ())
3226     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3227 
3228   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3229     {
3230       gimple stmt = gsi_stmt (si);
3231       stmt_vec_info stmt_info;
3232 
3233       if (dump_enabled_p ())
3234         {
3235           dump_printf_loc (MSG_NOTE, vect_location,
3236                            "------>SLPing statement: ");
3237           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3238         }
3239 
3240       stmt_info = vinfo_for_stmt (stmt);
3241       gcc_assert (stmt_info);
3242 
3243       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3244       if (STMT_SLP_TYPE (stmt_info))
3245         {
3246           vect_schedule_slp (NULL, bb_vinfo);
3247           break;
3248         }
3249     }
3250 
3251   if (dump_enabled_p ())
3252     dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3253 
3254   destroy_bb_vec_info (bb_vinfo);
3255 }
3256