1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s[%wu]\n",
78 			     GET_MODE_NAME (mode), count);
79 	  return false;
80 	}
81     }
82 
83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84     {
85       if (dump_enabled_p ())
86 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87                          "cannot use %s<%s><%s>\n", name,
88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89       return false;
90     }
91 
92   if (dump_enabled_p ())
93     dump_printf_loc (MSG_NOTE, vect_location,
94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95                      GET_MODE_NAME (mode));
96 
97   return true;
98 }
99 
100 
101 /* Return the smallest scalar part of STMT_INFO.
102    This is used to determine the vectype of the stmt.  We generally set the
103    vectype according to the type of the result (lhs).  For stmts whose
104    result-type is different than the type of the arguments (e.g., demotion,
105    promotion), vectype will be reset appropriately (later).  Note that we have
106    to visit the smallest datatype in this function, because that determines the
107    VF.  If the smallest datatype in the loop is present only as the rhs of a
108    promotion operation - we'd miss it.
109    Such a case, where a variable of this datatype does not appear in the lhs
110    anywhere in the loop, can only occur if it's an invariant: e.g.:
111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
112    invariant motion.  However, we cannot rely on invariant motion to always
113    take invariants out of the loop, and so in the case of promotion we also
114    have to check the rhs.
115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116    types.  */
117 
118 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,tree scalar_type)119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121   HOST_WIDE_INT lhs, rhs;
122 
123   /* During the analysis phase, this function is called on arbitrary
124      statements that might not have scalar results.  */
125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126     return scalar_type;
127 
128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 
130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131   if (assign)
132     {
133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134       if (gimple_assign_cast_p (assign)
135 	  || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 	  || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 	  || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 	  || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 	  || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 	  || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 	  || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142 	{
143 	  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 
145 	  rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 	  if (rhs < lhs)
147 	    scalar_type = rhs_type;
148 	}
149     }
150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151     {
152       unsigned int i = 0;
153       if (gimple_call_internal_p (call))
154 	{
155 	  internal_fn ifn = gimple_call_internal_fn (call);
156 	  if (internal_load_fn_p (ifn))
157 	    /* For loads the LHS type does the trick.  */
158 	    i = ~0U;
159 	  else if (internal_store_fn_p (ifn))
160 	    {
161 	      /* For stores use the tyep of the stored value.  */
162 	      i = internal_fn_stored_value_index (ifn);
163 	      scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 	      i = ~0U;
165 	    }
166 	  else if (internal_fn_mask_index (ifn) == 0)
167 	    i = 1;
168 	}
169       if (i < gimple_call_num_args (call))
170 	{
171 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173 	    {
174 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 	      if (rhs < lhs)
176 		scalar_type = rhs_type;
177 	    }
178 	}
179     }
180 
181   return scalar_type;
182 }
183 
184 
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186    tested at run-time.  Return TRUE if DDR was successfully inserted.
187    Return false if versioning is not supported.  */
188 
189 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193 
194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195     return opt_result::failure_at (vect_location,
196 				   "will not create alias checks, as"
197 				   " --param vect-max-version-for-alias-checks"
198 				   " == 0\n");
199 
200   opt_result res
201     = runtime_alias_check_p (ddr, loop,
202 			     optimize_loop_nest_for_speed_p (loop));
203   if (!res)
204     return res;
205 
206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207   return opt_result::success ();
208 }
209 
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
211 
212 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216   for (unsigned int i = 0; i < checks.length(); ++i)
217     if (checks[i] == value)
218       return;
219 
220   if (dump_enabled_p ())
221     dump_printf_loc (MSG_NOTE, vect_location,
222 		     "need run-time check that %T is nonzero\n",
223 		     value);
224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226 
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229    DR_INFO_B.  At least one of the accesses is a write.  */
230 
231 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236 
237   /* Single statements are always kept in their original order.  */
238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240     return true;
241 
242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
243      emitted at the position of the first scalar load.
244      Stores in a group are emitted at the position of the last scalar store.
245      Compute that position and check whether the resulting order matches
246      the current one.  */
247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248   if (il_a)
249     {
250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 	     s = DR_GROUP_NEXT_ELEMENT (s))
253 	  il_a = get_later_stmt (il_a, s);
254       else /* DR_IS_READ */
255 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 	     s = DR_GROUP_NEXT_ELEMENT (s))
257 	  if (get_later_stmt (il_a, s) == il_a)
258 	    il_a = s;
259     }
260   else
261     il_a = stmtinfo_a;
262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263   if (il_b)
264     {
265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 	     s = DR_GROUP_NEXT_ELEMENT (s))
268 	  il_b = get_later_stmt (il_b, s);
269       else /* DR_IS_READ */
270 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 	     s = DR_GROUP_NEXT_ELEMENT (s))
272 	  if (get_later_stmt (il_b, s) == il_b)
273 	    il_b = s;
274     }
275   else
276     il_b = stmtinfo_b;
277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280 
281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283    distances.  These distances are conservatively correct but they don't
284    reflect a guaranteed dependence.
285 
286    Return true if this function does all the work necessary to avoid
287    an alias or false if the caller should use the dependence distances
288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
289    the depth of the loop described by LOOP_VINFO and the other arguments
290    are as for vect_analyze_data_ref_dependence.  */
291 
292 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 				       loop_vec_info loop_vinfo,
295 				       int loop_depth, unsigned int *max_vf)
296 {
297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299     {
300       int dist = dist_v[loop_depth];
301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302 	{
303 	  /* If the user asserted safelen >= DIST consecutive iterations
304 	     can be executed concurrently, assume independence.
305 
306 	     ??? An alternative would be to add the alias check even
307 	     in this case, and vectorize the fallback loop with the
308 	     maximum VF set to safelen.  However, if the user has
309 	     explicitly given a length, it's less likely that that
310 	     would be a win.  */
311 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312 	    {
313 	      if ((unsigned int) loop->safelen < *max_vf)
314 		*max_vf = loop->safelen;
315 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 	      continue;
317 	    }
318 
319 	  /* For dependence distances of 2 or more, we have the option
320 	     of limiting VF or checking for an alias at runtime.
321 	     Prefer to check at runtime if we can, to avoid limiting
322 	     the VF unnecessarily when the bases are in fact independent.
323 
324 	     Note that the alias checks will be removed if the VF ends up
325 	     being small enough.  */
326 	  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 	  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 	  return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 		  && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 		  && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 	}
332     }
333   return true;
334 }
335 
336 
337 /* Function vect_analyze_data_ref_dependence.
338 
339    FIXME: I needed to change the sense of the returned flag.
340 
341    Return FALSE if there (might) exist a dependence between a memory-reference
342    DRA and a memory-reference DRB.  When versioning for alias may check a
343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
344    the data dependence.  */
345 
346 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 				  loop_vec_info loop_vinfo,
349 				  unsigned int *max_vf)
350 {
351   unsigned int i;
352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353   struct data_reference *dra = DDR_A (ddr);
354   struct data_reference *drb = DDR_B (ddr);
355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359   lambda_vector dist_v;
360   unsigned int loop_depth;
361 
362   /* If user asserted safelen consecutive iterations can be
363      executed concurrently, assume independence.  */
364   auto apply_safelen = [&]()
365     {
366       if (loop->safelen >= 2)
367 	{
368 	  if ((unsigned int) loop->safelen < *max_vf)
369 	    *max_vf = loop->safelen;
370 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371 	  return true;
372 	}
373       return false;
374     };
375 
376   /* In loop analysis all data references should be vectorizable.  */
377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379     gcc_unreachable ();
380 
381   /* Independent data accesses.  */
382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383     return opt_result::success ();
384 
385   if (dra == drb
386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387     return opt_result::success ();
388 
389   /* We do not have to consider dependences between accesses that belong
390      to the same group, unless the stride could be smaller than the
391      group size.  */
392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 	  == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396     return opt_result::success ();
397 
398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
399      least two scalar iterations, there is always also a true dependence.
400      As the vectorizer does not re-order loads and stores we can ignore
401      the anti-dependence if TBAA can disambiguate both DRs similar to the
402      case with known negative distance anti-dependences (positive
403      distance anti-dependences would violate TBAA constraints).  */
404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 				 get_alias_set (DR_REF (drb))))
408     return opt_result::success ();
409 
410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412     {
413       if (apply_safelen ())
414 	return opt_result::success ();
415 
416       return opt_result::failure_at
417 	(stmtinfo_a->stmt,
418 	 "possible alias involving gather/scatter between %T and %T\n",
419 	 DR_REF (dra), DR_REF (drb));
420     }
421 
422   /* Unknown data dependence.  */
423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
424     {
425       if (apply_safelen ())
426 	return opt_result::success ();
427 
428       if (dump_enabled_p ())
429 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
430 			 "versioning for alias required: "
431 			 "can't determine dependence between %T and %T\n",
432 			 DR_REF (dra), DR_REF (drb));
433 
434       /* Add to list of ddrs that need to be tested at run-time.  */
435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436     }
437 
438   /* Known data dependence.  */
439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
440     {
441       if (apply_safelen ())
442 	return opt_result::success ();
443 
444       if (dump_enabled_p ())
445 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
446 			 "versioning for alias required: "
447 			 "bad dist vector for %T and %T\n",
448 			 DR_REF (dra), DR_REF (drb));
449       /* Add to list of ddrs that need to be tested at run-time.  */
450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451     }
452 
453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 
455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
457 						loop_depth, max_vf))
458     return opt_result::success ();
459 
460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461     {
462       int dist = dist_v[loop_depth];
463 
464       if (dump_enabled_p ())
465 	dump_printf_loc (MSG_NOTE, vect_location,
466                          "dependence distance  = %d.\n", dist);
467 
468       if (dist == 0)
469 	{
470 	  if (dump_enabled_p ())
471 	    dump_printf_loc (MSG_NOTE, vect_location,
472 			     "dependence distance == 0 between %T and %T\n",
473 			     DR_REF (dra), DR_REF (drb));
474 
475 	  /* When we perform grouped accesses and perform implicit CSE
476 	     by detecting equal accesses and doing disambiguation with
477 	     runtime alias tests like for
478 	        .. = a[i];
479 		.. = a[i+1];
480 		a[i] = ..;
481 		a[i+1] = ..;
482 		*p = ..;
483 		.. = a[i];
484 		.. = a[i+1];
485 	     where we will end up loading { a[i], a[i+1] } once, make
486 	     sure that inserting group loads before the first load and
487 	     stores after the last store will do the right thing.
488 	     Similar for groups like
489 	        a[i] = ...;
490 		... = a[i];
491 		a[i+1] = ...;
492 	     where loads from the group interleave with the store.  */
493 	  if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
494 	    return opt_result::failure_at (stmtinfo_a->stmt,
495 					   "READ_WRITE dependence"
496 					   " in interleaving.\n");
497 
498 	  if (loop->safelen < 2)
499 	    {
500 	      tree indicator = dr_zero_step_indicator (dra);
501 	      if (!indicator || integer_zerop (indicator))
502 		return opt_result::failure_at (stmtinfo_a->stmt,
503 					       "access also has a zero step\n");
504 	      else if (TREE_CODE (indicator) != INTEGER_CST)
505 		vect_check_nonzero_value (loop_vinfo, indicator);
506 	    }
507 	  continue;
508 	}
509 
510       if (dist > 0 && DDR_REVERSED_P (ddr))
511 	{
512 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
513 	     reversed (to make distance vector positive), and the actual
514 	     distance is negative.  */
515 	  if (dump_enabled_p ())
516 	    dump_printf_loc (MSG_NOTE, vect_location,
517 	                     "dependence distance negative.\n");
518 	  /* When doing outer loop vectorization, we need to check if there is
519 	     a backward dependence at the inner loop level if the dependence
520 	     at the outer loop is reversed.  See PR81740.  */
521 	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
522 	      || nested_in_vect_loop_p (loop, stmtinfo_b))
523 	    {
524 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
525 							 DDR_LOOP_NEST (ddr));
526 	      if (dist_v[inner_depth] < 0)
527 		return opt_result::failure_at (stmtinfo_a->stmt,
528 					       "not vectorized, dependence "
529 					       "between data-refs %T and %T\n",
530 					       DR_REF (dra), DR_REF (drb));
531 	    }
532 	  /* Record a negative dependence distance to later limit the
533 	     amount of stmt copying / unrolling we can perform.
534 	     Only need to handle read-after-write dependence.  */
535 	  if (DR_IS_READ (drb)
536 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
537 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
538 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
539 	  continue;
540 	}
541 
542       unsigned int abs_dist = abs (dist);
543       if (abs_dist >= 2 && abs_dist < *max_vf)
544 	{
545 	  /* The dependence distance requires reduction of the maximal
546 	     vectorization factor.  */
547 	  *max_vf = abs_dist;
548 	  if (dump_enabled_p ())
549 	    dump_printf_loc (MSG_NOTE, vect_location,
550 	                     "adjusting maximal vectorization factor to %i\n",
551 	                     *max_vf);
552 	}
553 
554       if (abs_dist >= *max_vf)
555 	{
556 	  /* Dependence distance does not create dependence, as far as
557 	     vectorization is concerned, in this case.  */
558 	  if (dump_enabled_p ())
559 	    dump_printf_loc (MSG_NOTE, vect_location,
560 	                     "dependence distance >= VF.\n");
561 	  continue;
562 	}
563 
564       return opt_result::failure_at (stmtinfo_a->stmt,
565 				     "not vectorized, possible dependence "
566 				     "between data-refs %T and %T\n",
567 				     DR_REF (dra), DR_REF (drb));
568     }
569 
570   return opt_result::success ();
571 }
572 
573 /* Function vect_analyze_data_ref_dependences.
574 
575    Examine all the data references in the loop, and make sure there do not
576    exist any data dependences between them.  Set *MAX_VF according to
577    the maximum vectorization factor the data dependences allow.  */
578 
579 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
581 				   unsigned int *max_vf)
582 {
583   unsigned int i;
584   struct data_dependence_relation *ddr;
585 
586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 
588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589     {
590       LOOP_VINFO_DDRS (loop_vinfo)
591 	.create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
592 		 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
593       /* We do not need read-read dependences.  */
594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 					  &LOOP_VINFO_DDRS (loop_vinfo),
596 					  LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 					  false);
598       gcc_assert (res);
599     }
600 
601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602 
603   /* For epilogues we either have no aliases or alias versioning
604      was applied to original loop.  Therefore we may just get max_vf
605      using VF of original loop.  */
606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608   else
609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610       {
611 	opt_result res
612 	  = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 	if (!res)
614 	  return res;
615       }
616 
617   return opt_result::success ();
618 }
619 
620 
621 /* Function vect_slp_analyze_data_ref_dependence.
622 
623    Return TRUE if there (might) exist a dependence between a memory-reference
624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
626    according to the data dependence.  */
627 
628 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 				      struct data_dependence_relation *ddr)
631 {
632   struct data_reference *dra = DDR_A (ddr);
633   struct data_reference *drb = DDR_B (ddr);
634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636 
637   /* We need to check dependences of statements marked as unvectorizable
638      as well, they still can prohibit vectorization.  */
639 
640   /* Independent data accesses.  */
641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642     return false;
643 
644   if (dra == drb)
645     return false;
646 
647   /* Read-read is OK.  */
648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
649     return false;
650 
651   /* If dra and drb are part of the same interleaving chain consider
652      them independent.  */
653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 	  == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656     return false;
657 
658   /* Unknown data dependence.  */
659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660     {
661       if  (dump_enabled_p ())
662 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 			 "can't determine dependence between %T and %T\n",
664 			 DR_REF (dra), DR_REF (drb));
665     }
666   else if (dump_enabled_p ())
667     dump_printf_loc (MSG_NOTE, vect_location,
668 		     "determined dependence between %T and %T\n",
669 		     DR_REF (dra), DR_REF (drb));
670 
671   return true;
672 }
673 
674 
675 /* Analyze dependences involved in the transform of SLP NODE.  STORES
676    contain the vector of scalar stores of this instance if we are
677    disambiguating the loads.  */
678 
679 static bool
vect_slp_analyze_node_dependences(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
681 				   vec<stmt_vec_info> stores,
682 				   stmt_vec_info last_store_info)
683 {
684   /* This walks over all stmts involved in the SLP load/store done
685      in NODE verifying we can sink them up to the last stmt in the
686      group.  */
687   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
688     {
689       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
690       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
691 	{
692 	  stmt_vec_info access_info
693 	    = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
694 	  if (access_info == last_access_info)
695 	    continue;
696 	  data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
697 	  ao_ref ref;
698 	  bool ref_initialized_p = false;
699 	  for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
700 	       gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
701 	    {
702 	      gimple *stmt = gsi_stmt (gsi);
703 	      if (! gimple_vuse (stmt))
704 		continue;
705 
706 	      /* If we couldn't record a (single) data reference for this
707 		 stmt we have to resort to the alias oracle.  */
708 	      stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
709 	      data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
710 	      if (!dr_b)
711 		{
712 		  /* We are moving a store - this means
713 		     we cannot use TBAA for disambiguation.  */
714 		  if (!ref_initialized_p)
715 		    ao_ref_init (&ref, DR_REF (dr_a));
716 		  if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
717 		      || ref_maybe_used_by_stmt_p (stmt, &ref, false))
718 		    return false;
719 		  continue;
720 		}
721 
722 	      bool dependent = false;
723 	      /* If we run into a store of this same instance (we've just
724 		 marked those) then delay dependence checking until we run
725 		 into the last store because this is where it will have
726 		 been sunk to (and we verify if we can do that as well).  */
727 	      if (gimple_visited_p (stmt))
728 		{
729 		  if (stmt_info != last_store_info)
730 		    continue;
731 
732 		  for (stmt_vec_info &store_info : stores)
733 		    {
734 		      data_reference *store_dr
735 			= STMT_VINFO_DATA_REF (store_info);
736 		      ddr_p ddr = initialize_data_dependence_relation
737 				    (dr_a, store_dr, vNULL);
738 		      dependent
739 			= vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 		      free_dependence_relation (ddr);
741 		      if (dependent)
742 			break;
743 		    }
744 		}
745 	      else
746 		{
747 		  ddr_p ddr = initialize_data_dependence_relation (dr_a,
748 								   dr_b, vNULL);
749 		  dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
750 		  free_dependence_relation (ddr);
751 		}
752 	      if (dependent)
753 		return false;
754 	    }
755 	}
756     }
757   else /* DR_IS_READ */
758     {
759       stmt_vec_info first_access_info
760 	= vect_find_first_scalar_stmt_in_slp (node);
761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
762 	{
763 	  stmt_vec_info access_info
764 	    = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
765 	  if (access_info == first_access_info)
766 	    continue;
767 	  data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
768 	  ao_ref ref;
769 	  bool ref_initialized_p = false;
770 	  for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
771 	       gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
772 	    {
773 	      gimple *stmt = gsi_stmt (gsi);
774 	      if (! gimple_vdef (stmt))
775 		continue;
776 
777 	      /* If we couldn't record a (single) data reference for this
778 		 stmt we have to resort to the alias oracle.  */
779 	      stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
780 	      data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
781 
782 	      /* We are hoisting a load - this means we can use
783 		 TBAA for disambiguation.  */
784 	      if (!ref_initialized_p)
785 		ao_ref_init (&ref, DR_REF (dr_a));
786 	      if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
787 		{
788 		  if (!dr_b)
789 		    return false;
790 		  /* Resort to dependence checking below.  */
791 		}
792 	      else
793 		/* No dependence.  */
794 		continue;
795 
796 	      bool dependent = false;
797 	      /* If we run into a store of this same instance (we've just
798 		 marked those) then delay dependence checking until we run
799 		 into the last store because this is where it will have
800 		 been sunk to (and we verify if we can do that as well).  */
801 	      if (gimple_visited_p (stmt))
802 		{
803 		  if (stmt_info != last_store_info)
804 		    continue;
805 
806 		  for (stmt_vec_info &store_info : stores)
807 		    {
808 		      data_reference *store_dr
809 			= STMT_VINFO_DATA_REF (store_info);
810 		      ddr_p ddr = initialize_data_dependence_relation
811 				    (dr_a, store_dr, vNULL);
812 		      dependent
813 			= vect_slp_analyze_data_ref_dependence (vinfo, ddr);
814 		      free_dependence_relation (ddr);
815 		      if (dependent)
816 			break;
817 		    }
818 		}
819 	      else
820 		{
821 		  ddr_p ddr = initialize_data_dependence_relation (dr_a,
822 								   dr_b, vNULL);
823 		  dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
824 		  free_dependence_relation (ddr);
825 		}
826 	      if (dependent)
827 		return false;
828 	    }
829 	}
830     }
831   return true;
832 }
833 
834 
835 /* Function vect_analyze_data_ref_dependences.
836 
837    Examine all the data references in the basic-block, and make sure there
838    do not exist any data dependences between them.  Set *MAX_VF according to
839    the maximum vectorization factor the data dependences allow.  */
840 
841 bool
vect_slp_analyze_instance_dependence(vec_info * vinfo,slp_instance instance)842 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
843 {
844   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
845 
846   /* The stores of this instance are at the root of the SLP tree.  */
847   slp_tree store = NULL;
848   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
849     store = SLP_INSTANCE_TREE (instance);
850 
851   /* Verify we can sink stores to the vectorized stmt insert location.  */
852   stmt_vec_info last_store_info = NULL;
853   if (store)
854     {
855       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
856 	return false;
857 
858       /* Mark stores in this instance and remember the last one.  */
859       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
860       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
861 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
862     }
863 
864   bool res = true;
865 
866   /* Verify we can sink loads to the vectorized stmt insert location,
867      special-casing stores of this instance.  */
868   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
869     if (! vect_slp_analyze_node_dependences (vinfo, load,
870 					     store
871 					     ? SLP_TREE_SCALAR_STMTS (store)
872 					     : vNULL, last_store_info))
873       {
874 	res = false;
875 	break;
876       }
877 
878   /* Unset the visited flag.  */
879   if (store)
880     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
881       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
882 
883   return res;
884 }
885 
886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
887    applied.  */
888 
889 int
dr_misalignment(dr_vec_info * dr_info,tree vectype,poly_int64 offset)890 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
891 {
892   HOST_WIDE_INT diff = 0;
893   /* Alignment is only analyzed for the first element of a DR group,
894      use that but adjust misalignment by the offset of the access.  */
895   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
896     {
897       dr_vec_info *first_dr
898 	= STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
899       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
900 	 INTEGER_CSTs and the first element in the group has the lowest
901 	 address.  */
902       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
903 	      - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
904       gcc_assert (diff >= 0);
905       dr_info = first_dr;
906     }
907 
908   int misalign = dr_info->misalignment;
909   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
910   if (misalign == DR_MISALIGNMENT_UNKNOWN)
911     return misalign;
912 
913   /* If the access is only aligned for a vector type with smaller alignment
914      requirement the access has unknown misalignment.  */
915   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
916 		targetm.vectorize.preferred_vector_alignment (vectype)))
917     return DR_MISALIGNMENT_UNKNOWN;
918 
919   /* Apply the offset from the DR group start and the externally supplied
920      offset which can for example result from a negative stride access.  */
921   poly_int64 misalignment = misalign + diff + offset;
922 
923   /* vect_compute_data_ref_alignment will have ensured that target_alignment
924      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
925   unsigned HOST_WIDE_INT target_alignment_c
926     = dr_info->target_alignment.to_constant ();
927   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
928     return DR_MISALIGNMENT_UNKNOWN;
929   return misalign;
930 }
931 
932 /* Record the base alignment guarantee given by DRB, which occurs
933    in STMT_INFO.  */
934 
935 static void
vect_record_base_alignment(vec_info * vinfo,stmt_vec_info stmt_info,innermost_loop_behavior * drb)936 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
937 			    innermost_loop_behavior *drb)
938 {
939   bool existed;
940   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
941     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
942   if (!existed || entry.second->base_alignment < drb->base_alignment)
943     {
944       entry = std::make_pair (stmt_info, drb);
945       if (dump_enabled_p ())
946 	dump_printf_loc (MSG_NOTE, vect_location,
947 			 "recording new base alignment for %T\n"
948 			 "  alignment:    %d\n"
949 			 "  misalignment: %d\n"
950 			 "  based on:     %G",
951 			 drb->base_address,
952 			 drb->base_alignment,
953 			 drb->base_misalignment,
954 			 stmt_info->stmt);
955     }
956 }
957 
958 /* If the region we're going to vectorize is reached, all unconditional
959    data references occur at least once.  We can therefore pool the base
960    alignment guarantees from each unconditional reference.  Do this by
961    going through all the data references in VINFO and checking whether
962    the containing statement makes the reference unconditionally.  If so,
963    record the alignment of the base address in VINFO so that it can be
964    used for all other references with the same base.  */
965 
966 void
vect_record_base_alignments(vec_info * vinfo)967 vect_record_base_alignments (vec_info *vinfo)
968 {
969   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
970   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
971   for (data_reference *dr : vinfo->shared->datarefs)
972     {
973       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
974       stmt_vec_info stmt_info = dr_info->stmt;
975       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
976 	  && STMT_VINFO_VECTORIZABLE (stmt_info)
977 	  && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
978 	{
979 	  vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
980 
981 	  /* If DR is nested in the loop that is being vectorized, we can also
982 	     record the alignment of the base wrt the outer loop.  */
983 	  if (loop && nested_in_vect_loop_p (loop, stmt_info))
984 	    vect_record_base_alignment
985 	      (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
986 	}
987     }
988 }
989 
990 /* Function vect_compute_data_ref_alignment
991 
992    Compute the misalignment of the data reference DR_INFO when vectorizing
993    with VECTYPE.
994 
995    Output:
996    1. initialized misalignment info for DR_INFO
997 
998    FOR NOW: No analysis is actually performed. Misalignment is calculated
999    only for trivial cases. TODO.  */
1000 
1001 static void
vect_compute_data_ref_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype)1002 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1003 				 tree vectype)
1004 {
1005   stmt_vec_info stmt_info = dr_info->stmt;
1006   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1007   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1008   class loop *loop = NULL;
1009   tree ref = DR_REF (dr_info->dr);
1010 
1011   if (dump_enabled_p ())
1012     dump_printf_loc (MSG_NOTE, vect_location,
1013                      "vect_compute_data_ref_alignment:\n");
1014 
1015   if (loop_vinfo)
1016     loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 
1018   /* Initialize misalignment to unknown.  */
1019   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1020 
1021   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1022     return;
1023 
1024   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1025   bool step_preserves_misalignment_p;
1026 
1027   poly_uint64 vector_alignment
1028     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1029 		 BITS_PER_UNIT);
1030   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1031 
1032   /* If the main loop has peeled for alignment we have no way of knowing
1033      whether the data accesses in the epilogues are aligned.  We can't at
1034      compile time answer the question whether we have entered the main loop or
1035      not.  Fixes PR 92351.  */
1036   if (loop_vinfo)
1037     {
1038       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1039       if (orig_loop_vinfo
1040 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1041 	return;
1042     }
1043 
1044   unsigned HOST_WIDE_INT vect_align_c;
1045   if (!vector_alignment.is_constant (&vect_align_c))
1046     return;
1047 
1048   /* No step for BB vectorization.  */
1049   if (!loop)
1050     {
1051       gcc_assert (integer_zerop (drb->step));
1052       step_preserves_misalignment_p = true;
1053     }
1054 
1055   /* In case the dataref is in an inner-loop of the loop that is being
1056      vectorized (LOOP), we use the base and misalignment information
1057      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1058      stays the same throughout the execution of the inner-loop, which is why
1059      we have to check that the stride of the dataref in the inner-loop evenly
1060      divides by the vector alignment.  */
1061   else if (nested_in_vect_loop_p (loop, stmt_info))
1062     {
1063       step_preserves_misalignment_p
1064 	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1065 
1066       if (dump_enabled_p ())
1067 	{
1068 	  if (step_preserves_misalignment_p)
1069 	    dump_printf_loc (MSG_NOTE, vect_location,
1070 			     "inner step divides the vector alignment.\n");
1071 	  else
1072 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073 			     "inner step doesn't divide the vector"
1074 			     " alignment.\n");
1075 	}
1076     }
1077 
1078   /* Similarly we can only use base and misalignment information relative to
1079      an innermost loop if the misalignment stays the same throughout the
1080      execution of the loop.  As above, this is the case if the stride of
1081      the dataref evenly divides by the alignment.  */
1082   else
1083     {
1084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1085       step_preserves_misalignment_p
1086 	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1087 
1088       if (!step_preserves_misalignment_p && dump_enabled_p ())
1089 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 			 "step doesn't divide the vector alignment.\n");
1091     }
1092 
1093   unsigned int base_alignment = drb->base_alignment;
1094   unsigned int base_misalignment = drb->base_misalignment;
1095 
1096   /* Calculate the maximum of the pooled base address alignment and the
1097      alignment that we can compute for DR itself.  */
1098   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1099     = base_alignments->get (drb->base_address);
1100   if (entry
1101       && base_alignment < (*entry).second->base_alignment
1102       && (loop_vinfo
1103 	  || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1104 			      gimple_bb (entry->first->stmt))
1105 	      && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1106 		  || (entry->first->dr_aux.group <= dr_info->group)))))
1107     {
1108       base_alignment = entry->second->base_alignment;
1109       base_misalignment = entry->second->base_misalignment;
1110     }
1111 
1112   if (drb->offset_alignment < vect_align_c
1113       || !step_preserves_misalignment_p
1114       /* We need to know whether the step wrt the vectorized loop is
1115 	 negative when computing the starting misalignment below.  */
1116       || TREE_CODE (drb->step) != INTEGER_CST)
1117     {
1118       if (dump_enabled_p ())
1119 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120 			 "Unknown alignment for access: %T\n", ref);
1121       return;
1122     }
1123 
1124   if (base_alignment < vect_align_c)
1125     {
1126       unsigned int max_alignment;
1127       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1128       if (max_alignment < vect_align_c
1129 	  || !vect_can_force_dr_alignment_p (base,
1130 					     vect_align_c * BITS_PER_UNIT))
1131 	{
1132 	  if (dump_enabled_p ())
1133 	    dump_printf_loc (MSG_NOTE, vect_location,
1134 			     "can't force alignment of ref: %T\n", ref);
1135 	  return;
1136 	}
1137 
1138       /* Force the alignment of the decl.
1139 	 NOTE: This is the only change to the code we make during
1140 	 the analysis phase, before deciding to vectorize the loop.  */
1141       if (dump_enabled_p ())
1142 	dump_printf_loc (MSG_NOTE, vect_location,
1143 			 "force alignment of %T\n", ref);
1144 
1145       dr_info->base_decl = base;
1146       dr_info->base_misaligned = true;
1147       base_misalignment = 0;
1148     }
1149   poly_int64 misalignment
1150     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1151 
1152   unsigned int const_misalignment;
1153   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1154     {
1155       if (dump_enabled_p ())
1156 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 			 "Non-constant misalignment for access: %T\n", ref);
1158       return;
1159     }
1160 
1161   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1162 
1163   if (dump_enabled_p ())
1164     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 		     "misalign = %d bytes of ref %T\n",
1166 		     const_misalignment, ref);
1167 
1168   return;
1169 }
1170 
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173    is made aligned via peeling.  */
1174 
1175 static bool
vect_dr_aligned_if_related_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1177 					 dr_vec_info *dr_peel_info)
1178 {
1179   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1180 		  DR_TARGET_ALIGNMENT (dr_info)))
1181     {
1182       poly_offset_int diff
1183 	= (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1184 	   - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1185       if (known_eq (diff, 0)
1186 	  || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1187 	return true;
1188     }
1189   return false;
1190 }
1191 
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193    aligned via peeling.  */
1194 
1195 static bool
vect_dr_aligned_if_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1197 				 dr_vec_info *dr_peel_info)
1198 {
1199   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1200 			DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1202 			   DR_OFFSET (dr_peel_info->dr), 0)
1203       || !operand_equal_p (DR_STEP (dr_info->dr),
1204 			   DR_STEP (dr_peel_info->dr), 0))
1205     return false;
1206 
1207   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 }
1209 
1210 /* Compute the value for dr_info->misalign so that the access appears
1211    aligned.  This is used by peeling to compensate for dr_misalignment
1212    applying the offset for negative step.  */
1213 
1214 int
vect_dr_misalign_for_aligned_access(dr_vec_info * dr_info)1215 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1216 {
1217   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1218     return 0;
1219 
1220   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1221   poly_int64 misalignment
1222     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1223        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1224 
1225   unsigned HOST_WIDE_INT target_alignment_c;
1226   int misalign;
1227   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1228       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1229     return DR_MISALIGNMENT_UNKNOWN;
1230   return misalign;
1231 }
1232 
1233 /* Function vect_update_misalignment_for_peel.
1234    Sets DR_INFO's misalignment
1235    - to 0 if it has the same alignment as DR_PEEL_INFO,
1236    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237    - to -1 (unknown) otherwise.
1238 
1239    DR_INFO - the data reference whose misalignment is to be adjusted.
1240    DR_PEEL_INFO - the data reference whose misalignment is being made
1241 		  zero in the vector loop by the peel.
1242    NPEEL - the number of iterations in the peel loop if the misalignment
1243            of DR_PEEL_INFO is known at compile time.  */
1244 
1245 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1246 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1247 				   dr_vec_info *dr_peel_info, int npeel)
1248 {
1249   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1250   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1251     {
1252       SET_DR_MISALIGNMENT (dr_info,
1253 			   vect_dr_misalign_for_aligned_access (dr_peel_info));
1254       return;
1255     }
1256 
1257   unsigned HOST_WIDE_INT alignment;
1258   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1259       && known_alignment_for_access_p (dr_info,
1260 				       STMT_VINFO_VECTYPE (dr_info->stmt))
1261       && known_alignment_for_access_p (dr_peel_info,
1262 				       STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1263     {
1264       int misal = dr_info->misalignment;
1265       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1266       misal &= alignment - 1;
1267       set_dr_misalignment (dr_info, misal);
1268       return;
1269     }
1270 
1271   if (dump_enabled_p ())
1272     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1273 		     "to unknown (-1).\n");
1274   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 }
1276 
1277 /* Return true if alignment is relevant for DR_INFO.  */
1278 
1279 static bool
vect_relevant_for_alignment_p(dr_vec_info * dr_info)1280 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1281 {
1282   stmt_vec_info stmt_info = dr_info->stmt;
1283 
1284   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1285     return false;
1286 
1287   /* For interleaving, only the alignment of the first access matters.  */
1288   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1290     return false;
1291 
1292   /* Scatter-gather and invariant accesses continue to address individual
1293      scalars, so vector-level alignment is irrelevant.  */
1294   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1295       || integer_zerop (DR_STEP (dr_info->dr)))
1296     return false;
1297 
1298   /* Strided accesses perform only component accesses, alignment is
1299      irrelevant for them.  */
1300   if (STMT_VINFO_STRIDED_P (stmt_info)
1301       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1302     return false;
1303 
1304   return true;
1305 }
1306 
1307 /* Given an memory reference EXP return whether its alignment is less
1308    than its size.  */
1309 
1310 static bool
not_size_aligned(tree exp)1311 not_size_aligned (tree exp)
1312 {
1313   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1314     return true;
1315 
1316   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1317 	  > get_object_alignment (exp));
1318 }
1319 
1320 /* Function vector_alignment_reachable_p
1321 
1322    Return true if vector alignment for DR_INFO is reachable by peeling
1323    a few loop iterations.  Return false otherwise.  */
1324 
1325 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1326 vector_alignment_reachable_p (dr_vec_info *dr_info)
1327 {
1328   stmt_vec_info stmt_info = dr_info->stmt;
1329   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1330 
1331   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1332     {
1333       /* For interleaved access we peel only if number of iterations in
1334 	 the prolog loop ({VF - misalignment}), is a multiple of the
1335 	 number of the interleaved accesses.  */
1336       int elem_size, mis_in_elements;
1337 
1338       /* FORNOW: handle only known alignment.  */
1339       if (!known_alignment_for_access_p (dr_info, vectype))
1340 	return false;
1341 
1342       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1343       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1344       elem_size = vector_element_size (vector_size, nelements);
1345       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1346 
1347       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1348 	return false;
1349     }
1350 
1351   /* If misalignment is known at the compile time then allow peeling
1352      only if natural alignment is reachable through peeling.  */
1353   if (known_alignment_for_access_p (dr_info, vectype)
1354       && !aligned_access_p (dr_info, vectype))
1355     {
1356       HOST_WIDE_INT elmsize =
1357 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1358       if (dump_enabled_p ())
1359 	{
1360 	  dump_printf_loc (MSG_NOTE, vect_location,
1361 	                   "data size = %wd. misalignment = %d.\n", elmsize,
1362 			   dr_misalignment (dr_info, vectype));
1363 	}
1364       if (dr_misalignment (dr_info, vectype) % elmsize)
1365 	{
1366 	  if (dump_enabled_p ())
1367 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368 	                     "data size does not divide the misalignment.\n");
1369 	  return false;
1370 	}
1371     }
1372 
1373   if (!known_alignment_for_access_p (dr_info, vectype))
1374     {
1375       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1376       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1377       if (dump_enabled_p ())
1378 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379 	                 "Unknown misalignment, %snaturally aligned\n",
1380 			 is_packed ? "not " : "");
1381       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382     }
1383 
1384   return true;
1385 }
1386 
1387 
1388 /* Calculate the cost of the memory access represented by DR_INFO.  */
1389 
1390 static void
vect_get_data_access_cost(vec_info * vinfo,dr_vec_info * dr_info,dr_alignment_support alignment_support_scheme,int misalignment,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1391 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1392 			   dr_alignment_support alignment_support_scheme,
1393 			   int misalignment,
1394 			   unsigned int *inside_cost,
1395                            unsigned int *outside_cost,
1396 			   stmt_vector_for_cost *body_cost_vec,
1397 			   stmt_vector_for_cost *prologue_cost_vec)
1398 {
1399   stmt_vec_info stmt_info = dr_info->stmt;
1400   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1401   int ncopies;
1402 
1403   if (PURE_SLP_STMT (stmt_info))
1404     ncopies = 1;
1405   else
1406     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1407 
1408   if (DR_IS_READ (dr_info->dr))
1409     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1410 			misalignment, true, inside_cost,
1411 			outside_cost, prologue_cost_vec, body_cost_vec, false);
1412   else
1413     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1414 			 misalignment, inside_cost, body_cost_vec);
1415 
1416   if (dump_enabled_p ())
1417     dump_printf_loc (MSG_NOTE, vect_location,
1418                      "vect_get_data_access_cost: inside_cost = %d, "
1419                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1420 }
1421 
1422 
1423 typedef struct _vect_peel_info
1424 {
1425   dr_vec_info *dr_info;
1426   int npeel;
1427   unsigned int count;
1428 } *vect_peel_info;
1429 
1430 typedef struct _vect_peel_extended_info
1431 {
1432   vec_info *vinfo;
1433   struct _vect_peel_info peel_info;
1434   unsigned int inside_cost;
1435   unsigned int outside_cost;
1436 } *vect_peel_extended_info;
1437 
1438 
1439 /* Peeling hashtable helpers.  */
1440 
1441 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1442 {
1443   static inline hashval_t hash (const _vect_peel_info *);
1444   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 };
1446 
1447 inline hashval_t
hash(const _vect_peel_info * peel_info)1448 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1449 {
1450   return (hashval_t) peel_info->npeel;
1451 }
1452 
1453 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1454 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1455 {
1456   return (a->npeel == b->npeel);
1457 }
1458 
1459 
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1461 
1462 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel,bool supportable_if_not_aligned)1463 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1464 			  loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1465 			  int npeel, bool supportable_if_not_aligned)
1466 {
1467   struct _vect_peel_info elem, *slot;
1468   _vect_peel_info **new_slot;
1469 
1470   elem.npeel = npeel;
1471   slot = peeling_htab->find (&elem);
1472   if (slot)
1473     slot->count++;
1474   else
1475     {
1476       slot = XNEW (struct _vect_peel_info);
1477       slot->npeel = npeel;
1478       slot->dr_info = dr_info;
1479       slot->count = 1;
1480       new_slot = peeling_htab->find_slot (slot, INSERT);
1481       *new_slot = slot;
1482     }
1483 
1484   /* If this DR is not supported with unknown misalignment then bias
1485      this slot when the cost model is disabled.  */
1486   if (!supportable_if_not_aligned
1487       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488     slot->count += VECT_MAX_COST;
1489 }
1490 
1491 
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493    number of data accesses.  */
1494 
1495 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1496 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1497 				     _vect_peel_extended_info *max)
1498 {
1499   vect_peel_info elem = *slot;
1500 
1501   if (elem->count > max->peel_info.count
1502       || (elem->count == max->peel_info.count
1503           && max->peel_info.npeel > elem->npeel))
1504     {
1505       max->peel_info.npeel = elem->npeel;
1506       max->peel_info.count = elem->count;
1507       max->peel_info.dr_info = elem->dr_info;
1508     }
1509 
1510   return 1;
1511 }
1512 
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1515    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1516    after peeling.  */
1517 
1518 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel)1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1520 				dr_vec_info *dr0_info,
1521 				unsigned int *inside_cost,
1522 				unsigned int *outside_cost,
1523 				stmt_vector_for_cost *body_cost_vec,
1524 				stmt_vector_for_cost *prologue_cost_vec,
1525 				unsigned int npeel)
1526 {
1527   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1528 
1529   bool dr0_alignment_known_p
1530     = (dr0_info
1531        && known_alignment_for_access_p (dr0_info,
1532 					STMT_VINFO_VECTYPE (dr0_info->stmt)));
1533 
1534   for (data_reference *dr : datarefs)
1535     {
1536       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537       if (!vect_relevant_for_alignment_p (dr_info))
1538 	continue;
1539 
1540       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1541       dr_alignment_support alignment_support_scheme;
1542       int misalignment;
1543       unsigned HOST_WIDE_INT alignment;
1544 
1545       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1546 					    size_zero_node) < 0;
1547       poly_int64 off = 0;
1548       if (negative)
1549 	off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1550 	       * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1551 
1552       if (npeel == 0)
1553 	misalignment = dr_misalignment (dr_info, vectype, off);
1554       else if (dr_info == dr0_info
1555 	       || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1556 	misalignment = 0;
1557       else if (!dr0_alignment_known_p
1558 	       || !known_alignment_for_access_p (dr_info, vectype)
1559 	       || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1560 	misalignment = DR_MISALIGNMENT_UNKNOWN;
1561       else
1562 	{
1563 	  misalignment = dr_misalignment (dr_info, vectype, off);
1564 	  misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1565 	  misalignment &= alignment - 1;
1566 	}
1567       alignment_support_scheme
1568 	= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1569 					 misalignment);
1570 
1571       vect_get_data_access_cost (loop_vinfo, dr_info,
1572 				 alignment_support_scheme, misalignment,
1573 				 inside_cost, outside_cost,
1574 				 body_cost_vec, prologue_cost_vec);
1575     }
1576 }
1577 
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579    Find the one with the lowest cost.  */
1580 
1581 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1583 				   _vect_peel_extended_info *min)
1584 {
1585   vect_peel_info elem = *slot;
1586   int dummy;
1587   unsigned int inside_cost = 0, outside_cost = 0;
1588   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1589   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1590 		       epilogue_cost_vec;
1591 
1592   prologue_cost_vec.create (2);
1593   body_cost_vec.create (2);
1594   epilogue_cost_vec.create (2);
1595 
1596   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1597 				  &outside_cost, &body_cost_vec,
1598 				  &prologue_cost_vec, elem->npeel);
1599 
1600   body_cost_vec.release ();
1601 
1602   outside_cost += vect_get_known_peeling_cost
1603     (loop_vinfo, elem->npeel, &dummy,
1604      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1605      &prologue_cost_vec, &epilogue_cost_vec);
1606 
1607   /* Prologue and epilogue costs are added to the target model later.
1608      These costs depend only on the scalar iteration cost, the
1609      number of peeling iterations finally chosen, and the number of
1610      misaligned statements.  So discard the information found here.  */
1611   prologue_cost_vec.release ();
1612   epilogue_cost_vec.release ();
1613 
1614   if (inside_cost < min->inside_cost
1615       || (inside_cost == min->inside_cost
1616 	  && outside_cost < min->outside_cost))
1617     {
1618       min->inside_cost = inside_cost;
1619       min->outside_cost = outside_cost;
1620       min->peel_info.dr_info = elem->dr_info;
1621       min->peel_info.npeel = elem->npeel;
1622       min->peel_info.count = elem->count;
1623     }
1624 
1625   return 1;
1626 }
1627 
1628 
1629 /* Choose best peeling option by traversing peeling hash table and either
1630    choosing an option with the lowest cost (if cost model is enabled) or the
1631    option that aligns as many accesses as possible.  */
1632 
1633 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1634 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1635 				       loop_vec_info loop_vinfo)
1636 {
1637    struct _vect_peel_extended_info res;
1638 
1639    res.peel_info.dr_info = NULL;
1640    res.vinfo = loop_vinfo;
1641 
1642    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1643      {
1644        res.inside_cost = INT_MAX;
1645        res.outside_cost = INT_MAX;
1646        peeling_htab->traverse <_vect_peel_extended_info *,
1647 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1648      }
1649    else
1650      {
1651        res.peel_info.count = 0;
1652        peeling_htab->traverse <_vect_peel_extended_info *,
1653 	   		       vect_peeling_hash_get_most_frequent> (&res);
1654        res.inside_cost = 0;
1655        res.outside_cost = 0;
1656      }
1657 
1658    return res;
1659 }
1660 
1661 /* Return true if the new peeling NPEEL is supported.  */
1662 
1663 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1664 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1665 			  unsigned npeel)
1666 {
1667   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669 
1670   bool dr0_alignment_known_p
1671     = known_alignment_for_access_p (dr0_info,
1672 				    STMT_VINFO_VECTYPE (dr0_info->stmt));
1673 
1674   /* Ensure that all data refs can be vectorized after the peel.  */
1675   for (data_reference *dr : datarefs)
1676     {
1677       if (dr == dr0_info->dr)
1678 	continue;
1679 
1680       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1681       if (!vect_relevant_for_alignment_p (dr_info)
1682 	  || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1683 	continue;
1684 
1685       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1686       int misalignment;
1687       unsigned HOST_WIDE_INT alignment;
1688       if (!dr0_alignment_known_p
1689 	  || !known_alignment_for_access_p (dr_info, vectype)
1690 	  || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1691 	misalignment = DR_MISALIGNMENT_UNKNOWN;
1692       else
1693 	{
1694 	  misalignment = dr_misalignment (dr_info, vectype);
1695 	  misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1696 	  misalignment &= alignment - 1;
1697 	}
1698       supportable_dr_alignment
1699 	= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1700 					 misalignment);
1701       if (supportable_dr_alignment == dr_unaligned_unsupported)
1702 	return false;
1703     }
1704 
1705   return true;
1706 }
1707 
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709    with related alignment.  */
1710 
1711 static int
dr_align_group_sort_cmp(const void * dra_,const void * drb_)1712 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1713 {
1714   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1715   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1716   int cmp;
1717 
1718   /* Stabilize sort.  */
1719   if (dra == drb)
1720     return 0;
1721 
1722   /* Ordering of DRs according to base.  */
1723   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1724 			       DR_BASE_ADDRESS (drb));
1725   if (cmp != 0)
1726     return cmp;
1727 
1728   /* And according to DR_OFFSET.  */
1729   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1730   if (cmp != 0)
1731     return cmp;
1732 
1733   /* And after step.  */
1734   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1735   if (cmp != 0)
1736     return cmp;
1737 
1738   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1739   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1740   if (cmp == 0)
1741     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1742   return cmp;
1743 }
1744 
1745 /* Function vect_enhance_data_refs_alignment
1746 
1747    This pass will use loop versioning and loop peeling in order to enhance
1748    the alignment of data references in the loop.
1749 
1750    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751    original loop is to be vectorized.  Any other loops that are created by
1752    the transformations performed in this pass - are not supposed to be
1753    vectorized.  This restriction will be relaxed.
1754 
1755    This pass will require a cost model to guide it whether to apply peeling
1756    or versioning or a combination of the two.  For example, the scheme that
1757    intel uses when given a loop with several memory accesses, is as follows:
1758    choose one memory access ('p') which alignment you want to force by doing
1759    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1760    other accesses are not necessarily aligned, or (2) use loop versioning to
1761    generate one loop in which all accesses are aligned, and another loop in
1762    which only 'p' is necessarily aligned.
1763 
1764    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1767 
1768    Devising a cost model is the most critical aspect of this work.  It will
1769    guide us on which access to peel for, whether to use loop versioning, how
1770    many versions to create, etc.  The cost model will probably consist of
1771    generic considerations as well as target specific considerations (on
1772    powerpc for example, misaligned stores are more painful than misaligned
1773    loads).
1774 
1775    Here are the general steps involved in alignment enhancements:
1776 
1777      -- original loop, before alignment analysis:
1778 	for (i=0; i<N; i++){
1779 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1780 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1781 	}
1782 
1783      -- After vect_compute_data_refs_alignment:
1784 	for (i=0; i<N; i++){
1785 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1786 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1787 	}
1788 
1789      -- Possibility 1: we do loop versioning:
1790      if (p is aligned) {
1791 	for (i=0; i<N; i++){	# loop 1A
1792 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1793 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1794 	}
1795      }
1796      else {
1797 	for (i=0; i<N; i++){	# loop 1B
1798 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1799 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1800 	}
1801      }
1802 
1803      -- Possibility 2: we do loop peeling:
1804      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1805 	x = q[i];
1806 	p[i] = y;
1807      }
1808      for (i = 3; i < N; i++){	# loop 2A
1809 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1810 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1811      }
1812 
1813      -- Possibility 3: combination of loop peeling and versioning:
1814      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1815 	x = q[i];
1816 	p[i] = y;
1817      }
1818      if (p is aligned) {
1819 	for (i = 3; i<N; i++){	# loop 3A
1820 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1821 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1822 	}
1823      }
1824      else {
1825 	for (i = 3; i<N; i++){	# loop 3B
1826 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1827 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1828 	}
1829      }
1830 
1831      These loops are later passed to loop_transform to be vectorized.  The
1832      vectorizer will use the alignment information to guide the transformation
1833      (whether to generate regular loads/stores, or with special handling for
1834      misalignment).  */
1835 
1836 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1838 {
1839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840   dr_vec_info *first_store = NULL;
1841   dr_vec_info *dr0_info = NULL;
1842   struct data_reference *dr;
1843   unsigned int i;
1844   bool do_peeling = false;
1845   bool do_versioning = false;
1846   unsigned int npeel = 0;
1847   bool one_misalignment_known = false;
1848   bool one_misalignment_unknown = false;
1849   bool one_dr_unsupportable = false;
1850   dr_vec_info *unsupportable_dr_info = NULL;
1851   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1852   hash_table<peel_info_hasher> peeling_htab (1);
1853 
1854   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1855 
1856   /* Reset data so we can safely be called multiple times.  */
1857   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1858   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1859 
1860   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1861     return opt_result::success ();
1862 
1863   /* Sort the vector of datarefs so DRs that have the same or dependent
1864      alignment are next to each other.  */
1865   auto_vec<data_reference_p> datarefs
1866     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1867   datarefs.qsort (dr_align_group_sort_cmp);
1868 
1869   /* Compute the number of DRs that become aligned when we peel
1870      a dataref so it becomes aligned.  */
1871   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1872   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1873   unsigned i0;
1874   for (i0 = 0; i0 < datarefs.length (); ++i0)
1875     if (DR_BASE_ADDRESS (datarefs[i0]))
1876       break;
1877   for (i = i0 + 1; i <= datarefs.length (); ++i)
1878     {
1879       if (i == datarefs.length ()
1880 	  || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1881 			       DR_BASE_ADDRESS (datarefs[i]), 0)
1882 	  || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1883 			       DR_OFFSET (datarefs[i]), 0)
1884 	  || !operand_equal_p (DR_STEP (datarefs[i0]),
1885 			       DR_STEP (datarefs[i]), 0))
1886 	{
1887 	  /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888 	     possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1889 	     will get known misalignment if we align one of the refs
1890 	     with the largest DR_TARGET_ALIGNMENT.  */
1891 	  for (unsigned j = i0; j < i; ++j)
1892 	    {
1893 	      dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1894 	      for (unsigned k = i0; k < i; ++k)
1895 		{
1896 		  if (k == j)
1897 		    continue;
1898 		  dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1899 		  if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1900 							       dr_infoj))
1901 		    n_same_align_refs[j]++;
1902 		}
1903 	    }
1904 	  i0 = i;
1905 	}
1906     }
1907 
1908   /* While cost model enhancements are expected in the future, the high level
1909      view of the code at this time is as follows:
1910 
1911      A) If there is a misaligned access then see if peeling to align
1912         this access can make all data references satisfy
1913         vect_supportable_dr_alignment.  If so, update data structures
1914         as needed and return true.
1915 
1916      B) If peeling wasn't possible and there is a data reference with an
1917         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918         then see if loop versioning checks can be used to make all data
1919         references satisfy vect_supportable_dr_alignment.  If so, update
1920         data structures as needed and return true.
1921 
1922      C) If neither peeling nor versioning were successful then return false if
1923         any data reference does not satisfy vect_supportable_dr_alignment.
1924 
1925      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1926 
1927      Note, Possibility 3 above (which is peeling and versioning together) is not
1928      being done at this time.  */
1929 
1930   /* (1) Peeling to force alignment.  */
1931 
1932   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1933      Considerations:
1934      + How many accesses will become aligned due to the peeling
1935      - How many accesses will become unaligned due to the peeling,
1936        and the cost of misaligned accesses.
1937      - The cost of peeling (the extra runtime checks, the increase
1938        in code size).  */
1939 
1940   FOR_EACH_VEC_ELT (datarefs, i, dr)
1941     {
1942       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1943       if (!vect_relevant_for_alignment_p (dr_info))
1944 	continue;
1945 
1946       stmt_vec_info stmt_info = dr_info->stmt;
1947       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1948       do_peeling = vector_alignment_reachable_p (dr_info);
1949       if (do_peeling)
1950         {
1951 	  if (known_alignment_for_access_p (dr_info, vectype))
1952             {
1953 	      unsigned int npeel_tmp = 0;
1954 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1955 						    size_zero_node) < 0;
1956 
1957 	      /* If known_alignment_for_access_p then we have set
1958 	         DR_MISALIGNMENT which is only done if we know it at compiler
1959 	         time, so it is safe to assume target alignment is constant.
1960 	       */
1961 	      unsigned int target_align =
1962 		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1963 	      unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1964 	      poly_int64 off = 0;
1965 	      if (negative)
1966 		off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1967 	      unsigned int mis = dr_misalignment (dr_info, vectype, off);
1968 	      mis = negative ? mis : -mis;
1969 	      if (mis != 0)
1970 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1971 
1972               /* For multiple types, it is possible that the bigger type access
1973                  will have more than one peeling option.  E.g., a loop with two
1974                  types: one of size (vector size / 4), and the other one of
1975                  size (vector size / 8).  Vectorization factor will 8.  If both
1976                  accesses are misaligned by 3, the first one needs one scalar
1977                  iteration to be aligned, and the second one needs 5.  But the
1978 		 first one will be aligned also by peeling 5 scalar
1979                  iterations, and in that case both accesses will be aligned.
1980                  Hence, except for the immediate peeling amount, we also want
1981                  to try to add full vector size, while we don't exceed
1982                  vectorization factor.
1983                  We do this automatically for cost model, since we calculate
1984 		 cost for every peeling option.  */
1985 	      poly_uint64 nscalars = npeel_tmp;
1986               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1987 		{
1988 		  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989 		  nscalars = (STMT_SLP_TYPE (stmt_info)
1990 			      ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991 		}
1992 
1993 	      /* Save info about DR in the hash table.  Also include peeling
1994 		 amounts according to the explanation above.  Indicate
1995 		 the alignment status when the ref is not aligned.
1996 		 ???  Rather than using unknown alignment here we should
1997 		 prune all entries from the peeling hashtable which cause
1998 		 DRs to be not supported.  */
1999 	      bool supportable_if_not_aligned
2000 		= vect_supportable_dr_alignment
2001 		    (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2002 	      while (known_le (npeel_tmp, nscalars))
2003                 {
2004                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2005 					    dr_info, npeel_tmp,
2006 					    supportable_if_not_aligned);
2007 		  npeel_tmp += MAX (1, target_align / dr_size);
2008                 }
2009 
2010 	      one_misalignment_known = true;
2011             }
2012           else
2013             {
2014               /* If we don't know any misalignment values, we prefer
2015                  peeling for data-ref that has the maximum number of data-refs
2016                  with the same alignment, unless the target prefers to align
2017                  stores over load.  */
2018 	      unsigned same_align_drs = n_same_align_refs[i];
2019 	      if (!dr0_info
2020 		  || dr0_same_align_drs < same_align_drs)
2021 		{
2022 		  dr0_same_align_drs = same_align_drs;
2023 		  dr0_info = dr_info;
2024 		}
2025 	      /* For data-refs with the same number of related
2026 		 accesses prefer the one where the misalign
2027 		 computation will be invariant in the outermost loop.  */
2028 	      else if (dr0_same_align_drs == same_align_drs)
2029 		{
2030 		  class loop *ivloop0, *ivloop;
2031 		  ivloop0 = outermost_invariant_loop_for_expr
2032 		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
2033 		  ivloop = outermost_invariant_loop_for_expr
2034 		    (loop, DR_BASE_ADDRESS (dr));
2035 		  if ((ivloop && !ivloop0)
2036 		      || (ivloop && ivloop0
2037 			  && flow_loop_nested_p (ivloop, ivloop0)))
2038 		    dr0_info = dr_info;
2039 		}
2040 
2041 	      one_misalignment_unknown = true;
2042 
2043 	      /* Check for data refs with unsupportable alignment that
2044 	         can be peeled.  */
2045 	      enum dr_alignment_support supportable_dr_alignment
2046 		= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2047 						 DR_MISALIGNMENT_UNKNOWN);
2048 	      if (supportable_dr_alignment == dr_unaligned_unsupported)
2049 		{
2050 		  one_dr_unsupportable = true;
2051 		  unsupportable_dr_info = dr_info;
2052 		}
2053 
2054 	      if (!first_store && DR_IS_WRITE (dr))
2055 		{
2056 		  first_store = dr_info;
2057 		  first_store_same_align_drs = same_align_drs;
2058 		}
2059             }
2060         }
2061       else
2062         {
2063 	  if (!aligned_access_p (dr_info, vectype))
2064             {
2065               if (dump_enabled_p ())
2066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                                  "vector alignment may not be reachable\n");
2068               break;
2069             }
2070         }
2071     }
2072 
2073   /* Check if we can possibly peel the loop.  */
2074   if (!vect_can_advance_ivs_p (loop_vinfo)
2075       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2076       || loop->inner)
2077     do_peeling = false;
2078 
2079   struct _vect_peel_extended_info peel_for_known_alignment;
2080   struct _vect_peel_extended_info peel_for_unknown_alignment;
2081   struct _vect_peel_extended_info best_peel;
2082 
2083   peel_for_unknown_alignment.inside_cost = INT_MAX;
2084   peel_for_unknown_alignment.outside_cost = INT_MAX;
2085   peel_for_unknown_alignment.peel_info.count = 0;
2086 
2087   if (do_peeling
2088       && one_misalignment_unknown)
2089     {
2090       /* Check if the target requires to prefer stores over loads, i.e., if
2091          misaligned stores are more expensive than misaligned loads (taking
2092          drs with same alignment into account).  */
2093       unsigned int load_inside_cost = 0;
2094       unsigned int load_outside_cost = 0;
2095       unsigned int store_inside_cost = 0;
2096       unsigned int store_outside_cost = 0;
2097       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2098 
2099       stmt_vector_for_cost dummy;
2100       dummy.create (2);
2101       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2102 				      &load_inside_cost,
2103 				      &load_outside_cost,
2104 				      &dummy, &dummy, estimated_npeels);
2105       dummy.release ();
2106 
2107       if (first_store)
2108 	{
2109 	  dummy.create (2);
2110 	  vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2111 					  &store_inside_cost,
2112 					  &store_outside_cost,
2113 					  &dummy, &dummy,
2114 					  estimated_npeels);
2115 	  dummy.release ();
2116 	}
2117       else
2118 	{
2119 	  store_inside_cost = INT_MAX;
2120 	  store_outside_cost = INT_MAX;
2121 	}
2122 
2123       if (load_inside_cost > store_inside_cost
2124 	  || (load_inside_cost == store_inside_cost
2125 	      && load_outside_cost > store_outside_cost))
2126 	{
2127 	  dr0_info = first_store;
2128 	  dr0_same_align_drs = first_store_same_align_drs;
2129 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
2130 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
2131 	}
2132       else
2133 	{
2134 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
2135 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136 	}
2137 
2138       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2139       prologue_cost_vec.create (2);
2140       epilogue_cost_vec.create (2);
2141 
2142       int dummy2;
2143       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2144 	(loop_vinfo, estimated_npeels, &dummy2,
2145 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2146 	 &prologue_cost_vec, &epilogue_cost_vec);
2147 
2148       prologue_cost_vec.release ();
2149       epilogue_cost_vec.release ();
2150 
2151       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152     }
2153 
2154   peel_for_unknown_alignment.peel_info.npeel = 0;
2155   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2156 
2157   best_peel = peel_for_unknown_alignment;
2158 
2159   peel_for_known_alignment.inside_cost = INT_MAX;
2160   peel_for_known_alignment.outside_cost = INT_MAX;
2161   peel_for_known_alignment.peel_info.count = 0;
2162   peel_for_known_alignment.peel_info.dr_info = NULL;
2163 
2164   if (do_peeling && one_misalignment_known)
2165     {
2166       /* Peeling is possible, but there is no data access that is not supported
2167          unless aligned.  So we try to choose the best possible peeling from
2168 	 the hash table.  */
2169       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2170 	(&peeling_htab, loop_vinfo);
2171     }
2172 
2173   /* Compare costs of peeling for known and unknown alignment. */
2174   if (peel_for_known_alignment.peel_info.dr_info != NULL
2175       && peel_for_unknown_alignment.inside_cost
2176       >= peel_for_known_alignment.inside_cost)
2177     {
2178       best_peel = peel_for_known_alignment;
2179 
2180       /* If the best peeling for known alignment has NPEEL == 0, perform no
2181          peeling at all except if there is an unsupportable dr that we can
2182          align.  */
2183       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2184 	do_peeling = false;
2185     }
2186 
2187   /* If there is an unsupportable data ref, prefer this over all choices so far
2188      since we'd have to discard a chosen peeling except when it accidentally
2189      aligned the unsupportable data ref.  */
2190   if (one_dr_unsupportable)
2191     dr0_info = unsupportable_dr_info;
2192   else if (do_peeling)
2193     {
2194       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195 	 TODO: Use nopeel_outside_cost or get rid of it?  */
2196       unsigned nopeel_inside_cost = 0;
2197       unsigned nopeel_outside_cost = 0;
2198 
2199       stmt_vector_for_cost dummy;
2200       dummy.create (2);
2201       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2202 				      &nopeel_outside_cost, &dummy, &dummy, 0);
2203       dummy.release ();
2204 
2205       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2206 	 costs will be recorded.  */
2207       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2208       prologue_cost_vec.create (2);
2209       epilogue_cost_vec.create (2);
2210 
2211       int dummy2;
2212       nopeel_outside_cost += vect_get_known_peeling_cost
2213 	(loop_vinfo, 0, &dummy2,
2214 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2215 	 &prologue_cost_vec, &epilogue_cost_vec);
2216 
2217       prologue_cost_vec.release ();
2218       epilogue_cost_vec.release ();
2219 
2220       npeel = best_peel.peel_info.npeel;
2221       dr0_info = best_peel.peel_info.dr_info;
2222 
2223       /* If no peeling is not more expensive than the best peeling we
2224 	 have so far, don't perform any peeling.  */
2225       if (nopeel_inside_cost <= best_peel.inside_cost)
2226 	do_peeling = false;
2227     }
2228 
2229   if (do_peeling)
2230     {
2231       stmt_vec_info stmt_info = dr0_info->stmt;
2232       if (known_alignment_for_access_p (dr0_info,
2233 					STMT_VINFO_VECTYPE (stmt_info)))
2234         {
2235 	  bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2236 						size_zero_node) < 0;
2237           if (!npeel)
2238             {
2239               /* Since it's known at compile time, compute the number of
2240                  iterations in the peeled loop (the peeling factor) for use in
2241                  updating DR_MISALIGNMENT values.  The peeling factor is the
2242                  vectorization factor minus the misalignment as an element
2243                  count.  */
2244 	      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2245 	      poly_int64 off = 0;
2246 	      if (negative)
2247 		off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2248 		       * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2249 	      unsigned int mis
2250 		= dr_misalignment (dr0_info, vectype, off);
2251 	      mis = negative ? mis : -mis;
2252 	      /* If known_alignment_for_access_p then we have set
2253 	         DR_MISALIGNMENT which is only done if we know it at compiler
2254 	         time, so it is safe to assume target alignment is constant.
2255 	       */
2256 	      unsigned int target_align =
2257 		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2258 	      npeel = ((mis & (target_align - 1))
2259 		       / vect_get_scalar_dr_size (dr0_info));
2260             }
2261 
2262 	  /* For interleaved data access every iteration accesses all the
2263 	     members of the group, therefore we divide the number of iterations
2264 	     by the group size.  */
2265 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2266 	    npeel /= DR_GROUP_SIZE (stmt_info);
2267 
2268           if (dump_enabled_p ())
2269             dump_printf_loc (MSG_NOTE, vect_location,
2270                              "Try peeling by %d\n", npeel);
2271         }
2272 
2273       /* Ensure that all datarefs can be vectorized after the peel.  */
2274       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2275 	do_peeling = false;
2276 
2277       /* Check if all datarefs are supportable and log.  */
2278       if (do_peeling
2279 	  && npeel == 0
2280 	  && known_alignment_for_access_p (dr0_info,
2281 					   STMT_VINFO_VECTYPE (stmt_info)))
2282 	return opt_result::success ();
2283 
2284       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2285       if (do_peeling)
2286         {
2287           unsigned max_allowed_peel
2288 	    = param_vect_max_peeling_for_alignment;
2289 	  if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2290 	    max_allowed_peel = 0;
2291           if (max_allowed_peel != (unsigned)-1)
2292             {
2293               unsigned max_peel = npeel;
2294               if (max_peel == 0)
2295                 {
2296 		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2297 		  unsigned HOST_WIDE_INT target_align_c;
2298 		  if (target_align.is_constant (&target_align_c))
2299 		    max_peel =
2300 		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2301 		  else
2302 		    {
2303 		      do_peeling = false;
2304 		      if (dump_enabled_p ())
2305 			dump_printf_loc (MSG_NOTE, vect_location,
2306 			  "Disable peeling, max peels set and vector"
2307 			  " alignment unknown\n");
2308 		    }
2309                 }
2310               if (max_peel > max_allowed_peel)
2311                 {
2312                   do_peeling = false;
2313                   if (dump_enabled_p ())
2314                     dump_printf_loc (MSG_NOTE, vect_location,
2315                         "Disable peeling, max peels reached: %d\n", max_peel);
2316                 }
2317             }
2318         }
2319 
2320       /* Cost model #2 - if peeling may result in a remaining loop not
2321 	 iterating enough to be vectorized then do not peel.  Since this
2322 	 is a cost heuristic rather than a correctness decision, use the
2323 	 most likely runtime value for variable vectorization factors.  */
2324       if (do_peeling
2325 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2326 	{
2327 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2328 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2329 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2330 	      < assumed_vf + max_peel)
2331 	    do_peeling = false;
2332 	}
2333 
2334       if (do_peeling)
2335         {
2336           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337              If the misalignment of DR_i is identical to that of dr0 then set
2338              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2339              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340              by the peeling factor times the element size of DR_i (MOD the
2341              vectorization factor times the size).  Otherwise, the
2342              misalignment of DR_i must be set to unknown.  */
2343 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2344 	    if (dr != dr0_info->dr)
2345 	      {
2346 		dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2347 		if (!vect_relevant_for_alignment_p (dr_info))
2348 		  continue;
2349 
2350 		vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351 	      }
2352 
2353           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2354           if (npeel)
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2356           else
2357 	    LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2358 	  SET_DR_MISALIGNMENT (dr0_info,
2359 			       vect_dr_misalign_for_aligned_access (dr0_info));
2360 	  if (dump_enabled_p ())
2361             {
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Alignment of access forced using peeling.\n");
2364               dump_printf_loc (MSG_NOTE, vect_location,
2365                                "Peeling for alignment will be applied.\n");
2366             }
2367 
2368 	  /* The inside-loop cost will be accounted for in vectorizable_load
2369 	     and vectorizable_store correctly with adjusted alignments.
2370 	     Drop the body_cst_vec on the floor here.  */
2371 	  return opt_result::success ();
2372         }
2373     }
2374 
2375   /* (2) Versioning to force alignment.  */
2376 
2377   /* Try versioning if:
2378      1) optimize loop for speed and the cost-model is not cheap
2379      2) there is at least one unsupported misaligned data ref with an unknown
2380         misalignment, and
2381      3) all misaligned data refs with a known misalignment are supported, and
2382      4) the number of runtime alignment checks is within reason.  */
2383 
2384   do_versioning
2385     = (optimize_loop_nest_for_speed_p (loop)
2386        && !loop->inner /* FORNOW */
2387        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2388 
2389   if (do_versioning)
2390     {
2391       FOR_EACH_VEC_ELT (datarefs, i, dr)
2392         {
2393 	  dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2394 	  if (!vect_relevant_for_alignment_p (dr_info))
2395 	    continue;
2396 
2397 	  stmt_vec_info stmt_info = dr_info->stmt;
2398 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2399 	    {
2400 	      do_versioning = false;
2401 	      break;
2402 	    }
2403 
2404 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405 	  bool negative = tree_int_cst_compare (DR_STEP (dr),
2406 						size_zero_node) < 0;
2407 	  poly_int64 off = 0;
2408 	  if (negative)
2409 	    off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2410 		   * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2411 	  int misalignment;
2412 	  if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2413 	    continue;
2414 
2415 	  enum dr_alignment_support supportable_dr_alignment
2416 	    = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2417 					     misalignment);
2418 	  if (supportable_dr_alignment == dr_unaligned_unsupported)
2419             {
2420 	      if (misalignment != DR_MISALIGNMENT_UNKNOWN
2421 		  || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2422 		      >= (unsigned) param_vect_max_version_for_alignment_checks))
2423                 {
2424                   do_versioning = false;
2425                   break;
2426                 }
2427 
2428 	      /* At present we don't support versioning for alignment
2429 		 with variable VF, since there's no guarantee that the
2430 		 VF is a power of two.  We could relax this if we added
2431 		 a way of enforcing a power-of-two size.  */
2432 	      unsigned HOST_WIDE_INT size;
2433 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2434 		{
2435 		  do_versioning = false;
2436 		  break;
2437 		}
2438 
2439 	      /* Forcing alignment in the first iteration is no good if
2440 		 we don't keep it across iterations.  For now, just disable
2441 		 versioning in this case.
2442 		 ?? We could actually unroll the loop to achieve the required
2443 		 overall step alignment, and forcing the alignment could be
2444 		 done by doing some iterations of the non-vectorized loop.  */
2445 	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2446 			       * DR_STEP_ALIGNMENT (dr),
2447 			       DR_TARGET_ALIGNMENT (dr_info)))
2448 		{
2449 		  do_versioning = false;
2450 		  break;
2451 		}
2452 
2453               /* The rightmost bits of an aligned address must be zeros.
2454                  Construct the mask needed for this test.  For example,
2455                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456                  mask must be 15 = 0xf. */
2457 	      int mask = size - 1;
2458 
2459 	      /* FORNOW: use the same mask to test all potentially unaligned
2460 		 references in the loop.  */
2461 	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2462 		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2463 		{
2464 		  do_versioning = false;
2465 		  break;
2466 		}
2467 
2468               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2469 	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2470             }
2471         }
2472 
2473       /* Versioning requires at least one misaligned data reference.  */
2474       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2475         do_versioning = false;
2476       else if (!do_versioning)
2477         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478     }
2479 
2480   if (do_versioning)
2481     {
2482       const vec<stmt_vec_info> &may_misalign_stmts
2483 	= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2484       stmt_vec_info stmt_info;
2485 
2486       /* It can now be assumed that the data references in the statements
2487          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488          of the loop being vectorized.  */
2489       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2490         {
2491 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2492 	  SET_DR_MISALIGNMENT (dr_info,
2493 			       vect_dr_misalign_for_aligned_access (dr_info));
2494 	  if (dump_enabled_p ())
2495             dump_printf_loc (MSG_NOTE, vect_location,
2496                              "Alignment of access forced using versioning.\n");
2497         }
2498 
2499       if (dump_enabled_p ())
2500         dump_printf_loc (MSG_NOTE, vect_location,
2501                          "Versioning for alignment will be applied.\n");
2502 
2503       /* Peeling and versioning can't be done together at this time.  */
2504       gcc_assert (! (do_peeling && do_versioning));
2505 
2506       return opt_result::success ();
2507     }
2508 
2509   /* This point is reached if neither peeling nor versioning is being done.  */
2510   gcc_assert (! (do_peeling || do_versioning));
2511 
2512   return opt_result::success ();
2513 }
2514 
2515 
2516 /* Function vect_analyze_data_refs_alignment
2517 
2518    Analyze the alignment of the data-references in the loop.
2519    Return FALSE if a data reference is found that cannot be vectorized.  */
2520 
2521 opt_result
vect_analyze_data_refs_alignment(loop_vec_info loop_vinfo)2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2523 {
2524   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2525 
2526   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2527   struct data_reference *dr;
2528   unsigned int i;
2529 
2530   vect_record_base_alignments (loop_vinfo);
2531   FOR_EACH_VEC_ELT (datarefs, i, dr)
2532     {
2533       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2534       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2535 	{
2536 	  if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2537 	      && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2538 	    continue;
2539 	  vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2540 					   STMT_VINFO_VECTYPE (dr_info->stmt));
2541 	}
2542     }
2543 
2544   return opt_result::success ();
2545 }
2546 
2547 
2548 /* Analyze alignment of DRs of stmts in NODE.  */
2549 
2550 static bool
vect_slp_analyze_node_alignment(vec_info * vinfo,slp_tree node)2551 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2552 {
2553   /* Alignment is maintained in the first element of the group.  */
2554   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2555   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2556   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2557   tree vectype = SLP_TREE_VECTYPE (node);
2558   poly_uint64 vector_alignment
2559     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2560 		 BITS_PER_UNIT);
2561   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2562     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2563   /* Re-analyze alignment when we're facing a vectorization with a bigger
2564      alignment requirement.  */
2565   else if (known_lt (dr_info->target_alignment, vector_alignment))
2566     {
2567       poly_uint64 old_target_alignment = dr_info->target_alignment;
2568       int old_misalignment = dr_info->misalignment;
2569       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2570       /* But keep knowledge about a smaller alignment.  */
2571       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2572 	  && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2573 	{
2574 	  dr_info->target_alignment = old_target_alignment;
2575 	  dr_info->misalignment = old_misalignment;
2576 	}
2577     }
2578   /* When we ever face unordered target alignments the first one wins in terms
2579      of analyzing and the other will become unknown in dr_misalignment.  */
2580   return true;
2581 }
2582 
2583 /* Function vect_slp_analyze_instance_alignment
2584 
2585    Analyze the alignment of the data-references in the SLP instance.
2586    Return FALSE if a data reference is found that cannot be vectorized.  */
2587 
2588 bool
vect_slp_analyze_instance_alignment(vec_info * vinfo,slp_instance instance)2589 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2590 						slp_instance instance)
2591 {
2592   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2593 
2594   slp_tree node;
2595   unsigned i;
2596   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2597     if (! vect_slp_analyze_node_alignment (vinfo, node))
2598       return false;
2599 
2600   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2601       && ! vect_slp_analyze_node_alignment
2602 	     (vinfo, SLP_INSTANCE_TREE (instance)))
2603     return false;
2604 
2605   return true;
2606 }
2607 
2608 
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.
2613    Worker for vect_analyze_group_access.  */
2614 
2615 static bool
vect_analyze_group_access_1(vec_info * vinfo,dr_vec_info * dr_info)2616 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2617 {
2618   data_reference *dr = dr_info->dr;
2619   tree step = DR_STEP (dr);
2620   tree scalar_type = TREE_TYPE (DR_REF (dr));
2621   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2622   stmt_vec_info stmt_info = dr_info->stmt;
2623   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2624   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2625   HOST_WIDE_INT dr_step = -1;
2626   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2627   bool slp_impossible = false;
2628 
2629   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630      size of the interleaving group (including gaps).  */
2631   if (tree_fits_shwi_p (step))
2632     {
2633       dr_step = tree_to_shwi (step);
2634       /* Check that STEP is a multiple of type size.  Otherwise there is
2635          a non-element-sized gap at the end of the group which we
2636 	 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637 	 ???  As we can handle non-constant step fine here we should
2638 	 simply remove uses of DR_GROUP_GAP between the last and first
2639 	 element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2640 	 simply not include that gap.  */
2641       if ((dr_step % type_size) != 0)
2642 	{
2643 	  if (dump_enabled_p ())
2644 	    dump_printf_loc (MSG_NOTE, vect_location,
2645 			     "Step %T is not a multiple of the element size"
2646 			     " for %T\n",
2647 			     step, DR_REF (dr));
2648 	  return false;
2649 	}
2650       groupsize = absu_hwi (dr_step) / type_size;
2651     }
2652   else
2653     groupsize = 0;
2654 
2655   /* Not consecutive access is possible only if it is a part of interleaving.  */
2656   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2657     {
2658       /* Check if it this DR is a part of interleaving, and is a single
2659 	 element of the group that is accessed in the loop.  */
2660 
2661       /* Gaps are supported only for loads. STEP must be a multiple of the type
2662 	 size.  */
2663       if (DR_IS_READ (dr)
2664 	  && (dr_step % type_size) == 0
2665 	  && groupsize > 0
2666 	  /* This could be UINT_MAX but as we are generating code in a very
2667 	     inefficient way we have to cap earlier.
2668 	     See PR91403 for example.  */
2669 	  && groupsize <= 4096)
2670 	{
2671 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2672 	  DR_GROUP_SIZE (stmt_info) = groupsize;
2673 	  DR_GROUP_GAP (stmt_info) = groupsize - 1;
2674 	  if (dump_enabled_p ())
2675 	    dump_printf_loc (MSG_NOTE, vect_location,
2676 			     "Detected single element interleaving %T"
2677 			     " step %T\n",
2678 			     DR_REF (dr), step);
2679 
2680 	  return true;
2681 	}
2682 
2683       if (dump_enabled_p ())
2684 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685 			 "not consecutive access %G", stmt_info->stmt);
2686 
2687       if (bb_vinfo)
2688 	{
2689 	  /* Mark the statement as unvectorizable.  */
2690 	  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2691 	  return true;
2692 	}
2693 
2694       if (dump_enabled_p ())
2695 	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2696       STMT_VINFO_STRIDED_P (stmt_info) = true;
2697       return true;
2698     }
2699 
2700   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2701     {
2702       /* First stmt in the interleaving chain. Check the chain.  */
2703       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2704       struct data_reference *data_ref = dr;
2705       unsigned int count = 1;
2706       tree prev_init = DR_INIT (data_ref);
2707       HOST_WIDE_INT diff, gaps = 0;
2708 
2709       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2710       while (next)
2711         {
2712           /* We never have the same DR multiple times.  */
2713           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2714 				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2715 
2716 	  data_ref = STMT_VINFO_DATA_REF (next);
2717 
2718 	  /* All group members have the same STEP by construction.  */
2719 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2720 
2721           /* Check that the distance between two accesses is equal to the type
2722              size. Otherwise, we have gaps.  */
2723           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2724 		  - TREE_INT_CST_LOW (prev_init)) / type_size;
2725 	  if (diff < 1 || diff > UINT_MAX)
2726 	    {
2727 	      /* For artificial testcases with array accesses with large
2728 		 constant indices we can run into overflow issues which
2729 		 can end up fooling the groupsize constraint below so
2730 		 check the individual gaps (which are represented as
2731 		 unsigned int) as well.  */
2732 	      if (dump_enabled_p ())
2733 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2734 				 "interleaved access with gap larger "
2735 				 "than representable\n");
2736 	      return false;
2737 	    }
2738 	  if (diff != 1)
2739 	    {
2740 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2741 	      slp_impossible = true;
2742 	      if (DR_IS_WRITE (data_ref))
2743 		{
2744                   if (dump_enabled_p ())
2745                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2746                                      "interleaved store with gaps\n");
2747 		  return false;
2748 		}
2749 
2750               gaps += diff - 1;
2751 	    }
2752 
2753 	  last_accessed_element += diff;
2754 
2755           /* Store the gap from the previous member of the group. If there is no
2756              gap in the access, DR_GROUP_GAP is always 1.  */
2757 	  DR_GROUP_GAP (next) = diff;
2758 
2759 	  prev_init = DR_INIT (data_ref);
2760 	  next = DR_GROUP_NEXT_ELEMENT (next);
2761 	  /* Count the number of data-refs in the chain.  */
2762 	  count++;
2763         }
2764 
2765       if (groupsize == 0)
2766         groupsize = count + gaps;
2767 
2768       /* This could be UINT_MAX but as we are generating code in a very
2769          inefficient way we have to cap earlier.  See PR78699 for example.  */
2770       if (groupsize > 4096)
2771 	{
2772 	  if (dump_enabled_p ())
2773 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774 			     "group is too large\n");
2775 	  return false;
2776 	}
2777 
2778       /* Check that the size of the interleaving is equal to count for stores,
2779          i.e., that there are no gaps.  */
2780       if (groupsize != count
2781 	  && !DR_IS_READ (dr))
2782         {
2783 	  groupsize = count;
2784 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
2785 	}
2786 
2787       /* If there is a gap after the last load in the group it is the
2788 	 difference between the groupsize and the last accessed
2789 	 element.
2790 	 When there is no gap, this difference should be 0.  */
2791       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2792 
2793       DR_GROUP_SIZE (stmt_info) = groupsize;
2794       if (dump_enabled_p ())
2795 	{
2796 	  dump_printf_loc (MSG_NOTE, vect_location,
2797 			   "Detected interleaving ");
2798 	  if (DR_IS_READ (dr))
2799 	    dump_printf (MSG_NOTE, "load ");
2800 	  else if (STMT_VINFO_STRIDED_P (stmt_info))
2801 	    dump_printf (MSG_NOTE, "strided store ");
2802 	  else
2803 	    dump_printf (MSG_NOTE, "store ");
2804 	  dump_printf (MSG_NOTE, "of size %u\n",
2805 		       (unsigned)groupsize);
2806 	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2807 	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2808 	  while (next)
2809 	    {
2810 	      if (DR_GROUP_GAP (next) != 1)
2811 		dump_printf_loc (MSG_NOTE, vect_location,
2812 				 "\t<gap of %d elements>\n",
2813 				 DR_GROUP_GAP (next) - 1);
2814 	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2815 	      next = DR_GROUP_NEXT_ELEMENT (next);
2816 	    }
2817 	  if (DR_GROUP_GAP (stmt_info) != 0)
2818 	    dump_printf_loc (MSG_NOTE, vect_location,
2819 			     "\t<gap of %d elements>\n",
2820 			     DR_GROUP_GAP (stmt_info));
2821 	}
2822 
2823       /* SLP: create an SLP data structure for every interleaving group of
2824 	 stores for further analysis in vect_analyse_slp.  */
2825       if (DR_IS_WRITE (dr) && !slp_impossible)
2826 	{
2827 	  if (loop_vinfo)
2828 	    LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2829 	  if (bb_vinfo)
2830 	    BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2831 	}
2832     }
2833 
2834   return true;
2835 }
2836 
2837 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2838    accesses of legal size, step, etc.  Detect gaps, single element
2839    interleaving, and other special cases. Set grouped access info.
2840    Collect groups of strided stores for further use in SLP analysis.  */
2841 
2842 static bool
vect_analyze_group_access(vec_info * vinfo,dr_vec_info * dr_info)2843 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2844 {
2845   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2846     {
2847       /* Dissolve the group if present.  */
2848       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2849       while (stmt_info)
2850 	{
2851 	  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2852 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2853 	  DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2854 	  stmt_info = next;
2855 	}
2856       return false;
2857     }
2858   return true;
2859 }
2860 
2861 /* Analyze the access pattern of the data-reference DR_INFO.
2862    In case of non-consecutive accesses call vect_analyze_group_access() to
2863    analyze groups of accesses.  */
2864 
2865 static bool
vect_analyze_data_ref_access(vec_info * vinfo,dr_vec_info * dr_info)2866 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2867 {
2868   data_reference *dr = dr_info->dr;
2869   tree step = DR_STEP (dr);
2870   tree scalar_type = TREE_TYPE (DR_REF (dr));
2871   stmt_vec_info stmt_info = dr_info->stmt;
2872   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2873   class loop *loop = NULL;
2874 
2875   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2876     return true;
2877 
2878   if (loop_vinfo)
2879     loop = LOOP_VINFO_LOOP (loop_vinfo);
2880 
2881   if (loop_vinfo && !step)
2882     {
2883       if (dump_enabled_p ())
2884 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885 	                 "bad data-ref access in loop\n");
2886       return false;
2887     }
2888 
2889   /* Allow loads with zero step in inner-loop vectorization.  */
2890   if (loop_vinfo && integer_zerop (step))
2891     {
2892       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2893       if (!nested_in_vect_loop_p (loop, stmt_info))
2894 	return DR_IS_READ (dr);
2895       /* Allow references with zero step for outer loops marked
2896 	 with pragma omp simd only - it guarantees absence of
2897 	 loop-carried dependencies between inner loop iterations.  */
2898       if (loop->safelen < 2)
2899 	{
2900 	  if (dump_enabled_p ())
2901 	    dump_printf_loc (MSG_NOTE, vect_location,
2902 			     "zero step in inner loop of nest\n");
2903 	  return false;
2904 	}
2905     }
2906 
2907   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2908     {
2909       /* Interleaved accesses are not yet supported within outer-loop
2910         vectorization for references in the inner-loop.  */
2911       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2912 
2913       /* For the rest of the analysis we use the outer-loop step.  */
2914       step = STMT_VINFO_DR_STEP (stmt_info);
2915       if (integer_zerop (step))
2916 	{
2917 	  if (dump_enabled_p ())
2918 	    dump_printf_loc (MSG_NOTE, vect_location,
2919 	                     "zero step in outer loop.\n");
2920 	  return DR_IS_READ (dr);
2921 	}
2922     }
2923 
2924   /* Consecutive?  */
2925   if (TREE_CODE (step) == INTEGER_CST)
2926     {
2927       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2928       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2929 	  || (dr_step < 0
2930 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2931 	{
2932 	  /* Mark that it is not interleaving.  */
2933 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2934 	  return true;
2935 	}
2936     }
2937 
2938   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2939     {
2940       if (dump_enabled_p ())
2941 	dump_printf_loc (MSG_NOTE, vect_location,
2942 	                 "grouped access in outer loop.\n");
2943       return false;
2944     }
2945 
2946 
2947   /* Assume this is a DR handled by non-constant strided load case.  */
2948   if (TREE_CODE (step) != INTEGER_CST)
2949     return (STMT_VINFO_STRIDED_P (stmt_info)
2950 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2951 		|| vect_analyze_group_access (vinfo, dr_info)));
2952 
2953   /* Not consecutive access - check if it's a part of interleaving group.  */
2954   return vect_analyze_group_access (vinfo, dr_info);
2955 }
2956 
2957 /* Compare two data-references DRA and DRB to group them into chunks
2958    suitable for grouping.  */
2959 
2960 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2961 dr_group_sort_cmp (const void *dra_, const void *drb_)
2962 {
2963   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2964   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2965   data_reference_p dra = dra_info->dr;
2966   data_reference_p drb = drb_info->dr;
2967   int cmp;
2968 
2969   /* Stabilize sort.  */
2970   if (dra == drb)
2971     return 0;
2972 
2973   /* Different group IDs lead never belong to the same group.  */
2974   if (dra_info->group != drb_info->group)
2975     return dra_info->group < drb_info->group ? -1 : 1;
2976 
2977   /* Ordering of DRs according to base.  */
2978   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979 			       DR_BASE_ADDRESS (drb));
2980   if (cmp != 0)
2981     return cmp;
2982 
2983   /* And according to DR_OFFSET.  */
2984   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2985   if (cmp != 0)
2986     return cmp;
2987 
2988   /* Put reads before writes.  */
2989   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2990     return DR_IS_READ (dra) ? -1 : 1;
2991 
2992   /* Then sort after access size.  */
2993   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2994 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2995   if (cmp != 0)
2996     return cmp;
2997 
2998   /* And after step.  */
2999   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3000   if (cmp != 0)
3001     return cmp;
3002 
3003   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3004   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3005   if (cmp == 0)
3006     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3007   return cmp;
3008 }
3009 
3010 /* If OP is the result of a conversion, return the unconverted value,
3011    otherwise return null.  */
3012 
3013 static tree
strip_conversion(tree op)3014 strip_conversion (tree op)
3015 {
3016   if (TREE_CODE (op) != SSA_NAME)
3017     return NULL_TREE;
3018   gimple *stmt = SSA_NAME_DEF_STMT (op);
3019   if (!is_gimple_assign (stmt)
3020       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3021     return NULL_TREE;
3022   return gimple_assign_rhs1 (stmt);
3023 }
3024 
3025 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3026    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3027    be grouped in SLP mode.  */
3028 
3029 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info,bool allow_slp_p)3030 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3031 		   bool allow_slp_p)
3032 {
3033   if (gimple_assign_single_p (stmt1_info->stmt))
3034     return gimple_assign_single_p (stmt2_info->stmt);
3035 
3036   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3037   if (call1 && gimple_call_internal_p (call1))
3038     {
3039       /* Check for two masked loads or two masked stores.  */
3040       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3041       if (!call2 || !gimple_call_internal_p (call2))
3042 	return false;
3043       internal_fn ifn = gimple_call_internal_fn (call1);
3044       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3045 	return false;
3046       if (ifn != gimple_call_internal_fn (call2))
3047 	return false;
3048 
3049       /* Check that the masks are the same.  Cope with casts of masks,
3050 	 like those created by build_mask_conversion.  */
3051       tree mask1 = gimple_call_arg (call1, 2);
3052       tree mask2 = gimple_call_arg (call2, 2);
3053       if (!operand_equal_p (mask1, mask2, 0)
3054           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3055 	{
3056 	  mask1 = strip_conversion (mask1);
3057 	  if (!mask1)
3058 	    return false;
3059 	  mask2 = strip_conversion (mask2);
3060 	  if (!mask2)
3061 	    return false;
3062 	  if (!operand_equal_p (mask1, mask2, 0))
3063 	    return false;
3064 	}
3065       return true;
3066     }
3067 
3068   return false;
3069 }
3070 
3071 /* Function vect_analyze_data_ref_accesses.
3072 
3073    Analyze the access pattern of all the data references in the loop.
3074 
3075    FORNOW: the only access pattern that is considered vectorizable is a
3076 	   simple step 1 (consecutive) access.
3077 
3078    FORNOW: handle only arrays and pointer accesses.  */
3079 
3080 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo,vec<int> * dataref_groups)3081 vect_analyze_data_ref_accesses (vec_info *vinfo,
3082 				vec<int> *dataref_groups)
3083 {
3084   unsigned int i;
3085   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3086 
3087   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3088 
3089   if (datarefs.is_empty ())
3090     return opt_result::success ();
3091 
3092   /* Sort the array of datarefs to make building the interleaving chains
3093      linear.  Don't modify the original vector's order, it is needed for
3094      determining what dependencies are reversed.  */
3095   vec<dr_vec_info *> datarefs_copy;
3096   datarefs_copy.create (datarefs.length ());
3097   for (unsigned i = 0; i < datarefs.length (); i++)
3098     {
3099       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3100       /* If the caller computed DR grouping use that, otherwise group by
3101 	 basic blocks.  */
3102       if (dataref_groups)
3103 	dr_info->group = (*dataref_groups)[i];
3104       else
3105 	dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3106       datarefs_copy.quick_push (dr_info);
3107     }
3108   datarefs_copy.qsort (dr_group_sort_cmp);
3109   hash_set<stmt_vec_info> to_fixup;
3110 
3111   /* Build the interleaving chains.  */
3112   for (i = 0; i < datarefs_copy.length () - 1;)
3113     {
3114       dr_vec_info *dr_info_a = datarefs_copy[i];
3115       data_reference_p dra = dr_info_a->dr;
3116       int dra_group_id = dr_info_a->group;
3117       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3118       stmt_vec_info lastinfo = NULL;
3119       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3120 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3121 	{
3122 	  ++i;
3123 	  continue;
3124 	}
3125       for (i = i + 1; i < datarefs_copy.length (); ++i)
3126 	{
3127 	  dr_vec_info *dr_info_b = datarefs_copy[i];
3128 	  data_reference_p drb = dr_info_b->dr;
3129 	  int drb_group_id = dr_info_b->group;
3130 	  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3131 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3132 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3133 	    break;
3134 
3135 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
3136 	     accesses, same accesses) can lead to a group to be artificially
3137 	     split here as we don't just skip over those.  If it really
3138 	     matters we can push those to a worklist and re-iterate
3139 	     over them.  The we can just skip ahead to the next DR here.  */
3140 
3141 	  /* DRs in a different DR group should not be put into the same
3142 	     interleaving group.  */
3143 	  if (dra_group_id != drb_group_id)
3144 	    break;
3145 
3146 	  /* Check that the data-refs have same first location (except init)
3147 	     and they are both either store or load (not load and store,
3148 	     not masked loads or stores).  */
3149 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
3150 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3151 					DR_BASE_ADDRESS (drb)) != 0
3152 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3153 	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3154 	    break;
3155 
3156 	  /* Check that the data-refs have the same constant size.  */
3157 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3158 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3159 	  if (!tree_fits_uhwi_p (sza)
3160 	      || !tree_fits_uhwi_p (szb)
3161 	      || !tree_int_cst_equal (sza, szb))
3162 	    break;
3163 
3164 	  /* Check that the data-refs have the same step.  */
3165 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3166 	    break;
3167 
3168 	  /* Check the types are compatible.
3169 	     ???  We don't distinguish this during sorting.  */
3170 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3171 				   TREE_TYPE (DR_REF (drb))))
3172 	    break;
3173 
3174 	  /* Check that the DR_INITs are compile-time constants.  */
3175 	  if (!tree_fits_shwi_p (DR_INIT (dra))
3176 	      || !tree_fits_shwi_p (DR_INIT (drb)))
3177 	    break;
3178 
3179 	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
3180 	     just hold extra information.  */
3181 	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3182 	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3183 	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3184 	    break;
3185 
3186 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3187 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3188 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3189 	  HOST_WIDE_INT init_prev
3190 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3191 	  gcc_assert (init_a <= init_b
3192 		      && init_a <= init_prev
3193 		      && init_prev <= init_b);
3194 
3195 	  /* Do not place the same access in the interleaving chain twice.  */
3196 	  if (init_b == init_prev)
3197 	    {
3198 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3199 			  < gimple_uid (DR_STMT (drb)));
3200 	      /* Simply link in duplicates and fix up the chain below.  */
3201 	    }
3202 	  else
3203 	    {
3204 	      /* If init_b == init_a + the size of the type * k, we have an
3205 		 interleaving, and DRA is accessed before DRB.  */
3206 	      unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3207 	      if (type_size_a == 0
3208 		  || (((unsigned HOST_WIDE_INT)init_b - init_a)
3209 		      % type_size_a != 0))
3210 		break;
3211 
3212 	      /* If we have a store, the accesses are adjacent.  This splits
3213 		 groups into chunks we support (we don't support vectorization
3214 		 of stores with gaps).  */
3215 	      if (!DR_IS_READ (dra)
3216 		  && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3217 		      != type_size_a))
3218 		break;
3219 
3220 	      /* If the step (if not zero or non-constant) is smaller than the
3221 		 difference between data-refs' inits this splits groups into
3222 		 suitable sizes.  */
3223 	      if (tree_fits_shwi_p (DR_STEP (dra)))
3224 		{
3225 		  unsigned HOST_WIDE_INT step
3226 		    = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3227 		  if (step != 0
3228 		      && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3229 		    break;
3230 		}
3231 	    }
3232 
3233 	  if (dump_enabled_p ())
3234 	    dump_printf_loc (MSG_NOTE, vect_location,
3235 			     DR_IS_READ (dra)
3236 			     ? "Detected interleaving load %T and %T\n"
3237 			     : "Detected interleaving store %T and %T\n",
3238 			     DR_REF (dra), DR_REF (drb));
3239 
3240 	  /* Link the found element into the group list.  */
3241 	  if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3242 	    {
3243 	      DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3244 	      lastinfo = stmtinfo_a;
3245 	    }
3246 	  DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3247 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3248 	  lastinfo = stmtinfo_b;
3249 
3250 	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3251 	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3252 
3253 	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3254 	    dump_printf_loc (MSG_NOTE, vect_location,
3255 			     "Load suitable for SLP vectorization only.\n");
3256 
3257 	  if (init_b == init_prev
3258 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3259 	      && dump_enabled_p ())
3260 	    dump_printf_loc (MSG_NOTE, vect_location,
3261 			     "Queuing group with duplicate access for fixup\n");
3262 	}
3263     }
3264 
3265   /* Fixup groups with duplicate entries by splitting it.  */
3266   while (1)
3267     {
3268       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3269       if (!(it != to_fixup.end ()))
3270 	break;
3271       stmt_vec_info grp = *it;
3272       to_fixup.remove (grp);
3273 
3274       /* Find the earliest duplicate group member.  */
3275       unsigned first_duplicate = -1u;
3276       stmt_vec_info next, g = grp;
3277       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3278 	{
3279 	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3280 				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3281 	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3282 	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3283 	  g = next;
3284 	}
3285       if (first_duplicate == -1U)
3286 	continue;
3287 
3288       /* Then move all stmts after the first duplicate to a new group.
3289          Note this is a heuristic but one with the property that *it
3290 	 is fixed up completely.  */
3291       g = grp;
3292       stmt_vec_info newgroup = NULL, ng = grp;
3293       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3294 	{
3295 	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3296 	    {
3297 	      DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3298 	      if (!newgroup)
3299 		newgroup = next;
3300 	      else
3301 		DR_GROUP_NEXT_ELEMENT (ng) = next;
3302 	      ng = next;
3303 	      DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3304 	    }
3305 	  else
3306 	    g = DR_GROUP_NEXT_ELEMENT (g);
3307 	}
3308       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3309 
3310       /* Fixup the new group which still may contain duplicates.  */
3311       to_fixup.add (newgroup);
3312     }
3313 
3314   dr_vec_info *dr_info;
3315   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3316     {
3317       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3318 	  && !vect_analyze_data_ref_access (vinfo, dr_info))
3319 	{
3320 	  if (dump_enabled_p ())
3321 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322 			     "not vectorized: complicated access pattern.\n");
3323 
3324 	  if (is_a <bb_vec_info> (vinfo))
3325 	    {
3326 	      /* Mark the statement as not vectorizable.  */
3327 	      STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3328 	      continue;
3329 	    }
3330 	  else
3331 	    {
3332 	      datarefs_copy.release ();
3333 	      return opt_result::failure_at (dr_info->stmt->stmt,
3334 					     "not vectorized:"
3335 					     " complicated access pattern.\n");
3336 	    }
3337 	}
3338     }
3339 
3340   datarefs_copy.release ();
3341   return opt_result::success ();
3342 }
3343 
3344 /* Function vect_vfa_segment_size.
3345 
3346    Input:
3347      DR_INFO: The data reference.
3348      LENGTH_FACTOR: segment length to consider.
3349 
3350    Return a value suitable for the dr_with_seg_len::seg_len field.
3351    This is the "distance travelled" by the pointer from the first
3352    iteration in the segment to the last.  Note that it does not include
3353    the size of the access; in effect it only describes the first byte.  */
3354 
3355 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3356 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3357 {
3358   length_factor = size_binop (MINUS_EXPR,
3359 			      fold_convert (sizetype, length_factor),
3360 			      size_one_node);
3361   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3362 		     length_factor);
3363 }
3364 
3365 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3366    gives the worst-case number of bytes covered by the segment.  */
3367 
3368 static unsigned HOST_WIDE_INT
vect_vfa_access_size(vec_info * vinfo,dr_vec_info * dr_info)3369 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3370 {
3371   stmt_vec_info stmt_vinfo = dr_info->stmt;
3372   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3373   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3374   unsigned HOST_WIDE_INT access_size = ref_size;
3375   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3376     {
3377       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3378       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3379     }
3380   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3381   int misalignment;
3382   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3383       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3384       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3385 	  == dr_explicit_realign_optimized))
3386     {
3387       /* We might access a full vector's worth.  */
3388       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3389     }
3390   return access_size;
3391 }
3392 
3393 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3394    describes.  */
3395 
3396 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3397 vect_vfa_align (dr_vec_info *dr_info)
3398 {
3399   return dr_alignment (dr_info->dr);
3400 }
3401 
3402 /* Function vect_no_alias_p.
3403 
3404    Given data references A and B with equal base and offset, see whether
3405    the alias relation can be decided at compilation time.  Return 1 if
3406    it can and the references alias, 0 if it can and the references do
3407    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3408    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3409    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3410 
3411 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3412 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3413 			 tree segment_length_a, tree segment_length_b,
3414 			 unsigned HOST_WIDE_INT access_size_a,
3415 			 unsigned HOST_WIDE_INT access_size_b)
3416 {
3417   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3418   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3419   poly_uint64 const_length_a;
3420   poly_uint64 const_length_b;
3421 
3422   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3423      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3424      [a, a+12) */
3425   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3426     {
3427       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3428       offset_a -= const_length_a;
3429     }
3430   else
3431     const_length_a = tree_to_poly_uint64 (segment_length_a);
3432   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3433     {
3434       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3435       offset_b -= const_length_b;
3436     }
3437   else
3438     const_length_b = tree_to_poly_uint64 (segment_length_b);
3439 
3440   const_length_a += access_size_a;
3441   const_length_b += access_size_b;
3442 
3443   if (ranges_known_overlap_p (offset_a, const_length_a,
3444 			      offset_b, const_length_b))
3445     return 1;
3446 
3447   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3448 			       offset_b, const_length_b))
3449     return 0;
3450 
3451   return -1;
3452 }
3453 
3454 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3455    in DDR is >= VF.  */
3456 
3457 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3458 dependence_distance_ge_vf (data_dependence_relation *ddr,
3459 			   unsigned int loop_depth, poly_uint64 vf)
3460 {
3461   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3462       || DDR_NUM_DIST_VECTS (ddr) == 0)
3463     return false;
3464 
3465   /* If the dependence is exact, we should have limited the VF instead.  */
3466   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3467 
3468   unsigned int i;
3469   lambda_vector dist_v;
3470   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3471     {
3472       HOST_WIDE_INT dist = dist_v[loop_depth];
3473       if (dist != 0
3474 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3475 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3476 	return false;
3477     }
3478 
3479   if (dump_enabled_p ())
3480     dump_printf_loc (MSG_NOTE, vect_location,
3481 		     "dependence distance between %T and %T is >= VF\n",
3482 		     DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3483 
3484   return true;
3485 }
3486 
3487 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3488 
3489 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3490 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3491 {
3492   dump_printf (dump_kind, "%s (%T) >= ",
3493 	       lower_bound.unsigned_p ? "unsigned" : "abs",
3494 	       lower_bound.expr);
3495   dump_dec (dump_kind, lower_bound.min_value);
3496 }
3497 
3498 /* Record that the vectorized loop requires the vec_lower_bound described
3499    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3500 
3501 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3502 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3503 			poly_uint64 min_value)
3504 {
3505   vec<vec_lower_bound> &lower_bounds
3506     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3507   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3508     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3509       {
3510 	unsigned_p &= lower_bounds[i].unsigned_p;
3511 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3512 	if (lower_bounds[i].unsigned_p != unsigned_p
3513 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3514 	  {
3515 	    lower_bounds[i].unsigned_p = unsigned_p;
3516 	    lower_bounds[i].min_value = min_value;
3517 	    if (dump_enabled_p ())
3518 	      {
3519 		dump_printf_loc (MSG_NOTE, vect_location,
3520 				 "updating run-time check to ");
3521 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3522 		dump_printf (MSG_NOTE, "\n");
3523 	      }
3524 	  }
3525 	return;
3526       }
3527 
3528   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3529   if (dump_enabled_p ())
3530     {
3531       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3532       dump_lower_bound (MSG_NOTE, lower_bound);
3533       dump_printf (MSG_NOTE, "\n");
3534     }
3535   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3536 }
3537 
3538 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3539    will span fewer than GAP bytes.  */
3540 
3541 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3542 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3543 		  poly_int64 gap)
3544 {
3545   stmt_vec_info stmt_info = dr_info->stmt;
3546   HOST_WIDE_INT count
3547     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3548   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3549     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3550   return (estimated_poly_value (gap)
3551 	  <= count * vect_get_scalar_dr_size (dr_info));
3552 }
3553 
3554 /* Return true if we know that there is no alias between DR_INFO_A and
3555    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3556    When returning true, set *LOWER_BOUND_OUT to this N.  */
3557 
3558 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3559 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3560 				poly_uint64 *lower_bound_out)
3561 {
3562   /* Check that there is a constant gap of known sign between DR_A
3563      and DR_B.  */
3564   data_reference *dr_a = dr_info_a->dr;
3565   data_reference *dr_b = dr_info_b->dr;
3566   poly_int64 init_a, init_b;
3567   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3568       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3569       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3570       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3571       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3572       || !ordered_p (init_a, init_b))
3573     return false;
3574 
3575   /* Sort DR_A and DR_B by the address they access.  */
3576   if (maybe_lt (init_b, init_a))
3577     {
3578       std::swap (init_a, init_b);
3579       std::swap (dr_info_a, dr_info_b);
3580       std::swap (dr_a, dr_b);
3581     }
3582 
3583   /* If the two accesses could be dependent within a scalar iteration,
3584      make sure that we'd retain their order.  */
3585   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3586       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3587     return false;
3588 
3589   /* There is no alias if abs (DR_STEP) is greater than or equal to
3590      the bytes spanned by the combination of the two accesses.  */
3591   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3592   return true;
3593 }
3594 
3595 /* Function vect_prune_runtime_alias_test_list.
3596 
3597    Prune a list of ddrs to be tested at run-time by versioning for alias.
3598    Merge several alias checks into one if possible.
3599    Return FALSE if resulting list of ddrs is longer then allowed by
3600    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3601 
3602 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3603 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3604 {
3605   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3606   hash_set <tree_pair_hash> compared_objects;
3607 
3608   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3609   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3610     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3611   const vec<vec_object_pair> &check_unequal_addrs
3612     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3613   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3614   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3615 
3616   ddr_p ddr;
3617   unsigned int i;
3618   tree length_factor;
3619 
3620   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3621 
3622   /* Step values are irrelevant for aliasing if the number of vector
3623      iterations is equal to the number of scalar iterations (which can
3624      happen for fully-SLP loops).  */
3625   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3626 
3627   if (!vf_one_p)
3628     {
3629       /* Convert the checks for nonzero steps into bound tests.  */
3630       tree value;
3631       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3632 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3633     }
3634 
3635   if (may_alias_ddrs.is_empty ())
3636     return opt_result::success ();
3637 
3638   comp_alias_ddrs.create (may_alias_ddrs.length ());
3639 
3640   unsigned int loop_depth
3641     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3642 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3643 
3644   /* First, we collect all data ref pairs for aliasing checks.  */
3645   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3646     {
3647       poly_uint64 lower_bound;
3648       tree segment_length_a, segment_length_b;
3649       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3650       unsigned int align_a, align_b;
3651 
3652       /* Ignore the alias if the VF we chose ended up being no greater
3653 	 than the dependence distance.  */
3654       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3655 	continue;
3656 
3657       if (DDR_OBJECT_A (ddr))
3658 	{
3659 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3660 	  if (!compared_objects.add (new_pair))
3661 	    {
3662 	      if (dump_enabled_p ())
3663 		dump_printf_loc (MSG_NOTE, vect_location,
3664 				 "checking that %T and %T"
3665 				 " have different addresses\n",
3666 				 new_pair.first, new_pair.second);
3667 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3668 	    }
3669 	  continue;
3670 	}
3671 
3672       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3673       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3674 
3675       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3676       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3677 
3678       bool preserves_scalar_order_p
3679 	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3680       bool ignore_step_p
3681 	  = (vf_one_p
3682 	     && (preserves_scalar_order_p
3683 		 || operand_equal_p (DR_STEP (dr_info_a->dr),
3684 				     DR_STEP (dr_info_b->dr))));
3685 
3686       /* Skip the pair if inter-iteration dependencies are irrelevant
3687 	 and intra-iteration dependencies are guaranteed to be honored.  */
3688       if (ignore_step_p
3689 	  && (preserves_scalar_order_p
3690 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3691 						 &lower_bound)))
3692 	{
3693 	  if (dump_enabled_p ())
3694 	    dump_printf_loc (MSG_NOTE, vect_location,
3695 			     "no need for alias check between "
3696 			     "%T and %T when VF is 1\n",
3697 			     DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3698 	  continue;
3699 	}
3700 
3701       /* See whether we can handle the alias using a bounds check on
3702 	 the step, and whether that's likely to be the best approach.
3703 	 (It might not be, for example, if the minimum step is much larger
3704 	 than the number of bytes handled by one vector iteration.)  */
3705       if (!ignore_step_p
3706 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3707 	  && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3708 					     &lower_bound)
3709 	  && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3710 	      || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3711 	{
3712 	  bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3713 	  if (dump_enabled_p ())
3714 	    {
3715 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3716 			       "%T and %T when the step %T is outside ",
3717 			       DR_REF (dr_info_a->dr),
3718 			       DR_REF (dr_info_b->dr),
3719 			       DR_STEP (dr_info_a->dr));
3720 	      if (unsigned_p)
3721 		dump_printf (MSG_NOTE, "[0");
3722 	      else
3723 		{
3724 		  dump_printf (MSG_NOTE, "(");
3725 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3726 		}
3727 	      dump_printf (MSG_NOTE, ", ");
3728 	      dump_dec (MSG_NOTE, lower_bound);
3729 	      dump_printf (MSG_NOTE, ")\n");
3730 	    }
3731 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3732 				  unsigned_p, lower_bound);
3733 	  continue;
3734 	}
3735 
3736       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3737       if (dr_group_first_a)
3738 	{
3739 	  stmt_info_a = dr_group_first_a;
3740 	  dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3741 	}
3742 
3743       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3744       if (dr_group_first_b)
3745 	{
3746 	  stmt_info_b = dr_group_first_b;
3747 	  dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3748 	}
3749 
3750       if (ignore_step_p)
3751 	{
3752 	  segment_length_a = size_zero_node;
3753 	  segment_length_b = size_zero_node;
3754 	}
3755       else
3756 	{
3757 	  if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3758 				DR_STEP (dr_info_b->dr), 0))
3759 	    length_factor = scalar_loop_iters;
3760 	  else
3761 	    length_factor = size_int (vect_factor);
3762 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3763 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3764 	}
3765       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3766       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3767       align_a = vect_vfa_align (dr_info_a);
3768       align_b = vect_vfa_align (dr_info_b);
3769 
3770       /* See whether the alias is known at compilation time.  */
3771       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3772 			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
3773 	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3774 			      DR_OFFSET (dr_info_b->dr), 0)
3775 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3776 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3777 	  && poly_int_tree_p (segment_length_a)
3778 	  && poly_int_tree_p (segment_length_b))
3779 	{
3780 	  int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3781 					     segment_length_a,
3782 					     segment_length_b,
3783 					     access_size_a,
3784 					     access_size_b);
3785 	  if (res >= 0 && dump_enabled_p ())
3786 	    {
3787 	      dump_printf_loc (MSG_NOTE, vect_location,
3788 			       "can tell at compile time that %T and %T",
3789 			       DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3790 	      if (res == 0)
3791 		dump_printf (MSG_NOTE, " do not alias\n");
3792 	      else
3793 		dump_printf (MSG_NOTE, " alias\n");
3794 	    }
3795 
3796 	  if (res == 0)
3797 	    continue;
3798 
3799 	  if (res == 1)
3800 	    return opt_result::failure_at (stmt_info_b->stmt,
3801 					   "not vectorized:"
3802 					   " compilation time alias: %G%G",
3803 					   stmt_info_a->stmt,
3804 					   stmt_info_b->stmt);
3805 	}
3806 
3807       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3808 			    access_size_a, align_a);
3809       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3810 			    access_size_b, align_b);
3811       /* Canonicalize the order to be the one that's needed for accurate
3812 	 RAW, WAR and WAW flags, in cases where the data references are
3813 	 well-ordered.  The order doesn't really matter otherwise,
3814 	 but we might as well be consistent.  */
3815       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3816 	std::swap (dr_a, dr_b);
3817 
3818       dr_with_seg_len_pair_t dr_with_seg_len_pair
3819 	(dr_a, dr_b, (preserves_scalar_order_p
3820 		      ? dr_with_seg_len_pair_t::WELL_ORDERED
3821 		      : dr_with_seg_len_pair_t::REORDERED));
3822 
3823       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3824     }
3825 
3826   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3827 
3828   unsigned int count = (comp_alias_ddrs.length ()
3829 			+ check_unequal_addrs.length ());
3830 
3831   if (count
3832       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3833 	  == VECT_COST_MODEL_VERY_CHEAP))
3834     return opt_result::failure_at
3835       (vect_location, "would need a runtime alias check\n");
3836 
3837   if (dump_enabled_p ())
3838     dump_printf_loc (MSG_NOTE, vect_location,
3839 		     "improved number of alias checks from %d to %d\n",
3840 		     may_alias_ddrs.length (), count);
3841   unsigned limit = param_vect_max_version_for_alias_checks;
3842   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3843     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3844   if (count > limit)
3845     return opt_result::failure_at
3846       (vect_location,
3847        "number of versioning for alias run-time tests exceeds %d "
3848        "(--param vect-max-version-for-alias-checks)\n", limit);
3849 
3850   return opt_result::success ();
3851 }
3852 
3853 /* Check whether we can use an internal function for a gather load
3854    or scatter store.  READ_P is true for loads and false for stores.
3855    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3856    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3857    is the type of the offset that is being applied to the invariant
3858    base address.  SCALE is the amount by which the offset should
3859    be multiplied *after* it has been converted to address width.
3860 
3861    Return true if the function is supported, storing the function id in
3862    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3863 
3864 bool
vect_gather_scatter_fn_p(vec_info * vinfo,bool read_p,bool masked_p,tree vectype,tree memory_type,tree offset_type,int scale,internal_fn * ifn_out,tree * offset_vectype_out)3865 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3866 			  tree vectype, tree memory_type, tree offset_type,
3867 			  int scale, internal_fn *ifn_out,
3868 			  tree *offset_vectype_out)
3869 {
3870   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3871   unsigned int element_bits = vector_element_bits (vectype);
3872   if (element_bits != memory_bits)
3873     /* For now the vector elements must be the same width as the
3874        memory elements.  */
3875     return false;
3876 
3877   /* Work out which function we need.  */
3878   internal_fn ifn, alt_ifn;
3879   if (read_p)
3880     {
3881       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3882       alt_ifn = IFN_MASK_GATHER_LOAD;
3883     }
3884   else
3885     {
3886       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3887       alt_ifn = IFN_MASK_SCATTER_STORE;
3888     }
3889 
3890   for (;;)
3891     {
3892       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3893       if (!offset_vectype)
3894 	return false;
3895 
3896       /* Test whether the target supports this combination.  */
3897       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3898 						  offset_vectype, scale))
3899 	{
3900 	  *ifn_out = ifn;
3901 	  *offset_vectype_out = offset_vectype;
3902 	  return true;
3903 	}
3904       else if (!masked_p
3905 	       && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3906 							  memory_type,
3907 							  offset_vectype,
3908 							  scale))
3909 	{
3910 	  *ifn_out = alt_ifn;
3911 	  *offset_vectype_out = offset_vectype;
3912 	  return true;
3913 	}
3914 
3915       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3916 	  && TYPE_PRECISION (offset_type) >= element_bits)
3917 	return false;
3918 
3919       offset_type = build_nonstandard_integer_type
3920 	(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3921     }
3922 }
3923 
3924 /* STMT_INFO is a call to an internal gather load or scatter store function.
3925    Describe the operation in INFO.  */
3926 
3927 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3928 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3929 				   gather_scatter_info *info)
3930 {
3931   gcall *call = as_a <gcall *> (stmt_info->stmt);
3932   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3933   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3934 
3935   info->ifn = gimple_call_internal_fn (call);
3936   info->decl = NULL_TREE;
3937   info->base = gimple_call_arg (call, 0);
3938   info->offset = gimple_call_arg (call, 1);
3939   info->offset_dt = vect_unknown_def_type;
3940   info->offset_vectype = NULL_TREE;
3941   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3942   info->element_type = TREE_TYPE (vectype);
3943   info->memory_type = TREE_TYPE (DR_REF (dr));
3944 }
3945 
3946 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3947    gather load or scatter store.  Describe the operation in *INFO if so.  */
3948 
3949 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3950 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3951 			   gather_scatter_info *info)
3952 {
3953   HOST_WIDE_INT scale = 1;
3954   poly_int64 pbitpos, pbitsize;
3955   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3956   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3957   tree offtype = NULL_TREE;
3958   tree decl = NULL_TREE, base, off;
3959   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3960   tree memory_type = TREE_TYPE (DR_REF (dr));
3961   machine_mode pmode;
3962   int punsignedp, reversep, pvolatilep = 0;
3963   internal_fn ifn;
3964   tree offset_vectype;
3965   bool masked_p = false;
3966 
3967   /* See whether this is already a call to a gather/scatter internal function.
3968      If not, see whether it's a masked load or store.  */
3969   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3970   if (call && gimple_call_internal_p (call))
3971     {
3972       ifn = gimple_call_internal_fn (call);
3973       if (internal_gather_scatter_fn_p (ifn))
3974 	{
3975 	  vect_describe_gather_scatter_call (stmt_info, info);
3976 	  return true;
3977 	}
3978       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3979     }
3980 
3981   /* True if we should aim to use internal functions rather than
3982      built-in functions.  */
3983   bool use_ifn_p = (DR_IS_READ (dr)
3984 		    ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3985 		    : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3986 
3987   base = DR_REF (dr);
3988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3989      see if we can use the def stmt of the address.  */
3990   if (masked_p
3991       && TREE_CODE (base) == MEM_REF
3992       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3993       && integer_zerop (TREE_OPERAND (base, 1))
3994       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3995     {
3996       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3997       if (is_gimple_assign (def_stmt)
3998 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3999 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4000     }
4001 
4002   /* The gather and scatter builtins need address of the form
4003      loop_invariant + vector * {1, 2, 4, 8}
4004      or
4005      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4006      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4007      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4008      multiplications and additions in it.  To get a vector, we need
4009      a single SSA_NAME that will be defined in the loop and will
4010      contain everything that is not loop invariant and that can be
4011      vectorized.  The following code attempts to find such a preexistng
4012      SSA_NAME OFF and put the loop invariants into a tree BASE
4013      that can be gimplified before the loop.  */
4014   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4015 			      &punsignedp, &reversep, &pvolatilep);
4016   if (reversep)
4017     return false;
4018 
4019   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4020 
4021   if (TREE_CODE (base) == MEM_REF)
4022     {
4023       if (!integer_zerop (TREE_OPERAND (base, 1)))
4024 	{
4025 	  if (off == NULL_TREE)
4026 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4027 	  else
4028 	    off = size_binop (PLUS_EXPR, off,
4029 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
4030 	}
4031       base = TREE_OPERAND (base, 0);
4032     }
4033   else
4034     base = build_fold_addr_expr (base);
4035 
4036   if (off == NULL_TREE)
4037     off = size_zero_node;
4038 
4039   /* If base is not loop invariant, either off is 0, then we start with just
4040      the constant offset in the loop invariant BASE and continue with base
4041      as OFF, otherwise give up.
4042      We could handle that case by gimplifying the addition of base + off
4043      into some SSA_NAME and use that as off, but for now punt.  */
4044   if (!expr_invariant_in_loop_p (loop, base))
4045     {
4046       if (!integer_zerop (off))
4047 	return false;
4048       off = base;
4049       base = size_int (pbytepos);
4050     }
4051   /* Otherwise put base + constant offset into the loop invariant BASE
4052      and continue with OFF.  */
4053   else
4054     {
4055       base = fold_convert (sizetype, base);
4056       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4057     }
4058 
4059   /* OFF at this point may be either a SSA_NAME or some tree expression
4060      from get_inner_reference.  Try to peel off loop invariants from it
4061      into BASE as long as possible.  */
4062   STRIP_NOPS (off);
4063   while (offtype == NULL_TREE)
4064     {
4065       enum tree_code code;
4066       tree op0, op1, add = NULL_TREE;
4067 
4068       if (TREE_CODE (off) == SSA_NAME)
4069 	{
4070 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4071 
4072 	  if (expr_invariant_in_loop_p (loop, off))
4073 	    return false;
4074 
4075 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4076 	    break;
4077 
4078 	  op0 = gimple_assign_rhs1 (def_stmt);
4079 	  code = gimple_assign_rhs_code (def_stmt);
4080 	  op1 = gimple_assign_rhs2 (def_stmt);
4081 	}
4082       else
4083 	{
4084 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4085 	    return false;
4086 	  code = TREE_CODE (off);
4087 	  extract_ops_from_tree (off, &code, &op0, &op1);
4088 	}
4089       switch (code)
4090 	{
4091 	case POINTER_PLUS_EXPR:
4092 	case PLUS_EXPR:
4093 	  if (expr_invariant_in_loop_p (loop, op0))
4094 	    {
4095 	      add = op0;
4096 	      off = op1;
4097 	    do_add:
4098 	      add = fold_convert (sizetype, add);
4099 	      if (scale != 1)
4100 		add = size_binop (MULT_EXPR, add, size_int (scale));
4101 	      base = size_binop (PLUS_EXPR, base, add);
4102 	      continue;
4103 	    }
4104 	  if (expr_invariant_in_loop_p (loop, op1))
4105 	    {
4106 	      add = op1;
4107 	      off = op0;
4108 	      goto do_add;
4109 	    }
4110 	  break;
4111 	case MINUS_EXPR:
4112 	  if (expr_invariant_in_loop_p (loop, op1))
4113 	    {
4114 	      add = fold_convert (sizetype, op1);
4115 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
4116 	      off = op0;
4117 	      goto do_add;
4118 	    }
4119 	  break;
4120 	case MULT_EXPR:
4121 	  if (scale == 1 && tree_fits_shwi_p (op1))
4122 	    {
4123 	      int new_scale = tree_to_shwi (op1);
4124 	      /* Only treat this as a scaling operation if the target
4125 		 supports it for at least some offset type.  */
4126 	      if (use_ifn_p
4127 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4128 						masked_p, vectype, memory_type,
4129 						signed_char_type_node,
4130 						new_scale, &ifn,
4131 						&offset_vectype)
4132 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4133 						masked_p, vectype, memory_type,
4134 						unsigned_char_type_node,
4135 						new_scale, &ifn,
4136 						&offset_vectype))
4137 		break;
4138 	      scale = new_scale;
4139 	      off = op0;
4140 	      continue;
4141 	    }
4142 	  break;
4143 	case SSA_NAME:
4144 	  off = op0;
4145 	  continue;
4146 	CASE_CONVERT:
4147 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
4148 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4149 	    break;
4150 
4151 	  /* Don't include the conversion if the target is happy with
4152 	     the current offset type.  */
4153 	  if (use_ifn_p
4154 	      && !POINTER_TYPE_P (TREE_TYPE (off))
4155 	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4156 					   masked_p, vectype, memory_type,
4157 					   TREE_TYPE (off), scale, &ifn,
4158 					   &offset_vectype))
4159 	    break;
4160 
4161 	  if (TYPE_PRECISION (TREE_TYPE (op0))
4162 	      == TYPE_PRECISION (TREE_TYPE (off)))
4163 	    {
4164 	      off = op0;
4165 	      continue;
4166 	    }
4167 
4168 	  /* Include the conversion if it is widening and we're using
4169 	     the IFN path or the target can handle the converted from
4170 	     offset or the current size is not already the same as the
4171 	     data vector element size.  */
4172 	  if ((TYPE_PRECISION (TREE_TYPE (op0))
4173 	       < TYPE_PRECISION (TREE_TYPE (off)))
4174 	      && (use_ifn_p
4175 		  || (DR_IS_READ (dr)
4176 		      ? (targetm.vectorize.builtin_gather
4177 			 && targetm.vectorize.builtin_gather (vectype,
4178 							      TREE_TYPE (op0),
4179 							      scale))
4180 		      : (targetm.vectorize.builtin_scatter
4181 			 && targetm.vectorize.builtin_scatter (vectype,
4182 							       TREE_TYPE (op0),
4183 							       scale)))
4184 		  || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4185 				       TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4186 	    {
4187 	      off = op0;
4188 	      offtype = TREE_TYPE (off);
4189 	      STRIP_NOPS (off);
4190 	      continue;
4191 	    }
4192 	  break;
4193 	default:
4194 	  break;
4195 	}
4196       break;
4197     }
4198 
4199   /* If at the end OFF still isn't a SSA_NAME or isn't
4200      defined in the loop, punt.  */
4201   if (TREE_CODE (off) != SSA_NAME
4202       || expr_invariant_in_loop_p (loop, off))
4203     return false;
4204 
4205   if (offtype == NULL_TREE)
4206     offtype = TREE_TYPE (off);
4207 
4208   if (use_ifn_p)
4209     {
4210       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4211 				     vectype, memory_type, offtype, scale,
4212 				     &ifn, &offset_vectype))
4213 	ifn = IFN_LAST;
4214       decl = NULL_TREE;
4215     }
4216   else
4217     {
4218       if (DR_IS_READ (dr))
4219 	{
4220 	  if (targetm.vectorize.builtin_gather)
4221 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4222 	}
4223       else
4224 	{
4225 	  if (targetm.vectorize.builtin_scatter)
4226 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4227 	}
4228       ifn = IFN_LAST;
4229       /* The offset vector type will be read from DECL when needed.  */
4230       offset_vectype = NULL_TREE;
4231     }
4232 
4233   info->ifn = ifn;
4234   info->decl = decl;
4235   info->base = base;
4236   info->offset = off;
4237   info->offset_dt = vect_unknown_def_type;
4238   info->offset_vectype = offset_vectype;
4239   info->scale = scale;
4240   info->element_type = TREE_TYPE (vectype);
4241   info->memory_type = memory_type;
4242   return true;
4243 }
4244 
4245 /* Find the data references in STMT, analyze them with respect to LOOP and
4246    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4247    be handled.  */
4248 
4249 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs,vec<int> * dataref_groups,int group_id)4250 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4251 			       vec<data_reference_p> *datarefs,
4252 			       vec<int> *dataref_groups, int group_id)
4253 {
4254   /* We can ignore clobbers for dataref analysis - they are removed during
4255      loop vectorization and BB vectorization checks dependences with a
4256      stmt walk.  */
4257   if (gimple_clobber_p (stmt))
4258     return opt_result::success ();
4259 
4260   if (gimple_has_volatile_ops (stmt))
4261     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4262 				   stmt);
4263 
4264   if (stmt_can_throw_internal (cfun, stmt))
4265     return opt_result::failure_at (stmt,
4266 				   "not vectorized:"
4267 				   " statement can throw an exception: %G",
4268 				   stmt);
4269 
4270   auto_vec<data_reference_p, 2> refs;
4271   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4272   if (!res)
4273     return res;
4274 
4275   if (refs.is_empty ())
4276     return opt_result::success ();
4277 
4278   if (refs.length () > 1)
4279     {
4280       while (!refs.is_empty ())
4281 	free_data_ref (refs.pop ());
4282       return opt_result::failure_at (stmt,
4283 				     "not vectorized: more than one "
4284 				     "data ref in stmt: %G", stmt);
4285     }
4286 
4287   data_reference_p dr = refs.pop ();
4288   if (gcall *call = dyn_cast <gcall *> (stmt))
4289     if (!gimple_call_internal_p (call)
4290 	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4291 	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4292       {
4293 	free_data_ref (dr);
4294 	return opt_result::failure_at (stmt,
4295 				       "not vectorized: dr in a call %G", stmt);
4296       }
4297 
4298   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4299       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4300     {
4301       free_data_ref (dr);
4302       return opt_result::failure_at (stmt,
4303 				     "not vectorized:"
4304 				     " statement is bitfield access %G", stmt);
4305     }
4306 
4307   if (DR_BASE_ADDRESS (dr)
4308       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4309     {
4310       free_data_ref (dr);
4311       return opt_result::failure_at (stmt,
4312 				     "not vectorized:"
4313 				     " base addr of dr is a constant\n");
4314     }
4315 
4316   /* Check whether this may be a SIMD lane access and adjust the
4317      DR to make it easier for us to handle it.  */
4318   if (loop
4319       && loop->simduid
4320       && (!DR_BASE_ADDRESS (dr)
4321 	  || !DR_OFFSET (dr)
4322 	  || !DR_INIT (dr)
4323 	  || !DR_STEP (dr)))
4324     {
4325       struct data_reference *newdr
4326 	= create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4327 			   DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4328       if (DR_BASE_ADDRESS (newdr)
4329 	  && DR_OFFSET (newdr)
4330 	  && DR_INIT (newdr)
4331 	  && DR_STEP (newdr)
4332 	  && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4333 	  && integer_zerop (DR_STEP (newdr)))
4334 	{
4335 	  tree base_address = DR_BASE_ADDRESS (newdr);
4336 	  tree off = DR_OFFSET (newdr);
4337 	  tree step = ssize_int (1);
4338 	  if (integer_zerop (off)
4339 	      && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4340 	    {
4341 	      off = TREE_OPERAND (base_address, 1);
4342 	      base_address = TREE_OPERAND (base_address, 0);
4343 	    }
4344 	  STRIP_NOPS (off);
4345 	  if (TREE_CODE (off) == MULT_EXPR
4346 	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4347 	    {
4348 	      step = TREE_OPERAND (off, 1);
4349 	      off = TREE_OPERAND (off, 0);
4350 	      STRIP_NOPS (off);
4351 	    }
4352 	  if (CONVERT_EXPR_P (off)
4353 	      && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4354 		  < TYPE_PRECISION (TREE_TYPE (off))))
4355 	    off = TREE_OPERAND (off, 0);
4356 	  if (TREE_CODE (off) == SSA_NAME)
4357 	    {
4358 	      gimple *def = SSA_NAME_DEF_STMT (off);
4359 	      /* Look through widening conversion.  */
4360 	      if (is_gimple_assign (def)
4361 		  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4362 		{
4363 		  tree rhs1 = gimple_assign_rhs1 (def);
4364 		  if (TREE_CODE (rhs1) == SSA_NAME
4365 		      && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4366 		      && (TYPE_PRECISION (TREE_TYPE (off))
4367 			  > TYPE_PRECISION (TREE_TYPE (rhs1))))
4368 		    def = SSA_NAME_DEF_STMT (rhs1);
4369 		}
4370 	      if (is_gimple_call (def)
4371 		  && gimple_call_internal_p (def)
4372 		  && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4373 		{
4374 		  tree arg = gimple_call_arg (def, 0);
4375 		  tree reft = TREE_TYPE (DR_REF (newdr));
4376 		  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4377 		  arg = SSA_NAME_VAR (arg);
4378 		  if (arg == loop->simduid
4379 		      /* For now.  */
4380 		      && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4381 		    {
4382 		      DR_BASE_ADDRESS (newdr) = base_address;
4383 		      DR_OFFSET (newdr) = ssize_int (0);
4384 		      DR_STEP (newdr) = step;
4385 		      DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4386 		      DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4387 		      /* Mark as simd-lane access.  */
4388 		      tree arg2 = gimple_call_arg (def, 1);
4389 		      newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4390 		      free_data_ref (dr);
4391 		      datarefs->safe_push (newdr);
4392 		      if (dataref_groups)
4393 			dataref_groups->safe_push (group_id);
4394 		      return opt_result::success ();
4395 		    }
4396 		}
4397 	    }
4398 	}
4399       free_data_ref (newdr);
4400     }
4401 
4402   datarefs->safe_push (dr);
4403   if (dataref_groups)
4404     dataref_groups->safe_push (group_id);
4405   return opt_result::success ();
4406 }
4407 
4408 /* Function vect_analyze_data_refs.
4409 
4410   Find all the data references in the loop or basic block.
4411 
4412    The general structure of the analysis of data refs in the vectorizer is as
4413    follows:
4414    1- vect_analyze_data_refs(loop/bb): call
4415       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4416       in the loop/bb and their dependences.
4417    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4418    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4419    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4420 
4421 */
4422 
4423 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf,bool * fatal)4424 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4425 {
4426   class loop *loop = NULL;
4427   unsigned int i;
4428   struct data_reference *dr;
4429   tree scalar_type;
4430 
4431   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4432 
4433   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4434     loop = LOOP_VINFO_LOOP (loop_vinfo);
4435 
4436   /* Go through the data-refs, check that the analysis succeeded.  Update
4437      pointer from stmt_vec_info struct to DR and vectype.  */
4438 
4439   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4440   FOR_EACH_VEC_ELT (datarefs, i, dr)
4441     {
4442       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4443       poly_uint64 vf;
4444 
4445       gcc_assert (DR_REF (dr));
4446       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4447       gcc_assert (!stmt_info->dr_aux.dr);
4448       stmt_info->dr_aux.dr = dr;
4449       stmt_info->dr_aux.stmt = stmt_info;
4450 
4451       /* Check that analysis of the data-ref succeeded.  */
4452       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4453 	  || !DR_STEP (dr))
4454         {
4455 	  bool maybe_gather
4456 	    = DR_IS_READ (dr)
4457 	      && !TREE_THIS_VOLATILE (DR_REF (dr));
4458 	  bool maybe_scatter
4459 	    = DR_IS_WRITE (dr)
4460 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4461 	      && (targetm.vectorize.builtin_scatter != NULL
4462 		  || supports_vec_scatter_store_p ());
4463 
4464 	  /* If target supports vector gather loads or scatter stores,
4465 	     see if they can't be used.  */
4466 	  if (is_a <loop_vec_info> (vinfo)
4467 	      && !nested_in_vect_loop_p (loop, stmt_info))
4468 	    {
4469 	      if (maybe_gather || maybe_scatter)
4470 		{
4471 		  if (maybe_gather)
4472 		    gatherscatter = GATHER;
4473 		  else
4474 		    gatherscatter = SCATTER;
4475 		}
4476 	    }
4477 
4478 	  if (gatherscatter == SG_NONE)
4479 	    {
4480 	      if (dump_enabled_p ())
4481 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4482 				 "not vectorized: data ref analysis "
4483 				 "failed %G", stmt_info->stmt);
4484 	      if (is_a <bb_vec_info> (vinfo))
4485 		{
4486 		  /* In BB vectorization the ref can still participate
4487 		     in dependence analysis, we just can't vectorize it.  */
4488 		  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4489 		  continue;
4490 		}
4491 	      return opt_result::failure_at (stmt_info->stmt,
4492 					     "not vectorized:"
4493 					     " data ref analysis failed: %G",
4494 					     stmt_info->stmt);
4495 	    }
4496         }
4497 
4498       /* See if this was detected as SIMD lane access.  */
4499       if (dr->aux == (void *)-1
4500 	  || dr->aux == (void *)-2
4501 	  || dr->aux == (void *)-3
4502 	  || dr->aux == (void *)-4)
4503 	{
4504 	  if (nested_in_vect_loop_p (loop, stmt_info))
4505 	    return opt_result::failure_at (stmt_info->stmt,
4506 					   "not vectorized:"
4507 					   " data ref analysis failed: %G",
4508 					   stmt_info->stmt);
4509 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4510 	    = -(uintptr_t) dr->aux;
4511 	}
4512 
4513       tree base = get_base_address (DR_REF (dr));
4514       if (base && VAR_P (base) && DECL_NONALIASED (base))
4515 	{
4516           if (dump_enabled_p ())
4517 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4518 			     "not vectorized: base object not addressable "
4519 			     "for stmt: %G", stmt_info->stmt);
4520           if (is_a <bb_vec_info> (vinfo))
4521 	    {
4522 	      /* In BB vectorization the ref can still participate
4523 	         in dependence analysis, we just can't vectorize it.  */
4524 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4525 	      continue;
4526 	    }
4527 	  return opt_result::failure_at (stmt_info->stmt,
4528 					 "not vectorized: base object not"
4529 					 " addressable for stmt: %G",
4530 					 stmt_info->stmt);
4531 	}
4532 
4533       if (is_a <loop_vec_info> (vinfo)
4534 	  && DR_STEP (dr)
4535 	  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4536 	{
4537 	  if (nested_in_vect_loop_p (loop, stmt_info))
4538 	    return opt_result::failure_at (stmt_info->stmt,
4539 					   "not vectorized: "
4540 					   "not suitable for strided load %G",
4541 					   stmt_info->stmt);
4542 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4543 	}
4544 
4545       /* Update DR field in stmt_vec_info struct.  */
4546 
4547       /* If the dataref is in an inner-loop of the loop that is considered for
4548 	 for vectorization, we also want to analyze the access relative to
4549 	 the outer-loop (DR contains information only relative to the
4550 	 inner-most enclosing loop).  We do that by building a reference to the
4551 	 first location accessed by the inner-loop, and analyze it relative to
4552 	 the outer-loop.  */
4553       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4554 	{
4555 	  /* Build a reference to the first location accessed by the
4556 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4557 	     this address must be invariant in the inner loop, so we
4558 	     can consider it as being used in the outer loop.  */
4559 	  tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4560 	  tree offset = unshare_expr (DR_OFFSET (dr));
4561 	  tree init = unshare_expr (DR_INIT (dr));
4562 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4563 					  init, offset);
4564 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4565 	  tree init_ref = build_fold_indirect_ref (init_addr);
4566 
4567 	  if (dump_enabled_p ())
4568 	    dump_printf_loc (MSG_NOTE, vect_location,
4569 			     "analyze in outer loop: %T\n", init_ref);
4570 
4571 	  opt_result res
4572 	    = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4573 				    init_ref, loop, stmt_info->stmt);
4574 	  if (!res)
4575 	    /* dr_analyze_innermost already explained the failure.  */
4576 	    return res;
4577 
4578           if (dump_enabled_p ())
4579 	    dump_printf_loc (MSG_NOTE, vect_location,
4580 			     "\touter base_address: %T\n"
4581 			     "\touter offset from base address: %T\n"
4582 			     "\touter constant offset from base address: %T\n"
4583 			     "\touter step: %T\n"
4584 			     "\touter base alignment: %d\n\n"
4585 			     "\touter base misalignment: %d\n"
4586 			     "\touter offset alignment: %d\n"
4587 			     "\touter step alignment: %d\n",
4588 			     STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4589 			     STMT_VINFO_DR_OFFSET (stmt_info),
4590 			     STMT_VINFO_DR_INIT (stmt_info),
4591 			     STMT_VINFO_DR_STEP (stmt_info),
4592 			     STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4593 			     STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4594 			     STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4595 			     STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4596 	}
4597 
4598       /* Set vectype for STMT.  */
4599       scalar_type = TREE_TYPE (DR_REF (dr));
4600       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4601       if (!vectype)
4602         {
4603           if (dump_enabled_p ())
4604             {
4605               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4606                                "not vectorized: no vectype for stmt: %G",
4607 			       stmt_info->stmt);
4608               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4609               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4610                                  scalar_type);
4611               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4612             }
4613 
4614           if (is_a <bb_vec_info> (vinfo))
4615 	    {
4616 	      /* No vector type is fine, the ref can still participate
4617 	         in dependence analysis, we just can't vectorize it.  */
4618 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4619 	      continue;
4620 	    }
4621 	  if (fatal)
4622 	    *fatal = false;
4623 	  return opt_result::failure_at (stmt_info->stmt,
4624 					 "not vectorized:"
4625 					 " no vectype for stmt: %G"
4626 					 " scalar_type: %T\n",
4627 					 stmt_info->stmt, scalar_type);
4628         }
4629       else
4630 	{
4631 	  if (dump_enabled_p ())
4632 	    dump_printf_loc (MSG_NOTE, vect_location,
4633 			     "got vectype for stmt: %G%T\n",
4634 			     stmt_info->stmt, vectype);
4635 	}
4636 
4637       /* Adjust the minimal vectorization factor according to the
4638 	 vector type.  */
4639       vf = TYPE_VECTOR_SUBPARTS (vectype);
4640       *min_vf = upper_bound (*min_vf, vf);
4641 
4642       /* Leave the BB vectorizer to pick the vector type later, based on
4643 	 the final dataref group size and SLP node size.  */
4644       if (is_a <loop_vec_info> (vinfo))
4645 	STMT_VINFO_VECTYPE (stmt_info) = vectype;
4646 
4647       if (gatherscatter != SG_NONE)
4648 	{
4649 	  gather_scatter_info gs_info;
4650 	  if (!vect_check_gather_scatter (stmt_info,
4651 					  as_a <loop_vec_info> (vinfo),
4652 					  &gs_info)
4653 	      || !get_vectype_for_scalar_type (vinfo,
4654 					       TREE_TYPE (gs_info.offset)))
4655 	    {
4656 	      if (fatal)
4657 		*fatal = false;
4658 	      return opt_result::failure_at
4659 			(stmt_info->stmt,
4660 			 (gatherscatter == GATHER)
4661 			 ? "not vectorized: not suitable for gather load %G"
4662 			 : "not vectorized: not suitable for scatter store %G",
4663 			 stmt_info->stmt);
4664 	    }
4665 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4666 	}
4667     }
4668 
4669   /* We used to stop processing and prune the list here.  Verify we no
4670      longer need to.  */
4671   gcc_assert (i == datarefs.length ());
4672 
4673   return opt_result::success ();
4674 }
4675 
4676 
4677 /* Function vect_get_new_vect_var.
4678 
4679    Returns a name for a new variable.  The current naming scheme appends the
4680    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4681    the name of vectorizer generated variables, and appends that to NAME if
4682    provided.  */
4683 
4684 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4685 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4686 {
4687   const char *prefix;
4688   tree new_vect_var;
4689 
4690   switch (var_kind)
4691   {
4692   case vect_simple_var:
4693     prefix = "vect";
4694     break;
4695   case vect_scalar_var:
4696     prefix = "stmp";
4697     break;
4698   case vect_mask_var:
4699     prefix = "mask";
4700     break;
4701   case vect_pointer_var:
4702     prefix = "vectp";
4703     break;
4704   default:
4705     gcc_unreachable ();
4706   }
4707 
4708   if (name)
4709     {
4710       char* tmp = concat (prefix, "_", name, NULL);
4711       new_vect_var = create_tmp_reg (type, tmp);
4712       free (tmp);
4713     }
4714   else
4715     new_vect_var = create_tmp_reg (type, prefix);
4716 
4717   return new_vect_var;
4718 }
4719 
4720 /* Like vect_get_new_vect_var but return an SSA name.  */
4721 
4722 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4723 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4724 {
4725   const char *prefix;
4726   tree new_vect_var;
4727 
4728   switch (var_kind)
4729   {
4730   case vect_simple_var:
4731     prefix = "vect";
4732     break;
4733   case vect_scalar_var:
4734     prefix = "stmp";
4735     break;
4736   case vect_pointer_var:
4737     prefix = "vectp";
4738     break;
4739   default:
4740     gcc_unreachable ();
4741   }
4742 
4743   if (name)
4744     {
4745       char* tmp = concat (prefix, "_", name, NULL);
4746       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4747       free (tmp);
4748     }
4749   else
4750     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4751 
4752   return new_vect_var;
4753 }
4754 
4755 /* Duplicate points-to info on NAME from DR_INFO.  */
4756 
4757 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4758 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4759 {
4760   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4761   /* DR_PTR_INFO is for a base SSA name, not including constant or
4762      variable offsets in the ref so its alignment info does not apply.  */
4763   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4764 }
4765 
4766 /* Function vect_create_addr_base_for_vector_ref.
4767 
4768    Create an expression that computes the address of the first memory location
4769    that will be accessed for a data reference.
4770 
4771    Input:
4772    STMT_INFO: The statement containing the data reference.
4773    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4774    OFFSET: Optional. If supplied, it is be added to the initial address.
4775    LOOP:    Specify relative to which loop-nest should the address be computed.
4776             For example, when the dataref is in an inner-loop nested in an
4777 	    outer-loop that is now being vectorized, LOOP can be either the
4778 	    outer-loop, or the inner-loop.  The first memory location accessed
4779 	    by the following dataref ('in' points to short):
4780 
4781 		for (i=0; i<N; i++)
4782 		   for (j=0; j<M; j++)
4783 		     s += in[i+j]
4784 
4785 	    is as follows:
4786 	    if LOOP=i_loop:	&in		(relative to i_loop)
4787 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4788 
4789    Output:
4790    1. Return an SSA_NAME whose value is the address of the memory location of
4791       the first vector of the data reference.
4792    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4793       these statement(s) which define the returned SSA_NAME.
4794 
4795    FORNOW: We are only handling array accesses with step 1.  */
4796 
4797 tree
vect_create_addr_base_for_vector_ref(vec_info * vinfo,stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset)4798 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4799 				      gimple_seq *new_stmt_list,
4800 				      tree offset)
4801 {
4802   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4803   struct data_reference *dr = dr_info->dr;
4804   const char *base_name;
4805   tree addr_base;
4806   tree dest;
4807   gimple_seq seq = NULL;
4808   tree vect_ptr_type;
4809   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4810   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4811 
4812   tree data_ref_base = unshare_expr (drb->base_address);
4813   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4814   tree init = unshare_expr (drb->init);
4815 
4816   if (loop_vinfo)
4817     base_name = get_name (data_ref_base);
4818   else
4819     {
4820       base_offset = ssize_int (0);
4821       init = ssize_int (0);
4822       base_name = get_name (DR_REF (dr));
4823     }
4824 
4825   /* Create base_offset */
4826   base_offset = size_binop (PLUS_EXPR,
4827 			    fold_convert (sizetype, base_offset),
4828 			    fold_convert (sizetype, init));
4829 
4830   if (offset)
4831     {
4832       offset = fold_convert (sizetype, offset);
4833       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4834 				 base_offset, offset);
4835     }
4836 
4837   /* base + base_offset */
4838   if (loop_vinfo)
4839     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4840   else
4841     addr_base = build1 (ADDR_EXPR,
4842 			build_pointer_type (TREE_TYPE (DR_REF (dr))),
4843 			/* Strip zero offset components since we don't need
4844 			   them and they can confuse late diagnostics if
4845 			   we CSE them wrongly.  See PR106904 for example.  */
4846 			unshare_expr (strip_zero_offset_components
4847 								(DR_REF (dr))));
4848 
4849   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4850   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4851   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4852   gimple_seq_add_seq (new_stmt_list, seq);
4853 
4854   if (DR_PTR_INFO (dr)
4855       && TREE_CODE (addr_base) == SSA_NAME
4856       /* We should only duplicate pointer info to newly created SSA names.  */
4857       && SSA_NAME_VAR (addr_base) == dest)
4858     {
4859       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4860       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4861     }
4862 
4863   if (dump_enabled_p ())
4864     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4865 
4866   return addr_base;
4867 }
4868 
4869 
4870 /* Function vect_create_data_ref_ptr.
4871 
4872    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4873    location accessed in the loop by STMT_INFO, along with the def-use update
4874    chain to appropriately advance the pointer through the loop iterations.
4875    Also set aliasing information for the pointer.  This pointer is used by
4876    the callers to this function to create a memory reference expression for
4877    vector load/store access.
4878 
4879    Input:
4880    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4881          GIMPLE_ASSIGN <name, data-ref> or
4882 	 GIMPLE_ASSIGN <data-ref, name>.
4883    2. AGGR_TYPE: the type of the reference, which should be either a vector
4884         or an array.
4885    3. AT_LOOP: the loop where the vector memref is to be created.
4886    4. OFFSET (optional): a byte offset to be added to the initial address
4887 	accessed by the data-ref in STMT_INFO.
4888    5. BSI: location where the new stmts are to be placed if there is no loop
4889    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4890         pointing to the initial address.
4891    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4892 	to the IV during each iteration of the loop.  NULL says to move
4893 	by one copy of AGGR_TYPE up or down, depending on the step of the
4894 	data reference.
4895 
4896    Output:
4897    1. Declare a new ptr to vector_type, and have it point to the base of the
4898       data reference (initial addressed accessed by the data reference).
4899       For example, for vector of type V8HI, the following code is generated:
4900 
4901       v8hi *ap;
4902       ap = (v8hi *)initial_address;
4903 
4904       if OFFSET is not supplied:
4905          initial_address = &a[init];
4906       if OFFSET is supplied:
4907 	 initial_address = &a[init] + OFFSET;
4908       if BYTE_OFFSET is supplied:
4909 	 initial_address = &a[init] + BYTE_OFFSET;
4910 
4911       Return the initial_address in INITIAL_ADDRESS.
4912 
4913    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4914       update the pointer in each iteration of the loop.
4915 
4916       Return the increment stmt that updates the pointer in PTR_INCR.
4917 
4918    3. Return the pointer.  */
4919 
4920 tree
vect_create_data_ref_ptr(vec_info * vinfo,stmt_vec_info stmt_info,tree aggr_type,class loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree iv_step)4921 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4922 			  tree aggr_type, class loop *at_loop, tree offset,
4923 			  tree *initial_address, gimple_stmt_iterator *gsi,
4924 			  gimple **ptr_incr, bool only_init,
4925 			  tree iv_step)
4926 {
4927   const char *base_name;
4928   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4929   class loop *loop = NULL;
4930   bool nested_in_vect_loop = false;
4931   class loop *containing_loop = NULL;
4932   tree aggr_ptr_type;
4933   tree aggr_ptr;
4934   tree new_temp;
4935   gimple_seq new_stmt_list = NULL;
4936   edge pe = NULL;
4937   basic_block new_bb;
4938   tree aggr_ptr_init;
4939   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4940   struct data_reference *dr = dr_info->dr;
4941   tree aptr;
4942   gimple_stmt_iterator incr_gsi;
4943   bool insert_after;
4944   tree indx_before_incr, indx_after_incr;
4945   gimple *incr;
4946   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4947 
4948   gcc_assert (iv_step != NULL_TREE
4949 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4950 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4951 
4952   if (loop_vinfo)
4953     {
4954       loop = LOOP_VINFO_LOOP (loop_vinfo);
4955       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4956       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4957       pe = loop_preheader_edge (loop);
4958     }
4959   else
4960     {
4961       gcc_assert (bb_vinfo);
4962       only_init = true;
4963       *ptr_incr = NULL;
4964     }
4965 
4966   /* Create an expression for the first address accessed by this load
4967      in LOOP.  */
4968   base_name = get_name (DR_BASE_ADDRESS (dr));
4969 
4970   if (dump_enabled_p ())
4971     {
4972       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4973       dump_printf_loc (MSG_NOTE, vect_location,
4974                        "create %s-pointer variable to type: %T",
4975 		       get_tree_code_name (TREE_CODE (aggr_type)),
4976 		       aggr_type);
4977       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4978         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4979       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4980         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4981       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4982         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4983       else
4984         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4985       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4986     }
4987 
4988   /* (1) Create the new aggregate-pointer variable.
4989      Vector and array types inherit the alias set of their component
4990      type by default so we need to use a ref-all pointer if the data
4991      reference does not conflict with the created aggregated data
4992      reference because it is not addressable.  */
4993   bool need_ref_all = false;
4994   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4995 			      get_alias_set (DR_REF (dr))))
4996     need_ref_all = true;
4997   /* Likewise for any of the data references in the stmt group.  */
4998   else if (DR_GROUP_SIZE (stmt_info) > 1)
4999     {
5000       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5001       do
5002 	{
5003 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5004 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5005 				      get_alias_set (DR_REF (sdr))))
5006 	    {
5007 	      need_ref_all = true;
5008 	      break;
5009 	    }
5010 	  sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5011 	}
5012       while (sinfo);
5013     }
5014   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5015 					       need_ref_all);
5016   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5017 
5018 
5019   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5020      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5021      def-use update cycles for the pointer: one relative to the outer-loop
5022      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5023      to the inner-loop (which is the inner-most loop containing the dataref),
5024      and this is done be step (5) below.
5025 
5026      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5027      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5028      redundant.  Steps (3),(4) create the following:
5029 
5030 	vp0 = &base_addr;
5031 	LOOP:	vp1 = phi(vp0,vp2)
5032 		...
5033 		...
5034 		vp2 = vp1 + step
5035 		goto LOOP
5036 
5037      If there is an inner-loop nested in loop, then step (5) will also be
5038      applied, and an additional update in the inner-loop will be created:
5039 
5040 	vp0 = &base_addr;
5041 	LOOP:   vp1 = phi(vp0,vp2)
5042 		...
5043         inner:     vp3 = phi(vp1,vp4)
5044 	           vp4 = vp3 + inner_step
5045 	           if () goto inner
5046 		...
5047 		vp2 = vp1 + step
5048 		if () goto LOOP   */
5049 
5050   /* (2) Calculate the initial address of the aggregate-pointer, and set
5051      the aggregate-pointer to point to it before the loop.  */
5052 
5053   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5054 
5055   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5056 						   stmt_info, &new_stmt_list,
5057 						   offset);
5058   if (new_stmt_list)
5059     {
5060       if (pe)
5061         {
5062           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5063           gcc_assert (!new_bb);
5064         }
5065       else
5066         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5067     }
5068 
5069   *initial_address = new_temp;
5070   aggr_ptr_init = new_temp;
5071 
5072   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5073      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5074      inner-loop nested in LOOP (during outer-loop vectorization).  */
5075 
5076   /* No update in loop is required.  */
5077   if (only_init && (!loop_vinfo || at_loop == loop))
5078     aptr = aggr_ptr_init;
5079   else
5080     {
5081       /* Accesses to invariant addresses should be handled specially
5082 	 by the caller.  */
5083       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5084       gcc_assert (!integer_zerop (step));
5085 
5086       if (iv_step == NULL_TREE)
5087 	{
5088 	  /* The step of the aggregate pointer is the type size,
5089 	     negated for downward accesses.  */
5090 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
5091 	  if (tree_int_cst_sgn (step) == -1)
5092 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5093 	}
5094 
5095       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5096 
5097       create_iv (aggr_ptr_init,
5098 		 fold_convert (aggr_ptr_type, iv_step),
5099 		 aggr_ptr, loop, &incr_gsi, insert_after,
5100 		 &indx_before_incr, &indx_after_incr);
5101       incr = gsi_stmt (incr_gsi);
5102 
5103       /* Copy the points-to information if it exists. */
5104       if (DR_PTR_INFO (dr))
5105 	{
5106 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5107 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5108 	}
5109       if (ptr_incr)
5110 	*ptr_incr = incr;
5111 
5112       aptr = indx_before_incr;
5113     }
5114 
5115   if (!nested_in_vect_loop || only_init)
5116     return aptr;
5117 
5118 
5119   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5120      nested in LOOP, if exists.  */
5121 
5122   gcc_assert (nested_in_vect_loop);
5123   if (!only_init)
5124     {
5125       standard_iv_increment_position (containing_loop, &incr_gsi,
5126 				      &insert_after);
5127       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5128 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5129 		 &indx_after_incr);
5130       incr = gsi_stmt (incr_gsi);
5131 
5132       /* Copy the points-to information if it exists. */
5133       if (DR_PTR_INFO (dr))
5134 	{
5135 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5136 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5137 	}
5138       if (ptr_incr)
5139 	*ptr_incr = incr;
5140 
5141       return indx_before_incr;
5142     }
5143   else
5144     gcc_unreachable ();
5145 }
5146 
5147 
5148 /* Function bump_vector_ptr
5149 
5150    Increment a pointer (to a vector type) by vector-size. If requested,
5151    i.e. if PTR-INCR is given, then also connect the new increment stmt
5152    to the existing def-use update-chain of the pointer, by modifying
5153    the PTR_INCR as illustrated below:
5154 
5155    The pointer def-use update-chain before this function:
5156                         DATAREF_PTR = phi (p_0, p_2)
5157                         ....
5158         PTR_INCR:       p_2 = DATAREF_PTR + step
5159 
5160    The pointer def-use update-chain after this function:
5161                         DATAREF_PTR = phi (p_0, p_2)
5162                         ....
5163                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5164                         ....
5165         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5166 
5167    Input:
5168    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5169                  in the loop.
5170    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5171 	      the loop.  The increment amount across iterations is expected
5172 	      to be vector_size.
5173    BSI - location where the new update stmt is to be placed.
5174    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5175    BUMP - optional. The offset by which to bump the pointer. If not given,
5176 	  the offset is assumed to be vector_size.
5177 
5178    Output: Return NEW_DATAREF_PTR as illustrated above.
5179 
5180 */
5181 
5182 tree
bump_vector_ptr(vec_info * vinfo,tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)5183 bump_vector_ptr (vec_info *vinfo,
5184 		 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5185 		 stmt_vec_info stmt_info, tree bump)
5186 {
5187   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5188   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5189   tree update = TYPE_SIZE_UNIT (vectype);
5190   gimple *incr_stmt;
5191   ssa_op_iter iter;
5192   use_operand_p use_p;
5193   tree new_dataref_ptr;
5194 
5195   if (bump)
5196     update = bump;
5197 
5198   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5199     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5200   else
5201     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5202   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5203 				   dataref_ptr, update);
5204   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5205   /* Fold the increment, avoiding excessive chains use-def chains of
5206      those, leading to compile-time issues for passes until the next
5207      forwprop pass which would do this as well.  */
5208   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5209   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5210     {
5211       incr_stmt = gsi_stmt (fold_gsi);
5212       update_stmt (incr_stmt);
5213     }
5214 
5215   /* Copy the points-to information if it exists. */
5216   if (DR_PTR_INFO (dr))
5217     {
5218       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5219       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5220     }
5221 
5222   if (!ptr_incr)
5223     return new_dataref_ptr;
5224 
5225   /* Update the vector-pointer's cross-iteration increment.  */
5226   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5227     {
5228       tree use = USE_FROM_PTR (use_p);
5229 
5230       if (use == dataref_ptr)
5231         SET_USE (use_p, new_dataref_ptr);
5232       else
5233         gcc_assert (operand_equal_p (use, update, 0));
5234     }
5235 
5236   return new_dataref_ptr;
5237 }
5238 
5239 
5240 /* Copy memory reference info such as base/clique from the SRC reference
5241    to the DEST MEM_REF.  */
5242 
5243 void
vect_copy_ref_info(tree dest,tree src)5244 vect_copy_ref_info (tree dest, tree src)
5245 {
5246   if (TREE_CODE (dest) != MEM_REF)
5247     return;
5248 
5249   tree src_base = src;
5250   while (handled_component_p (src_base))
5251     src_base = TREE_OPERAND (src_base, 0);
5252   if (TREE_CODE (src_base) != MEM_REF
5253       && TREE_CODE (src_base) != TARGET_MEM_REF)
5254     return;
5255 
5256   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5257   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5258 }
5259 
5260 
5261 /* Function vect_create_destination_var.
5262 
5263    Create a new temporary of type VECTYPE.  */
5264 
5265 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5266 vect_create_destination_var (tree scalar_dest, tree vectype)
5267 {
5268   tree vec_dest;
5269   const char *name;
5270   char *new_name;
5271   tree type;
5272   enum vect_var_kind kind;
5273 
5274   kind = vectype
5275     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5276     ? vect_mask_var
5277     : vect_simple_var
5278     : vect_scalar_var;
5279   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5280 
5281   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5282 
5283   name = get_name (scalar_dest);
5284   if (name)
5285     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5286   else
5287     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5288   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5289   free (new_name);
5290 
5291   return vec_dest;
5292 }
5293 
5294 /* Function vect_grouped_store_supported.
5295 
5296    Returns TRUE if interleave high and interleave low permutations
5297    are supported, and FALSE otherwise.  */
5298 
5299 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5300 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5301 {
5302   machine_mode mode = TYPE_MODE (vectype);
5303 
5304   /* vect_permute_store_chain requires the group size to be equal to 3 or
5305      be a power of two.  */
5306   if (count != 3 && exact_log2 (count) == -1)
5307     {
5308       if (dump_enabled_p ())
5309 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5310 			 "the size of the group of accesses"
5311 			 " is not a power of 2 or not eqaul to 3\n");
5312       return false;
5313     }
5314 
5315   /* Check that the permutation is supported.  */
5316   if (VECTOR_MODE_P (mode))
5317     {
5318       unsigned int i;
5319       if (count == 3)
5320 	{
5321 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5322 	  unsigned int i, j;
5323 
5324 	  unsigned int nelt;
5325 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5326 	    {
5327 	      if (dump_enabled_p ())
5328 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5329 				 "cannot handle groups of 3 stores for"
5330 				 " variable-length vectors\n");
5331 	      return false;
5332 	    }
5333 
5334 	  vec_perm_builder sel (nelt, nelt, 1);
5335 	  sel.quick_grow (nelt);
5336 	  vec_perm_indices indices;
5337 	  for (j = 0; j < 3; j++)
5338 	    {
5339 	      int nelt0 = ((3 - j) * nelt) % 3;
5340 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5341 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5342 	      for (i = 0; i < nelt; i++)
5343 		{
5344 		  if (3 * i + nelt0 < nelt)
5345 		    sel[3 * i + nelt0] = j0++;
5346 		  if (3 * i + nelt1 < nelt)
5347 		    sel[3 * i + nelt1] = nelt + j1++;
5348 		  if (3 * i + nelt2 < nelt)
5349 		    sel[3 * i + nelt2] = 0;
5350 		}
5351 	      indices.new_vector (sel, 2, nelt);
5352 	      if (!can_vec_perm_const_p (mode, indices))
5353 		{
5354 		  if (dump_enabled_p ())
5355 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5356 				 "permutation op not supported by target.\n");
5357 		  return false;
5358 		}
5359 
5360 	      for (i = 0; i < nelt; i++)
5361 		{
5362 		  if (3 * i + nelt0 < nelt)
5363 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5364 		  if (3 * i + nelt1 < nelt)
5365 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5366 		  if (3 * i + nelt2 < nelt)
5367 		    sel[3 * i + nelt2] = nelt + j2++;
5368 		}
5369 	      indices.new_vector (sel, 2, nelt);
5370 	      if (!can_vec_perm_const_p (mode, indices))
5371 		{
5372 		  if (dump_enabled_p ())
5373 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5374 				 "permutation op not supported by target.\n");
5375 		  return false;
5376 		}
5377 	    }
5378 	  return true;
5379 	}
5380       else
5381 	{
5382 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5383 	  gcc_assert (pow2p_hwi (count));
5384 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5385 
5386 	  /* The encoding has 2 interleaved stepped patterns.  */
5387 	  vec_perm_builder sel (nelt, 2, 3);
5388 	  sel.quick_grow (6);
5389 	  for (i = 0; i < 3; i++)
5390 	    {
5391 	      sel[i * 2] = i;
5392 	      sel[i * 2 + 1] = i + nelt;
5393 	    }
5394 	  vec_perm_indices indices (sel, 2, nelt);
5395 	  if (can_vec_perm_const_p (mode, indices))
5396 	    {
5397 	      for (i = 0; i < 6; i++)
5398 		sel[i] += exact_div (nelt, 2);
5399 	      indices.new_vector (sel, 2, nelt);
5400 	      if (can_vec_perm_const_p (mode, indices))
5401 		return true;
5402 	    }
5403 	}
5404     }
5405 
5406   if (dump_enabled_p ())
5407     dump_printf (MSG_MISSED_OPTIMIZATION,
5408 		 "permutation op not supported by target.\n");
5409   return false;
5410 }
5411 
5412 
5413 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5414    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5415 
5416 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5417 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5418 			    bool masked_p)
5419 {
5420   if (masked_p)
5421     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5422 					 vec_mask_store_lanes_optab,
5423 					 vectype, count);
5424   else
5425     return vect_lanes_optab_supported_p ("vec_store_lanes",
5426 					 vec_store_lanes_optab,
5427 					 vectype, count);
5428 }
5429 
5430 
5431 /* Function vect_permute_store_chain.
5432 
5433    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5434    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5435    the data correctly for the stores.  Return the final references for stores
5436    in RESULT_CHAIN.
5437 
5438    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5439    The input is 4 vectors each containing 8 elements.  We assign a number to
5440    each element, the input sequence is:
5441 
5442    1st vec:   0  1  2  3  4  5  6  7
5443    2nd vec:   8  9 10 11 12 13 14 15
5444    3rd vec:  16 17 18 19 20 21 22 23
5445    4th vec:  24 25 26 27 28 29 30 31
5446 
5447    The output sequence should be:
5448 
5449    1st vec:  0  8 16 24  1  9 17 25
5450    2nd vec:  2 10 18 26  3 11 19 27
5451    3rd vec:  4 12 20 28  5 13 21 30
5452    4th vec:  6 14 22 30  7 15 23 31
5453 
5454    i.e., we interleave the contents of the four vectors in their order.
5455 
5456    We use interleave_high/low instructions to create such output.  The input of
5457    each interleave_high/low operation is two vectors:
5458    1st vec    2nd vec
5459    0 1 2 3    4 5 6 7
5460    the even elements of the result vector are obtained left-to-right from the
5461    high/low elements of the first vector.  The odd elements of the result are
5462    obtained left-to-right from the high/low elements of the second vector.
5463    The output of interleave_high will be:   0 4 1 5
5464    and of interleave_low:                   2 6 3 7
5465 
5466 
5467    The permutation is done in log LENGTH stages.  In each stage interleave_high
5468    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5469    where the first argument is taken from the first half of DR_CHAIN and the
5470    second argument from it's second half.
5471    In our example,
5472 
5473    I1: interleave_high (1st vec, 3rd vec)
5474    I2: interleave_low (1st vec, 3rd vec)
5475    I3: interleave_high (2nd vec, 4th vec)
5476    I4: interleave_low (2nd vec, 4th vec)
5477 
5478    The output for the first stage is:
5479 
5480    I1:  0 16  1 17  2 18  3 19
5481    I2:  4 20  5 21  6 22  7 23
5482    I3:  8 24  9 25 10 26 11 27
5483    I4: 12 28 13 29 14 30 15 31
5484 
5485    The output of the second stage, i.e. the final result is:
5486 
5487    I1:  0  8 16 24  1  9 17 25
5488    I2:  2 10 18 26  3 11 19 27
5489    I3:  4 12 20 28  5 13 21 30
5490    I4:  6 14 22 30  7 15 23 31.  */
5491 
5492 void
vect_permute_store_chain(vec_info * vinfo,vec<tree> & dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5493 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5494 			  unsigned int length,
5495 			  stmt_vec_info stmt_info,
5496 			  gimple_stmt_iterator *gsi,
5497 			  vec<tree> *result_chain)
5498 {
5499   tree vect1, vect2, high, low;
5500   gimple *perm_stmt;
5501   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5502   tree perm_mask_low, perm_mask_high;
5503   tree data_ref;
5504   tree perm3_mask_low, perm3_mask_high;
5505   unsigned int i, j, n, log_length = exact_log2 (length);
5506 
5507   result_chain->quick_grow (length);
5508   memcpy (result_chain->address (), dr_chain.address (),
5509 	  length * sizeof (tree));
5510 
5511   if (length == 3)
5512     {
5513       /* vect_grouped_store_supported ensures that this is constant.  */
5514       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5515       unsigned int j0 = 0, j1 = 0, j2 = 0;
5516 
5517       vec_perm_builder sel (nelt, nelt, 1);
5518       sel.quick_grow (nelt);
5519       vec_perm_indices indices;
5520       for (j = 0; j < 3; j++)
5521         {
5522 	  int nelt0 = ((3 - j) * nelt) % 3;
5523 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5524 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5525 
5526 	  for (i = 0; i < nelt; i++)
5527 	    {
5528 	      if (3 * i + nelt0 < nelt)
5529 		sel[3 * i + nelt0] = j0++;
5530 	      if (3 * i + nelt1 < nelt)
5531 		sel[3 * i + nelt1] = nelt + j1++;
5532 	      if (3 * i + nelt2 < nelt)
5533 		sel[3 * i + nelt2] = 0;
5534 	    }
5535 	  indices.new_vector (sel, 2, nelt);
5536 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5537 
5538 	  for (i = 0; i < nelt; i++)
5539 	    {
5540 	      if (3 * i + nelt0 < nelt)
5541 		sel[3 * i + nelt0] = 3 * i + nelt0;
5542 	      if (3 * i + nelt1 < nelt)
5543 		sel[3 * i + nelt1] = 3 * i + nelt1;
5544 	      if (3 * i + nelt2 < nelt)
5545 		sel[3 * i + nelt2] = nelt + j2++;
5546 	    }
5547 	  indices.new_vector (sel, 2, nelt);
5548 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5549 
5550 	  vect1 = dr_chain[0];
5551 	  vect2 = dr_chain[1];
5552 
5553 	  /* Create interleaving stmt:
5554 	     low = VEC_PERM_EXPR <vect1, vect2,
5555 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5556 				   j + 2, nelt + j + 2, *, ...}>  */
5557 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5558 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5559 					   vect2, perm3_mask_low);
5560 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5561 
5562 	  vect1 = data_ref;
5563 	  vect2 = dr_chain[2];
5564 	  /* Create interleaving stmt:
5565 	     low = VEC_PERM_EXPR <vect1, vect2,
5566 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5567 				   6, 7, nelt + j + 2, ...}>  */
5568 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5569 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5570 					   vect2, perm3_mask_high);
5571 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5572 	  (*result_chain)[j] = data_ref;
5573 	}
5574     }
5575   else
5576     {
5577       /* If length is not equal to 3 then only power of 2 is supported.  */
5578       gcc_assert (pow2p_hwi (length));
5579 
5580       /* The encoding has 2 interleaved stepped patterns.  */
5581       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5582       vec_perm_builder sel (nelt, 2, 3);
5583       sel.quick_grow (6);
5584       for (i = 0; i < 3; i++)
5585 	{
5586 	  sel[i * 2] = i;
5587 	  sel[i * 2 + 1] = i + nelt;
5588 	}
5589 	vec_perm_indices indices (sel, 2, nelt);
5590 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5591 
5592 	for (i = 0; i < 6; i++)
5593 	  sel[i] += exact_div (nelt, 2);
5594 	indices.new_vector (sel, 2, nelt);
5595 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5596 
5597 	for (i = 0, n = log_length; i < n; i++)
5598 	  {
5599 	    for (j = 0; j < length/2; j++)
5600 	      {
5601 		vect1 = dr_chain[j];
5602 		vect2 = dr_chain[j+length/2];
5603 
5604 		/* Create interleaving stmt:
5605 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5606 							...}>  */
5607 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5608 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5609 						 vect2, perm_mask_high);
5610 		vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5611 		(*result_chain)[2*j] = high;
5612 
5613 		/* Create interleaving stmt:
5614 		   low = VEC_PERM_EXPR <vect1, vect2,
5615 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5616 					 ...}>  */
5617 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5618 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5619 						 vect2, perm_mask_low);
5620 		vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5621 		(*result_chain)[2*j+1] = low;
5622 	      }
5623 	    memcpy (dr_chain.address (), result_chain->address (),
5624 		    length * sizeof (tree));
5625 	  }
5626     }
5627 }
5628 
5629 /* Function vect_setup_realignment
5630 
5631    This function is called when vectorizing an unaligned load using
5632    the dr_explicit_realign[_optimized] scheme.
5633    This function generates the following code at the loop prolog:
5634 
5635       p = initial_addr;
5636    x  msq_init = *(floor(p));   # prolog load
5637       realignment_token = call target_builtin;
5638     loop:
5639    x  msq = phi (msq_init, ---)
5640 
5641    The stmts marked with x are generated only for the case of
5642    dr_explicit_realign_optimized.
5643 
5644    The code above sets up a new (vector) pointer, pointing to the first
5645    location accessed by STMT_INFO, and a "floor-aligned" load using that
5646    pointer.  It also generates code to compute the "realignment-token"
5647    (if the relevant target hook was defined), and creates a phi-node at the
5648    loop-header bb whose arguments are the result of the prolog-load (created
5649    by this function) and the result of a load that takes place in the loop
5650    (to be created by the caller to this function).
5651 
5652    For the case of dr_explicit_realign_optimized:
5653    The caller to this function uses the phi-result (msq) to create the
5654    realignment code inside the loop, and sets up the missing phi argument,
5655    as follows:
5656     loop:
5657       msq = phi (msq_init, lsq)
5658       lsq = *(floor(p'));        # load in loop
5659       result = realign_load (msq, lsq, realignment_token);
5660 
5661    For the case of dr_explicit_realign:
5662     loop:
5663       msq = *(floor(p)); 	# load in loop
5664       p' = p + (VS-1);
5665       lsq = *(floor(p'));	# load in loop
5666       result = realign_load (msq, lsq, realignment_token);
5667 
5668    Input:
5669    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5670 	       a memory location that may be unaligned.
5671    BSI - place where new code is to be inserted.
5672    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5673 			      is used.
5674 
5675    Output:
5676    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5677                        target hook, if defined.
5678    Return value - the result of the loop-header phi node.  */
5679 
5680 tree
vect_setup_realignment(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,class loop ** at_loop)5681 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5682 			gimple_stmt_iterator *gsi, tree *realignment_token,
5683 			enum dr_alignment_support alignment_support_scheme,
5684 			tree init_addr,
5685 			class loop **at_loop)
5686 {
5687   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5688   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5689   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5690   struct data_reference *dr = dr_info->dr;
5691   class loop *loop = NULL;
5692   edge pe = NULL;
5693   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5694   tree vec_dest;
5695   gimple *inc;
5696   tree ptr;
5697   tree data_ref;
5698   basic_block new_bb;
5699   tree msq_init = NULL_TREE;
5700   tree new_temp;
5701   gphi *phi_stmt;
5702   tree msq = NULL_TREE;
5703   gimple_seq stmts = NULL;
5704   bool compute_in_loop = false;
5705   bool nested_in_vect_loop = false;
5706   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5707   class loop *loop_for_initial_load = NULL;
5708 
5709   if (loop_vinfo)
5710     {
5711       loop = LOOP_VINFO_LOOP (loop_vinfo);
5712       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5713     }
5714 
5715   gcc_assert (alignment_support_scheme == dr_explicit_realign
5716 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5717 
5718   /* We need to generate three things:
5719      1. the misalignment computation
5720      2. the extra vector load (for the optimized realignment scheme).
5721      3. the phi node for the two vectors from which the realignment is
5722       done (for the optimized realignment scheme).  */
5723 
5724   /* 1. Determine where to generate the misalignment computation.
5725 
5726      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5727      calculation will be generated by this function, outside the loop (in the
5728      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5729      caller, inside the loop.
5730 
5731      Background: If the misalignment remains fixed throughout the iterations of
5732      the loop, then both realignment schemes are applicable, and also the
5733      misalignment computation can be done outside LOOP.  This is because we are
5734      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5735      are a multiple of VS (the Vector Size), and therefore the misalignment in
5736      different vectorized LOOP iterations is always the same.
5737      The problem arises only if the memory access is in an inner-loop nested
5738      inside LOOP, which is now being vectorized using outer-loop vectorization.
5739      This is the only case when the misalignment of the memory access may not
5740      remain fixed throughout the iterations of the inner-loop (as explained in
5741      detail in vect_supportable_dr_alignment).  In this case, not only is the
5742      optimized realignment scheme not applicable, but also the misalignment
5743      computation (and generation of the realignment token that is passed to
5744      REALIGN_LOAD) have to be done inside the loop.
5745 
5746      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5747      or not, which in turn determines if the misalignment is computed inside
5748      the inner-loop, or outside LOOP.  */
5749 
5750   if (init_addr != NULL_TREE || !loop_vinfo)
5751     {
5752       compute_in_loop = true;
5753       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5754     }
5755 
5756 
5757   /* 2. Determine where to generate the extra vector load.
5758 
5759      For the optimized realignment scheme, instead of generating two vector
5760      loads in each iteration, we generate a single extra vector load in the
5761      preheader of the loop, and in each iteration reuse the result of the
5762      vector load from the previous iteration.  In case the memory access is in
5763      an inner-loop nested inside LOOP, which is now being vectorized using
5764      outer-loop vectorization, we need to determine whether this initial vector
5765      load should be generated at the preheader of the inner-loop, or can be
5766      generated at the preheader of LOOP.  If the memory access has no evolution
5767      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5768      to be generated inside LOOP (in the preheader of the inner-loop).  */
5769 
5770   if (nested_in_vect_loop)
5771     {
5772       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5773       bool invariant_in_outerloop =
5774             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5775       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5776     }
5777   else
5778     loop_for_initial_load = loop;
5779   if (at_loop)
5780     *at_loop = loop_for_initial_load;
5781 
5782   if (loop_for_initial_load)
5783     pe = loop_preheader_edge (loop_for_initial_load);
5784 
5785   /* 3. For the case of the optimized realignment, create the first vector
5786       load at the loop preheader.  */
5787 
5788   if (alignment_support_scheme == dr_explicit_realign_optimized)
5789     {
5790       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5791       gassign *new_stmt;
5792 
5793       gcc_assert (!compute_in_loop);
5794       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5795       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5796 				      loop_for_initial_load, NULL_TREE,
5797 				      &init_addr, NULL, &inc, true);
5798       if (TREE_CODE (ptr) == SSA_NAME)
5799 	new_temp = copy_ssa_name (ptr);
5800       else
5801 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5802       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5803       tree type = TREE_TYPE (ptr);
5804       new_stmt = gimple_build_assign
5805 		   (new_temp, BIT_AND_EXPR, ptr,
5806 		    fold_build2 (MINUS_EXPR, type,
5807 				 build_int_cst (type, 0),
5808 				 build_int_cst (type, align)));
5809       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5810       gcc_assert (!new_bb);
5811       data_ref
5812 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5813 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5814       vect_copy_ref_info (data_ref, DR_REF (dr));
5815       new_stmt = gimple_build_assign (vec_dest, data_ref);
5816       new_temp = make_ssa_name (vec_dest, new_stmt);
5817       gimple_assign_set_lhs (new_stmt, new_temp);
5818       if (pe)
5819         {
5820           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5821           gcc_assert (!new_bb);
5822         }
5823       else
5824          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5825 
5826       msq_init = gimple_assign_lhs (new_stmt);
5827     }
5828 
5829   /* 4. Create realignment token using a target builtin, if available.
5830       It is done either inside the containing loop, or before LOOP (as
5831       determined above).  */
5832 
5833   if (targetm.vectorize.builtin_mask_for_load)
5834     {
5835       gcall *new_stmt;
5836       tree builtin_decl;
5837 
5838       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5839       if (!init_addr)
5840 	{
5841 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5842 	  init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5843 							    stmt_info, &stmts,
5844 							    NULL_TREE);
5845           if (loop)
5846             {
5847    	      pe = loop_preheader_edge (loop);
5848 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5849 	      gcc_assert (!new_bb);
5850             }
5851           else
5852              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5853 	}
5854 
5855       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5856       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5857       vec_dest =
5858 	vect_create_destination_var (scalar_dest,
5859 				     gimple_call_return_type (new_stmt));
5860       new_temp = make_ssa_name (vec_dest, new_stmt);
5861       gimple_call_set_lhs (new_stmt, new_temp);
5862 
5863       if (compute_in_loop)
5864 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5865       else
5866 	{
5867 	  /* Generate the misalignment computation outside LOOP.  */
5868 	  pe = loop_preheader_edge (loop);
5869 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5870 	  gcc_assert (!new_bb);
5871 	}
5872 
5873       *realignment_token = gimple_call_lhs (new_stmt);
5874 
5875       /* The result of the CALL_EXPR to this builtin is determined from
5876          the value of the parameter and no global variables are touched
5877          which makes the builtin a "const" function.  Requiring the
5878          builtin to have the "const" attribute makes it unnecessary
5879          to call mark_call_clobbered.  */
5880       gcc_assert (TREE_READONLY (builtin_decl));
5881     }
5882 
5883   if (alignment_support_scheme == dr_explicit_realign)
5884     return msq;
5885 
5886   gcc_assert (!compute_in_loop);
5887   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5888 
5889 
5890   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5891 
5892   pe = loop_preheader_edge (containing_loop);
5893   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5894   msq = make_ssa_name (vec_dest);
5895   phi_stmt = create_phi_node (msq, containing_loop->header);
5896   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5897 
5898   return msq;
5899 }
5900 
5901 
5902 /* Function vect_grouped_load_supported.
5903 
5904    COUNT is the size of the load group (the number of statements plus the
5905    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5906    only one statement, with a gap of COUNT - 1.
5907 
5908    Returns true if a suitable permute exists.  */
5909 
5910 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5911 vect_grouped_load_supported (tree vectype, bool single_element_p,
5912 			     unsigned HOST_WIDE_INT count)
5913 {
5914   machine_mode mode = TYPE_MODE (vectype);
5915 
5916   /* If this is single-element interleaving with an element distance
5917      that leaves unused vector loads around punt - we at least create
5918      very sub-optimal code in that case (and blow up memory,
5919      see PR65518).  */
5920   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5921     {
5922       if (dump_enabled_p ())
5923 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5924 			 "single-element interleaving not supported "
5925 			 "for not adjacent vector loads\n");
5926       return false;
5927     }
5928 
5929   /* vect_permute_load_chain requires the group size to be equal to 3 or
5930      be a power of two.  */
5931   if (count != 3 && exact_log2 (count) == -1)
5932     {
5933       if (dump_enabled_p ())
5934 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5935 			 "the size of the group of accesses"
5936 			 " is not a power of 2 or not equal to 3\n");
5937       return false;
5938     }
5939 
5940   /* Check that the permutation is supported.  */
5941   if (VECTOR_MODE_P (mode))
5942     {
5943       unsigned int i, j;
5944       if (count == 3)
5945 	{
5946 	  unsigned int nelt;
5947 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5948 	    {
5949 	      if (dump_enabled_p ())
5950 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5951 				 "cannot handle groups of 3 loads for"
5952 				 " variable-length vectors\n");
5953 	      return false;
5954 	    }
5955 
5956 	  vec_perm_builder sel (nelt, nelt, 1);
5957 	  sel.quick_grow (nelt);
5958 	  vec_perm_indices indices;
5959 	  unsigned int k;
5960 	  for (k = 0; k < 3; k++)
5961 	    {
5962 	      for (i = 0; i < nelt; i++)
5963 		if (3 * i + k < 2 * nelt)
5964 		  sel[i] = 3 * i + k;
5965 		else
5966 		  sel[i] = 0;
5967 	      indices.new_vector (sel, 2, nelt);
5968 	      if (!can_vec_perm_const_p (mode, indices))
5969 		{
5970 		  if (dump_enabled_p ())
5971 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972 				     "shuffle of 3 loads is not supported by"
5973 				     " target\n");
5974 		  return false;
5975 		}
5976 	      for (i = 0, j = 0; i < nelt; i++)
5977 		if (3 * i + k < 2 * nelt)
5978 		  sel[i] = i;
5979 		else
5980 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5981 	      indices.new_vector (sel, 2, nelt);
5982 	      if (!can_vec_perm_const_p (mode, indices))
5983 		{
5984 		  if (dump_enabled_p ())
5985 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5986 				     "shuffle of 3 loads is not supported by"
5987 				     " target\n");
5988 		  return false;
5989 		}
5990 	    }
5991 	  return true;
5992 	}
5993       else
5994 	{
5995 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5996 	  gcc_assert (pow2p_hwi (count));
5997 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5998 
5999 	  /* The encoding has a single stepped pattern.  */
6000 	  vec_perm_builder sel (nelt, 1, 3);
6001 	  sel.quick_grow (3);
6002 	  for (i = 0; i < 3; i++)
6003 	    sel[i] = i * 2;
6004 	  vec_perm_indices indices (sel, 2, nelt);
6005 	  if (can_vec_perm_const_p (mode, indices))
6006 	    {
6007 	      for (i = 0; i < 3; i++)
6008 		sel[i] = i * 2 + 1;
6009 	      indices.new_vector (sel, 2, nelt);
6010 	      if (can_vec_perm_const_p (mode, indices))
6011 		return true;
6012 	    }
6013         }
6014     }
6015 
6016   if (dump_enabled_p ())
6017     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6018 		     "extract even/odd not supported by target\n");
6019   return false;
6020 }
6021 
6022 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6023    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6024 
6025 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)6026 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6027 			   bool masked_p)
6028 {
6029   if (masked_p)
6030     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6031 					 vec_mask_load_lanes_optab,
6032 					 vectype, count);
6033   else
6034     return vect_lanes_optab_supported_p ("vec_load_lanes",
6035 					 vec_load_lanes_optab,
6036 					 vectype, count);
6037 }
6038 
6039 /* Function vect_permute_load_chain.
6040 
6041    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6042    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6043    the input data correctly.  Return the final references for loads in
6044    RESULT_CHAIN.
6045 
6046    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6047    The input is 4 vectors each containing 8 elements. We assign a number to each
6048    element, the input sequence is:
6049 
6050    1st vec:   0  1  2  3  4  5  6  7
6051    2nd vec:   8  9 10 11 12 13 14 15
6052    3rd vec:  16 17 18 19 20 21 22 23
6053    4th vec:  24 25 26 27 28 29 30 31
6054 
6055    The output sequence should be:
6056 
6057    1st vec:  0 4  8 12 16 20 24 28
6058    2nd vec:  1 5  9 13 17 21 25 29
6059    3rd vec:  2 6 10 14 18 22 26 30
6060    4th vec:  3 7 11 15 19 23 27 31
6061 
6062    i.e., the first output vector should contain the first elements of each
6063    interleaving group, etc.
6064 
6065    We use extract_even/odd instructions to create such output.  The input of
6066    each extract_even/odd operation is two vectors
6067    1st vec    2nd vec
6068    0 1 2 3    4 5 6 7
6069 
6070    and the output is the vector of extracted even/odd elements.  The output of
6071    extract_even will be:   0 2 4 6
6072    and of extract_odd:     1 3 5 7
6073 
6074 
6075    The permutation is done in log LENGTH stages.  In each stage extract_even
6076    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6077    their order.  In our example,
6078 
6079    E1: extract_even (1st vec, 2nd vec)
6080    E2: extract_odd (1st vec, 2nd vec)
6081    E3: extract_even (3rd vec, 4th vec)
6082    E4: extract_odd (3rd vec, 4th vec)
6083 
6084    The output for the first stage will be:
6085 
6086    E1:  0  2  4  6  8 10 12 14
6087    E2:  1  3  5  7  9 11 13 15
6088    E3: 16 18 20 22 24 26 28 30
6089    E4: 17 19 21 23 25 27 29 31
6090 
6091    In order to proceed and create the correct sequence for the next stage (or
6092    for the correct output, if the second stage is the last one, as in our
6093    example), we first put the output of extract_even operation and then the
6094    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6095    The input for the second stage is:
6096 
6097    1st vec (E1):  0  2  4  6  8 10 12 14
6098    2nd vec (E3): 16 18 20 22 24 26 28 30
6099    3rd vec (E2):  1  3  5  7  9 11 13 15
6100    4th vec (E4): 17 19 21 23 25 27 29 31
6101 
6102    The output of the second stage:
6103 
6104    E1: 0 4  8 12 16 20 24 28
6105    E2: 2 6 10 14 18 22 26 30
6106    E3: 1 5  9 13 17 21 25 29
6107    E4: 3 7 11 15 19 23 27 31
6108 
6109    And RESULT_CHAIN after reordering:
6110 
6111    1st vec (E1):  0 4  8 12 16 20 24 28
6112    2nd vec (E3):  1 5  9 13 17 21 25 29
6113    3rd vec (E2):  2 6 10 14 18 22 26 30
6114    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6115 
6116 static void
vect_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6117 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6118 			 unsigned int length,
6119 			 stmt_vec_info stmt_info,
6120 			 gimple_stmt_iterator *gsi,
6121 			 vec<tree> *result_chain)
6122 {
6123   tree data_ref, first_vect, second_vect;
6124   tree perm_mask_even, perm_mask_odd;
6125   tree perm3_mask_low, perm3_mask_high;
6126   gimple *perm_stmt;
6127   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6128   unsigned int i, j, log_length = exact_log2 (length);
6129 
6130   result_chain->quick_grow (length);
6131   memcpy (result_chain->address (), dr_chain.address (),
6132 	  length * sizeof (tree));
6133 
6134   if (length == 3)
6135     {
6136       /* vect_grouped_load_supported ensures that this is constant.  */
6137       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6138       unsigned int k;
6139 
6140       vec_perm_builder sel (nelt, nelt, 1);
6141       sel.quick_grow (nelt);
6142       vec_perm_indices indices;
6143       for (k = 0; k < 3; k++)
6144 	{
6145 	  for (i = 0; i < nelt; i++)
6146 	    if (3 * i + k < 2 * nelt)
6147 	      sel[i] = 3 * i + k;
6148 	    else
6149 	      sel[i] = 0;
6150 	  indices.new_vector (sel, 2, nelt);
6151 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6152 
6153 	  for (i = 0, j = 0; i < nelt; i++)
6154 	    if (3 * i + k < 2 * nelt)
6155 	      sel[i] = i;
6156 	    else
6157 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6158 	  indices.new_vector (sel, 2, nelt);
6159 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6160 
6161 	  first_vect = dr_chain[0];
6162 	  second_vect = dr_chain[1];
6163 
6164 	  /* Create interleaving stmt (low part of):
6165 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6166 							     ...}>  */
6167 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6168 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6169 					   second_vect, perm3_mask_low);
6170 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6171 
6172 	  /* Create interleaving stmt (high part of):
6173 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6174 							      ...}>  */
6175 	  first_vect = data_ref;
6176 	  second_vect = dr_chain[2];
6177 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6178 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6179 					   second_vect, perm3_mask_high);
6180 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6181 	  (*result_chain)[k] = data_ref;
6182 	}
6183     }
6184   else
6185     {
6186       /* If length is not equal to 3 then only power of 2 is supported.  */
6187       gcc_assert (pow2p_hwi (length));
6188 
6189       /* The encoding has a single stepped pattern.  */
6190       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6191       vec_perm_builder sel (nelt, 1, 3);
6192       sel.quick_grow (3);
6193       for (i = 0; i < 3; ++i)
6194 	sel[i] = i * 2;
6195       vec_perm_indices indices (sel, 2, nelt);
6196       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6197 
6198       for (i = 0; i < 3; ++i)
6199 	sel[i] = i * 2 + 1;
6200       indices.new_vector (sel, 2, nelt);
6201       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6202 
6203       for (i = 0; i < log_length; i++)
6204 	{
6205 	  for (j = 0; j < length; j += 2)
6206 	    {
6207 	      first_vect = dr_chain[j];
6208 	      second_vect = dr_chain[j+1];
6209 
6210 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6211 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6212 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213 					       first_vect, second_vect,
6214 					       perm_mask_even);
6215 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6216 	      (*result_chain)[j/2] = data_ref;
6217 
6218 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6219 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6220 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6221 					       first_vect, second_vect,
6222 					       perm_mask_odd);
6223 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6224 	      (*result_chain)[j/2+length/2] = data_ref;
6225 	    }
6226 	  memcpy (dr_chain.address (), result_chain->address (),
6227 		  length * sizeof (tree));
6228 	}
6229     }
6230 }
6231 
6232 /* Function vect_shift_permute_load_chain.
6233 
6234    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6235    sequence of stmts to reorder the input data accordingly.
6236    Return the final references for loads in RESULT_CHAIN.
6237    Return true if successed, false otherwise.
6238 
6239    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6240    The input is 3 vectors each containing 8 elements.  We assign a
6241    number to each element, the input sequence is:
6242 
6243    1st vec:   0  1  2  3  4  5  6  7
6244    2nd vec:   8  9 10 11 12 13 14 15
6245    3rd vec:  16 17 18 19 20 21 22 23
6246 
6247    The output sequence should be:
6248 
6249    1st vec:  0 3 6  9 12 15 18 21
6250    2nd vec:  1 4 7 10 13 16 19 22
6251    3rd vec:  2 5 8 11 14 17 20 23
6252 
6253    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6254 
6255    First we shuffle all 3 vectors to get correct elements order:
6256 
6257    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6258    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6259    3rd vec:  (16 19 22) (17 20 23) (18 21)
6260 
6261    Next we unite and shift vector 3 times:
6262 
6263    1st step:
6264      shift right by 6 the concatenation of:
6265      "1st vec" and  "2nd vec"
6266        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6267      "2nd vec" and  "3rd vec"
6268        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6269      "3rd vec" and  "1st vec"
6270        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6271 			     | New vectors                   |
6272 
6273      So that now new vectors are:
6274 
6275      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6276      2nd vec:  (10 13) (16 19 22) (17 20 23)
6277      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6278 
6279    2nd step:
6280      shift right by 5 the concatenation of:
6281      "1st vec" and  "3rd vec"
6282        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6283      "2nd vec" and  "1st vec"
6284        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6285      "3rd vec" and  "2nd vec"
6286        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6287 			  | New vectors                   |
6288 
6289      So that now new vectors are:
6290 
6291      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6292      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6293      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6294 
6295    3rd step:
6296      shift right by 5 the concatenation of:
6297      "1st vec" and  "1st vec"
6298        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6299      shift right by 3 the concatenation of:
6300      "2nd vec" and  "2nd vec"
6301                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6302 			  | New vectors                   |
6303 
6304      So that now all vectors are READY:
6305      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6306      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6307      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6308 
6309    This algorithm is faster than one in vect_permute_load_chain if:
6310      1.  "shift of a concatination" is faster than general permutation.
6311 	 This is usually so.
6312      2.  The TARGET machine can't execute vector instructions in parallel.
6313 	 This is because each step of the algorithm depends on previous.
6314 	 The algorithm in vect_permute_load_chain is much more parallel.
6315 
6316    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6317 */
6318 
6319 static bool
vect_shift_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6320 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6321 			       unsigned int length,
6322 			       stmt_vec_info stmt_info,
6323 			       gimple_stmt_iterator *gsi,
6324 			       vec<tree> *result_chain)
6325 {
6326   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6327   tree perm2_mask1, perm2_mask2, perm3_mask;
6328   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6329   gimple *perm_stmt;
6330 
6331   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6332   unsigned int i;
6333   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6334 
6335   unsigned HOST_WIDE_INT nelt, vf;
6336   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6337       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6338     /* Not supported for variable-length vectors.  */
6339     return false;
6340 
6341   vec_perm_builder sel (nelt, nelt, 1);
6342   sel.quick_grow (nelt);
6343 
6344   result_chain->quick_grow (length);
6345   memcpy (result_chain->address (), dr_chain.address (),
6346 	  length * sizeof (tree));
6347 
6348   if (pow2p_hwi (length) && vf > 4)
6349     {
6350       unsigned int j, log_length = exact_log2 (length);
6351       for (i = 0; i < nelt / 2; ++i)
6352 	sel[i] = i * 2;
6353       for (i = 0; i < nelt / 2; ++i)
6354 	sel[nelt / 2 + i] = i * 2 + 1;
6355       vec_perm_indices indices (sel, 2, nelt);
6356       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6357 	{
6358 	  if (dump_enabled_p ())
6359 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6360 			     "shuffle of 2 fields structure is not \
6361 			      supported by target\n");
6362 	  return false;
6363 	}
6364       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6365 
6366       for (i = 0; i < nelt / 2; ++i)
6367 	sel[i] = i * 2 + 1;
6368       for (i = 0; i < nelt / 2; ++i)
6369 	sel[nelt / 2 + i] = i * 2;
6370       indices.new_vector (sel, 2, nelt);
6371       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6372 	{
6373 	  if (dump_enabled_p ())
6374 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6375 			     "shuffle of 2 fields structure is not \
6376 			      supported by target\n");
6377 	  return false;
6378 	}
6379       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6380 
6381       /* Generating permutation constant to shift all elements.
6382 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6383       for (i = 0; i < nelt; i++)
6384 	sel[i] = nelt / 2 + i;
6385       indices.new_vector (sel, 2, nelt);
6386       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6387 	{
6388 	  if (dump_enabled_p ())
6389 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6390 			     "shift permutation is not supported by target\n");
6391 	  return false;
6392 	}
6393       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6394 
6395       /* Generating permutation constant to select vector from 2.
6396 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6397       for (i = 0; i < nelt / 2; i++)
6398 	sel[i] = i;
6399       for (i = nelt / 2; i < nelt; i++)
6400 	sel[i] = nelt + i;
6401       indices.new_vector (sel, 2, nelt);
6402       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6403 	{
6404 	  if (dump_enabled_p ())
6405 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406 			     "select is not supported by target\n");
6407 	  return false;
6408 	}
6409       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6410 
6411       for (i = 0; i < log_length; i++)
6412 	{
6413 	  for (j = 0; j < length; j += 2)
6414 	    {
6415 	      first_vect = dr_chain[j];
6416 	      second_vect = dr_chain[j + 1];
6417 
6418 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6419 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6420 					       first_vect, first_vect,
6421 					       perm2_mask1);
6422 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6423 	      vect[0] = data_ref;
6424 
6425 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6426 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6427 					       second_vect, second_vect,
6428 					       perm2_mask2);
6429 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6430 	      vect[1] = data_ref;
6431 
6432 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6433 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6434 					       vect[0], vect[1], shift1_mask);
6435 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6436 	      (*result_chain)[j/2 + length/2] = data_ref;
6437 
6438 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6439 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6440 					       vect[0], vect[1], select_mask);
6441 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6442 	      (*result_chain)[j/2] = data_ref;
6443 	    }
6444 	  memcpy (dr_chain.address (), result_chain->address (),
6445 		  length * sizeof (tree));
6446 	}
6447       return true;
6448     }
6449   if (length == 3 && vf > 2)
6450     {
6451       unsigned int k = 0, l = 0;
6452 
6453       /* Generating permutation constant to get all elements in rigth order.
6454 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6455       for (i = 0; i < nelt; i++)
6456 	{
6457 	  if (3 * k + (l % 3) >= nelt)
6458 	    {
6459 	      k = 0;
6460 	      l += (3 - (nelt % 3));
6461 	    }
6462 	  sel[i] = 3 * k + (l % 3);
6463 	  k++;
6464 	}
6465       vec_perm_indices indices (sel, 2, nelt);
6466       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6467 	{
6468 	  if (dump_enabled_p ())
6469 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470 			     "shuffle of 3 fields structure is not \
6471 			      supported by target\n");
6472 	  return false;
6473 	}
6474       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6475 
6476       /* Generating permutation constant to shift all elements.
6477 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6478       for (i = 0; i < nelt; i++)
6479 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6480       indices.new_vector (sel, 2, nelt);
6481       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6482 	{
6483 	  if (dump_enabled_p ())
6484 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6485 			     "shift permutation is not supported by target\n");
6486 	  return false;
6487 	}
6488       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6489 
6490       /* Generating permutation constant to shift all elements.
6491 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6492       for (i = 0; i < nelt; i++)
6493 	sel[i] = 2 * (nelt / 3) + 1 + i;
6494       indices.new_vector (sel, 2, nelt);
6495       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6496 	{
6497 	  if (dump_enabled_p ())
6498 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6499 			     "shift permutation is not supported by target\n");
6500 	  return false;
6501 	}
6502       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6503 
6504       /* Generating permutation constant to shift all elements.
6505 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6506       for (i = 0; i < nelt; i++)
6507 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6508       indices.new_vector (sel, 2, nelt);
6509       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6510 	{
6511 	  if (dump_enabled_p ())
6512 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 			     "shift permutation is not supported by target\n");
6514 	  return false;
6515 	}
6516       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6517 
6518       /* Generating permutation constant to shift all elements.
6519 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6520       for (i = 0; i < nelt; i++)
6521 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6522       indices.new_vector (sel, 2, nelt);
6523       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6524 	{
6525 	  if (dump_enabled_p ())
6526 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527 			     "shift permutation is not supported by target\n");
6528 	  return false;
6529 	}
6530       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6531 
6532       for (k = 0; k < 3; k++)
6533 	{
6534 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6535 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6536 					   dr_chain[k], dr_chain[k],
6537 					   perm3_mask);
6538 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6539 	  vect[k] = data_ref;
6540 	}
6541 
6542       for (k = 0; k < 3; k++)
6543 	{
6544 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6545 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6546 					   vect[k % 3], vect[(k + 1) % 3],
6547 					   shift1_mask);
6548 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6549 	  vect_shift[k] = data_ref;
6550 	}
6551 
6552       for (k = 0; k < 3; k++)
6553 	{
6554 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6555 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6556 					   vect_shift[(4 - k) % 3],
6557 					   vect_shift[(3 - k) % 3],
6558 					   shift2_mask);
6559 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6560 	  vect[k] = data_ref;
6561 	}
6562 
6563       (*result_chain)[3 - (nelt % 3)] = vect[2];
6564 
6565       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6566       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6567 				       vect[0], shift3_mask);
6568       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6569       (*result_chain)[nelt % 3] = data_ref;
6570 
6571       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6572       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6573 				       vect[1], shift4_mask);
6574       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6575       (*result_chain)[0] = data_ref;
6576       return true;
6577     }
6578   return false;
6579 }
6580 
6581 /* Function vect_transform_grouped_load.
6582 
6583    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6584    to perform their permutation and ascribe the result vectorized statements to
6585    the scalar statements.
6586 */
6587 
6588 void
vect_transform_grouped_load(vec_info * vinfo,stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6589 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6590 			     vec<tree> dr_chain,
6591 			     int size, gimple_stmt_iterator *gsi)
6592 {
6593   machine_mode mode;
6594   vec<tree> result_chain = vNULL;
6595 
6596   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6597      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6598      vectors, that are ready for vector computation.  */
6599   result_chain.create (size);
6600 
6601   /* If reassociation width for vector type is 2 or greater target machine can
6602      execute 2 or more vector instructions in parallel.  Otherwise try to
6603      get chain for loads group using vect_shift_permute_load_chain.  */
6604   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6605   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6606       || pow2p_hwi (size)
6607       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6608 					 gsi, &result_chain))
6609     vect_permute_load_chain (vinfo, dr_chain,
6610 			     size, stmt_info, gsi, &result_chain);
6611   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6612   result_chain.release ();
6613 }
6614 
6615 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6616    generated as part of the vectorization of STMT_INFO.  Assign the statement
6617    for each vector to the associated scalar statement.  */
6618 
6619 void
vect_record_grouped_load_vectors(vec_info *,stmt_vec_info stmt_info,vec<tree> result_chain)6620 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6621 				  vec<tree> result_chain)
6622 {
6623   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6624   unsigned int i, gap_count;
6625   tree tmp_data_ref;
6626 
6627   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6628      Since we scan the chain starting from it's first node, their order
6629      corresponds the order of data-refs in RESULT_CHAIN.  */
6630   stmt_vec_info next_stmt_info = first_stmt_info;
6631   gap_count = 1;
6632   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6633     {
6634       if (!next_stmt_info)
6635 	break;
6636 
6637       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6638        code elimination pass later.  No need to check for the first stmt in
6639        the group, since it always exists.
6640        DR_GROUP_GAP is the number of steps in elements from the previous
6641        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6642        correspond to the gaps.  */
6643       if (next_stmt_info != first_stmt_info
6644 	  && gap_count < DR_GROUP_GAP (next_stmt_info))
6645 	{
6646 	  gap_count++;
6647 	  continue;
6648 	}
6649 
6650       /* ???  The following needs cleanup after the removal of
6651          DR_GROUP_SAME_DR_STMT.  */
6652       if (next_stmt_info)
6653         {
6654 	  gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6655 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6656 	     copies, and we put the new vector statement last.  */
6657 	  STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6658 
6659 	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6660 	  gap_count = 1;
6661         }
6662     }
6663 }
6664 
6665 /* Function vect_force_dr_alignment_p.
6666 
6667    Returns whether the alignment of a DECL can be forced to be aligned
6668    on ALIGNMENT bit boundary.  */
6669 
6670 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6671 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6672 {
6673   if (!VAR_P (decl))
6674     return false;
6675 
6676   if (decl_in_symtab_p (decl)
6677       && !symtab_node::get (decl)->can_increase_alignment_p ())
6678     return false;
6679 
6680   if (TREE_STATIC (decl))
6681     return (known_le (alignment,
6682 		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6683   else
6684     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6685 }
6686 
6687 /* Return whether the data reference DR_INFO is supported with respect to its
6688    alignment.
6689    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6690    it is aligned, i.e., check if it is possible to vectorize it with different
6691    alignment.  */
6692 
6693 enum dr_alignment_support
vect_supportable_dr_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype,int misalignment)6694 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6695 			       tree vectype, int misalignment)
6696 {
6697   data_reference *dr = dr_info->dr;
6698   stmt_vec_info stmt_info = dr_info->stmt;
6699   machine_mode mode = TYPE_MODE (vectype);
6700   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6701   class loop *vect_loop = NULL;
6702   bool nested_in_vect_loop = false;
6703 
6704   if (misalignment == 0)
6705     return dr_aligned;
6706 
6707   /* For now assume all conditional loads/stores support unaligned
6708      access without any special code.  */
6709   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6710     if (gimple_call_internal_p (stmt)
6711 	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6712 	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6713       return dr_unaligned_supported;
6714 
6715   if (loop_vinfo)
6716     {
6717       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6718       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6719     }
6720 
6721   /* Possibly unaligned access.  */
6722 
6723   /* We can choose between using the implicit realignment scheme (generating
6724      a misaligned_move stmt) and the explicit realignment scheme (generating
6725      aligned loads with a REALIGN_LOAD).  There are two variants to the
6726      explicit realignment scheme: optimized, and unoptimized.
6727      We can optimize the realignment only if the step between consecutive
6728      vector loads is equal to the vector size.  Since the vector memory
6729      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6730      is guaranteed that the misalignment amount remains the same throughout the
6731      execution of the vectorized loop.  Therefore, we can create the
6732      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6733      at the loop preheader.
6734 
6735      However, in the case of outer-loop vectorization, when vectorizing a
6736      memory access in the inner-loop nested within the LOOP that is now being
6737      vectorized, while it is guaranteed that the misalignment of the
6738      vectorized memory access will remain the same in different outer-loop
6739      iterations, it is *not* guaranteed that is will remain the same throughout
6740      the execution of the inner-loop.  This is because the inner-loop advances
6741      with the original scalar step (and not in steps of VS).  If the inner-loop
6742      step happens to be a multiple of VS, then the misalignment remains fixed
6743      and we can use the optimized realignment scheme.  For example:
6744 
6745       for (i=0; i<N; i++)
6746         for (j=0; j<M; j++)
6747           s += a[i+j];
6748 
6749      When vectorizing the i-loop in the above example, the step between
6750      consecutive vector loads is 1, and so the misalignment does not remain
6751      fixed across the execution of the inner-loop, and the realignment cannot
6752      be optimized (as illustrated in the following pseudo vectorized loop):
6753 
6754       for (i=0; i<N; i+=4)
6755         for (j=0; j<M; j++){
6756           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6757                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6758                          // (assuming that we start from an aligned address).
6759           }
6760 
6761      We therefore have to use the unoptimized realignment scheme:
6762 
6763       for (i=0; i<N; i+=4)
6764           for (j=k; j<M; j+=4)
6765           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6766                            // that the misalignment of the initial address is
6767                            // 0).
6768 
6769      The loop can then be vectorized as follows:
6770 
6771       for (k=0; k<4; k++){
6772         rt = get_realignment_token (&vp[k]);
6773         for (i=0; i<N; i+=4){
6774           v1 = vp[i+k];
6775           for (j=k; j<M; j+=4){
6776             v2 = vp[i+j+VS-1];
6777             va = REALIGN_LOAD <v1,v2,rt>;
6778             vs += va;
6779             v1 = v2;
6780           }
6781         }
6782     } */
6783 
6784   if (DR_IS_READ (dr))
6785     {
6786       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6787 	  && (!targetm.vectorize.builtin_mask_for_load
6788 	      || targetm.vectorize.builtin_mask_for_load ()))
6789 	{
6790 	  /* If we are doing SLP then the accesses need not have the
6791 	     same alignment, instead it depends on the SLP group size.  */
6792 	  if (loop_vinfo
6793 	      && STMT_SLP_TYPE (stmt_info)
6794 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6795 			      * (DR_GROUP_SIZE
6796 				 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6797 			      TYPE_VECTOR_SUBPARTS (vectype)))
6798 	    ;
6799 	  else if (!loop_vinfo
6800 		   || (nested_in_vect_loop
6801 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6802 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6803 	    return dr_explicit_realign;
6804 	  else
6805 	    return dr_explicit_realign_optimized;
6806 	}
6807     }
6808 
6809   bool is_packed = false;
6810   tree type = TREE_TYPE (DR_REF (dr));
6811   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6812     is_packed = not_size_aligned (DR_REF (dr));
6813   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6814 						     is_packed))
6815     return dr_unaligned_supported;
6816 
6817   /* Unsupported.  */
6818   return dr_unaligned_unsupported;
6819 }
6820