1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s["
78 			     HOST_WIDE_INT_PRINT_DEC "]\n",
79 			     GET_MODE_NAME (mode), count);
80 	  return false;
81 	}
82     }
83 
84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85     {
86       if (dump_enabled_p ())
87 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88                          "cannot use %s<%s><%s>\n", name,
89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90       return false;
91     }
92 
93   if (dump_enabled_p ())
94     dump_printf_loc (MSG_NOTE, vect_location,
95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96                      GET_MODE_NAME (mode));
97 
98   return true;
99 }
100 
101 
102 /* Return the smallest scalar part of STMT.
103    This is used to determine the vectype of the stmt.  We generally set the
104    vectype according to the type of the result (lhs).  For stmts whose
105    result-type is different than the type of the arguments (e.g., demotion,
106    promotion), vectype will be reset appropriately (later).  Note that we have
107    to visit the smallest datatype in this function, because that determines the
108    VF.  If the smallest datatype in the loop is present only as the rhs of a
109    promotion operation - we'd miss it.
110    Such a case, where a variable of this datatype does not appear in the lhs
111    anywhere in the loop, can only occur if it's an invariant: e.g.:
112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
113    invariant motion.  However, we cannot rely on invariant motion to always
114    take invariants out of the loop, and so in the case of promotion we also
115    have to check the rhs.
116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117    types.  */
118 
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121                                HOST_WIDE_INT *rhs_size_unit)
122 {
123   tree scalar_type = gimple_expr_type (stmt);
124   HOST_WIDE_INT lhs, rhs;
125 
126   /* During the analysis phase, this function is called on arbitrary
127      statements that might not have scalar results.  */
128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129     return scalar_type;
130 
131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 
133   if (is_gimple_assign (stmt)
134       && (gimple_assign_cast_p (stmt)
135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140     {
141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142 
143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144       if (rhs < lhs)
145         scalar_type = rhs_type;
146     }
147 
148   *lhs_size_unit = lhs;
149   *rhs_size_unit = rhs;
150   return scalar_type;
151 }
152 
153 
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155    tested at run-time.  Return TRUE if DDR was successfully inserted.
156    Return false if versioning is not supported.  */
157 
158 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
160 {
161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
162 
163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164     return false;
165 
166   if (!runtime_alias_check_p (ddr, loop,
167 			      optimize_loop_nest_for_speed_p (loop)))
168     return false;
169 
170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171   return true;
172 }
173 
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
175 
176 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
178 {
179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180   for (unsigned int i = 0; i < checks.length(); ++i)
181     if (checks[i] == value)
182       return;
183 
184   if (dump_enabled_p ())
185     {
186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188       dump_printf (MSG_NOTE, " is nonzero\n");
189     }
190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 }
192 
193 /* Return true if we know that the order of vectorized STMT_A and
194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195    At least one of the statements is a write.  */
196 
197 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
199 {
200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
202 
203   /* Single statements are always kept in their original order.  */
204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206     return true;
207 
208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
209      group are emitted at the position of the last scalar load and all
210      stores in a group are emitted at the position of the last scalar store.
211      Compute that position and check whether the resulting order matches
212      the current one.  */
213   gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
214   if (last_a)
215     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
216 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
217       last_a = get_later_stmt (last_a, s);
218   else
219     last_a = stmt_a;
220   gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
221   if (last_b)
222     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
223 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
224       last_b = get_later_stmt (last_b, s);
225   else
226     last_b = stmt_b;
227   return ((get_later_stmt (last_a, last_b) == last_a)
228 	  == (get_later_stmt (stmt_a, stmt_b) == stmt_a));
229 }
230 
231 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
232    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
233    distances.  These distances are conservatively correct but they don't
234    reflect a guaranteed dependence.
235 
236    Return true if this function does all the work necessary to avoid
237    an alias or false if the caller should use the dependence distances
238    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
239    the depth of the loop described by LOOP_VINFO and the other arguments
240    are as for vect_analyze_data_ref_dependence.  */
241 
242 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)243 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
244 				       loop_vec_info loop_vinfo,
245 				       int loop_depth, unsigned int *max_vf)
246 {
247   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
248   lambda_vector dist_v;
249   unsigned int i;
250   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
251     {
252       int dist = dist_v[loop_depth];
253       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
254 	{
255 	  /* If the user asserted safelen >= DIST consecutive iterations
256 	     can be executed concurrently, assume independence.
257 
258 	     ??? An alternative would be to add the alias check even
259 	     in this case, and vectorize the fallback loop with the
260 	     maximum VF set to safelen.  However, if the user has
261 	     explicitly given a length, it's less likely that that
262 	     would be a win.  */
263 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
264 	    {
265 	      if ((unsigned int) loop->safelen < *max_vf)
266 		*max_vf = loop->safelen;
267 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
268 	      continue;
269 	    }
270 
271 	  /* For dependence distances of 2 or more, we have the option
272 	     of limiting VF or checking for an alias at runtime.
273 	     Prefer to check at runtime if we can, to avoid limiting
274 	     the VF unnecessarily when the bases are in fact independent.
275 
276 	     Note that the alias checks will be removed if the VF ends up
277 	     being small enough.  */
278 	  return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
279 	}
280     }
281   return true;
282 }
283 
284 
285 /* Function vect_analyze_data_ref_dependence.
286 
287    Return TRUE if there (might) exist a dependence between a memory-reference
288    DRA and a memory-reference DRB.  When versioning for alias may check a
289    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
290    the data dependence.  */
291 
292 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)293 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
294 				  loop_vec_info loop_vinfo,
295 				  unsigned int *max_vf)
296 {
297   unsigned int i;
298   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
299   struct data_reference *dra = DDR_A (ddr);
300   struct data_reference *drb = DDR_B (ddr);
301   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
302   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
303   lambda_vector dist_v;
304   unsigned int loop_depth;
305 
306   /* In loop analysis all data references should be vectorizable.  */
307   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
308       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
309     gcc_unreachable ();
310 
311   /* Independent data accesses.  */
312   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
313     return false;
314 
315   if (dra == drb
316       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
317     return false;
318 
319   /* We do not have to consider dependences between accesses that belong
320      to the same group, unless the stride could be smaller than the
321      group size.  */
322   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
323       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
324       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
325     return false;
326 
327   /* Even if we have an anti-dependence then, as the vectorized loop covers at
328      least two scalar iterations, there is always also a true dependence.
329      As the vectorizer does not re-order loads and stores we can ignore
330      the anti-dependence if TBAA can disambiguate both DRs similar to the
331      case with known negative distance anti-dependences (positive
332      distance anti-dependences would violate TBAA constraints).  */
333   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
334        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
335       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
336 				 get_alias_set (DR_REF (drb))))
337     return false;
338 
339   /* Unknown data dependence.  */
340   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
341     {
342       /* If user asserted safelen consecutive iterations can be
343 	 executed concurrently, assume independence.  */
344       if (loop->safelen >= 2)
345 	{
346 	  if ((unsigned int) loop->safelen < *max_vf)
347 	    *max_vf = loop->safelen;
348 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
349 	  return false;
350 	}
351 
352       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
353 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
354 	{
355 	  if (dump_enabled_p ())
356 	    {
357 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 			       "versioning for alias not supported for: "
359 			       "can't determine dependence between ");
360 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 				 DR_REF (dra));
362 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
363 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
364 				 DR_REF (drb));
365 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
366 	    }
367 	  return true;
368 	}
369 
370       if (dump_enabled_p ())
371 	{
372 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373 			   "versioning for alias required: "
374 			   "can't determine dependence between ");
375 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 			     DR_REF (dra));
377 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
378 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
379 			     DR_REF (drb));
380 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
381 	}
382 
383       /* Add to list of ddrs that need to be tested at run-time.  */
384       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
385     }
386 
387   /* Known data dependence.  */
388   if (DDR_NUM_DIST_VECTS (ddr) == 0)
389     {
390       /* If user asserted safelen consecutive iterations can be
391 	 executed concurrently, assume independence.  */
392       if (loop->safelen >= 2)
393 	{
394 	  if ((unsigned int) loop->safelen < *max_vf)
395 	    *max_vf = loop->safelen;
396 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
397 	  return false;
398 	}
399 
400       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
401 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
402 	{
403 	  if (dump_enabled_p ())
404 	    {
405 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 			       "versioning for alias not supported for: "
407 			       "bad dist vector for ");
408 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 				 DR_REF (dra));
410 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
411 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
412 				 DR_REF (drb));
413 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 	    }
415 	  return true;
416 	}
417 
418       if (dump_enabled_p ())
419         {
420           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421                            "versioning for alias required: "
422                            "bad dist vector for ");
423           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
424           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
425           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
426           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
427         }
428       /* Add to list of ddrs that need to be tested at run-time.  */
429       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
430     }
431 
432   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
433 
434   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
435       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
436 						loop_depth, max_vf))
437     return false;
438 
439   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
440     {
441       int dist = dist_v[loop_depth];
442 
443       if (dump_enabled_p ())
444 	dump_printf_loc (MSG_NOTE, vect_location,
445                          "dependence distance  = %d.\n", dist);
446 
447       if (dist == 0)
448 	{
449 	  if (dump_enabled_p ())
450 	    {
451 	      dump_printf_loc (MSG_NOTE, vect_location,
452 	                       "dependence distance == 0 between ");
453 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
454 	      dump_printf (MSG_NOTE, " and ");
455 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
456 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
457 	    }
458 
459 	  /* When we perform grouped accesses and perform implicit CSE
460 	     by detecting equal accesses and doing disambiguation with
461 	     runtime alias tests like for
462 	        .. = a[i];
463 		.. = a[i+1];
464 		a[i] = ..;
465 		a[i+1] = ..;
466 		*p = ..;
467 		.. = a[i];
468 		.. = a[i+1];
469 	     where we will end up loading { a[i], a[i+1] } once, make
470 	     sure that inserting group loads before the first load and
471 	     stores after the last store will do the right thing.
472 	     Similar for groups like
473 	        a[i] = ...;
474 		... = a[i];
475 		a[i+1] = ...;
476 	     where loads from the group interleave with the store.  */
477 	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
478 	    {
479 	      if (dump_enabled_p ())
480 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
481 				 "READ_WRITE dependence in interleaving.\n");
482 	      return true;
483 	    }
484 
485 	  if (loop->safelen < 2)
486 	    {
487 	      tree indicator = dr_zero_step_indicator (dra);
488 	      if (TREE_CODE (indicator) != INTEGER_CST)
489 		vect_check_nonzero_value (loop_vinfo, indicator);
490 	      else if (integer_zerop (indicator))
491 		{
492 		  if (dump_enabled_p ())
493 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 				 "access also has a zero step\n");
495 		  return true;
496 		}
497 	    }
498 	  continue;
499 	}
500 
501       if (dist > 0 && DDR_REVERSED_P (ddr))
502 	{
503 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
504 	     reversed (to make distance vector positive), and the actual
505 	     distance is negative.  */
506 	  if (dump_enabled_p ())
507 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
508 	                     "dependence distance negative.\n");
509 	  /* Record a negative dependence distance to later limit the
510 	     amount of stmt copying / unrolling we can perform.
511 	     Only need to handle read-after-write dependence.  */
512 	  if (DR_IS_READ (drb)
513 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
514 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
515 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
516 	  continue;
517 	}
518 
519       unsigned int abs_dist = abs (dist);
520       if (abs_dist >= 2 && abs_dist < *max_vf)
521 	{
522 	  /* The dependence distance requires reduction of the maximal
523 	     vectorization factor.  */
524 	  *max_vf = abs (dist);
525 	  if (dump_enabled_p ())
526 	    dump_printf_loc (MSG_NOTE, vect_location,
527 	                     "adjusting maximal vectorization factor to %i\n",
528 	                     *max_vf);
529 	}
530 
531       if (abs_dist >= *max_vf)
532 	{
533 	  /* Dependence distance does not create dependence, as far as
534 	     vectorization is concerned, in this case.  */
535 	  if (dump_enabled_p ())
536 	    dump_printf_loc (MSG_NOTE, vect_location,
537 	                     "dependence distance >= VF.\n");
538 	  continue;
539 	}
540 
541       if (dump_enabled_p ())
542 	{
543 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
544 	               "not vectorized, possible dependence "
545 	               "between data-refs ");
546 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
547 	  dump_printf (MSG_NOTE,  " and ");
548 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
549 	  dump_printf (MSG_NOTE,  "\n");
550 	}
551 
552       return true;
553     }
554 
555   return false;
556 }
557 
558 /* Function vect_analyze_data_ref_dependences.
559 
560    Examine all the data references in the loop, and make sure there do not
561    exist any data dependences between them.  Set *MAX_VF according to
562    the maximum vectorization factor the data dependences allow.  */
563 
564 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)565 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
566 				   unsigned int *max_vf)
567 {
568   unsigned int i;
569   struct data_dependence_relation *ddr;
570 
571   if (dump_enabled_p ())
572     dump_printf_loc (MSG_NOTE, vect_location,
573                      "=== vect_analyze_data_ref_dependences ===\n");
574 
575   LOOP_VINFO_DDRS (loop_vinfo)
576     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
577 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
578   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
579   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
580   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
581 				&LOOP_VINFO_DDRS (loop_vinfo),
582 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
583     return false;
584 
585   /* For epilogues we either have no aliases or alias versioning
586      was applied to original loop.  Therefore we may just get max_vf
587      using VF of original loop.  */
588   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
589     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
590   else
591     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
592       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
593 	return false;
594 
595   return true;
596 }
597 
598 
599 /* Function vect_slp_analyze_data_ref_dependence.
600 
601    Return TRUE if there (might) exist a dependence between a memory-reference
602    DRA and a memory-reference DRB.  When versioning for alias may check a
603    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
604    the data dependence.  */
605 
606 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)607 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
608 {
609   struct data_reference *dra = DDR_A (ddr);
610   struct data_reference *drb = DDR_B (ddr);
611 
612   /* We need to check dependences of statements marked as unvectorizable
613      as well, they still can prohibit vectorization.  */
614 
615   /* Independent data accesses.  */
616   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
617     return false;
618 
619   if (dra == drb)
620     return false;
621 
622   /* Read-read is OK.  */
623   if (DR_IS_READ (dra) && DR_IS_READ (drb))
624     return false;
625 
626   /* If dra and drb are part of the same interleaving chain consider
627      them independent.  */
628   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
629       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
630 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
631     return false;
632 
633   /* Unknown data dependence.  */
634   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
635     {
636       if  (dump_enabled_p ())
637 	{
638 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
639 			   "can't determine dependence between ");
640 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
641 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
642 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
643 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
644 	}
645     }
646   else if (dump_enabled_p ())
647     {
648       dump_printf_loc (MSG_NOTE, vect_location,
649 		       "determined dependence between ");
650       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
651       dump_printf (MSG_NOTE, " and ");
652       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
653       dump_printf (MSG_NOTE,  "\n");
654     }
655 
656   return true;
657 }
658 
659 
660 /* Analyze dependences involved in the transform of SLP NODE.  STORES
661    contain the vector of scalar stores of this instance if we are
662    disambiguating the loads.  */
663 
664 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)665 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
666 				   vec<gimple *> stores, gimple *last_store)
667 {
668   /* This walks over all stmts involved in the SLP load/store done
669      in NODE verifying we can sink them up to the last stmt in the
670      group.  */
671   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
672   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
673     {
674       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
675       if (access == last_access)
676 	continue;
677       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
678       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
679 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
680 	{
681 	  gimple *stmt = gsi_stmt (gsi);
682 	  if (! gimple_vuse (stmt)
683 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
684 	    continue;
685 
686 	  /* If we couldn't record a (single) data reference for this
687 	     stmt we have to give up.  */
688 	  /* ???  Here and below if dependence analysis fails we can resort
689 	     to the alias oracle which can handle more kinds of stmts.  */
690 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
691 	  if (!dr_b)
692 	    return false;
693 
694 	  bool dependent = false;
695 	  /* If we run into a store of this same instance (we've just
696 	     marked those) then delay dependence checking until we run
697 	     into the last store because this is where it will have
698 	     been sunk to (and we verify if we can do that as well).  */
699 	  if (gimple_visited_p (stmt))
700 	    {
701 	      if (stmt != last_store)
702 		continue;
703 	      unsigned i;
704 	      gimple *store;
705 	      FOR_EACH_VEC_ELT (stores, i, store)
706 		{
707 		  data_reference *store_dr
708 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
709 		  ddr_p ddr = initialize_data_dependence_relation
710 				(dr_a, store_dr, vNULL);
711 		  dependent = vect_slp_analyze_data_ref_dependence (ddr);
712 		  free_dependence_relation (ddr);
713 		  if (dependent)
714 		    break;
715 		}
716 	    }
717 	  else
718 	    {
719 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
720 							       dr_b, vNULL);
721 	      dependent = vect_slp_analyze_data_ref_dependence (ddr);
722 	      free_dependence_relation (ddr);
723 	    }
724 	  if (dependent)
725 	    return false;
726 	}
727     }
728   return true;
729 }
730 
731 
732 /* Function vect_analyze_data_ref_dependences.
733 
734    Examine all the data references in the basic-block, and make sure there
735    do not exist any data dependences between them.  Set *MAX_VF according to
736    the maximum vectorization factor the data dependences allow.  */
737 
738 bool
vect_slp_analyze_instance_dependence(slp_instance instance)739 vect_slp_analyze_instance_dependence (slp_instance instance)
740 {
741   if (dump_enabled_p ())
742     dump_printf_loc (MSG_NOTE, vect_location,
743                      "=== vect_slp_analyze_instance_dependence ===\n");
744 
745   /* The stores of this instance are at the root of the SLP tree.  */
746   slp_tree store = SLP_INSTANCE_TREE (instance);
747   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
748     store = NULL;
749 
750   /* Verify we can sink stores to the vectorized stmt insert location.  */
751   gimple *last_store = NULL;
752   if (store)
753     {
754       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
755 	return false;
756 
757       /* Mark stores in this instance and remember the last one.  */
758       last_store = vect_find_last_scalar_stmt_in_slp (store);
759       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
760 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
761     }
762 
763   bool res = true;
764 
765   /* Verify we can sink loads to the vectorized stmt insert location,
766      special-casing stores of this instance.  */
767   slp_tree load;
768   unsigned int i;
769   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
770     if (! vect_slp_analyze_node_dependences (instance, load,
771 					     store
772 					     ? SLP_TREE_SCALAR_STMTS (store)
773 					     : vNULL, last_store))
774       {
775 	res = false;
776 	break;
777       }
778 
779   /* Unset the visited flag.  */
780   if (store)
781     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
782       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
783 
784   return res;
785 }
786 
787 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
788    the statement that contains DRB, which is useful for recording in the
789    dump file.  */
790 
791 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)792 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
793 			    innermost_loop_behavior *drb)
794 {
795   bool existed;
796   innermost_loop_behavior *&entry
797     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
798   if (!existed || entry->base_alignment < drb->base_alignment)
799     {
800       entry = drb;
801       if (dump_enabled_p ())
802 	{
803 	  dump_printf_loc (MSG_NOTE, vect_location,
804 			   "recording new base alignment for ");
805 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
806 	  dump_printf (MSG_NOTE, "\n");
807 	  dump_printf_loc (MSG_NOTE, vect_location,
808 			   "  alignment:    %d\n", drb->base_alignment);
809 	  dump_printf_loc (MSG_NOTE, vect_location,
810 			   "  misalignment: %d\n", drb->base_misalignment);
811 	  dump_printf_loc (MSG_NOTE, vect_location,
812 			   "  based on:     ");
813 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
814 	}
815     }
816 }
817 
818 /* If the region we're going to vectorize is reached, all unconditional
819    data references occur at least once.  We can therefore pool the base
820    alignment guarantees from each unconditional reference.  Do this by
821    going through all the data references in VINFO and checking whether
822    the containing statement makes the reference unconditionally.  If so,
823    record the alignment of the base address in VINFO so that it can be
824    used for all other references with the same base.  */
825 
826 void
vect_record_base_alignments(vec_info * vinfo)827 vect_record_base_alignments (vec_info *vinfo)
828 {
829   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
830   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
831   data_reference *dr;
832   unsigned int i;
833   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
834     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
835       {
836 	gimple *stmt = DR_STMT (dr);
837 	vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
838 
839 	/* If DR is nested in the loop that is being vectorized, we can also
840 	   record the alignment of the base wrt the outer loop.  */
841 	if (loop && nested_in_vect_loop_p (loop, stmt))
842 	  {
843 	    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
844 	    vect_record_base_alignment
845 	      (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
846 	  }
847       }
848 }
849 
850 /* Return the target alignment for the vectorized form of DR.  */
851 
852 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)853 vect_calculate_target_alignment (struct data_reference *dr)
854 {
855   gimple *stmt = DR_STMT (dr);
856   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
857   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
858   return targetm.vectorize.preferred_vector_alignment (vectype);
859 }
860 
861 /* Function vect_compute_data_ref_alignment
862 
863    Compute the misalignment of the data reference DR.
864 
865    Output:
866    1. If during the misalignment computation it is found that the data reference
867       cannot be vectorized then false is returned.
868    2. DR_MISALIGNMENT (DR) is defined.
869 
870    FOR NOW: No analysis is actually performed. Misalignment is calculated
871    only for trivial cases. TODO.  */
872 
873 bool
vect_compute_data_ref_alignment(struct data_reference * dr)874 vect_compute_data_ref_alignment (struct data_reference *dr)
875 {
876   gimple *stmt = DR_STMT (dr);
877   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
878   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
879   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
880   struct loop *loop = NULL;
881   tree ref = DR_REF (dr);
882   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
883 
884   if (dump_enabled_p ())
885     dump_printf_loc (MSG_NOTE, vect_location,
886                      "vect_compute_data_ref_alignment:\n");
887 
888   if (loop_vinfo)
889     loop = LOOP_VINFO_LOOP (loop_vinfo);
890 
891   /* Initialize misalignment to unknown.  */
892   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
893 
894   innermost_loop_behavior *drb = vect_dr_behavior (dr);
895   bool step_preserves_misalignment_p;
896 
897   unsigned HOST_WIDE_INT vector_alignment
898     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
899   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
900 
901   /* No step for BB vectorization.  */
902   if (!loop)
903     {
904       gcc_assert (integer_zerop (drb->step));
905       step_preserves_misalignment_p = true;
906     }
907 
908   /* In case the dataref is in an inner-loop of the loop that is being
909      vectorized (LOOP), we use the base and misalignment information
910      relative to the outer-loop (LOOP).  This is ok only if the misalignment
911      stays the same throughout the execution of the inner-loop, which is why
912      we have to check that the stride of the dataref in the inner-loop evenly
913      divides by the vector alignment.  */
914   else if (nested_in_vect_loop_p (loop, stmt))
915     {
916       step_preserves_misalignment_p
917 	= (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
918 
919       if (dump_enabled_p ())
920 	{
921 	  if (step_preserves_misalignment_p)
922 	    dump_printf_loc (MSG_NOTE, vect_location,
923 			     "inner step divides the vector alignment.\n");
924 	  else
925 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 			     "inner step doesn't divide the vector"
927 			     " alignment.\n");
928 	}
929     }
930 
931   /* Similarly we can only use base and misalignment information relative to
932      an innermost loop if the misalignment stays the same throughout the
933      execution of the loop.  As above, this is the case if the stride of
934      the dataref evenly divides by the alignment.  */
935   else
936     {
937       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
938       step_preserves_misalignment_p
939 	= multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
940 
941       if (!step_preserves_misalignment_p && dump_enabled_p ())
942 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
943 			 "step doesn't divide the vector alignment.\n");
944     }
945 
946   unsigned int base_alignment = drb->base_alignment;
947   unsigned int base_misalignment = drb->base_misalignment;
948 
949   /* Calculate the maximum of the pooled base address alignment and the
950      alignment that we can compute for DR itself.  */
951   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
952   if (entry && base_alignment < (*entry)->base_alignment)
953     {
954       base_alignment = (*entry)->base_alignment;
955       base_misalignment = (*entry)->base_misalignment;
956     }
957 
958   if (drb->offset_alignment < vector_alignment
959       || !step_preserves_misalignment_p
960       /* We need to know whether the step wrt the vectorized loop is
961 	 negative when computing the starting misalignment below.  */
962       || TREE_CODE (drb->step) != INTEGER_CST)
963     {
964       if (dump_enabled_p ())
965 	{
966 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 	                   "Unknown alignment for access: ");
968 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
969 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
970 	}
971       return true;
972     }
973 
974   if (base_alignment < vector_alignment)
975     {
976       unsigned int max_alignment;
977       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
978       if (max_alignment < vector_alignment
979 	  || !vect_can_force_dr_alignment_p (base,
980 					     vector_alignment * BITS_PER_UNIT))
981 	{
982 	  if (dump_enabled_p ())
983 	    {
984 	      dump_printf_loc (MSG_NOTE, vect_location,
985 	                       "can't force alignment of ref: ");
986 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
987 	      dump_printf (MSG_NOTE, "\n");
988 	    }
989 	  return true;
990 	}
991 
992       /* Force the alignment of the decl.
993 	 NOTE: This is the only change to the code we make during
994 	 the analysis phase, before deciding to vectorize the loop.  */
995       if (dump_enabled_p ())
996         {
997           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
998           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
999           dump_printf (MSG_NOTE, "\n");
1000         }
1001 
1002       DR_VECT_AUX (dr)->base_decl = base;
1003       DR_VECT_AUX (dr)->base_misaligned = true;
1004       base_misalignment = 0;
1005     }
1006   poly_int64 misalignment
1007     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1008 
1009   /* If this is a backward running DR then first access in the larger
1010      vectype actually is N-1 elements before the address in the DR.
1011      Adjust misalign accordingly.  */
1012   if (tree_int_cst_sgn (drb->step) < 0)
1013     /* PLUS because STEP is negative.  */
1014     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1015 		     * TREE_INT_CST_LOW (drb->step));
1016 
1017   unsigned int const_misalignment;
1018   if (!known_misalignment (misalignment, vector_alignment,
1019 			   &const_misalignment))
1020     {
1021       if (dump_enabled_p ())
1022 	{
1023 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 			   "Non-constant misalignment for access: ");
1025 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027 	}
1028       return true;
1029     }
1030 
1031   SET_DR_MISALIGNMENT (dr, const_misalignment);
1032 
1033   if (dump_enabled_p ())
1034     {
1035       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1037       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039     }
1040 
1041   return true;
1042 }
1043 
1044 /* Function vect_update_misalignment_for_peel.
1045    Sets DR's misalignment
1046    - to 0 if it has the same alignment as DR_PEEL,
1047    - to the misalignment computed using NPEEL if DR's salignment is known,
1048    - to -1 (unknown) otherwise.
1049 
1050    DR - the data reference whose misalignment is to be adjusted.
1051    DR_PEEL - the data reference whose misalignment is being made
1052              zero in the vector loop by the peel.
1053    NPEEL - the number of iterations in the peel loop if the misalignment
1054            of DR_PEEL is known at compile time.  */
1055 
1056 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1057 vect_update_misalignment_for_peel (struct data_reference *dr,
1058                                    struct data_reference *dr_peel, int npeel)
1059 {
1060   unsigned int i;
1061   vec<dr_p> same_aligned_drs;
1062   struct data_reference *current_dr;
1063   int dr_size = vect_get_scalar_dr_size (dr);
1064   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1065   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1066   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1067 
1068  /* For interleaved data accesses the step in the loop must be multiplied by
1069      the size of the interleaving group.  */
1070   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1071     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1072   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1073     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1074 
1075   /* It can be assumed that the data refs with the same alignment as dr_peel
1076      are aligned in the vector loop.  */
1077   same_aligned_drs
1078     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1079   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1080     {
1081       if (current_dr != dr)
1082         continue;
1083       gcc_assert (!known_alignment_for_access_p (dr)
1084 		  || !known_alignment_for_access_p (dr_peel)
1085 		  || (DR_MISALIGNMENT (dr) / dr_size
1086 		      == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1087       SET_DR_MISALIGNMENT (dr, 0);
1088       return;
1089     }
1090 
1091   if (known_alignment_for_access_p (dr)
1092       && known_alignment_for_access_p (dr_peel))
1093     {
1094       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1095       int misal = DR_MISALIGNMENT (dr);
1096       misal += negative ? -npeel * dr_size : npeel * dr_size;
1097       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1098       SET_DR_MISALIGNMENT (dr, misal);
1099       return;
1100     }
1101 
1102   if (dump_enabled_p ())
1103     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1104 		     "to unknown (-1).\n");
1105   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1106 }
1107 
1108 
1109 /* Function verify_data_ref_alignment
1110 
1111    Return TRUE if DR can be handled with respect to alignment.  */
1112 
1113 static bool
verify_data_ref_alignment(data_reference_p dr)1114 verify_data_ref_alignment (data_reference_p dr)
1115 {
1116   enum dr_alignment_support supportable_dr_alignment
1117     = vect_supportable_dr_alignment (dr, false);
1118   if (!supportable_dr_alignment)
1119     {
1120       if (dump_enabled_p ())
1121 	{
1122 	  if (DR_IS_READ (dr))
1123 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 			     "not vectorized: unsupported unaligned load.");
1125 	  else
1126 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 			     "not vectorized: unsupported unaligned "
1128 			     "store.");
1129 
1130 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1131 			     DR_REF (dr));
1132 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1133 	}
1134       return false;
1135     }
1136 
1137   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1138     dump_printf_loc (MSG_NOTE, vect_location,
1139 		     "Vectorizing an unaligned access.\n");
1140 
1141   return true;
1142 }
1143 
1144 /* Function vect_verify_datarefs_alignment
1145 
1146    Return TRUE if all data references in the loop can be
1147    handled with respect to alignment.  */
1148 
1149 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1150 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1151 {
1152   vec<data_reference_p> datarefs = vinfo->datarefs;
1153   struct data_reference *dr;
1154   unsigned int i;
1155 
1156   FOR_EACH_VEC_ELT (datarefs, i, dr)
1157     {
1158       gimple *stmt = DR_STMT (dr);
1159       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1160 
1161       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1162 	continue;
1163 
1164       /* For interleaving, only the alignment of the first access matters.   */
1165       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1166 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1167 	continue;
1168 
1169       /* Strided accesses perform only component accesses, alignment is
1170 	 irrelevant for them.  */
1171       if (STMT_VINFO_STRIDED_P (stmt_info)
1172 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1173 	continue;
1174 
1175       if (! verify_data_ref_alignment (dr))
1176 	return false;
1177     }
1178 
1179   return true;
1180 }
1181 
1182 /* Given an memory reference EXP return whether its alignment is less
1183    than its size.  */
1184 
1185 static bool
not_size_aligned(tree exp)1186 not_size_aligned (tree exp)
1187 {
1188   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1189     return true;
1190 
1191   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1192 	  > get_object_alignment (exp));
1193 }
1194 
1195 /* Function vector_alignment_reachable_p
1196 
1197    Return true if vector alignment for DR is reachable by peeling
1198    a few loop iterations.  Return false otherwise.  */
1199 
1200 static bool
vector_alignment_reachable_p(struct data_reference * dr)1201 vector_alignment_reachable_p (struct data_reference *dr)
1202 {
1203   gimple *stmt = DR_STMT (dr);
1204   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1205   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1206 
1207   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1208     {
1209       /* For interleaved access we peel only if number of iterations in
1210 	 the prolog loop ({VF - misalignment}), is a multiple of the
1211 	 number of the interleaved accesses.  */
1212       int elem_size, mis_in_elements;
1213 
1214       /* FORNOW: handle only known alignment.  */
1215       if (!known_alignment_for_access_p (dr))
1216 	return false;
1217 
1218       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1219       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1220       elem_size = vector_element_size (vector_size, nelements);
1221       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1222 
1223       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1224 	return false;
1225     }
1226 
1227   /* If misalignment is known at the compile time then allow peeling
1228      only if natural alignment is reachable through peeling.  */
1229   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1230     {
1231       HOST_WIDE_INT elmsize =
1232 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1233       if (dump_enabled_p ())
1234 	{
1235 	  dump_printf_loc (MSG_NOTE, vect_location,
1236 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1237 	  dump_printf (MSG_NOTE,
1238 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1239 	}
1240       if (DR_MISALIGNMENT (dr) % elmsize)
1241 	{
1242 	  if (dump_enabled_p ())
1243 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 	                     "data size does not divide the misalignment.\n");
1245 	  return false;
1246 	}
1247     }
1248 
1249   if (!known_alignment_for_access_p (dr))
1250     {
1251       tree type = TREE_TYPE (DR_REF (dr));
1252       bool is_packed = not_size_aligned (DR_REF (dr));
1253       if (dump_enabled_p ())
1254 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 	                 "Unknown misalignment, %snaturally aligned\n",
1256 			 is_packed ? "not " : "");
1257       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1258     }
1259 
1260   return true;
1261 }
1262 
1263 
1264 /* Calculate the cost of the memory access represented by DR.  */
1265 
1266 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1267 vect_get_data_access_cost (struct data_reference *dr,
1268                            unsigned int *inside_cost,
1269                            unsigned int *outside_cost,
1270 			   stmt_vector_for_cost *body_cost_vec)
1271 {
1272   gimple *stmt = DR_STMT (dr);
1273   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1274   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1275   int ncopies;
1276 
1277   if (PURE_SLP_STMT (stmt_info))
1278     ncopies = 1;
1279   else
1280     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1281 
1282   if (DR_IS_READ (dr))
1283     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1284 			NULL, body_cost_vec, false);
1285   else
1286     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1287 
1288   if (dump_enabled_p ())
1289     dump_printf_loc (MSG_NOTE, vect_location,
1290                      "vect_get_data_access_cost: inside_cost = %d, "
1291                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1292 }
1293 
1294 
1295 typedef struct _vect_peel_info
1296 {
1297   struct data_reference *dr;
1298   int npeel;
1299   unsigned int count;
1300 } *vect_peel_info;
1301 
1302 typedef struct _vect_peel_extended_info
1303 {
1304   struct _vect_peel_info peel_info;
1305   unsigned int inside_cost;
1306   unsigned int outside_cost;
1307 } *vect_peel_extended_info;
1308 
1309 
1310 /* Peeling hashtable helpers.  */
1311 
1312 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1313 {
1314   static inline hashval_t hash (const _vect_peel_info *);
1315   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1316 };
1317 
1318 inline hashval_t
hash(const _vect_peel_info * peel_info)1319 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1320 {
1321   return (hashval_t) peel_info->npeel;
1322 }
1323 
1324 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1325 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1326 {
1327   return (a->npeel == b->npeel);
1328 }
1329 
1330 
1331 /* Insert DR into peeling hash table with NPEEL as key.  */
1332 
1333 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1334 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1335 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1336                           int npeel)
1337 {
1338   struct _vect_peel_info elem, *slot;
1339   _vect_peel_info **new_slot;
1340   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1341 
1342   elem.npeel = npeel;
1343   slot = peeling_htab->find (&elem);
1344   if (slot)
1345     slot->count++;
1346   else
1347     {
1348       slot = XNEW (struct _vect_peel_info);
1349       slot->npeel = npeel;
1350       slot->dr = dr;
1351       slot->count = 1;
1352       new_slot = peeling_htab->find_slot (slot, INSERT);
1353       *new_slot = slot;
1354     }
1355 
1356   if (!supportable_dr_alignment
1357       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1358     slot->count += VECT_MAX_COST;
1359 }
1360 
1361 
1362 /* Traverse peeling hash table to find peeling option that aligns maximum
1363    number of data accesses.  */
1364 
1365 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1366 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1367 				     _vect_peel_extended_info *max)
1368 {
1369   vect_peel_info elem = *slot;
1370 
1371   if (elem->count > max->peel_info.count
1372       || (elem->count == max->peel_info.count
1373           && max->peel_info.npeel > elem->npeel))
1374     {
1375       max->peel_info.npeel = elem->npeel;
1376       max->peel_info.count = elem->count;
1377       max->peel_info.dr = elem->dr;
1378     }
1379 
1380   return 1;
1381 }
1382 
1383 /* Get the costs of peeling NPEEL iterations checking data access costs
1384    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1385    misalignment will be zero after peeling.  */
1386 
1387 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1388 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1389 				struct data_reference *dr0,
1390 				unsigned int *inside_cost,
1391 				unsigned int *outside_cost,
1392 				stmt_vector_for_cost *body_cost_vec,
1393 				unsigned int npeel,
1394 				bool unknown_misalignment)
1395 {
1396   unsigned i;
1397   data_reference *dr;
1398 
1399   FOR_EACH_VEC_ELT (datarefs, i, dr)
1400     {
1401       gimple *stmt = DR_STMT (dr);
1402       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1403       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1404 	continue;
1405 
1406       /* For interleaving, only the alignment of the first access
1407          matters.  */
1408       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1409           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1410         continue;
1411 
1412       /* Strided accesses perform only component accesses, alignment is
1413          irrelevant for them.  */
1414       if (STMT_VINFO_STRIDED_P (stmt_info)
1415 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1416 	continue;
1417 
1418       int save_misalignment;
1419       save_misalignment = DR_MISALIGNMENT (dr);
1420       if (npeel == 0)
1421 	;
1422       else if (unknown_misalignment && dr == dr0)
1423 	SET_DR_MISALIGNMENT (dr, 0);
1424       else
1425 	vect_update_misalignment_for_peel (dr, dr0, npeel);
1426       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1427 				 body_cost_vec);
1428       SET_DR_MISALIGNMENT (dr, save_misalignment);
1429     }
1430 }
1431 
1432 /* Traverse peeling hash table and calculate cost for each peeling option.
1433    Find the one with the lowest cost.  */
1434 
1435 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1436 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1437 				   _vect_peel_extended_info *min)
1438 {
1439   vect_peel_info elem = *slot;
1440   int dummy;
1441   unsigned int inside_cost = 0, outside_cost = 0;
1442   gimple *stmt = DR_STMT (elem->dr);
1443   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1445   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1446 		       epilogue_cost_vec;
1447 
1448   prologue_cost_vec.create (2);
1449   body_cost_vec.create (2);
1450   epilogue_cost_vec.create (2);
1451 
1452   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1453 				  elem->dr, &inside_cost, &outside_cost,
1454 				  &body_cost_vec, elem->npeel, false);
1455 
1456   body_cost_vec.release ();
1457 
1458   outside_cost += vect_get_known_peeling_cost
1459     (loop_vinfo, elem->npeel, &dummy,
1460      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1461      &prologue_cost_vec, &epilogue_cost_vec);
1462 
1463   /* Prologue and epilogue costs are added to the target model later.
1464      These costs depend only on the scalar iteration cost, the
1465      number of peeling iterations finally chosen, and the number of
1466      misaligned statements.  So discard the information found here.  */
1467   prologue_cost_vec.release ();
1468   epilogue_cost_vec.release ();
1469 
1470   if (inside_cost < min->inside_cost
1471       || (inside_cost == min->inside_cost
1472 	  && outside_cost < min->outside_cost))
1473     {
1474       min->inside_cost = inside_cost;
1475       min->outside_cost = outside_cost;
1476       min->peel_info.dr = elem->dr;
1477       min->peel_info.npeel = elem->npeel;
1478       min->peel_info.count = elem->count;
1479     }
1480 
1481   return 1;
1482 }
1483 
1484 
1485 /* Choose best peeling option by traversing peeling hash table and either
1486    choosing an option with the lowest cost (if cost model is enabled) or the
1487    option that aligns as many accesses as possible.  */
1488 
1489 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1490 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1491 				       loop_vec_info loop_vinfo)
1492 {
1493    struct _vect_peel_extended_info res;
1494 
1495    res.peel_info.dr = NULL;
1496 
1497    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1498      {
1499        res.inside_cost = INT_MAX;
1500        res.outside_cost = INT_MAX;
1501        peeling_htab->traverse <_vect_peel_extended_info *,
1502 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1503      }
1504    else
1505      {
1506        res.peel_info.count = 0;
1507        peeling_htab->traverse <_vect_peel_extended_info *,
1508 	   		       vect_peeling_hash_get_most_frequent> (&res);
1509        res.inside_cost = 0;
1510        res.outside_cost = 0;
1511      }
1512 
1513    return res;
1514 }
1515 
1516 /* Return true if the new peeling NPEEL is supported.  */
1517 
1518 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1519 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1520 			  unsigned npeel)
1521 {
1522   unsigned i;
1523   struct data_reference *dr = NULL;
1524   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1525   gimple *stmt;
1526   stmt_vec_info stmt_info;
1527   enum dr_alignment_support supportable_dr_alignment;
1528 
1529   /* Ensure that all data refs can be vectorized after the peel.  */
1530   FOR_EACH_VEC_ELT (datarefs, i, dr)
1531     {
1532       int save_misalignment;
1533 
1534       if (dr == dr0)
1535 	continue;
1536 
1537       stmt = DR_STMT (dr);
1538       stmt_info = vinfo_for_stmt (stmt);
1539       /* For interleaving, only the alignment of the first access
1540 	 matters.  */
1541       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1542 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1543 	continue;
1544 
1545       /* Strided accesses perform only component accesses, alignment is
1546 	 irrelevant for them.  */
1547       if (STMT_VINFO_STRIDED_P (stmt_info)
1548 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1549 	continue;
1550 
1551       save_misalignment = DR_MISALIGNMENT (dr);
1552       vect_update_misalignment_for_peel (dr, dr0, npeel);
1553       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1554       SET_DR_MISALIGNMENT (dr, save_misalignment);
1555 
1556       if (!supportable_dr_alignment)
1557 	return false;
1558     }
1559 
1560   return true;
1561 }
1562 
1563 /* Function vect_enhance_data_refs_alignment
1564 
1565    This pass will use loop versioning and loop peeling in order to enhance
1566    the alignment of data references in the loop.
1567 
1568    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1569    original loop is to be vectorized.  Any other loops that are created by
1570    the transformations performed in this pass - are not supposed to be
1571    vectorized.  This restriction will be relaxed.
1572 
1573    This pass will require a cost model to guide it whether to apply peeling
1574    or versioning or a combination of the two.  For example, the scheme that
1575    intel uses when given a loop with several memory accesses, is as follows:
1576    choose one memory access ('p') which alignment you want to force by doing
1577    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1578    other accesses are not necessarily aligned, or (2) use loop versioning to
1579    generate one loop in which all accesses are aligned, and another loop in
1580    which only 'p' is necessarily aligned.
1581 
1582    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1583    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1584    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1585 
1586    Devising a cost model is the most critical aspect of this work.  It will
1587    guide us on which access to peel for, whether to use loop versioning, how
1588    many versions to create, etc.  The cost model will probably consist of
1589    generic considerations as well as target specific considerations (on
1590    powerpc for example, misaligned stores are more painful than misaligned
1591    loads).
1592 
1593    Here are the general steps involved in alignment enhancements:
1594 
1595      -- original loop, before alignment analysis:
1596 	for (i=0; i<N; i++){
1597 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1598 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1599 	}
1600 
1601      -- After vect_compute_data_refs_alignment:
1602 	for (i=0; i<N; i++){
1603 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1604 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1605 	}
1606 
1607      -- Possibility 1: we do loop versioning:
1608      if (p is aligned) {
1609 	for (i=0; i<N; i++){	# loop 1A
1610 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1611 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1612 	}
1613      }
1614      else {
1615 	for (i=0; i<N; i++){	# loop 1B
1616 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1617 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1618 	}
1619      }
1620 
1621      -- Possibility 2: we do loop peeling:
1622      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1623 	x = q[i];
1624 	p[i] = y;
1625      }
1626      for (i = 3; i < N; i++){	# loop 2A
1627 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1628 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1629      }
1630 
1631      -- Possibility 3: combination of loop peeling and versioning:
1632      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1633 	x = q[i];
1634 	p[i] = y;
1635      }
1636      if (p is aligned) {
1637 	for (i = 3; i<N; i++){	# loop 3A
1638 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1639 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1640 	}
1641      }
1642      else {
1643 	for (i = 3; i<N; i++){	# loop 3B
1644 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1645 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1646 	}
1647      }
1648 
1649      These loops are later passed to loop_transform to be vectorized.  The
1650      vectorizer will use the alignment information to guide the transformation
1651      (whether to generate regular loads/stores, or with special handling for
1652      misalignment).  */
1653 
1654 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1655 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1656 {
1657   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1658   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1659   enum dr_alignment_support supportable_dr_alignment;
1660   struct data_reference *dr0 = NULL, *first_store = NULL;
1661   struct data_reference *dr;
1662   unsigned int i, j;
1663   bool do_peeling = false;
1664   bool do_versioning = false;
1665   bool stat;
1666   gimple *stmt;
1667   stmt_vec_info stmt_info;
1668   unsigned int npeel = 0;
1669   bool one_misalignment_known = false;
1670   bool one_misalignment_unknown = false;
1671   bool one_dr_unsupportable = false;
1672   struct data_reference *unsupportable_dr = NULL;
1673   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1674   unsigned possible_npeel_number = 1;
1675   tree vectype;
1676   unsigned int mis, same_align_drs_max = 0;
1677   hash_table<peel_info_hasher> peeling_htab (1);
1678 
1679   if (dump_enabled_p ())
1680     dump_printf_loc (MSG_NOTE, vect_location,
1681                      "=== vect_enhance_data_refs_alignment ===\n");
1682 
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686 
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689 
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694 
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700 
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703 
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705 
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708 
1709   /* (1) Peeling to force alignment.  */
1710 
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718 
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       stmt = DR_STMT (dr);
1722       stmt_info = vinfo_for_stmt (stmt);
1723 
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 	continue;
1726 
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1731         continue;
1732 
1733       /* For invariant accesses there is nothing to enhance.  */
1734       if (integer_zerop (DR_STEP (dr)))
1735 	continue;
1736 
1737       /* Strided accesses perform only component accesses, alignment is
1738 	 irrelevant for them.  */
1739       if (STMT_VINFO_STRIDED_P (stmt_info)
1740 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1741 	continue;
1742 
1743       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1744       do_peeling = vector_alignment_reachable_p (dr);
1745       if (do_peeling)
1746         {
1747           if (known_alignment_for_access_p (dr))
1748             {
1749 	      unsigned int npeel_tmp = 0;
1750 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1751 						    size_zero_node) < 0;
1752 
1753 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1755 	      unsigned int dr_size = vect_get_scalar_dr_size (dr);
1756 	      mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1757 	      if (DR_MISALIGNMENT (dr) != 0)
1758 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1759 
1760               /* For multiple types, it is possible that the bigger type access
1761                  will have more than one peeling option.  E.g., a loop with two
1762                  types: one of size (vector size / 4), and the other one of
1763                  size (vector size / 8).  Vectorization factor will 8.  If both
1764                  accesses are misaligned by 3, the first one needs one scalar
1765                  iteration to be aligned, and the second one needs 5.  But the
1766 		 first one will be aligned also by peeling 5 scalar
1767                  iterations, and in that case both accesses will be aligned.
1768                  Hence, except for the immediate peeling amount, we also want
1769                  to try to add full vector size, while we don't exceed
1770                  vectorization factor.
1771                  We do this automatically for cost model, since we calculate
1772 		 cost for every peeling option.  */
1773               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1774 		{
1775 		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1776 					  ? vf * GROUP_SIZE (stmt_info) : vf);
1777 		  possible_npeel_number
1778 		    = vect_get_num_vectors (nscalars, vectype);
1779 
1780 		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1781 		     allow peeling NELEMENTS.  */
1782 		  if (DR_MISALIGNMENT (dr) == 0)
1783 		    possible_npeel_number++;
1784 		}
1785 
1786 	      /* Save info about DR in the hash table.  Also include peeling
1787 	         amounts according to the explanation above.  */
1788               for (j = 0; j < possible_npeel_number; j++)
1789                 {
1790                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1791 					    dr, npeel_tmp);
1792 		  npeel_tmp += target_align / dr_size;
1793                 }
1794 
1795 	      one_misalignment_known = true;
1796             }
1797           else
1798             {
1799               /* If we don't know any misalignment values, we prefer
1800                  peeling for data-ref that has the maximum number of data-refs
1801                  with the same alignment, unless the target prefers to align
1802                  stores over load.  */
1803 	      unsigned same_align_drs
1804 		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1805 	      if (!dr0
1806 		  || same_align_drs_max < same_align_drs)
1807 		{
1808 		  same_align_drs_max = same_align_drs;
1809 		  dr0 = dr;
1810 		}
1811 	      /* For data-refs with the same number of related
1812 		 accesses prefer the one where the misalign
1813 		 computation will be invariant in the outermost loop.  */
1814 	      else if (same_align_drs_max == same_align_drs)
1815 		{
1816 		  struct loop *ivloop0, *ivloop;
1817 		  ivloop0 = outermost_invariant_loop_for_expr
1818 		    (loop, DR_BASE_ADDRESS (dr0));
1819 		  ivloop = outermost_invariant_loop_for_expr
1820 		    (loop, DR_BASE_ADDRESS (dr));
1821 		  if ((ivloop && !ivloop0)
1822 		      || (ivloop && ivloop0
1823 			  && flow_loop_nested_p (ivloop, ivloop0)))
1824 		    dr0 = dr;
1825 		}
1826 
1827 	      one_misalignment_unknown = true;
1828 
1829 	      /* Check for data refs with unsupportable alignment that
1830 	         can be peeled.  */
1831 	      if (!supportable_dr_alignment)
1832 	      {
1833 		one_dr_unsupportable = true;
1834 		unsupportable_dr = dr;
1835 	      }
1836 
1837 	      if (!first_store && DR_IS_WRITE (dr))
1838 		first_store = dr;
1839             }
1840         }
1841       else
1842         {
1843           if (!aligned_access_p (dr))
1844             {
1845               if (dump_enabled_p ())
1846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847                                  "vector alignment may not be reachable\n");
1848               break;
1849             }
1850         }
1851     }
1852 
1853   /* Check if we can possibly peel the loop.  */
1854   if (!vect_can_advance_ivs_p (loop_vinfo)
1855       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1856       || loop->inner)
1857     do_peeling = false;
1858 
1859   struct _vect_peel_extended_info peel_for_known_alignment;
1860   struct _vect_peel_extended_info peel_for_unknown_alignment;
1861   struct _vect_peel_extended_info best_peel;
1862 
1863   peel_for_unknown_alignment.inside_cost = INT_MAX;
1864   peel_for_unknown_alignment.outside_cost = INT_MAX;
1865   peel_for_unknown_alignment.peel_info.count = 0;
1866 
1867   if (do_peeling
1868       && one_misalignment_unknown)
1869     {
1870       /* Check if the target requires to prefer stores over loads, i.e., if
1871          misaligned stores are more expensive than misaligned loads (taking
1872          drs with same alignment into account).  */
1873       unsigned int load_inside_cost = 0;
1874       unsigned int load_outside_cost = 0;
1875       unsigned int store_inside_cost = 0;
1876       unsigned int store_outside_cost = 0;
1877       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1878 
1879       stmt_vector_for_cost dummy;
1880       dummy.create (2);
1881       vect_get_peeling_costs_all_drs (datarefs, dr0,
1882 				      &load_inside_cost,
1883 				      &load_outside_cost,
1884 				      &dummy, estimated_npeels, true);
1885       dummy.release ();
1886 
1887       if (first_store)
1888 	{
1889 	  dummy.create (2);
1890 	  vect_get_peeling_costs_all_drs (datarefs, first_store,
1891 					  &store_inside_cost,
1892 					  &store_outside_cost,
1893 					  &dummy, estimated_npeels, true);
1894 	  dummy.release ();
1895 	}
1896       else
1897 	{
1898 	  store_inside_cost = INT_MAX;
1899 	  store_outside_cost = INT_MAX;
1900 	}
1901 
1902       if (load_inside_cost > store_inside_cost
1903 	  || (load_inside_cost == store_inside_cost
1904 	      && load_outside_cost > store_outside_cost))
1905 	{
1906 	  dr0 = first_store;
1907 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1908 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1909 	}
1910       else
1911 	{
1912 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1913 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1914 	}
1915 
1916       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1917       prologue_cost_vec.create (2);
1918       epilogue_cost_vec.create (2);
1919 
1920       int dummy2;
1921       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1922 	(loop_vinfo, estimated_npeels, &dummy2,
1923 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1924 	 &prologue_cost_vec, &epilogue_cost_vec);
1925 
1926       prologue_cost_vec.release ();
1927       epilogue_cost_vec.release ();
1928 
1929       peel_for_unknown_alignment.peel_info.count = 1
1930 	+ STMT_VINFO_SAME_ALIGN_REFS
1931 	(vinfo_for_stmt (DR_STMT (dr0))).length ();
1932     }
1933 
1934   peel_for_unknown_alignment.peel_info.npeel = 0;
1935   peel_for_unknown_alignment.peel_info.dr = dr0;
1936 
1937   best_peel = peel_for_unknown_alignment;
1938 
1939   peel_for_known_alignment.inside_cost = INT_MAX;
1940   peel_for_known_alignment.outside_cost = INT_MAX;
1941   peel_for_known_alignment.peel_info.count = 0;
1942   peel_for_known_alignment.peel_info.dr = NULL;
1943 
1944   if (do_peeling && one_misalignment_known)
1945     {
1946       /* Peeling is possible, but there is no data access that is not supported
1947          unless aligned.  So we try to choose the best possible peeling from
1948 	 the hash table.  */
1949       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1950 	(&peeling_htab, loop_vinfo);
1951     }
1952 
1953   /* Compare costs of peeling for known and unknown alignment. */
1954   if (peel_for_known_alignment.peel_info.dr != NULL
1955       && peel_for_unknown_alignment.inside_cost
1956       >= peel_for_known_alignment.inside_cost)
1957     {
1958       best_peel = peel_for_known_alignment;
1959 
1960       /* If the best peeling for known alignment has NPEEL == 0, perform no
1961          peeling at all except if there is an unsupportable dr that we can
1962          align.  */
1963       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1964 	do_peeling = false;
1965     }
1966 
1967   /* If there is an unsupportable data ref, prefer this over all choices so far
1968      since we'd have to discard a chosen peeling except when it accidentally
1969      aligned the unsupportable data ref.  */
1970   if (one_dr_unsupportable)
1971     dr0 = unsupportable_dr;
1972   else if (do_peeling)
1973     {
1974       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1975 	 TODO: Use nopeel_outside_cost or get rid of it?  */
1976       unsigned nopeel_inside_cost = 0;
1977       unsigned nopeel_outside_cost = 0;
1978 
1979       stmt_vector_for_cost dummy;
1980       dummy.create (2);
1981       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1982 				      &nopeel_outside_cost, &dummy, 0, false);
1983       dummy.release ();
1984 
1985       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1986 	 costs will be recorded.  */
1987       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1988       prologue_cost_vec.create (2);
1989       epilogue_cost_vec.create (2);
1990 
1991       int dummy2;
1992       nopeel_outside_cost += vect_get_known_peeling_cost
1993 	(loop_vinfo, 0, &dummy2,
1994 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1995 	 &prologue_cost_vec, &epilogue_cost_vec);
1996 
1997       prologue_cost_vec.release ();
1998       epilogue_cost_vec.release ();
1999 
2000       npeel = best_peel.peel_info.npeel;
2001       dr0 = best_peel.peel_info.dr;
2002 
2003       /* If no peeling is not more expensive than the best peeling we
2004 	 have so far, don't perform any peeling.  */
2005       if (nopeel_inside_cost <= best_peel.inside_cost)
2006 	do_peeling = false;
2007     }
2008 
2009   if (do_peeling)
2010     {
2011       stmt = DR_STMT (dr0);
2012       stmt_info = vinfo_for_stmt (stmt);
2013       vectype = STMT_VINFO_VECTYPE (stmt_info);
2014 
2015       if (known_alignment_for_access_p (dr0))
2016         {
2017 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
2018 						size_zero_node) < 0;
2019           if (!npeel)
2020             {
2021               /* Since it's known at compile time, compute the number of
2022                  iterations in the peeled loop (the peeling factor) for use in
2023                  updating DR_MISALIGNMENT values.  The peeling factor is the
2024                  vectorization factor minus the misalignment as an element
2025                  count.  */
2026 	      mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2027 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2028 	      npeel = ((mis & (target_align - 1))
2029 		       / vect_get_scalar_dr_size (dr0));
2030             }
2031 
2032 	  /* For interleaved data access every iteration accesses all the
2033 	     members of the group, therefore we divide the number of iterations
2034 	     by the group size.  */
2035 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2036 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037 	    npeel /= GROUP_SIZE (stmt_info);
2038 
2039           if (dump_enabled_p ())
2040             dump_printf_loc (MSG_NOTE, vect_location,
2041                              "Try peeling by %d\n", npeel);
2042         }
2043 
2044       /* Ensure that all datarefs can be vectorized after the peel.  */
2045       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2046 	do_peeling = false;
2047 
2048       /* Check if all datarefs are supportable and log.  */
2049       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2050         {
2051           stat = vect_verify_datarefs_alignment (loop_vinfo);
2052           if (!stat)
2053             do_peeling = false;
2054           else
2055 	    return stat;
2056         }
2057 
2058       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2059       if (do_peeling)
2060         {
2061           unsigned max_allowed_peel
2062             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2063           if (max_allowed_peel != (unsigned)-1)
2064             {
2065               unsigned max_peel = npeel;
2066               if (max_peel == 0)
2067                 {
2068 		  unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2069 		  max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2070                 }
2071               if (max_peel > max_allowed_peel)
2072                 {
2073                   do_peeling = false;
2074                   if (dump_enabled_p ())
2075                     dump_printf_loc (MSG_NOTE, vect_location,
2076                         "Disable peeling, max peels reached: %d\n", max_peel);
2077                 }
2078             }
2079         }
2080 
2081       /* Cost model #2 - if peeling may result in a remaining loop not
2082 	 iterating enough to be vectorized then do not peel.  Since this
2083 	 is a cost heuristic rather than a correctness decision, use the
2084 	 most likely runtime value for variable vectorization factors.  */
2085       if (do_peeling
2086 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2087 	{
2088 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2089 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2090 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2091 	      < assumed_vf + max_peel)
2092 	    do_peeling = false;
2093 	}
2094 
2095       if (do_peeling)
2096         {
2097           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2098              If the misalignment of DR_i is identical to that of dr0 then set
2099              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2100              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2101              by the peeling factor times the element size of DR_i (MOD the
2102              vectorization factor times the size).  Otherwise, the
2103              misalignment of DR_i must be set to unknown.  */
2104 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2105 	    if (dr != dr0)
2106 	      {
2107 		/* Strided accesses perform only component accesses, alignment
2108 		   is irrelevant for them.  */
2109 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
2110 		if (STMT_VINFO_STRIDED_P (stmt_info)
2111 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112 		  continue;
2113 
2114 		vect_update_misalignment_for_peel (dr, dr0, npeel);
2115 	      }
2116 
2117           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2118           if (npeel)
2119             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2120           else
2121             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2122 	      = DR_MISALIGNMENT (dr0);
2123 	  SET_DR_MISALIGNMENT (dr0, 0);
2124 	  if (dump_enabled_p ())
2125             {
2126               dump_printf_loc (MSG_NOTE, vect_location,
2127                                "Alignment of access forced using peeling.\n");
2128               dump_printf_loc (MSG_NOTE, vect_location,
2129                                "Peeling for alignment will be applied.\n");
2130             }
2131 
2132 	  /* The inside-loop cost will be accounted for in vectorizable_load
2133 	     and vectorizable_store correctly with adjusted alignments.
2134 	     Drop the body_cst_vec on the floor here.  */
2135 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
2136 	  gcc_assert (stat);
2137           return stat;
2138         }
2139     }
2140 
2141   /* (2) Versioning to force alignment.  */
2142 
2143   /* Try versioning if:
2144      1) optimize loop for speed
2145      2) there is at least one unsupported misaligned data ref with an unknown
2146         misalignment, and
2147      3) all misaligned data refs with a known misalignment are supported, and
2148      4) the number of runtime alignment checks is within reason.  */
2149 
2150   do_versioning =
2151 	optimize_loop_nest_for_speed_p (loop)
2152 	&& (!loop->inner); /* FORNOW */
2153 
2154   if (do_versioning)
2155     {
2156       FOR_EACH_VEC_ELT (datarefs, i, dr)
2157         {
2158 	  stmt = DR_STMT (dr);
2159 	  stmt_info = vinfo_for_stmt (stmt);
2160 
2161 	  /* For interleaving, only the alignment of the first access
2162 	     matters.  */
2163 	  if (aligned_access_p (dr)
2164 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2165 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2166 	    continue;
2167 
2168 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2169 	    {
2170 	      /* Strided loads perform only component accesses, alignment is
2171 		 irrelevant for them.  */
2172 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2173 		continue;
2174 	      do_versioning = false;
2175 	      break;
2176 	    }
2177 
2178 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2179 
2180           if (!supportable_dr_alignment)
2181             {
2182 	      gimple *stmt;
2183               int mask;
2184               tree vectype;
2185 
2186               if (known_alignment_for_access_p (dr)
2187                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2188                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2189                 {
2190                   do_versioning = false;
2191                   break;
2192                 }
2193 
2194               stmt = DR_STMT (dr);
2195               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2196               gcc_assert (vectype);
2197 
2198 	      /* At present we don't support versioning for alignment
2199 		 with variable VF, since there's no guarantee that the
2200 		 VF is a power of two.  We could relax this if we added
2201 		 a way of enforcing a power-of-two size.  */
2202 	      unsigned HOST_WIDE_INT size;
2203 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2204 		{
2205 		  do_versioning = false;
2206 		  break;
2207 		}
2208 
2209               /* The rightmost bits of an aligned address must be zeros.
2210                  Construct the mask needed for this test.  For example,
2211                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2212                  mask must be 15 = 0xf. */
2213 	      mask = size - 1;
2214 
2215               /* FORNOW: use the same mask to test all potentially unaligned
2216                  references in the loop.  The vectorizer currently supports
2217                  a single vector size, see the reference to
2218                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2219                  vectorization factor is computed.  */
2220               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2221                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2222               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2223               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2224 		      DR_STMT (dr));
2225             }
2226         }
2227 
2228       /* Versioning requires at least one misaligned data reference.  */
2229       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2230         do_versioning = false;
2231       else if (!do_versioning)
2232         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2233     }
2234 
2235   if (do_versioning)
2236     {
2237       vec<gimple *> may_misalign_stmts
2238         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2239       gimple *stmt;
2240 
2241       /* It can now be assumed that the data references in the statements
2242          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2243          of the loop being vectorized.  */
2244       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2245         {
2246           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2247           dr = STMT_VINFO_DATA_REF (stmt_info);
2248 	  SET_DR_MISALIGNMENT (dr, 0);
2249 	  if (dump_enabled_p ())
2250             dump_printf_loc (MSG_NOTE, vect_location,
2251                              "Alignment of access forced using versioning.\n");
2252         }
2253 
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_NOTE, vect_location,
2256                          "Versioning for alignment will be applied.\n");
2257 
2258       /* Peeling and versioning can't be done together at this time.  */
2259       gcc_assert (! (do_peeling && do_versioning));
2260 
2261       stat = vect_verify_datarefs_alignment (loop_vinfo);
2262       gcc_assert (stat);
2263       return stat;
2264     }
2265 
2266   /* This point is reached if neither peeling nor versioning is being done.  */
2267   gcc_assert (! (do_peeling || do_versioning));
2268 
2269   stat = vect_verify_datarefs_alignment (loop_vinfo);
2270   return stat;
2271 }
2272 
2273 
2274 /* Function vect_find_same_alignment_drs.
2275 
2276    Update group and alignment relations according to the chosen
2277    vectorization factor.  */
2278 
2279 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2280 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2281 {
2282   struct data_reference *dra = DDR_A (ddr);
2283   struct data_reference *drb = DDR_B (ddr);
2284   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2285   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2286 
2287   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2288     return;
2289 
2290   if (dra == drb)
2291     return;
2292 
2293   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2294       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2295       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2296     return;
2297 
2298   /* Two references with distance zero have the same alignment.  */
2299   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2300 			  - wi::to_poly_offset (DR_INIT (drb)));
2301   if (maybe_ne (diff, 0))
2302     {
2303       /* Get the wider of the two alignments.  */
2304       unsigned int align_a = (vect_calculate_target_alignment (dra)
2305 			      / BITS_PER_UNIT);
2306       unsigned int align_b = (vect_calculate_target_alignment (drb)
2307 			      / BITS_PER_UNIT);
2308       unsigned int max_align = MAX (align_a, align_b);
2309 
2310       /* Require the gap to be a multiple of the larger vector alignment.  */
2311       if (!multiple_p (diff, max_align))
2312 	return;
2313     }
2314 
2315   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2316   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2317   if (dump_enabled_p ())
2318     {
2319       dump_printf_loc (MSG_NOTE, vect_location,
2320 		       "accesses have the same alignment: ");
2321       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2322       dump_printf (MSG_NOTE,  " and ");
2323       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2324       dump_printf (MSG_NOTE, "\n");
2325     }
2326 }
2327 
2328 
2329 /* Function vect_analyze_data_refs_alignment
2330 
2331    Analyze the alignment of the data-references in the loop.
2332    Return FALSE if a data reference is found that cannot be vectorized.  */
2333 
2334 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2335 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2336 {
2337   if (dump_enabled_p ())
2338     dump_printf_loc (MSG_NOTE, vect_location,
2339                      "=== vect_analyze_data_refs_alignment ===\n");
2340 
2341   /* Mark groups of data references with same alignment using
2342      data dependence information.  */
2343   vec<ddr_p> ddrs = vinfo->ddrs;
2344   struct data_dependence_relation *ddr;
2345   unsigned int i;
2346 
2347   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2348     vect_find_same_alignment_drs (ddr);
2349 
2350   vec<data_reference_p> datarefs = vinfo->datarefs;
2351   struct data_reference *dr;
2352 
2353   vect_record_base_alignments (vinfo);
2354   FOR_EACH_VEC_ELT (datarefs, i, dr)
2355     {
2356       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2357       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2358 	  && !vect_compute_data_ref_alignment (dr))
2359 	{
2360 	  /* Strided accesses perform only component accesses, misalignment
2361 	     information is irrelevant for them.  */
2362 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2363 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2364 	    continue;
2365 
2366 	  if (dump_enabled_p ())
2367 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 			     "not vectorized: can't calculate alignment "
2369 			     "for data ref.\n");
2370 
2371 	  return false;
2372 	}
2373     }
2374 
2375   return true;
2376 }
2377 
2378 
2379 /* Analyze alignment of DRs of stmts in NODE.  */
2380 
2381 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2382 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2383 {
2384   /* We vectorize from the first scalar stmt in the node unless
2385      the node is permuted in which case we start from the first
2386      element in the group.  */
2387   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2388   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2389   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2390     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2391 
2392   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2393   if (! vect_compute_data_ref_alignment (dr)
2394       /* For creating the data-ref pointer we need alignment of the
2395 	 first element anyway.  */
2396       || (dr != first_dr
2397 	  && ! vect_compute_data_ref_alignment (first_dr))
2398       || ! verify_data_ref_alignment (dr))
2399     {
2400       if (dump_enabled_p ())
2401 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2402 			 "not vectorized: bad data alignment in basic "
2403 			 "block.\n");
2404       return false;
2405     }
2406 
2407   return true;
2408 }
2409 
2410 /* Function vect_slp_analyze_instance_alignment
2411 
2412    Analyze the alignment of the data-references in the SLP instance.
2413    Return FALSE if a data reference is found that cannot be vectorized.  */
2414 
2415 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2416 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2417 {
2418   if (dump_enabled_p ())
2419     dump_printf_loc (MSG_NOTE, vect_location,
2420                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2421 
2422   slp_tree node;
2423   unsigned i;
2424   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2425     if (! vect_slp_analyze_and_verify_node_alignment (node))
2426       return false;
2427 
2428   node = SLP_INSTANCE_TREE (instance);
2429   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2430       && ! vect_slp_analyze_and_verify_node_alignment
2431 	     (SLP_INSTANCE_TREE (instance)))
2432     return false;
2433 
2434   return true;
2435 }
2436 
2437 
2438 /* Analyze groups of accesses: check that DR belongs to a group of
2439    accesses of legal size, step, etc.  Detect gaps, single element
2440    interleaving, and other special cases. Set grouped access info.
2441    Collect groups of strided stores for further use in SLP analysis.
2442    Worker for vect_analyze_group_access.  */
2443 
2444 static bool
vect_analyze_group_access_1(struct data_reference * dr)2445 vect_analyze_group_access_1 (struct data_reference *dr)
2446 {
2447   tree step = DR_STEP (dr);
2448   tree scalar_type = TREE_TYPE (DR_REF (dr));
2449   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2450   gimple *stmt = DR_STMT (dr);
2451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2453   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2454   HOST_WIDE_INT dr_step = -1;
2455   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2456   bool slp_impossible = false;
2457 
2458   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2459      size of the interleaving group (including gaps).  */
2460   if (tree_fits_shwi_p (step))
2461     {
2462       dr_step = tree_to_shwi (step);
2463       /* Check that STEP is a multiple of type size.  Otherwise there is
2464          a non-element-sized gap at the end of the group which we
2465 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2466 	 ???  As we can handle non-constant step fine here we should
2467 	 simply remove uses of GROUP_GAP between the last and first
2468 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2469 	 simply not include that gap.  */
2470       if ((dr_step % type_size) != 0)
2471 	{
2472 	  if (dump_enabled_p ())
2473 	    {
2474 	      dump_printf_loc (MSG_NOTE, vect_location,
2475 	                       "Step ");
2476 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2477 	      dump_printf (MSG_NOTE,
2478 			   " is not a multiple of the element size for ");
2479 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2480 	      dump_printf (MSG_NOTE, "\n");
2481 	    }
2482 	  return false;
2483 	}
2484       groupsize = absu_hwi (dr_step) / type_size;
2485     }
2486   else
2487     groupsize = 0;
2488 
2489   /* Not consecutive access is possible only if it is a part of interleaving.  */
2490   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2491     {
2492       /* Check if it this DR is a part of interleaving, and is a single
2493 	 element of the group that is accessed in the loop.  */
2494 
2495       /* Gaps are supported only for loads. STEP must be a multiple of the type
2496 	 size.  */
2497       if (DR_IS_READ (dr)
2498 	  && (dr_step % type_size) == 0
2499 	  && groupsize > 0)
2500 	{
2501 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2502 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2503 	  GROUP_GAP (stmt_info) = groupsize - 1;
2504 	  if (dump_enabled_p ())
2505 	    {
2506 	      dump_printf_loc (MSG_NOTE, vect_location,
2507 	                       "Detected single element interleaving ");
2508 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2509 	      dump_printf (MSG_NOTE, " step ");
2510 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2511 	      dump_printf (MSG_NOTE, "\n");
2512 	    }
2513 
2514 	  return true;
2515 	}
2516 
2517       if (dump_enabled_p ())
2518         {
2519  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2520 	                   "not consecutive access ");
2521 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2522         }
2523 
2524       if (bb_vinfo)
2525         {
2526           /* Mark the statement as unvectorizable.  */
2527           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2528           return true;
2529         }
2530 
2531       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2532       STMT_VINFO_STRIDED_P (stmt_info) = true;
2533       return true;
2534     }
2535 
2536   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2537     {
2538       /* First stmt in the interleaving chain. Check the chain.  */
2539       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2540       struct data_reference *data_ref = dr;
2541       unsigned int count = 1;
2542       tree prev_init = DR_INIT (data_ref);
2543       gimple *prev = stmt;
2544       HOST_WIDE_INT diff, gaps = 0;
2545 
2546       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2547       while (next)
2548         {
2549           /* Skip same data-refs.  In case that two or more stmts share
2550              data-ref (supported only for loads), we vectorize only the first
2551              stmt, and the rest get their vectorized loads from the first
2552              one.  */
2553           if (!tree_int_cst_compare (DR_INIT (data_ref),
2554                                      DR_INIT (STMT_VINFO_DATA_REF (
2555 						   vinfo_for_stmt (next)))))
2556             {
2557               if (DR_IS_WRITE (data_ref))
2558                 {
2559                   if (dump_enabled_p ())
2560                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561                                      "Two store stmts share the same dr.\n");
2562                   return false;
2563                 }
2564 
2565 	      if (dump_enabled_p ())
2566 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 				 "Two or more load stmts share the same dr.\n");
2568 
2569               /* For load use the same data-ref load.  */
2570               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2571 
2572               prev = next;
2573               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2574               continue;
2575             }
2576 
2577           prev = next;
2578           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2579 
2580 	  /* All group members have the same STEP by construction.  */
2581 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2582 
2583           /* Check that the distance between two accesses is equal to the type
2584              size. Otherwise, we have gaps.  */
2585           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2586                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2587 	  if (diff != 1)
2588 	    {
2589 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2590 	      slp_impossible = true;
2591 	      if (DR_IS_WRITE (data_ref))
2592 		{
2593                   if (dump_enabled_p ())
2594                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2595                                      "interleaved store with gaps\n");
2596 		  return false;
2597 		}
2598 
2599               gaps += diff - 1;
2600 	    }
2601 
2602 	  last_accessed_element += diff;
2603 
2604           /* Store the gap from the previous member of the group. If there is no
2605              gap in the access, GROUP_GAP is always 1.  */
2606           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2607 
2608           prev_init = DR_INIT (data_ref);
2609           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2610           /* Count the number of data-refs in the chain.  */
2611           count++;
2612         }
2613 
2614       if (groupsize == 0)
2615         groupsize = count + gaps;
2616 
2617       /* This could be UINT_MAX but as we are generating code in a very
2618          inefficient way we have to cap earlier.  See PR78699 for example.  */
2619       if (groupsize > 4096)
2620 	{
2621 	  if (dump_enabled_p ())
2622 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 			     "group is too large\n");
2624 	  return false;
2625 	}
2626 
2627       /* Check that the size of the interleaving is equal to count for stores,
2628          i.e., that there are no gaps.  */
2629       if (groupsize != count
2630 	  && !DR_IS_READ (dr))
2631         {
2632 	  if (dump_enabled_p ())
2633 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2634 			     "interleaved store with gaps\n");
2635 	  return false;
2636 	}
2637 
2638       /* If there is a gap after the last load in the group it is the
2639 	 difference between the groupsize and the last accessed
2640 	 element.
2641 	 When there is no gap, this difference should be 0.  */
2642       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2643 
2644       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2645       if (dump_enabled_p ())
2646 	{
2647 	  dump_printf_loc (MSG_NOTE, vect_location,
2648 			   "Detected interleaving ");
2649 	  if (DR_IS_READ (dr))
2650 	    dump_printf (MSG_NOTE, "load ");
2651 	  else
2652 	    dump_printf (MSG_NOTE, "store ");
2653 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2654 		       (unsigned)groupsize);
2655 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2656 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2657 	    dump_printf_loc (MSG_NOTE, vect_location,
2658 			     "There is a gap of %u elements after the group\n",
2659 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2660 	}
2661 
2662       /* SLP: create an SLP data structure for every interleaving group of
2663 	 stores for further analysis in vect_analyse_slp.  */
2664       if (DR_IS_WRITE (dr) && !slp_impossible)
2665         {
2666           if (loop_vinfo)
2667             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2668           if (bb_vinfo)
2669             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2670         }
2671     }
2672 
2673   return true;
2674 }
2675 
2676 /* Analyze groups of accesses: check that DR belongs to a group of
2677    accesses of legal size, step, etc.  Detect gaps, single element
2678    interleaving, and other special cases. Set grouped access info.
2679    Collect groups of strided stores for further use in SLP analysis.  */
2680 
2681 static bool
vect_analyze_group_access(struct data_reference * dr)2682 vect_analyze_group_access (struct data_reference *dr)
2683 {
2684   if (!vect_analyze_group_access_1 (dr))
2685     {
2686       /* Dissolve the group if present.  */
2687       gimple *next;
2688       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2689       while (stmt)
2690 	{
2691 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2692 	  next = GROUP_NEXT_ELEMENT (vinfo);
2693 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2694 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2695 	  stmt = next;
2696 	}
2697       return false;
2698     }
2699   return true;
2700 }
2701 
2702 /* Analyze the access pattern of the data-reference DR.
2703    In case of non-consecutive accesses call vect_analyze_group_access() to
2704    analyze groups of accesses.  */
2705 
2706 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2707 vect_analyze_data_ref_access (struct data_reference *dr)
2708 {
2709   tree step = DR_STEP (dr);
2710   tree scalar_type = TREE_TYPE (DR_REF (dr));
2711   gimple *stmt = DR_STMT (dr);
2712   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2713   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2714   struct loop *loop = NULL;
2715 
2716   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2717     return true;
2718 
2719   if (loop_vinfo)
2720     loop = LOOP_VINFO_LOOP (loop_vinfo);
2721 
2722   if (loop_vinfo && !step)
2723     {
2724       if (dump_enabled_p ())
2725 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2726 	                 "bad data-ref access in loop\n");
2727       return false;
2728     }
2729 
2730   /* Allow loads with zero step in inner-loop vectorization.  */
2731   if (loop_vinfo && integer_zerop (step))
2732     {
2733       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2734       if (!nested_in_vect_loop_p (loop, stmt))
2735 	return DR_IS_READ (dr);
2736       /* Allow references with zero step for outer loops marked
2737 	 with pragma omp simd only - it guarantees absence of
2738 	 loop-carried dependencies between inner loop iterations.  */
2739       if (loop->safelen < 2)
2740 	{
2741 	  if (dump_enabled_p ())
2742 	    dump_printf_loc (MSG_NOTE, vect_location,
2743 			     "zero step in inner loop of nest\n");
2744 	  return false;
2745 	}
2746     }
2747 
2748   if (loop && nested_in_vect_loop_p (loop, stmt))
2749     {
2750       /* Interleaved accesses are not yet supported within outer-loop
2751         vectorization for references in the inner-loop.  */
2752       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2753 
2754       /* For the rest of the analysis we use the outer-loop step.  */
2755       step = STMT_VINFO_DR_STEP (stmt_info);
2756       if (integer_zerop (step))
2757 	{
2758 	  if (dump_enabled_p ())
2759 	    dump_printf_loc (MSG_NOTE, vect_location,
2760 	                     "zero step in outer loop.\n");
2761 	  return DR_IS_READ (dr);
2762 	}
2763     }
2764 
2765   /* Consecutive?  */
2766   if (TREE_CODE (step) == INTEGER_CST)
2767     {
2768       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2769       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2770 	  || (dr_step < 0
2771 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2772 	{
2773 	  /* Mark that it is not interleaving.  */
2774 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2775 	  return true;
2776 	}
2777     }
2778 
2779   if (loop && nested_in_vect_loop_p (loop, stmt))
2780     {
2781       if (dump_enabled_p ())
2782 	dump_printf_loc (MSG_NOTE, vect_location,
2783 	                 "grouped access in outer loop.\n");
2784       return false;
2785     }
2786 
2787 
2788   /* Assume this is a DR handled by non-constant strided load case.  */
2789   if (TREE_CODE (step) != INTEGER_CST)
2790     return (STMT_VINFO_STRIDED_P (stmt_info)
2791 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2792 		|| vect_analyze_group_access (dr)));
2793 
2794   /* Not consecutive access - check if it's a part of interleaving group.  */
2795   return vect_analyze_group_access (dr);
2796 }
2797 
2798 /* Compare two data-references DRA and DRB to group them into chunks
2799    suitable for grouping.  */
2800 
2801 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2802 dr_group_sort_cmp (const void *dra_, const void *drb_)
2803 {
2804   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2805   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2806   int cmp;
2807 
2808   /* Stabilize sort.  */
2809   if (dra == drb)
2810     return 0;
2811 
2812   /* DRs in different loops never belong to the same group.  */
2813   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2814   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2815   if (loopa != loopb)
2816     return loopa->num < loopb->num ? -1 : 1;
2817 
2818   /* Ordering of DRs according to base.  */
2819   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2820 			       DR_BASE_ADDRESS (drb));
2821   if (cmp != 0)
2822     return cmp;
2823 
2824   /* And according to DR_OFFSET.  */
2825   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2826   if (cmp != 0)
2827     return cmp;
2828 
2829   /* Put reads before writes.  */
2830   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2831     return DR_IS_READ (dra) ? -1 : 1;
2832 
2833   /* Then sort after access size.  */
2834   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2835 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2836   if (cmp != 0)
2837     return cmp;
2838 
2839   /* And after step.  */
2840   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2841   if (cmp != 0)
2842     return cmp;
2843 
2844   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2845   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2846   if (cmp == 0)
2847     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2848   return cmp;
2849 }
2850 
2851 /* If OP is the result of a conversion, return the unconverted value,
2852    otherwise return null.  */
2853 
2854 static tree
strip_conversion(tree op)2855 strip_conversion (tree op)
2856 {
2857   if (TREE_CODE (op) != SSA_NAME)
2858     return NULL_TREE;
2859   gimple *stmt = SSA_NAME_DEF_STMT (op);
2860   if (!is_gimple_assign (stmt)
2861       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2862     return NULL_TREE;
2863   return gimple_assign_rhs1 (stmt);
2864 }
2865 
2866 /* Return true if vectorizable_* routines can handle statements STMT1
2867    and STMT2 being in a single group.  */
2868 
2869 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2870 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2871 {
2872   if (gimple_assign_single_p (stmt1))
2873     return gimple_assign_single_p (stmt2);
2874 
2875   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2876     {
2877       /* Check for two masked loads or two masked stores.  */
2878       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2879 	return false;
2880       internal_fn ifn = gimple_call_internal_fn (stmt1);
2881       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2882 	return false;
2883       if (ifn != gimple_call_internal_fn (stmt2))
2884 	return false;
2885 
2886       /* Check that the masks are the same.  Cope with casts of masks,
2887 	 like those created by build_mask_conversion.  */
2888       tree mask1 = gimple_call_arg (stmt1, 2);
2889       tree mask2 = gimple_call_arg (stmt2, 2);
2890       if (!operand_equal_p (mask1, mask2, 0))
2891 	{
2892 	  mask1 = strip_conversion (mask1);
2893 	  if (!mask1)
2894 	    return false;
2895 	  mask2 = strip_conversion (mask2);
2896 	  if (!mask2)
2897 	    return false;
2898 	  if (!operand_equal_p (mask1, mask2, 0))
2899 	    return false;
2900 	}
2901       return true;
2902     }
2903 
2904   return false;
2905 }
2906 
2907 /* Function vect_analyze_data_ref_accesses.
2908 
2909    Analyze the access pattern of all the data references in the loop.
2910 
2911    FORNOW: the only access pattern that is considered vectorizable is a
2912 	   simple step 1 (consecutive) access.
2913 
2914    FORNOW: handle only arrays and pointer accesses.  */
2915 
2916 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2917 vect_analyze_data_ref_accesses (vec_info *vinfo)
2918 {
2919   unsigned int i;
2920   vec<data_reference_p> datarefs = vinfo->datarefs;
2921   struct data_reference *dr;
2922 
2923   if (dump_enabled_p ())
2924     dump_printf_loc (MSG_NOTE, vect_location,
2925                      "=== vect_analyze_data_ref_accesses ===\n");
2926 
2927   if (datarefs.is_empty ())
2928     return true;
2929 
2930   /* Sort the array of datarefs to make building the interleaving chains
2931      linear.  Don't modify the original vector's order, it is needed for
2932      determining what dependencies are reversed.  */
2933   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2934   datarefs_copy.qsort (dr_group_sort_cmp);
2935 
2936   /* Build the interleaving chains.  */
2937   for (i = 0; i < datarefs_copy.length () - 1;)
2938     {
2939       data_reference_p dra = datarefs_copy[i];
2940       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2941       stmt_vec_info lastinfo = NULL;
2942       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2943 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2944 	{
2945 	  ++i;
2946 	  continue;
2947 	}
2948       for (i = i + 1; i < datarefs_copy.length (); ++i)
2949 	{
2950 	  data_reference_p drb = datarefs_copy[i];
2951 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2952 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2953 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2954 	    break;
2955 
2956 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2957 	     accesses, same accesses) can lead to a group to be artificially
2958 	     split here as we don't just skip over those.  If it really
2959 	     matters we can push those to a worklist and re-iterate
2960 	     over them.  The we can just skip ahead to the next DR here.  */
2961 
2962 	  /* DRs in a different loop should not be put into the same
2963 	     interleaving group.  */
2964 	  if (gimple_bb (DR_STMT (dra))->loop_father
2965 	      != gimple_bb (DR_STMT (drb))->loop_father)
2966 	    break;
2967 
2968 	  /* Check that the data-refs have same first location (except init)
2969 	     and they are both either store or load (not load and store,
2970 	     not masked loads or stores).  */
2971 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2972 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2973 					DR_BASE_ADDRESS (drb)) != 0
2974 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2975 	      || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2976 	    break;
2977 
2978 	  /* Check that the data-refs have the same constant size.  */
2979 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2980 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2981 	  if (!tree_fits_uhwi_p (sza)
2982 	      || !tree_fits_uhwi_p (szb)
2983 	      || !tree_int_cst_equal (sza, szb))
2984 	    break;
2985 
2986 	  /* Check that the data-refs have the same step.  */
2987 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2988 	    break;
2989 
2990 	  /* Check the types are compatible.
2991 	     ???  We don't distinguish this during sorting.  */
2992 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2993 				   TREE_TYPE (DR_REF (drb))))
2994 	    break;
2995 
2996 	  /* Check that the DR_INITs are compile-time constants.  */
2997 	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2998 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2999 	    break;
3000 
3001 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3002 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004 	  HOST_WIDE_INT init_prev
3005 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3006 	  gcc_assert (init_a <= init_b
3007 		      && init_a <= init_prev
3008 		      && init_prev <= init_b);
3009 
3010 	  /* Do not place the same access in the interleaving chain twice.  */
3011 	  if (init_b == init_prev)
3012 	    {
3013 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3014 			  < gimple_uid (DR_STMT (drb)));
3015 	      /* ???  For now we simply "drop" the later reference which is
3016 	         otherwise the same rather than finishing off this group.
3017 		 In the end we'd want to re-process duplicates forming
3018 		 multiple groups from the refs, likely by just collecting
3019 		 all candidates (including duplicates and split points
3020 		 below) in a vector and then process them together.  */
3021 	      continue;
3022 	    }
3023 
3024 	  /* If init_b == init_a + the size of the type * k, we have an
3025 	     interleaving, and DRA is accessed before DRB.  */
3026 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027 	  if (type_size_a == 0
3028 	      || (init_b - init_a) % type_size_a != 0)
3029 	    break;
3030 
3031 	  /* If we have a store, the accesses are adjacent.  This splits
3032 	     groups into chunks we support (we don't support vectorization
3033 	     of stores with gaps).  */
3034 	  if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035 	    break;
3036 
3037 	  /* If the step (if not zero or non-constant) is greater than the
3038 	     difference between data-refs' inits this splits groups into
3039 	     suitable sizes.  */
3040 	  if (tree_fits_shwi_p (DR_STEP (dra)))
3041 	    {
3042 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043 	      if (step != 0 && step <= (init_b - init_a))
3044 		break;
3045 	    }
3046 
3047 	  if (dump_enabled_p ())
3048 	    {
3049 	      dump_printf_loc (MSG_NOTE, vect_location,
3050 			       "Detected interleaving ");
3051 	      if (DR_IS_READ (dra))
3052 		dump_printf (MSG_NOTE, "load ");
3053 	      else
3054 		dump_printf (MSG_NOTE, "store ");
3055 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3056 	      dump_printf (MSG_NOTE,  " and ");
3057 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3058 	      dump_printf (MSG_NOTE, "\n");
3059 	    }
3060 
3061 	  /* Link the found element into the group list.  */
3062 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3063 	    {
3064 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3065 	      lastinfo = stmtinfo_a;
3066 	    }
3067 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3068 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3069 	  lastinfo = stmtinfo_b;
3070 	}
3071     }
3072 
3073   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3074     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3075         && !vect_analyze_data_ref_access (dr))
3076       {
3077 	if (dump_enabled_p ())
3078 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079 	                   "not vectorized: complicated access pattern.\n");
3080 
3081         if (is_a <bb_vec_info> (vinfo))
3082           {
3083             /* Mark the statement as not vectorizable.  */
3084             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3085             continue;
3086           }
3087         else
3088 	  {
3089 	    datarefs_copy.release ();
3090 	    return false;
3091 	  }
3092       }
3093 
3094   datarefs_copy.release ();
3095   return true;
3096 }
3097 
3098 /* Function vect_vfa_segment_size.
3099 
3100    Input:
3101      DR: The data reference.
3102      LENGTH_FACTOR: segment length to consider.
3103 
3104    Return a value suitable for the dr_with_seg_len::seg_len field.
3105    This is the "distance travelled" by the pointer from the first
3106    iteration in the segment to the last.  Note that it does not include
3107    the size of the access; in effect it only describes the first byte.  */
3108 
3109 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3110 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3111 {
3112   length_factor = size_binop (MINUS_EXPR,
3113 			      fold_convert (sizetype, length_factor),
3114 			      size_one_node);
3115   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3116 		     length_factor);
3117 }
3118 
3119 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3120    gives the worst-case number of bytes covered by the segment.  */
3121 
3122 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3123 vect_vfa_access_size (data_reference *dr)
3124 {
3125   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3126   tree ref_type = TREE_TYPE (DR_REF (dr));
3127   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3128   unsigned HOST_WIDE_INT access_size = ref_size;
3129   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3130     {
3131       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3132       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3133     }
3134   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3135       && (vect_supportable_dr_alignment (dr, false)
3136 	  == dr_explicit_realign_optimized))
3137     {
3138       /* We might access a full vector's worth.  */
3139       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3140       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3141     }
3142   return access_size;
3143 }
3144 
3145 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3146 
3147 static unsigned int
vect_vfa_align(const data_reference * dr)3148 vect_vfa_align (const data_reference *dr)
3149 {
3150   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3151 }
3152 
3153 /* Function vect_no_alias_p.
3154 
3155    Given data references A and B with equal base and offset, see whether
3156    the alias relation can be decided at compilation time.  Return 1 if
3157    it can and the references alias, 0 if it can and the references do
3158    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3159    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3160    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3161 
3162 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3163 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3164 			 tree segment_length_a, tree segment_length_b,
3165 			 unsigned HOST_WIDE_INT access_size_a,
3166 			 unsigned HOST_WIDE_INT access_size_b)
3167 {
3168   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3169   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3170   poly_uint64 const_length_a;
3171   poly_uint64 const_length_b;
3172 
3173   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3174      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3175      [a, a+12) */
3176   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3177     {
3178       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3179       offset_a = (offset_a + access_size_a) - const_length_a;
3180     }
3181   else
3182     const_length_a = tree_to_poly_uint64 (segment_length_a);
3183   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3184     {
3185       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3186       offset_b = (offset_b + access_size_b) - const_length_b;
3187     }
3188   else
3189     const_length_b = tree_to_poly_uint64 (segment_length_b);
3190 
3191   const_length_a += access_size_a;
3192   const_length_b += access_size_b;
3193 
3194   if (ranges_known_overlap_p (offset_a, const_length_a,
3195 			      offset_b, const_length_b))
3196     return 1;
3197 
3198   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3199 			       offset_b, const_length_b))
3200     return 0;
3201 
3202   return -1;
3203 }
3204 
3205 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3206    in DDR is >= VF.  */
3207 
3208 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3209 dependence_distance_ge_vf (data_dependence_relation *ddr,
3210 			   unsigned int loop_depth, poly_uint64 vf)
3211 {
3212   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3213       || DDR_NUM_DIST_VECTS (ddr) == 0)
3214     return false;
3215 
3216   /* If the dependence is exact, we should have limited the VF instead.  */
3217   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3218 
3219   unsigned int i;
3220   lambda_vector dist_v;
3221   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3222     {
3223       HOST_WIDE_INT dist = dist_v[loop_depth];
3224       if (dist != 0
3225 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3226 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3227 	return false;
3228     }
3229 
3230   if (dump_enabled_p ())
3231     {
3232       dump_printf_loc (MSG_NOTE, vect_location,
3233 		       "dependence distance between ");
3234       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3235       dump_printf (MSG_NOTE,  " and ");
3236       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3237       dump_printf (MSG_NOTE,  " is >= VF\n");
3238     }
3239 
3240   return true;
3241 }
3242 
3243 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3244 
3245 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3246 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3247 {
3248   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3249   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3250   dump_printf (dump_kind, ") >= ");
3251   dump_dec (dump_kind, lower_bound.min_value);
3252 }
3253 
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3256 
3257 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3259 			poly_uint64 min_value)
3260 {
3261   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3262   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3263     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3264       {
3265 	unsigned_p &= lower_bounds[i].unsigned_p;
3266 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3267 	if (lower_bounds[i].unsigned_p != unsigned_p
3268 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3269 	  {
3270 	    lower_bounds[i].unsigned_p = unsigned_p;
3271 	    lower_bounds[i].min_value = min_value;
3272 	    if (dump_enabled_p ())
3273 	      {
3274 		dump_printf_loc (MSG_NOTE, vect_location,
3275 				 "updating run-time check to ");
3276 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3277 		dump_printf (MSG_NOTE, "\n");
3278 	      }
3279 	  }
3280 	return;
3281       }
3282 
3283   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3284   if (dump_enabled_p ())
3285     {
3286       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3287       dump_lower_bound (MSG_NOTE, lower_bound);
3288       dump_printf (MSG_NOTE, "\n");
3289     }
3290   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3291 }
3292 
3293 /* Return true if it's unlikely that the step of the vectorized form of DR
3294    will span fewer than GAP bytes.  */
3295 
3296 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3297 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3298 {
3299   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3300   HOST_WIDE_INT count
3301     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3302   if (GROUP_FIRST_ELEMENT (stmt_info))
3303     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3304   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3305 }
3306 
3307 /* Return true if we know that there is no alias between DR_A and DR_B
3308    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3309    *LOWER_BOUND_OUT to this N.  */
3310 
3311 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3312 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3313 				poly_uint64 *lower_bound_out)
3314 {
3315   /* Check that there is a constant gap of known sign between DR_A
3316      and DR_B.  */
3317   poly_int64 init_a, init_b;
3318   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3319       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3320       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3321       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3322       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3323       || !ordered_p (init_a, init_b))
3324     return false;
3325 
3326   /* Sort DR_A and DR_B by the address they access.  */
3327   if (maybe_lt (init_b, init_a))
3328     {
3329       std::swap (init_a, init_b);
3330       std::swap (dr_a, dr_b);
3331     }
3332 
3333   /* If the two accesses could be dependent within a scalar iteration,
3334      make sure that we'd retain their order.  */
3335   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3336       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3337     return false;
3338 
3339   /* There is no alias if abs (DR_STEP) is greater than or equal to
3340      the bytes spanned by the combination of the two accesses.  */
3341   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3342   return true;
3343 }
3344 
3345 /* Function vect_prune_runtime_alias_test_list.
3346 
3347    Prune a list of ddrs to be tested at run-time by versioning for alias.
3348    Merge several alias checks into one if possible.
3349    Return FALSE if resulting list of ddrs is longer then allowed by
3350    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3351 
3352 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3353 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3354 {
3355   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3356   hash_set <tree_pair_hash> compared_objects;
3357 
3358   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3359   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3360     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3361   vec<vec_object_pair> &check_unequal_addrs
3362     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3363   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3364   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3365 
3366   ddr_p ddr;
3367   unsigned int i;
3368   tree length_factor;
3369 
3370   if (dump_enabled_p ())
3371     dump_printf_loc (MSG_NOTE, vect_location,
3372                      "=== vect_prune_runtime_alias_test_list ===\n");
3373 
3374   /* Step values are irrelevant for aliasing if the number of vector
3375      iterations is equal to the number of scalar iterations (which can
3376      happen for fully-SLP loops).  */
3377   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3378 
3379   if (!ignore_step_p)
3380     {
3381       /* Convert the checks for nonzero steps into bound tests.  */
3382       tree value;
3383       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3384 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3385     }
3386 
3387   if (may_alias_ddrs.is_empty ())
3388     return true;
3389 
3390   comp_alias_ddrs.create (may_alias_ddrs.length ());
3391 
3392   unsigned int loop_depth
3393     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3394 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3395 
3396   /* First, we collect all data ref pairs for aliasing checks.  */
3397   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3398     {
3399       int comp_res;
3400       poly_uint64 lower_bound;
3401       struct data_reference *dr_a, *dr_b;
3402       gimple *dr_group_first_a, *dr_group_first_b;
3403       tree segment_length_a, segment_length_b;
3404       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3405       unsigned int align_a, align_b;
3406       gimple *stmt_a, *stmt_b;
3407 
3408       /* Ignore the alias if the VF we chose ended up being no greater
3409 	 than the dependence distance.  */
3410       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3411 	continue;
3412 
3413       if (DDR_OBJECT_A (ddr))
3414 	{
3415 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3416 	  if (!compared_objects.add (new_pair))
3417 	    {
3418 	      if (dump_enabled_p ())
3419 		{
3420 		  dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3421 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3422 		  dump_printf (MSG_NOTE, " and ");
3423 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3424 		  dump_printf (MSG_NOTE, " have different addresses\n");
3425 		}
3426 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3427 	    }
3428 	  continue;
3429 	}
3430 
3431       dr_a = DDR_A (ddr);
3432       stmt_a = DR_STMT (DDR_A (ddr));
3433 
3434       dr_b = DDR_B (ddr);
3435       stmt_b = DR_STMT (DDR_B (ddr));
3436 
3437       /* Skip the pair if inter-iteration dependencies are irrelevant
3438 	 and intra-iteration dependencies are guaranteed to be honored.  */
3439       if (ignore_step_p
3440 	  && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3441 	      || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3442 	{
3443 	  if (dump_enabled_p ())
3444 	    {
3445 	      dump_printf_loc (MSG_NOTE, vect_location,
3446 			       "no need for alias check between ");
3447 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3448 	      dump_printf (MSG_NOTE, " and ");
3449 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3450 	      dump_printf (MSG_NOTE, " when VF is 1\n");
3451 	    }
3452 	  continue;
3453 	}
3454 
3455       /* See whether we can handle the alias using a bounds check on
3456 	 the step, and whether that's likely to be the best approach.
3457 	 (It might not be, for example, if the minimum step is much larger
3458 	 than the number of bytes handled by one vector iteration.)  */
3459       if (!ignore_step_p
3460 	  && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3461 	  && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3462 	  && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3463 	      || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3464 	{
3465 	  bool unsigned_p = dr_known_forward_stride_p (dr_a);
3466 	  if (dump_enabled_p ())
3467 	    {
3468 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3469 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3470 	      dump_printf (MSG_NOTE, " and ");
3471 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3472 	      dump_printf (MSG_NOTE, " when the step ");
3473 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3474 	      dump_printf (MSG_NOTE, " is outside ");
3475 	      if (unsigned_p)
3476 		dump_printf (MSG_NOTE, "[0");
3477 	      else
3478 		{
3479 		  dump_printf (MSG_NOTE, "(");
3480 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3481 		}
3482 	      dump_printf (MSG_NOTE, ", ");
3483 	      dump_dec (MSG_NOTE, lower_bound);
3484 	      dump_printf (MSG_NOTE, ")\n");
3485 	    }
3486 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3487 				  lower_bound);
3488 	  continue;
3489 	}
3490 
3491       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3492       if (dr_group_first_a)
3493 	{
3494 	  stmt_a = dr_group_first_a;
3495 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3496 	}
3497 
3498       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3499       if (dr_group_first_b)
3500 	{
3501 	  stmt_b = dr_group_first_b;
3502 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3503 	}
3504 
3505       if (ignore_step_p)
3506 	{
3507 	  segment_length_a = size_zero_node;
3508 	  segment_length_b = size_zero_node;
3509 	}
3510       else
3511 	{
3512 	  if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3513 	    length_factor = scalar_loop_iters;
3514 	  else
3515 	    length_factor = size_int (vect_factor);
3516 	  segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3517 	  segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3518 	}
3519       access_size_a = vect_vfa_access_size (dr_a);
3520       access_size_b = vect_vfa_access_size (dr_b);
3521       align_a = vect_vfa_align (dr_a);
3522       align_b = vect_vfa_align (dr_b);
3523 
3524       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3525 					DR_BASE_ADDRESS (dr_b));
3526       if (comp_res == 0)
3527 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3528 					  DR_OFFSET (dr_b));
3529 
3530       /* See whether the alias is known at compilation time.  */
3531       if (comp_res == 0
3532 	  && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3533 	  && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3534 	  && poly_int_tree_p (segment_length_a)
3535 	  && poly_int_tree_p (segment_length_b))
3536 	{
3537 	  int res = vect_compile_time_alias (dr_a, dr_b,
3538 					     segment_length_a,
3539 					     segment_length_b,
3540 					     access_size_a,
3541 					     access_size_b);
3542 	  if (res >= 0 && dump_enabled_p ())
3543 	    {
3544 	      dump_printf_loc (MSG_NOTE, vect_location,
3545 			       "can tell at compile time that ");
3546 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3547 	      dump_printf (MSG_NOTE, " and ");
3548 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3549 	      if (res == 0)
3550 		dump_printf (MSG_NOTE, " do not alias\n");
3551 	      else
3552 		dump_printf (MSG_NOTE, " alias\n");
3553 	    }
3554 
3555 	  if (res == 0)
3556 	    continue;
3557 
3558 	  if (res == 1)
3559 	    {
3560 	      if (dump_enabled_p ())
3561 		dump_printf_loc (MSG_NOTE, vect_location,
3562 				 "not vectorized: compilation time alias.\n");
3563 	      return false;
3564 	    }
3565 	}
3566 
3567       dr_with_seg_len_pair_t dr_with_seg_len_pair
3568 	(dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3569 	 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3570 
3571       /* Canonicalize pairs by sorting the two DR members.  */
3572       if (comp_res > 0)
3573 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3574 
3575       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3576     }
3577 
3578   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3579 
3580   unsigned int count = (comp_alias_ddrs.length ()
3581 			+ check_unequal_addrs.length ());
3582 
3583   dump_printf_loc (MSG_NOTE, vect_location,
3584 		   "improved number of alias checks from %d to %d\n",
3585 		   may_alias_ddrs.length (), count);
3586   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3587     {
3588       if (dump_enabled_p ())
3589 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 			 "number of versioning for alias "
3591 			 "run-time tests exceeds %d "
3592 			 "(--param vect-max-version-for-alias-checks)\n",
3593 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3594       return false;
3595     }
3596 
3597   return true;
3598 }
3599 
3600 /* Check whether we can use an internal function for a gather load
3601    or scatter store.  READ_P is true for loads and false for stores.
3602    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3603    the type of the memory elements being loaded or stored.  OFFSET_BITS
3604    is the number of bits in each scalar offset and OFFSET_SIGN is the
3605    sign of the offset.  SCALE is the amount by which the offset should
3606    be multiplied *after* it has been converted to address width.
3607 
3608    Return true if the function is supported, storing the function
3609    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3610 
3611 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3612 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3613 			  tree memory_type, unsigned int offset_bits,
3614 			  signop offset_sign, int scale,
3615 			  internal_fn *ifn_out, tree *element_type_out)
3616 {
3617   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3618   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3619   if (offset_bits > element_bits)
3620     /* Internal functions require the offset to be the same width as
3621        the vector elements.  We can extend narrower offsets, but it isn't
3622        safe to truncate wider offsets.  */
3623     return false;
3624 
3625   if (element_bits != memory_bits)
3626     /* For now the vector elements must be the same width as the
3627        memory elements.  */
3628     return false;
3629 
3630   /* Work out which function we need.  */
3631   internal_fn ifn;
3632   if (read_p)
3633     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3634   else
3635     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3636 
3637   /* Test whether the target supports this combination.  */
3638   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3639 					       offset_sign, scale))
3640     return false;
3641 
3642   *ifn_out = ifn;
3643   *element_type_out = TREE_TYPE (vectype);
3644   return true;
3645 }
3646 
3647 /* CALL is a call to an internal gather load or scatter store function.
3648    Describe the operation in INFO.  */
3649 
3650 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3651 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3652 {
3653   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3654   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3655   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656 
3657   info->ifn = gimple_call_internal_fn (call);
3658   info->decl = NULL_TREE;
3659   info->base = gimple_call_arg (call, 0);
3660   info->offset = gimple_call_arg (call, 1);
3661   info->offset_dt = vect_unknown_def_type;
3662   info->offset_vectype = NULL_TREE;
3663   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3664   info->element_type = TREE_TYPE (vectype);
3665   info->memory_type = TREE_TYPE (DR_REF (dr));
3666 }
3667 
3668 /* Return true if a non-affine read or write in STMT is suitable for a
3669    gather load or scatter store.  Describe the operation in *INFO if so.  */
3670 
3671 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3672 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3673 			   gather_scatter_info *info)
3674 {
3675   HOST_WIDE_INT scale = 1;
3676   poly_int64 pbitpos, pbitsize;
3677   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3678   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3679   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3680   tree offtype = NULL_TREE;
3681   tree decl = NULL_TREE, base, off;
3682   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3683   tree memory_type = TREE_TYPE (DR_REF (dr));
3684   machine_mode pmode;
3685   int punsignedp, reversep, pvolatilep = 0;
3686   internal_fn ifn;
3687   tree element_type;
3688   bool masked_p = false;
3689 
3690   /* See whether this is already a call to a gather/scatter internal function.
3691      If not, see whether it's a masked load or store.  */
3692   gcall *call = dyn_cast <gcall *> (stmt);
3693   if (call && gimple_call_internal_p (call))
3694     {
3695       ifn = gimple_call_internal_fn (stmt);
3696       if (internal_gather_scatter_fn_p (ifn))
3697 	{
3698 	  vect_describe_gather_scatter_call (call, info);
3699 	  return true;
3700 	}
3701       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3702     }
3703 
3704   /* True if we should aim to use internal functions rather than
3705      built-in functions.  */
3706   bool use_ifn_p = (DR_IS_READ (dr)
3707 		    ? supports_vec_gather_load_p ()
3708 		    : supports_vec_scatter_store_p ());
3709 
3710   base = DR_REF (dr);
3711   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3712      see if we can use the def stmt of the address.  */
3713   if (masked_p
3714       && TREE_CODE (base) == MEM_REF
3715       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3716       && integer_zerop (TREE_OPERAND (base, 1))
3717       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3718     {
3719       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3720       if (is_gimple_assign (def_stmt)
3721 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3722 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3723     }
3724 
3725   /* The gather and scatter builtins need address of the form
3726      loop_invariant + vector * {1, 2, 4, 8}
3727      or
3728      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3729      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3730      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3731      multiplications and additions in it.  To get a vector, we need
3732      a single SSA_NAME that will be defined in the loop and will
3733      contain everything that is not loop invariant and that can be
3734      vectorized.  The following code attempts to find such a preexistng
3735      SSA_NAME OFF and put the loop invariants into a tree BASE
3736      that can be gimplified before the loop.  */
3737   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3738 			      &punsignedp, &reversep, &pvolatilep);
3739   gcc_assert (base && !reversep);
3740   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3741 
3742   if (TREE_CODE (base) == MEM_REF)
3743     {
3744       if (!integer_zerop (TREE_OPERAND (base, 1)))
3745 	{
3746 	  if (off == NULL_TREE)
3747 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3748 	  else
3749 	    off = size_binop (PLUS_EXPR, off,
3750 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3751 	}
3752       base = TREE_OPERAND (base, 0);
3753     }
3754   else
3755     base = build_fold_addr_expr (base);
3756 
3757   if (off == NULL_TREE)
3758     off = size_zero_node;
3759 
3760   /* If base is not loop invariant, either off is 0, then we start with just
3761      the constant offset in the loop invariant BASE and continue with base
3762      as OFF, otherwise give up.
3763      We could handle that case by gimplifying the addition of base + off
3764      into some SSA_NAME and use that as off, but for now punt.  */
3765   if (!expr_invariant_in_loop_p (loop, base))
3766     {
3767       if (!integer_zerop (off))
3768 	return false;
3769       off = base;
3770       base = size_int (pbytepos);
3771     }
3772   /* Otherwise put base + constant offset into the loop invariant BASE
3773      and continue with OFF.  */
3774   else
3775     {
3776       base = fold_convert (sizetype, base);
3777       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3778     }
3779 
3780   /* OFF at this point may be either a SSA_NAME or some tree expression
3781      from get_inner_reference.  Try to peel off loop invariants from it
3782      into BASE as long as possible.  */
3783   STRIP_NOPS (off);
3784   while (offtype == NULL_TREE)
3785     {
3786       enum tree_code code;
3787       tree op0, op1, add = NULL_TREE;
3788 
3789       if (TREE_CODE (off) == SSA_NAME)
3790 	{
3791 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3792 
3793 	  if (expr_invariant_in_loop_p (loop, off))
3794 	    return false;
3795 
3796 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3797 	    break;
3798 
3799 	  op0 = gimple_assign_rhs1 (def_stmt);
3800 	  code = gimple_assign_rhs_code (def_stmt);
3801 	  op1 = gimple_assign_rhs2 (def_stmt);
3802 	}
3803       else
3804 	{
3805 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3806 	    return false;
3807 	  code = TREE_CODE (off);
3808 	  extract_ops_from_tree (off, &code, &op0, &op1);
3809 	}
3810       switch (code)
3811 	{
3812 	case POINTER_PLUS_EXPR:
3813 	case PLUS_EXPR:
3814 	  if (expr_invariant_in_loop_p (loop, op0))
3815 	    {
3816 	      add = op0;
3817 	      off = op1;
3818 	    do_add:
3819 	      add = fold_convert (sizetype, add);
3820 	      if (scale != 1)
3821 		add = size_binop (MULT_EXPR, add, size_int (scale));
3822 	      base = size_binop (PLUS_EXPR, base, add);
3823 	      continue;
3824 	    }
3825 	  if (expr_invariant_in_loop_p (loop, op1))
3826 	    {
3827 	      add = op1;
3828 	      off = op0;
3829 	      goto do_add;
3830 	    }
3831 	  break;
3832 	case MINUS_EXPR:
3833 	  if (expr_invariant_in_loop_p (loop, op1))
3834 	    {
3835 	      add = fold_convert (sizetype, op1);
3836 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3837 	      off = op0;
3838 	      goto do_add;
3839 	    }
3840 	  break;
3841 	case MULT_EXPR:
3842 	  if (scale == 1 && tree_fits_shwi_p (op1))
3843 	    {
3844 	      int new_scale = tree_to_shwi (op1);
3845 	      /* Only treat this as a scaling operation if the target
3846 		 supports it.  */
3847 	      if (use_ifn_p
3848 		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3849 						vectype, memory_type, 1,
3850 						TYPE_SIGN (TREE_TYPE (op0)),
3851 						new_scale, &ifn,
3852 						&element_type))
3853 		break;
3854 	      scale = new_scale;
3855 	      off = op0;
3856 	      continue;
3857 	    }
3858 	  break;
3859 	case SSA_NAME:
3860 	  off = op0;
3861 	  continue;
3862 	CASE_CONVERT:
3863 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3864 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3865 	    break;
3866 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3867 	      == TYPE_PRECISION (TREE_TYPE (off)))
3868 	    {
3869 	      off = op0;
3870 	      continue;
3871 	    }
3872 
3873 	  /* The internal functions need the offset to be the same width
3874 	     as the elements of VECTYPE.  Don't include operations that
3875 	     cast the offset from that width to a different width.  */
3876 	  if (use_ifn_p
3877 	      && (int_size_in_bytes (TREE_TYPE (vectype))
3878 		  == int_size_in_bytes (TREE_TYPE (off))))
3879 	    break;
3880 
3881 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3882 	      < TYPE_PRECISION (TREE_TYPE (off)))
3883 	    {
3884 	      off = op0;
3885 	      offtype = TREE_TYPE (off);
3886 	      STRIP_NOPS (off);
3887 	      continue;
3888 	    }
3889 	  break;
3890 	default:
3891 	  break;
3892 	}
3893       break;
3894     }
3895 
3896   /* If at the end OFF still isn't a SSA_NAME or isn't
3897      defined in the loop, punt.  */
3898   if (TREE_CODE (off) != SSA_NAME
3899       || expr_invariant_in_loop_p (loop, off))
3900     return false;
3901 
3902   if (offtype == NULL_TREE)
3903     offtype = TREE_TYPE (off);
3904 
3905   if (use_ifn_p)
3906     {
3907       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3908 				     memory_type, TYPE_PRECISION (offtype),
3909 				     TYPE_SIGN (offtype), scale, &ifn,
3910 				     &element_type))
3911 	return false;
3912     }
3913   else
3914     {
3915       if (DR_IS_READ (dr))
3916 	{
3917 	  if (targetm.vectorize.builtin_gather)
3918 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3919 	}
3920       else
3921 	{
3922 	  if (targetm.vectorize.builtin_scatter)
3923 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3924 	}
3925 
3926       if (!decl)
3927 	return false;
3928 
3929       ifn = IFN_LAST;
3930       element_type = TREE_TYPE (vectype);
3931     }
3932 
3933   info->ifn = ifn;
3934   info->decl = decl;
3935   info->base = base;
3936   info->offset = off;
3937   info->offset_dt = vect_unknown_def_type;
3938   info->offset_vectype = NULL_TREE;
3939   info->scale = scale;
3940   info->element_type = element_type;
3941   info->memory_type = memory_type;
3942   return true;
3943 }
3944 
3945 /* Function vect_analyze_data_refs.
3946 
3947   Find all the data references in the loop or basic block.
3948 
3949    The general structure of the analysis of data refs in the vectorizer is as
3950    follows:
3951    1- vect_analyze_data_refs(loop/bb): call
3952       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3953       in the loop/bb and their dependences.
3954    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3955    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3956    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3957 
3958 */
3959 
3960 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)3961 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3962 {
3963   struct loop *loop = NULL;
3964   unsigned int i;
3965   struct data_reference *dr;
3966   tree scalar_type;
3967 
3968   if (dump_enabled_p ())
3969     dump_printf_loc (MSG_NOTE, vect_location,
3970 		     "=== vect_analyze_data_refs ===\n");
3971 
3972   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3973     loop = LOOP_VINFO_LOOP (loop_vinfo);
3974 
3975   /* Go through the data-refs, check that the analysis succeeded.  Update
3976      pointer from stmt_vec_info struct to DR and vectype.  */
3977 
3978   vec<data_reference_p> datarefs = vinfo->datarefs;
3979   FOR_EACH_VEC_ELT (datarefs, i, dr)
3980     {
3981       gimple *stmt;
3982       stmt_vec_info stmt_info;
3983       tree base, offset, init;
3984       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3985       bool simd_lane_access = false;
3986       poly_uint64 vf;
3987 
3988 again:
3989       if (!dr || !DR_REF (dr))
3990         {
3991           if (dump_enabled_p ())
3992 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3993 	                     "not vectorized: unhandled data-ref\n");
3994           return false;
3995         }
3996 
3997       stmt = DR_STMT (dr);
3998       stmt_info = vinfo_for_stmt (stmt);
3999 
4000       /* Discard clobbers from the dataref vector.  We will remove
4001          clobber stmts during vectorization.  */
4002       if (gimple_clobber_p (stmt))
4003 	{
4004 	  free_data_ref (dr);
4005 	  if (i == datarefs.length () - 1)
4006 	    {
4007 	      datarefs.pop ();
4008 	      break;
4009 	    }
4010 	  datarefs.ordered_remove (i);
4011 	  dr = datarefs[i];
4012 	  goto again;
4013 	}
4014 
4015       /* Check that analysis of the data-ref succeeded.  */
4016       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4017 	  || !DR_STEP (dr))
4018         {
4019 	  bool maybe_gather
4020 	    = DR_IS_READ (dr)
4021 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4022 	      && (targetm.vectorize.builtin_gather != NULL
4023 		  || supports_vec_gather_load_p ());
4024 	  bool maybe_scatter
4025 	    = DR_IS_WRITE (dr)
4026 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4027 	      && (targetm.vectorize.builtin_scatter != NULL
4028 		  || supports_vec_scatter_store_p ());
4029 	  bool maybe_simd_lane_access
4030 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
4031 
4032 	  /* If target supports vector gather loads or scatter stores, or if
4033 	     this might be a SIMD lane access, see if they can't be used.  */
4034 	  if (is_a <loop_vec_info> (vinfo)
4035 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4036 	      && !nested_in_vect_loop_p (loop, stmt))
4037 	    {
4038 	      struct data_reference *newdr
4039 		= create_data_ref (NULL, loop_containing_stmt (stmt),
4040 				   DR_REF (dr), stmt, !maybe_scatter,
4041 				   DR_IS_CONDITIONAL_IN_STMT (dr));
4042 	      gcc_assert (newdr != NULL && DR_REF (newdr));
4043 	      if (DR_BASE_ADDRESS (newdr)
4044 		  && DR_OFFSET (newdr)
4045 		  && DR_INIT (newdr)
4046 		  && DR_STEP (newdr)
4047 		  && integer_zerop (DR_STEP (newdr)))
4048 		{
4049 		  if (maybe_simd_lane_access)
4050 		    {
4051 		      tree off = DR_OFFSET (newdr);
4052 		      STRIP_NOPS (off);
4053 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4054 			  && TREE_CODE (off) == MULT_EXPR
4055 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4056 			{
4057 			  tree step = TREE_OPERAND (off, 1);
4058 			  off = TREE_OPERAND (off, 0);
4059 			  STRIP_NOPS (off);
4060 			  if (CONVERT_EXPR_P (off)
4061 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4062 									  0)))
4063 				 < TYPE_PRECISION (TREE_TYPE (off)))
4064 			    off = TREE_OPERAND (off, 0);
4065 			  if (TREE_CODE (off) == SSA_NAME)
4066 			    {
4067 			      gimple *def = SSA_NAME_DEF_STMT (off);
4068 			      tree reft = TREE_TYPE (DR_REF (newdr));
4069 			      if (is_gimple_call (def)
4070 				  && gimple_call_internal_p (def)
4071 				  && (gimple_call_internal_fn (def)
4072 				      == IFN_GOMP_SIMD_LANE))
4073 				{
4074 				  tree arg = gimple_call_arg (def, 0);
4075 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4076 				  arg = SSA_NAME_VAR (arg);
4077 				  if (arg == loop->simduid
4078 				      /* For now.  */
4079 				      && tree_int_cst_equal
4080 					   (TYPE_SIZE_UNIT (reft),
4081 					    step))
4082 				    {
4083 				      DR_OFFSET (newdr) = ssize_int (0);
4084 				      DR_STEP (newdr) = step;
4085 				      DR_OFFSET_ALIGNMENT (newdr)
4086 					= BIGGEST_ALIGNMENT;
4087 				      DR_STEP_ALIGNMENT (newdr)
4088 					= highest_pow2_factor (step);
4089 				      dr = newdr;
4090 				      simd_lane_access = true;
4091 				    }
4092 				}
4093 			    }
4094 			}
4095 		    }
4096 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
4097 		    {
4098 		      dr = newdr;
4099 		      if (maybe_gather)
4100 			gatherscatter = GATHER;
4101 		      else
4102 			gatherscatter = SCATTER;
4103 		    }
4104 		}
4105 	      if (gatherscatter == SG_NONE && !simd_lane_access)
4106 		free_data_ref (newdr);
4107 	    }
4108 
4109 	  if (gatherscatter == SG_NONE && !simd_lane_access)
4110 	    {
4111 	      if (dump_enabled_p ())
4112 		{
4113 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114                                    "not vectorized: data ref analysis "
4115                                    "failed ");
4116 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4117 		}
4118 
4119 	      if (is_a <bb_vec_info> (vinfo))
4120 		break;
4121 
4122 	      return false;
4123 	    }
4124         }
4125 
4126       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4127         {
4128           if (dump_enabled_p ())
4129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130                              "not vectorized: base addr of dr is a "
4131                              "constant\n");
4132 
4133           if (is_a <bb_vec_info> (vinfo))
4134 	    break;
4135 
4136 	  if (gatherscatter != SG_NONE || simd_lane_access)
4137 	    free_data_ref (dr);
4138 	  return false;
4139         }
4140 
4141       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4142         {
4143           if (dump_enabled_p ())
4144             {
4145               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146                                "not vectorized: volatile type ");
4147               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4148             }
4149 
4150           if (is_a <bb_vec_info> (vinfo))
4151 	    break;
4152 
4153           return false;
4154         }
4155 
4156       if (stmt_can_throw_internal (stmt))
4157         {
4158           if (dump_enabled_p ())
4159             {
4160               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4161                                "not vectorized: statement can throw an "
4162                                "exception ");
4163               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4164             }
4165 
4166           if (is_a <bb_vec_info> (vinfo))
4167 	    break;
4168 
4169 	  if (gatherscatter != SG_NONE || simd_lane_access)
4170 	    free_data_ref (dr);
4171           return false;
4172         }
4173 
4174       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4175 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4176 	{
4177           if (dump_enabled_p ())
4178             {
4179               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180                                "not vectorized: statement is bitfield "
4181                                "access ");
4182               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4183             }
4184 
4185           if (is_a <bb_vec_info> (vinfo))
4186 	    break;
4187 
4188 	  if (gatherscatter != SG_NONE || simd_lane_access)
4189 	    free_data_ref (dr);
4190           return false;
4191 	}
4192 
4193       base = unshare_expr (DR_BASE_ADDRESS (dr));
4194       offset = unshare_expr (DR_OFFSET (dr));
4195       init = unshare_expr (DR_INIT (dr));
4196 
4197       if (is_gimple_call (stmt)
4198 	  && (!gimple_call_internal_p (stmt)
4199 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4200 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4201 	{
4202 	  if (dump_enabled_p ())
4203 	    {
4204 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4205 	                       "not vectorized: dr in a call ");
4206 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4207 	    }
4208 
4209 	  if (is_a <bb_vec_info> (vinfo))
4210 	    break;
4211 
4212 	  if (gatherscatter != SG_NONE || simd_lane_access)
4213 	    free_data_ref (dr);
4214 	  return false;
4215 	}
4216 
4217       /* Update DR field in stmt_vec_info struct.  */
4218 
4219       /* If the dataref is in an inner-loop of the loop that is considered for
4220 	 for vectorization, we also want to analyze the access relative to
4221 	 the outer-loop (DR contains information only relative to the
4222 	 inner-most enclosing loop).  We do that by building a reference to the
4223 	 first location accessed by the inner-loop, and analyze it relative to
4224 	 the outer-loop.  */
4225       if (loop && nested_in_vect_loop_p (loop, stmt))
4226 	{
4227 	  /* Build a reference to the first location accessed by the
4228 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4229 	     this address must be invariant in the inner loop, so we
4230 	     can consider it as being used in the outer loop.  */
4231 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4232 					  init, offset);
4233 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4234 	  tree init_ref = build_fold_indirect_ref (init_addr);
4235 
4236 	  if (dump_enabled_p ())
4237 	    {
4238 	      dump_printf_loc (MSG_NOTE, vect_location,
4239                                "analyze in outer loop: ");
4240 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4241 	      dump_printf (MSG_NOTE, "\n");
4242 	    }
4243 
4244 	  if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4245 				     init_ref, loop))
4246 	    /* dr_analyze_innermost already explained the failure.  */
4247 	    return false;
4248 
4249           if (dump_enabled_p ())
4250 	    {
4251 	      dump_printf_loc (MSG_NOTE, vect_location,
4252                                "\touter base_address: ");
4253 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4254                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4255 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4256 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4257                                  STMT_VINFO_DR_OFFSET (stmt_info));
4258 	      dump_printf (MSG_NOTE,
4259                            "\n\touter constant offset from base address: ");
4260 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4261                                  STMT_VINFO_DR_INIT (stmt_info));
4262 	      dump_printf (MSG_NOTE, "\n\touter step: ");
4263 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4264                                  STMT_VINFO_DR_STEP (stmt_info));
4265 	      dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4266 			   STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4267 	      dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4268 			   STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4269 	      dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4270 			   STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4271 	      dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4272 			   STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4273 	    }
4274 	}
4275 
4276       if (STMT_VINFO_DATA_REF (stmt_info))
4277         {
4278           if (dump_enabled_p ())
4279             {
4280               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4281                                "not vectorized: more than one data ref "
4282                                "in stmt: ");
4283               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4284             }
4285 
4286           if (is_a <bb_vec_info> (vinfo))
4287 	    break;
4288 
4289 	  if (gatherscatter != SG_NONE || simd_lane_access)
4290 	    free_data_ref (dr);
4291           return false;
4292         }
4293 
4294       STMT_VINFO_DATA_REF (stmt_info) = dr;
4295       if (simd_lane_access)
4296 	{
4297 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4298 	  free_data_ref (datarefs[i]);
4299 	  datarefs[i] = dr;
4300 	}
4301 
4302       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4303 	  && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4304 	  && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4305 	{
4306           if (dump_enabled_p ())
4307             {
4308               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4309                                "not vectorized: base object not addressable "
4310 			       "for stmt: ");
4311               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4312             }
4313           if (is_a <bb_vec_info> (vinfo))
4314 	    {
4315 	      /* In BB vectorization the ref can still participate
4316 	         in dependence analysis, we just can't vectorize it.  */
4317 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4318 	      continue;
4319 	    }
4320 	  return false;
4321 	}
4322 
4323       /* Set vectype for STMT.  */
4324       scalar_type = TREE_TYPE (DR_REF (dr));
4325       STMT_VINFO_VECTYPE (stmt_info)
4326 	= get_vectype_for_scalar_type (scalar_type);
4327       if (!STMT_VINFO_VECTYPE (stmt_info))
4328         {
4329           if (dump_enabled_p ())
4330             {
4331               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332                                "not vectorized: no vectype for stmt: ");
4333               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4334               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4335               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4336                                  scalar_type);
4337               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4338             }
4339 
4340           if (is_a <bb_vec_info> (vinfo))
4341 	    {
4342 	      /* No vector type is fine, the ref can still participate
4343 	         in dependence analysis, we just can't vectorize it.  */
4344 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4345 	      continue;
4346 	    }
4347 
4348 	  if (gatherscatter != SG_NONE || simd_lane_access)
4349 	    {
4350 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4351 	      if (gatherscatter != SG_NONE)
4352 		free_data_ref (dr);
4353 	    }
4354 	  return false;
4355         }
4356       else
4357 	{
4358 	  if (dump_enabled_p ())
4359 	    {
4360 	      dump_printf_loc (MSG_NOTE, vect_location,
4361 			       "got vectype for stmt: ");
4362 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4363 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4364 				 STMT_VINFO_VECTYPE (stmt_info));
4365 	      dump_printf (MSG_NOTE, "\n");
4366 	    }
4367 	}
4368 
4369       /* Adjust the minimal vectorization factor according to the
4370 	 vector type.  */
4371       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4372       *min_vf = upper_bound (*min_vf, vf);
4373 
4374       if (gatherscatter != SG_NONE)
4375 	{
4376 	  gather_scatter_info gs_info;
4377 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4378 					  &gs_info)
4379 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4380 	    {
4381 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4382 	      free_data_ref (dr);
4383 	      if (dump_enabled_p ())
4384 		{
4385 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4386 				   (gatherscatter == GATHER) ?
4387 				   "not vectorized: not suitable for gather "
4388 				   "load " :
4389 				   "not vectorized: not suitable for scatter "
4390 				   "store ");
4391 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4392 		}
4393 	      return false;
4394 	    }
4395 
4396 	  free_data_ref (datarefs[i]);
4397 	  datarefs[i] = dr;
4398 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4399 	}
4400 
4401       else if (is_a <loop_vec_info> (vinfo)
4402 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4403 	{
4404 	  if (nested_in_vect_loop_p (loop, stmt))
4405 	    {
4406 	      if (dump_enabled_p ())
4407 		{
4408 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4409                                    "not vectorized: not suitable for strided "
4410                                    "load ");
4411 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4412 		}
4413 	      return false;
4414 	    }
4415 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4416 	}
4417     }
4418 
4419   /* If we stopped analysis at the first dataref we could not analyze
4420      when trying to vectorize a basic-block mark the rest of the datarefs
4421      as not vectorizable and truncate the vector of datarefs.  That
4422      avoids spending useless time in analyzing their dependence.  */
4423   if (i != datarefs.length ())
4424     {
4425       gcc_assert (is_a <bb_vec_info> (vinfo));
4426       for (unsigned j = i; j < datarefs.length (); ++j)
4427 	{
4428 	  data_reference_p dr = datarefs[j];
4429           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4430 	  free_data_ref (dr);
4431 	}
4432       datarefs.truncate (i);
4433     }
4434 
4435   return true;
4436 }
4437 
4438 
4439 /* Function vect_get_new_vect_var.
4440 
4441    Returns a name for a new variable.  The current naming scheme appends the
4442    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4443    the name of vectorizer generated variables, and appends that to NAME if
4444    provided.  */
4445 
4446 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4447 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4448 {
4449   const char *prefix;
4450   tree new_vect_var;
4451 
4452   switch (var_kind)
4453   {
4454   case vect_simple_var:
4455     prefix = "vect";
4456     break;
4457   case vect_scalar_var:
4458     prefix = "stmp";
4459     break;
4460   case vect_mask_var:
4461     prefix = "mask";
4462     break;
4463   case vect_pointer_var:
4464     prefix = "vectp";
4465     break;
4466   default:
4467     gcc_unreachable ();
4468   }
4469 
4470   if (name)
4471     {
4472       char* tmp = concat (prefix, "_", name, NULL);
4473       new_vect_var = create_tmp_reg (type, tmp);
4474       free (tmp);
4475     }
4476   else
4477     new_vect_var = create_tmp_reg (type, prefix);
4478 
4479   return new_vect_var;
4480 }
4481 
4482 /* Like vect_get_new_vect_var but return an SSA name.  */
4483 
4484 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4485 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4486 {
4487   const char *prefix;
4488   tree new_vect_var;
4489 
4490   switch (var_kind)
4491   {
4492   case vect_simple_var:
4493     prefix = "vect";
4494     break;
4495   case vect_scalar_var:
4496     prefix = "stmp";
4497     break;
4498   case vect_pointer_var:
4499     prefix = "vectp";
4500     break;
4501   default:
4502     gcc_unreachable ();
4503   }
4504 
4505   if (name)
4506     {
4507       char* tmp = concat (prefix, "_", name, NULL);
4508       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4509       free (tmp);
4510     }
4511   else
4512     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4513 
4514   return new_vect_var;
4515 }
4516 
4517 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4518 
4519 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4520 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4521 {
4522   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4523   int misalign = DR_MISALIGNMENT (dr);
4524   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4525     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4526   else
4527     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4528 			    DR_TARGET_ALIGNMENT (dr), misalign);
4529 }
4530 
4531 /* Function vect_create_addr_base_for_vector_ref.
4532 
4533    Create an expression that computes the address of the first memory location
4534    that will be accessed for a data reference.
4535 
4536    Input:
4537    STMT: The statement containing the data reference.
4538    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4539    OFFSET: Optional. If supplied, it is be added to the initial address.
4540    LOOP:    Specify relative to which loop-nest should the address be computed.
4541             For example, when the dataref is in an inner-loop nested in an
4542 	    outer-loop that is now being vectorized, LOOP can be either the
4543 	    outer-loop, or the inner-loop.  The first memory location accessed
4544 	    by the following dataref ('in' points to short):
4545 
4546 		for (i=0; i<N; i++)
4547 		   for (j=0; j<M; j++)
4548 		     s += in[i+j]
4549 
4550 	    is as follows:
4551 	    if LOOP=i_loop:	&in		(relative to i_loop)
4552 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4553    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4554 	    initial address.  Unlike OFFSET, which is number of elements to
4555 	    be added, BYTE_OFFSET is measured in bytes.
4556 
4557    Output:
4558    1. Return an SSA_NAME whose value is the address of the memory location of
4559       the first vector of the data reference.
4560    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4561       these statement(s) which define the returned SSA_NAME.
4562 
4563    FORNOW: We are only handling array accesses with step 1.  */
4564 
4565 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4566 vect_create_addr_base_for_vector_ref (gimple *stmt,
4567 				      gimple_seq *new_stmt_list,
4568 				      tree offset,
4569 				      tree byte_offset)
4570 {
4571   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4572   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4573   const char *base_name;
4574   tree addr_base;
4575   tree dest;
4576   gimple_seq seq = NULL;
4577   tree vect_ptr_type;
4578   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4579   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4580   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4581 
4582   tree data_ref_base = unshare_expr (drb->base_address);
4583   tree base_offset = unshare_expr (drb->offset);
4584   tree init = unshare_expr (drb->init);
4585 
4586   if (loop_vinfo)
4587     base_name = get_name (data_ref_base);
4588   else
4589     {
4590       base_offset = ssize_int (0);
4591       init = ssize_int (0);
4592       base_name = get_name (DR_REF (dr));
4593     }
4594 
4595   /* Create base_offset */
4596   base_offset = size_binop (PLUS_EXPR,
4597 			    fold_convert (sizetype, base_offset),
4598 			    fold_convert (sizetype, init));
4599 
4600   if (offset)
4601     {
4602       offset = fold_build2 (MULT_EXPR, sizetype,
4603 			    fold_convert (sizetype, offset), step);
4604       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4605 				 base_offset, offset);
4606     }
4607   if (byte_offset)
4608     {
4609       byte_offset = fold_convert (sizetype, byte_offset);
4610       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4611 				 base_offset, byte_offset);
4612     }
4613 
4614   /* base + base_offset */
4615   if (loop_vinfo)
4616     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4617   else
4618     {
4619       addr_base = build1 (ADDR_EXPR,
4620 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4621 			  unshare_expr (DR_REF (dr)));
4622     }
4623 
4624   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4625   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4626   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4627   gimple_seq_add_seq (new_stmt_list, seq);
4628 
4629   if (DR_PTR_INFO (dr)
4630       && TREE_CODE (addr_base) == SSA_NAME
4631       && !SSA_NAME_PTR_INFO (addr_base))
4632     {
4633       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4634       if (offset || byte_offset)
4635 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4636     }
4637 
4638   if (dump_enabled_p ())
4639     {
4640       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4641       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4642       dump_printf (MSG_NOTE, "\n");
4643     }
4644 
4645   return addr_base;
4646 }
4647 
4648 
4649 /* Function vect_create_data_ref_ptr.
4650 
4651    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4652    location accessed in the loop by STMT, along with the def-use update
4653    chain to appropriately advance the pointer through the loop iterations.
4654    Also set aliasing information for the pointer.  This pointer is used by
4655    the callers to this function to create a memory reference expression for
4656    vector load/store access.
4657 
4658    Input:
4659    1. STMT: a stmt that references memory. Expected to be of the form
4660          GIMPLE_ASSIGN <name, data-ref> or
4661 	 GIMPLE_ASSIGN <data-ref, name>.
4662    2. AGGR_TYPE: the type of the reference, which should be either a vector
4663         or an array.
4664    3. AT_LOOP: the loop where the vector memref is to be created.
4665    4. OFFSET (optional): an offset to be added to the initial address accessed
4666         by the data-ref in STMT.
4667    5. BSI: location where the new stmts are to be placed if there is no loop
4668    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4669         pointing to the initial address.
4670    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4671 	to the initial address accessed by the data-ref in STMT.  This is
4672 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4673 	in bytes.
4674    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4675 	to the IV during each iteration of the loop.  NULL says to move
4676 	by one copy of AGGR_TYPE up or down, depending on the step of the
4677 	data reference.
4678 
4679    Output:
4680    1. Declare a new ptr to vector_type, and have it point to the base of the
4681       data reference (initial addressed accessed by the data reference).
4682       For example, for vector of type V8HI, the following code is generated:
4683 
4684       v8hi *ap;
4685       ap = (v8hi *)initial_address;
4686 
4687       if OFFSET is not supplied:
4688          initial_address = &a[init];
4689       if OFFSET is supplied:
4690          initial_address = &a[init + OFFSET];
4691       if BYTE_OFFSET is supplied:
4692 	 initial_address = &a[init] + BYTE_OFFSET;
4693 
4694       Return the initial_address in INITIAL_ADDRESS.
4695 
4696    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4697       update the pointer in each iteration of the loop.
4698 
4699       Return the increment stmt that updates the pointer in PTR_INCR.
4700 
4701    3. Set INV_P to true if the access pattern of the data reference in the
4702       vectorized loop is invariant.  Set it to false otherwise.
4703 
4704    4. Return the pointer.  */
4705 
4706 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4707 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4708 			  tree offset, tree *initial_address,
4709 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4710 			  bool only_init, bool *inv_p, tree byte_offset,
4711 			  tree iv_step)
4712 {
4713   const char *base_name;
4714   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4715   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4716   struct loop *loop = NULL;
4717   bool nested_in_vect_loop = false;
4718   struct loop *containing_loop = NULL;
4719   tree aggr_ptr_type;
4720   tree aggr_ptr;
4721   tree new_temp;
4722   gimple_seq new_stmt_list = NULL;
4723   edge pe = NULL;
4724   basic_block new_bb;
4725   tree aggr_ptr_init;
4726   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4727   tree aptr;
4728   gimple_stmt_iterator incr_gsi;
4729   bool insert_after;
4730   tree indx_before_incr, indx_after_incr;
4731   gimple *incr;
4732   tree step;
4733   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4734 
4735   gcc_assert (iv_step != NULL_TREE
4736 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4737 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4738 
4739   if (loop_vinfo)
4740     {
4741       loop = LOOP_VINFO_LOOP (loop_vinfo);
4742       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4743       containing_loop = (gimple_bb (stmt))->loop_father;
4744       pe = loop_preheader_edge (loop);
4745     }
4746   else
4747     {
4748       gcc_assert (bb_vinfo);
4749       only_init = true;
4750       *ptr_incr = NULL;
4751     }
4752 
4753   /* Check the step (evolution) of the load in LOOP, and record
4754      whether it's invariant.  */
4755   step = vect_dr_behavior (dr)->step;
4756   if (integer_zerop (step))
4757     *inv_p = true;
4758   else
4759     *inv_p = false;
4760 
4761   /* Create an expression for the first address accessed by this load
4762      in LOOP.  */
4763   base_name = get_name (DR_BASE_ADDRESS (dr));
4764 
4765   if (dump_enabled_p ())
4766     {
4767       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4768       dump_printf_loc (MSG_NOTE, vect_location,
4769                        "create %s-pointer variable to type: ",
4770 		       get_tree_code_name (TREE_CODE (aggr_type)));
4771       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4772       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4773         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4774       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4775         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4776       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4777         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4778       else
4779         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4780       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4781       dump_printf (MSG_NOTE, "\n");
4782     }
4783 
4784   /* (1) Create the new aggregate-pointer variable.
4785      Vector and array types inherit the alias set of their component
4786      type by default so we need to use a ref-all pointer if the data
4787      reference does not conflict with the created aggregated data
4788      reference because it is not addressable.  */
4789   bool need_ref_all = false;
4790   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4791 			      get_alias_set (DR_REF (dr))))
4792     need_ref_all = true;
4793   /* Likewise for any of the data references in the stmt group.  */
4794   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4795     {
4796       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4797       do
4798 	{
4799 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4800 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4801 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4802 				      get_alias_set (DR_REF (sdr))))
4803 	    {
4804 	      need_ref_all = true;
4805 	      break;
4806 	    }
4807 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4808 	}
4809       while (orig_stmt);
4810     }
4811   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4812 					       need_ref_all);
4813   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4814 
4815 
4816   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4817      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4818      def-use update cycles for the pointer: one relative to the outer-loop
4819      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4820      to the inner-loop (which is the inner-most loop containing the dataref),
4821      and this is done be step (5) below.
4822 
4823      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4824      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4825      redundant.  Steps (3),(4) create the following:
4826 
4827 	vp0 = &base_addr;
4828 	LOOP:	vp1 = phi(vp0,vp2)
4829 		...
4830 		...
4831 		vp2 = vp1 + step
4832 		goto LOOP
4833 
4834      If there is an inner-loop nested in loop, then step (5) will also be
4835      applied, and an additional update in the inner-loop will be created:
4836 
4837 	vp0 = &base_addr;
4838 	LOOP:   vp1 = phi(vp0,vp2)
4839 		...
4840         inner:     vp3 = phi(vp1,vp4)
4841 	           vp4 = vp3 + inner_step
4842 	           if () goto inner
4843 		...
4844 		vp2 = vp1 + step
4845 		if () goto LOOP   */
4846 
4847   /* (2) Calculate the initial address of the aggregate-pointer, and set
4848      the aggregate-pointer to point to it before the loop.  */
4849 
4850   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4851 
4852   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4853 						   offset, byte_offset);
4854   if (new_stmt_list)
4855     {
4856       if (pe)
4857         {
4858           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4859           gcc_assert (!new_bb);
4860         }
4861       else
4862         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4863     }
4864 
4865   *initial_address = new_temp;
4866   aggr_ptr_init = new_temp;
4867 
4868   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4869      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4870      inner-loop nested in LOOP (during outer-loop vectorization).  */
4871 
4872   /* No update in loop is required.  */
4873   if (only_init && (!loop_vinfo || at_loop == loop))
4874     aptr = aggr_ptr_init;
4875   else
4876     {
4877       if (iv_step == NULL_TREE)
4878 	{
4879 	  /* The step of the aggregate pointer is the type size.  */
4880 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4881 	  /* One exception to the above is when the scalar step of the load in
4882 	     LOOP is zero. In this case the step here is also zero.  */
4883 	  if (*inv_p)
4884 	    iv_step = size_zero_node;
4885 	  else if (tree_int_cst_sgn (step) == -1)
4886 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4887 	}
4888 
4889       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4890 
4891       create_iv (aggr_ptr_init,
4892 		 fold_convert (aggr_ptr_type, iv_step),
4893 		 aggr_ptr, loop, &incr_gsi, insert_after,
4894 		 &indx_before_incr, &indx_after_incr);
4895       incr = gsi_stmt (incr_gsi);
4896       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4897 
4898       /* Copy the points-to information if it exists. */
4899       if (DR_PTR_INFO (dr))
4900 	{
4901 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4902 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4903 	}
4904       if (ptr_incr)
4905 	*ptr_incr = incr;
4906 
4907       aptr = indx_before_incr;
4908     }
4909 
4910   if (!nested_in_vect_loop || only_init)
4911     return aptr;
4912 
4913 
4914   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4915      nested in LOOP, if exists.  */
4916 
4917   gcc_assert (nested_in_vect_loop);
4918   if (!only_init)
4919     {
4920       standard_iv_increment_position (containing_loop, &incr_gsi,
4921 				      &insert_after);
4922       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4923 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4924 		 &indx_after_incr);
4925       incr = gsi_stmt (incr_gsi);
4926       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4927 
4928       /* Copy the points-to information if it exists. */
4929       if (DR_PTR_INFO (dr))
4930 	{
4931 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4932 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4933 	}
4934       if (ptr_incr)
4935 	*ptr_incr = incr;
4936 
4937       return indx_before_incr;
4938     }
4939   else
4940     gcc_unreachable ();
4941 }
4942 
4943 
4944 /* Function bump_vector_ptr
4945 
4946    Increment a pointer (to a vector type) by vector-size. If requested,
4947    i.e. if PTR-INCR is given, then also connect the new increment stmt
4948    to the existing def-use update-chain of the pointer, by modifying
4949    the PTR_INCR as illustrated below:
4950 
4951    The pointer def-use update-chain before this function:
4952                         DATAREF_PTR = phi (p_0, p_2)
4953                         ....
4954         PTR_INCR:       p_2 = DATAREF_PTR + step
4955 
4956    The pointer def-use update-chain after this function:
4957                         DATAREF_PTR = phi (p_0, p_2)
4958                         ....
4959                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4960                         ....
4961         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4962 
4963    Input:
4964    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4965                  in the loop.
4966    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4967 	      the loop.  The increment amount across iterations is expected
4968 	      to be vector_size.
4969    BSI - location where the new update stmt is to be placed.
4970    STMT - the original scalar memory-access stmt that is being vectorized.
4971    BUMP - optional. The offset by which to bump the pointer. If not given,
4972 	  the offset is assumed to be vector_size.
4973 
4974    Output: Return NEW_DATAREF_PTR as illustrated above.
4975 
4976 */
4977 
4978 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)4979 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4980 		 gimple *stmt, tree bump)
4981 {
4982   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4983   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4984   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4985   tree update = TYPE_SIZE_UNIT (vectype);
4986   gassign *incr_stmt;
4987   ssa_op_iter iter;
4988   use_operand_p use_p;
4989   tree new_dataref_ptr;
4990 
4991   if (bump)
4992     update = bump;
4993 
4994   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4995     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4996   else
4997     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4998   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4999 				   dataref_ptr, update);
5000   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5001 
5002   /* Copy the points-to information if it exists. */
5003   if (DR_PTR_INFO (dr))
5004     {
5005       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5006       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5007     }
5008 
5009   if (!ptr_incr)
5010     return new_dataref_ptr;
5011 
5012   /* Update the vector-pointer's cross-iteration increment.  */
5013   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5014     {
5015       tree use = USE_FROM_PTR (use_p);
5016 
5017       if (use == dataref_ptr)
5018         SET_USE (use_p, new_dataref_ptr);
5019       else
5020         gcc_assert (operand_equal_p (use, update, 0));
5021     }
5022 
5023   return new_dataref_ptr;
5024 }
5025 
5026 
5027 /* Copy memory reference info such as base/clique from the SRC reference
5028    to the DEST MEM_REF.  */
5029 
5030 void
vect_copy_ref_info(tree dest,tree src)5031 vect_copy_ref_info (tree dest, tree src)
5032 {
5033   if (TREE_CODE (dest) != MEM_REF)
5034     return;
5035 
5036   tree src_base = src;
5037   while (handled_component_p (src_base))
5038     src_base = TREE_OPERAND (src_base, 0);
5039   if (TREE_CODE (src_base) != MEM_REF
5040       && TREE_CODE (src_base) != TARGET_MEM_REF)
5041     return;
5042 
5043   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5044   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5045 }
5046 
5047 
5048 /* Function vect_create_destination_var.
5049 
5050    Create a new temporary of type VECTYPE.  */
5051 
5052 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5053 vect_create_destination_var (tree scalar_dest, tree vectype)
5054 {
5055   tree vec_dest;
5056   const char *name;
5057   char *new_name;
5058   tree type;
5059   enum vect_var_kind kind;
5060 
5061   kind = vectype
5062     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5063     ? vect_mask_var
5064     : vect_simple_var
5065     : vect_scalar_var;
5066   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5067 
5068   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5069 
5070   name = get_name (scalar_dest);
5071   if (name)
5072     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5073   else
5074     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5075   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5076   free (new_name);
5077 
5078   return vec_dest;
5079 }
5080 
5081 /* Function vect_grouped_store_supported.
5082 
5083    Returns TRUE if interleave high and interleave low permutations
5084    are supported, and FALSE otherwise.  */
5085 
5086 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5087 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5088 {
5089   machine_mode mode = TYPE_MODE (vectype);
5090 
5091   /* vect_permute_store_chain requires the group size to be equal to 3 or
5092      be a power of two.  */
5093   if (count != 3 && exact_log2 (count) == -1)
5094     {
5095       if (dump_enabled_p ())
5096 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097 			 "the size of the group of accesses"
5098 			 " is not a power of 2 or not eqaul to 3\n");
5099       return false;
5100     }
5101 
5102   /* Check that the permutation is supported.  */
5103   if (VECTOR_MODE_P (mode))
5104     {
5105       unsigned int i;
5106       if (count == 3)
5107 	{
5108 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5109 	  unsigned int i, j;
5110 
5111 	  unsigned int nelt;
5112 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5113 	    {
5114 	      if (dump_enabled_p ())
5115 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116 				 "cannot handle groups of 3 stores for"
5117 				 " variable-length vectors\n");
5118 	      return false;
5119 	    }
5120 
5121 	  vec_perm_builder sel (nelt, nelt, 1);
5122 	  sel.quick_grow (nelt);
5123 	  vec_perm_indices indices;
5124 	  for (j = 0; j < 3; j++)
5125 	    {
5126 	      int nelt0 = ((3 - j) * nelt) % 3;
5127 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5128 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5129 	      for (i = 0; i < nelt; i++)
5130 		{
5131 		  if (3 * i + nelt0 < nelt)
5132 		    sel[3 * i + nelt0] = j0++;
5133 		  if (3 * i + nelt1 < nelt)
5134 		    sel[3 * i + nelt1] = nelt + j1++;
5135 		  if (3 * i + nelt2 < nelt)
5136 		    sel[3 * i + nelt2] = 0;
5137 		}
5138 	      indices.new_vector (sel, 2, nelt);
5139 	      if (!can_vec_perm_const_p (mode, indices))
5140 		{
5141 		  if (dump_enabled_p ())
5142 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5143 				 "permutation op not supported by target.\n");
5144 		  return false;
5145 		}
5146 
5147 	      for (i = 0; i < nelt; i++)
5148 		{
5149 		  if (3 * i + nelt0 < nelt)
5150 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5151 		  if (3 * i + nelt1 < nelt)
5152 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5153 		  if (3 * i + nelt2 < nelt)
5154 		    sel[3 * i + nelt2] = nelt + j2++;
5155 		}
5156 	      indices.new_vector (sel, 2, nelt);
5157 	      if (!can_vec_perm_const_p (mode, indices))
5158 		{
5159 		  if (dump_enabled_p ())
5160 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5161 				 "permutation op not supported by target.\n");
5162 		  return false;
5163 		}
5164 	    }
5165 	  return true;
5166 	}
5167       else
5168 	{
5169 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5170 	  gcc_assert (pow2p_hwi (count));
5171 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5172 
5173 	  /* The encoding has 2 interleaved stepped patterns.  */
5174 	  vec_perm_builder sel (nelt, 2, 3);
5175 	  sel.quick_grow (6);
5176 	  for (i = 0; i < 3; i++)
5177 	    {
5178 	      sel[i * 2] = i;
5179 	      sel[i * 2 + 1] = i + nelt;
5180 	    }
5181 	  vec_perm_indices indices (sel, 2, nelt);
5182 	  if (can_vec_perm_const_p (mode, indices))
5183 	    {
5184 	      for (i = 0; i < 6; i++)
5185 		sel[i] += exact_div (nelt, 2);
5186 	      indices.new_vector (sel, 2, nelt);
5187 	      if (can_vec_perm_const_p (mode, indices))
5188 		return true;
5189 	    }
5190 	}
5191     }
5192 
5193   if (dump_enabled_p ())
5194     dump_printf (MSG_MISSED_OPTIMIZATION,
5195 		 "permutaion op not supported by target.\n");
5196   return false;
5197 }
5198 
5199 
5200 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5201    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5202 
5203 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5204 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5205 			    bool masked_p)
5206 {
5207   if (masked_p)
5208     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5209 					 vec_mask_store_lanes_optab,
5210 					 vectype, count);
5211   else
5212     return vect_lanes_optab_supported_p ("vec_store_lanes",
5213 					 vec_store_lanes_optab,
5214 					 vectype, count);
5215 }
5216 
5217 
5218 /* Function vect_permute_store_chain.
5219 
5220    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5221    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5222    the data correctly for the stores.  Return the final references for stores
5223    in RESULT_CHAIN.
5224 
5225    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5226    The input is 4 vectors each containing 8 elements.  We assign a number to
5227    each element, the input sequence is:
5228 
5229    1st vec:   0  1  2  3  4  5  6  7
5230    2nd vec:   8  9 10 11 12 13 14 15
5231    3rd vec:  16 17 18 19 20 21 22 23
5232    4th vec:  24 25 26 27 28 29 30 31
5233 
5234    The output sequence should be:
5235 
5236    1st vec:  0  8 16 24  1  9 17 25
5237    2nd vec:  2 10 18 26  3 11 19 27
5238    3rd vec:  4 12 20 28  5 13 21 30
5239    4th vec:  6 14 22 30  7 15 23 31
5240 
5241    i.e., we interleave the contents of the four vectors in their order.
5242 
5243    We use interleave_high/low instructions to create such output.  The input of
5244    each interleave_high/low operation is two vectors:
5245    1st vec    2nd vec
5246    0 1 2 3    4 5 6 7
5247    the even elements of the result vector are obtained left-to-right from the
5248    high/low elements of the first vector.  The odd elements of the result are
5249    obtained left-to-right from the high/low elements of the second vector.
5250    The output of interleave_high will be:   0 4 1 5
5251    and of interleave_low:                   2 6 3 7
5252 
5253 
5254    The permutation is done in log LENGTH stages.  In each stage interleave_high
5255    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5256    where the first argument is taken from the first half of DR_CHAIN and the
5257    second argument from it's second half.
5258    In our example,
5259 
5260    I1: interleave_high (1st vec, 3rd vec)
5261    I2: interleave_low (1st vec, 3rd vec)
5262    I3: interleave_high (2nd vec, 4th vec)
5263    I4: interleave_low (2nd vec, 4th vec)
5264 
5265    The output for the first stage is:
5266 
5267    I1:  0 16  1 17  2 18  3 19
5268    I2:  4 20  5 21  6 22  7 23
5269    I3:  8 24  9 25 10 26 11 27
5270    I4: 12 28 13 29 14 30 15 31
5271 
5272    The output of the second stage, i.e. the final result is:
5273 
5274    I1:  0  8 16 24  1  9 17 25
5275    I2:  2 10 18 26  3 11 19 27
5276    I3:  4 12 20 28  5 13 21 30
5277    I4:  6 14 22 30  7 15 23 31.  */
5278 
5279 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5280 vect_permute_store_chain (vec<tree> dr_chain,
5281 			  unsigned int length,
5282 			  gimple *stmt,
5283 			  gimple_stmt_iterator *gsi,
5284 			  vec<tree> *result_chain)
5285 {
5286   tree vect1, vect2, high, low;
5287   gimple *perm_stmt;
5288   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5289   tree perm_mask_low, perm_mask_high;
5290   tree data_ref;
5291   tree perm3_mask_low, perm3_mask_high;
5292   unsigned int i, j, n, log_length = exact_log2 (length);
5293 
5294   result_chain->quick_grow (length);
5295   memcpy (result_chain->address (), dr_chain.address (),
5296 	  length * sizeof (tree));
5297 
5298   if (length == 3)
5299     {
5300       /* vect_grouped_store_supported ensures that this is constant.  */
5301       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5302       unsigned int j0 = 0, j1 = 0, j2 = 0;
5303 
5304       vec_perm_builder sel (nelt, nelt, 1);
5305       sel.quick_grow (nelt);
5306       vec_perm_indices indices;
5307       for (j = 0; j < 3; j++)
5308         {
5309 	  int nelt0 = ((3 - j) * nelt) % 3;
5310 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5311 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5312 
5313 	  for (i = 0; i < nelt; i++)
5314 	    {
5315 	      if (3 * i + nelt0 < nelt)
5316 		sel[3 * i + nelt0] = j0++;
5317 	      if (3 * i + nelt1 < nelt)
5318 		sel[3 * i + nelt1] = nelt + j1++;
5319 	      if (3 * i + nelt2 < nelt)
5320 		sel[3 * i + nelt2] = 0;
5321 	    }
5322 	  indices.new_vector (sel, 2, nelt);
5323 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5324 
5325 	  for (i = 0; i < nelt; i++)
5326 	    {
5327 	      if (3 * i + nelt0 < nelt)
5328 		sel[3 * i + nelt0] = 3 * i + nelt0;
5329 	      if (3 * i + nelt1 < nelt)
5330 		sel[3 * i + nelt1] = 3 * i + nelt1;
5331 	      if (3 * i + nelt2 < nelt)
5332 		sel[3 * i + nelt2] = nelt + j2++;
5333 	    }
5334 	  indices.new_vector (sel, 2, nelt);
5335 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5336 
5337 	  vect1 = dr_chain[0];
5338 	  vect2 = dr_chain[1];
5339 
5340 	  /* Create interleaving stmt:
5341 	     low = VEC_PERM_EXPR <vect1, vect2,
5342 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5343 				   j + 2, nelt + j + 2, *, ...}>  */
5344 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5345 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5346 					   vect2, perm3_mask_low);
5347 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5348 
5349 	  vect1 = data_ref;
5350 	  vect2 = dr_chain[2];
5351 	  /* Create interleaving stmt:
5352 	     low = VEC_PERM_EXPR <vect1, vect2,
5353 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5354 				   6, 7, nelt + j + 2, ...}>  */
5355 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5356 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5357 					   vect2, perm3_mask_high);
5358 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5359 	  (*result_chain)[j] = data_ref;
5360 	}
5361     }
5362   else
5363     {
5364       /* If length is not equal to 3 then only power of 2 is supported.  */
5365       gcc_assert (pow2p_hwi (length));
5366 
5367       /* The encoding has 2 interleaved stepped patterns.  */
5368       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5369       vec_perm_builder sel (nelt, 2, 3);
5370       sel.quick_grow (6);
5371       for (i = 0; i < 3; i++)
5372 	{
5373 	  sel[i * 2] = i;
5374 	  sel[i * 2 + 1] = i + nelt;
5375 	}
5376 	vec_perm_indices indices (sel, 2, nelt);
5377 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5378 
5379 	for (i = 0; i < 6; i++)
5380 	  sel[i] += exact_div (nelt, 2);
5381 	indices.new_vector (sel, 2, nelt);
5382 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5383 
5384 	for (i = 0, n = log_length; i < n; i++)
5385 	  {
5386 	    for (j = 0; j < length/2; j++)
5387 	      {
5388 		vect1 = dr_chain[j];
5389 		vect2 = dr_chain[j+length/2];
5390 
5391 		/* Create interleaving stmt:
5392 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5393 							...}>  */
5394 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5395 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5396 						 vect2, perm_mask_high);
5397 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5398 		(*result_chain)[2*j] = high;
5399 
5400 		/* Create interleaving stmt:
5401 		   low = VEC_PERM_EXPR <vect1, vect2,
5402 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5403 					 ...}>  */
5404 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5405 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5406 						 vect2, perm_mask_low);
5407 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408 		(*result_chain)[2*j+1] = low;
5409 	      }
5410 	    memcpy (dr_chain.address (), result_chain->address (),
5411 		    length * sizeof (tree));
5412 	  }
5413     }
5414 }
5415 
5416 /* Function vect_setup_realignment
5417 
5418    This function is called when vectorizing an unaligned load using
5419    the dr_explicit_realign[_optimized] scheme.
5420    This function generates the following code at the loop prolog:
5421 
5422       p = initial_addr;
5423    x  msq_init = *(floor(p));   # prolog load
5424       realignment_token = call target_builtin;
5425     loop:
5426    x  msq = phi (msq_init, ---)
5427 
5428    The stmts marked with x are generated only for the case of
5429    dr_explicit_realign_optimized.
5430 
5431    The code above sets up a new (vector) pointer, pointing to the first
5432    location accessed by STMT, and a "floor-aligned" load using that pointer.
5433    It also generates code to compute the "realignment-token" (if the relevant
5434    target hook was defined), and creates a phi-node at the loop-header bb
5435    whose arguments are the result of the prolog-load (created by this
5436    function) and the result of a load that takes place in the loop (to be
5437    created by the caller to this function).
5438 
5439    For the case of dr_explicit_realign_optimized:
5440    The caller to this function uses the phi-result (msq) to create the
5441    realignment code inside the loop, and sets up the missing phi argument,
5442    as follows:
5443     loop:
5444       msq = phi (msq_init, lsq)
5445       lsq = *(floor(p'));        # load in loop
5446       result = realign_load (msq, lsq, realignment_token);
5447 
5448    For the case of dr_explicit_realign:
5449     loop:
5450       msq = *(floor(p)); 	# load in loop
5451       p' = p + (VS-1);
5452       lsq = *(floor(p'));	# load in loop
5453       result = realign_load (msq, lsq, realignment_token);
5454 
5455    Input:
5456    STMT - (scalar) load stmt to be vectorized. This load accesses
5457           a memory location that may be unaligned.
5458    BSI - place where new code is to be inserted.
5459    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5460 			      is used.
5461 
5462    Output:
5463    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5464                        target hook, if defined.
5465    Return value - the result of the loop-header phi node.  */
5466 
5467 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5468 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5469                         tree *realignment_token,
5470 			enum dr_alignment_support alignment_support_scheme,
5471 			tree init_addr,
5472 			struct loop **at_loop)
5473 {
5474   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5475   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5476   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5477   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5478   struct loop *loop = NULL;
5479   edge pe = NULL;
5480   tree scalar_dest = gimple_assign_lhs (stmt);
5481   tree vec_dest;
5482   gimple *inc;
5483   tree ptr;
5484   tree data_ref;
5485   basic_block new_bb;
5486   tree msq_init = NULL_TREE;
5487   tree new_temp;
5488   gphi *phi_stmt;
5489   tree msq = NULL_TREE;
5490   gimple_seq stmts = NULL;
5491   bool inv_p;
5492   bool compute_in_loop = false;
5493   bool nested_in_vect_loop = false;
5494   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5495   struct loop *loop_for_initial_load = NULL;
5496 
5497   if (loop_vinfo)
5498     {
5499       loop = LOOP_VINFO_LOOP (loop_vinfo);
5500       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5501     }
5502 
5503   gcc_assert (alignment_support_scheme == dr_explicit_realign
5504 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5505 
5506   /* We need to generate three things:
5507      1. the misalignment computation
5508      2. the extra vector load (for the optimized realignment scheme).
5509      3. the phi node for the two vectors from which the realignment is
5510       done (for the optimized realignment scheme).  */
5511 
5512   /* 1. Determine where to generate the misalignment computation.
5513 
5514      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5515      calculation will be generated by this function, outside the loop (in the
5516      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5517      caller, inside the loop.
5518 
5519      Background: If the misalignment remains fixed throughout the iterations of
5520      the loop, then both realignment schemes are applicable, and also the
5521      misalignment computation can be done outside LOOP.  This is because we are
5522      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5523      are a multiple of VS (the Vector Size), and therefore the misalignment in
5524      different vectorized LOOP iterations is always the same.
5525      The problem arises only if the memory access is in an inner-loop nested
5526      inside LOOP, which is now being vectorized using outer-loop vectorization.
5527      This is the only case when the misalignment of the memory access may not
5528      remain fixed throughout the iterations of the inner-loop (as explained in
5529      detail in vect_supportable_dr_alignment).  In this case, not only is the
5530      optimized realignment scheme not applicable, but also the misalignment
5531      computation (and generation of the realignment token that is passed to
5532      REALIGN_LOAD) have to be done inside the loop.
5533 
5534      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5535      or not, which in turn determines if the misalignment is computed inside
5536      the inner-loop, or outside LOOP.  */
5537 
5538   if (init_addr != NULL_TREE || !loop_vinfo)
5539     {
5540       compute_in_loop = true;
5541       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5542     }
5543 
5544 
5545   /* 2. Determine where to generate the extra vector load.
5546 
5547      For the optimized realignment scheme, instead of generating two vector
5548      loads in each iteration, we generate a single extra vector load in the
5549      preheader of the loop, and in each iteration reuse the result of the
5550      vector load from the previous iteration.  In case the memory access is in
5551      an inner-loop nested inside LOOP, which is now being vectorized using
5552      outer-loop vectorization, we need to determine whether this initial vector
5553      load should be generated at the preheader of the inner-loop, or can be
5554      generated at the preheader of LOOP.  If the memory access has no evolution
5555      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5556      to be generated inside LOOP (in the preheader of the inner-loop).  */
5557 
5558   if (nested_in_vect_loop)
5559     {
5560       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5561       bool invariant_in_outerloop =
5562             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5563       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5564     }
5565   else
5566     loop_for_initial_load = loop;
5567   if (at_loop)
5568     *at_loop = loop_for_initial_load;
5569 
5570   if (loop_for_initial_load)
5571     pe = loop_preheader_edge (loop_for_initial_load);
5572 
5573   /* 3. For the case of the optimized realignment, create the first vector
5574       load at the loop preheader.  */
5575 
5576   if (alignment_support_scheme == dr_explicit_realign_optimized)
5577     {
5578       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5579       gassign *new_stmt;
5580 
5581       gcc_assert (!compute_in_loop);
5582       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5583       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5584 				      NULL_TREE, &init_addr, NULL, &inc,
5585 				      true, &inv_p);
5586       if (TREE_CODE (ptr) == SSA_NAME)
5587 	new_temp = copy_ssa_name (ptr);
5588       else
5589 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5590       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5591       new_stmt = gimple_build_assign
5592 		   (new_temp, BIT_AND_EXPR, ptr,
5593 		    build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5594       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595       gcc_assert (!new_bb);
5596       data_ref
5597 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5598 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5599       vect_copy_ref_info (data_ref, DR_REF (dr));
5600       new_stmt = gimple_build_assign (vec_dest, data_ref);
5601       new_temp = make_ssa_name (vec_dest, new_stmt);
5602       gimple_assign_set_lhs (new_stmt, new_temp);
5603       if (pe)
5604         {
5605           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5606           gcc_assert (!new_bb);
5607         }
5608       else
5609          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610 
5611       msq_init = gimple_assign_lhs (new_stmt);
5612     }
5613 
5614   /* 4. Create realignment token using a target builtin, if available.
5615       It is done either inside the containing loop, or before LOOP (as
5616       determined above).  */
5617 
5618   if (targetm.vectorize.builtin_mask_for_load)
5619     {
5620       gcall *new_stmt;
5621       tree builtin_decl;
5622 
5623       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5624       if (!init_addr)
5625 	{
5626 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5627 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5628 							    NULL_TREE);
5629           if (loop)
5630             {
5631    	      pe = loop_preheader_edge (loop);
5632 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5633 	      gcc_assert (!new_bb);
5634             }
5635           else
5636              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5637 	}
5638 
5639       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5640       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5641       vec_dest =
5642 	vect_create_destination_var (scalar_dest,
5643 				     gimple_call_return_type (new_stmt));
5644       new_temp = make_ssa_name (vec_dest, new_stmt);
5645       gimple_call_set_lhs (new_stmt, new_temp);
5646 
5647       if (compute_in_loop)
5648 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649       else
5650 	{
5651 	  /* Generate the misalignment computation outside LOOP.  */
5652 	  pe = loop_preheader_edge (loop);
5653 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5654 	  gcc_assert (!new_bb);
5655 	}
5656 
5657       *realignment_token = gimple_call_lhs (new_stmt);
5658 
5659       /* The result of the CALL_EXPR to this builtin is determined from
5660          the value of the parameter and no global variables are touched
5661          which makes the builtin a "const" function.  Requiring the
5662          builtin to have the "const" attribute makes it unnecessary
5663          to call mark_call_clobbered.  */
5664       gcc_assert (TREE_READONLY (builtin_decl));
5665     }
5666 
5667   if (alignment_support_scheme == dr_explicit_realign)
5668     return msq;
5669 
5670   gcc_assert (!compute_in_loop);
5671   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5672 
5673 
5674   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5675 
5676   pe = loop_preheader_edge (containing_loop);
5677   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5678   msq = make_ssa_name (vec_dest);
5679   phi_stmt = create_phi_node (msq, containing_loop->header);
5680   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5681 
5682   return msq;
5683 }
5684 
5685 
5686 /* Function vect_grouped_load_supported.
5687 
5688    COUNT is the size of the load group (the number of statements plus the
5689    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5690    only one statement, with a gap of COUNT - 1.
5691 
5692    Returns true if a suitable permute exists.  */
5693 
5694 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5695 vect_grouped_load_supported (tree vectype, bool single_element_p,
5696 			     unsigned HOST_WIDE_INT count)
5697 {
5698   machine_mode mode = TYPE_MODE (vectype);
5699 
5700   /* If this is single-element interleaving with an element distance
5701      that leaves unused vector loads around punt - we at least create
5702      very sub-optimal code in that case (and blow up memory,
5703      see PR65518).  */
5704   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5705     {
5706       if (dump_enabled_p ())
5707 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708 			 "single-element interleaving not supported "
5709 			 "for not adjacent vector loads\n");
5710       return false;
5711     }
5712 
5713   /* vect_permute_load_chain requires the group size to be equal to 3 or
5714      be a power of two.  */
5715   if (count != 3 && exact_log2 (count) == -1)
5716     {
5717       if (dump_enabled_p ())
5718 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 			 "the size of the group of accesses"
5720 			 " is not a power of 2 or not equal to 3\n");
5721       return false;
5722     }
5723 
5724   /* Check that the permutation is supported.  */
5725   if (VECTOR_MODE_P (mode))
5726     {
5727       unsigned int i, j;
5728       if (count == 3)
5729 	{
5730 	  unsigned int nelt;
5731 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5732 	    {
5733 	      if (dump_enabled_p ())
5734 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5735 				 "cannot handle groups of 3 loads for"
5736 				 " variable-length vectors\n");
5737 	      return false;
5738 	    }
5739 
5740 	  vec_perm_builder sel (nelt, nelt, 1);
5741 	  sel.quick_grow (nelt);
5742 	  vec_perm_indices indices;
5743 	  unsigned int k;
5744 	  for (k = 0; k < 3; k++)
5745 	    {
5746 	      for (i = 0; i < nelt; i++)
5747 		if (3 * i + k < 2 * nelt)
5748 		  sel[i] = 3 * i + k;
5749 		else
5750 		  sel[i] = 0;
5751 	      indices.new_vector (sel, 2, nelt);
5752 	      if (!can_vec_perm_const_p (mode, indices))
5753 		{
5754 		  if (dump_enabled_p ())
5755 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 				     "shuffle of 3 loads is not supported by"
5757 				     " target\n");
5758 		  return false;
5759 		}
5760 	      for (i = 0, j = 0; i < nelt; i++)
5761 		if (3 * i + k < 2 * nelt)
5762 		  sel[i] = i;
5763 		else
5764 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5765 	      indices.new_vector (sel, 2, nelt);
5766 	      if (!can_vec_perm_const_p (mode, indices))
5767 		{
5768 		  if (dump_enabled_p ())
5769 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5770 				     "shuffle of 3 loads is not supported by"
5771 				     " target\n");
5772 		  return false;
5773 		}
5774 	    }
5775 	  return true;
5776 	}
5777       else
5778 	{
5779 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5780 	  gcc_assert (pow2p_hwi (count));
5781 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5782 
5783 	  /* The encoding has a single stepped pattern.  */
5784 	  vec_perm_builder sel (nelt, 1, 3);
5785 	  sel.quick_grow (3);
5786 	  for (i = 0; i < 3; i++)
5787 	    sel[i] = i * 2;
5788 	  vec_perm_indices indices (sel, 2, nelt);
5789 	  if (can_vec_perm_const_p (mode, indices))
5790 	    {
5791 	      for (i = 0; i < 3; i++)
5792 		sel[i] = i * 2 + 1;
5793 	      indices.new_vector (sel, 2, nelt);
5794 	      if (can_vec_perm_const_p (mode, indices))
5795 		return true;
5796 	    }
5797         }
5798     }
5799 
5800   if (dump_enabled_p ())
5801     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5802 		     "extract even/odd not supported by target\n");
5803   return false;
5804 }
5805 
5806 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5807    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5808 
5809 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5810 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5811 			   bool masked_p)
5812 {
5813   if (masked_p)
5814     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5815 					 vec_mask_load_lanes_optab,
5816 					 vectype, count);
5817   else
5818     return vect_lanes_optab_supported_p ("vec_load_lanes",
5819 					 vec_load_lanes_optab,
5820 					 vectype, count);
5821 }
5822 
5823 /* Function vect_permute_load_chain.
5824 
5825    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5826    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5827    the input data correctly.  Return the final references for loads in
5828    RESULT_CHAIN.
5829 
5830    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5831    The input is 4 vectors each containing 8 elements. We assign a number to each
5832    element, the input sequence is:
5833 
5834    1st vec:   0  1  2  3  4  5  6  7
5835    2nd vec:   8  9 10 11 12 13 14 15
5836    3rd vec:  16 17 18 19 20 21 22 23
5837    4th vec:  24 25 26 27 28 29 30 31
5838 
5839    The output sequence should be:
5840 
5841    1st vec:  0 4  8 12 16 20 24 28
5842    2nd vec:  1 5  9 13 17 21 25 29
5843    3rd vec:  2 6 10 14 18 22 26 30
5844    4th vec:  3 7 11 15 19 23 27 31
5845 
5846    i.e., the first output vector should contain the first elements of each
5847    interleaving group, etc.
5848 
5849    We use extract_even/odd instructions to create such output.  The input of
5850    each extract_even/odd operation is two vectors
5851    1st vec    2nd vec
5852    0 1 2 3    4 5 6 7
5853 
5854    and the output is the vector of extracted even/odd elements.  The output of
5855    extract_even will be:   0 2 4 6
5856    and of extract_odd:     1 3 5 7
5857 
5858 
5859    The permutation is done in log LENGTH stages.  In each stage extract_even
5860    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5861    their order.  In our example,
5862 
5863    E1: extract_even (1st vec, 2nd vec)
5864    E2: extract_odd (1st vec, 2nd vec)
5865    E3: extract_even (3rd vec, 4th vec)
5866    E4: extract_odd (3rd vec, 4th vec)
5867 
5868    The output for the first stage will be:
5869 
5870    E1:  0  2  4  6  8 10 12 14
5871    E2:  1  3  5  7  9 11 13 15
5872    E3: 16 18 20 22 24 26 28 30
5873    E4: 17 19 21 23 25 27 29 31
5874 
5875    In order to proceed and create the correct sequence for the next stage (or
5876    for the correct output, if the second stage is the last one, as in our
5877    example), we first put the output of extract_even operation and then the
5878    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5879    The input for the second stage is:
5880 
5881    1st vec (E1):  0  2  4  6  8 10 12 14
5882    2nd vec (E3): 16 18 20 22 24 26 28 30
5883    3rd vec (E2):  1  3  5  7  9 11 13 15
5884    4th vec (E4): 17 19 21 23 25 27 29 31
5885 
5886    The output of the second stage:
5887 
5888    E1: 0 4  8 12 16 20 24 28
5889    E2: 2 6 10 14 18 22 26 30
5890    E3: 1 5  9 13 17 21 25 29
5891    E4: 3 7 11 15 19 23 27 31
5892 
5893    And RESULT_CHAIN after reordering:
5894 
5895    1st vec (E1):  0 4  8 12 16 20 24 28
5896    2nd vec (E3):  1 5  9 13 17 21 25 29
5897    3rd vec (E2):  2 6 10 14 18 22 26 30
5898    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5899 
5900 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5901 vect_permute_load_chain (vec<tree> dr_chain,
5902 			 unsigned int length,
5903 			 gimple *stmt,
5904 			 gimple_stmt_iterator *gsi,
5905 			 vec<tree> *result_chain)
5906 {
5907   tree data_ref, first_vect, second_vect;
5908   tree perm_mask_even, perm_mask_odd;
5909   tree perm3_mask_low, perm3_mask_high;
5910   gimple *perm_stmt;
5911   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5912   unsigned int i, j, log_length = exact_log2 (length);
5913 
5914   result_chain->quick_grow (length);
5915   memcpy (result_chain->address (), dr_chain.address (),
5916 	  length * sizeof (tree));
5917 
5918   if (length == 3)
5919     {
5920       /* vect_grouped_load_supported ensures that this is constant.  */
5921       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5922       unsigned int k;
5923 
5924       vec_perm_builder sel (nelt, nelt, 1);
5925       sel.quick_grow (nelt);
5926       vec_perm_indices indices;
5927       for (k = 0; k < 3; k++)
5928 	{
5929 	  for (i = 0; i < nelt; i++)
5930 	    if (3 * i + k < 2 * nelt)
5931 	      sel[i] = 3 * i + k;
5932 	    else
5933 	      sel[i] = 0;
5934 	  indices.new_vector (sel, 2, nelt);
5935 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5936 
5937 	  for (i = 0, j = 0; i < nelt; i++)
5938 	    if (3 * i + k < 2 * nelt)
5939 	      sel[i] = i;
5940 	    else
5941 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5942 	  indices.new_vector (sel, 2, nelt);
5943 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5944 
5945 	  first_vect = dr_chain[0];
5946 	  second_vect = dr_chain[1];
5947 
5948 	  /* Create interleaving stmt (low part of):
5949 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5950 							     ...}>  */
5951 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5952 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5953 					   second_vect, perm3_mask_low);
5954 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5955 
5956 	  /* Create interleaving stmt (high part of):
5957 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958 							      ...}>  */
5959 	  first_vect = data_ref;
5960 	  second_vect = dr_chain[2];
5961 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5962 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5963 					   second_vect, perm3_mask_high);
5964 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5965 	  (*result_chain)[k] = data_ref;
5966 	}
5967     }
5968   else
5969     {
5970       /* If length is not equal to 3 then only power of 2 is supported.  */
5971       gcc_assert (pow2p_hwi (length));
5972 
5973       /* The encoding has a single stepped pattern.  */
5974       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5975       vec_perm_builder sel (nelt, 1, 3);
5976       sel.quick_grow (3);
5977       for (i = 0; i < 3; ++i)
5978 	sel[i] = i * 2;
5979       vec_perm_indices indices (sel, 2, nelt);
5980       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5981 
5982       for (i = 0; i < 3; ++i)
5983 	sel[i] = i * 2 + 1;
5984       indices.new_vector (sel, 2, nelt);
5985       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5986 
5987       for (i = 0; i < log_length; i++)
5988 	{
5989 	  for (j = 0; j < length; j += 2)
5990 	    {
5991 	      first_vect = dr_chain[j];
5992 	      second_vect = dr_chain[j+1];
5993 
5994 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5995 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5996 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5997 					       first_vect, second_vect,
5998 					       perm_mask_even);
5999 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6000 	      (*result_chain)[j/2] = data_ref;
6001 
6002 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6003 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6004 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005 					       first_vect, second_vect,
6006 					       perm_mask_odd);
6007 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6008 	      (*result_chain)[j/2+length/2] = data_ref;
6009 	    }
6010 	  memcpy (dr_chain.address (), result_chain->address (),
6011 		  length * sizeof (tree));
6012 	}
6013     }
6014 }
6015 
6016 /* Function vect_shift_permute_load_chain.
6017 
6018    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6019    sequence of stmts to reorder the input data accordingly.
6020    Return the final references for loads in RESULT_CHAIN.
6021    Return true if successed, false otherwise.
6022 
6023    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6024    The input is 3 vectors each containing 8 elements.  We assign a
6025    number to each element, the input sequence is:
6026 
6027    1st vec:   0  1  2  3  4  5  6  7
6028    2nd vec:   8  9 10 11 12 13 14 15
6029    3rd vec:  16 17 18 19 20 21 22 23
6030 
6031    The output sequence should be:
6032 
6033    1st vec:  0 3 6  9 12 15 18 21
6034    2nd vec:  1 4 7 10 13 16 19 22
6035    3rd vec:  2 5 8 11 14 17 20 23
6036 
6037    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6038 
6039    First we shuffle all 3 vectors to get correct elements order:
6040 
6041    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6042    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6043    3rd vec:  (16 19 22) (17 20 23) (18 21)
6044 
6045    Next we unite and shift vector 3 times:
6046 
6047    1st step:
6048      shift right by 6 the concatenation of:
6049      "1st vec" and  "2nd vec"
6050        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6051      "2nd vec" and  "3rd vec"
6052        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6053      "3rd vec" and  "1st vec"
6054        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6055 			     | New vectors                   |
6056 
6057      So that now new vectors are:
6058 
6059      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6060      2nd vec:  (10 13) (16 19 22) (17 20 23)
6061      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6062 
6063    2nd step:
6064      shift right by 5 the concatenation of:
6065      "1st vec" and  "3rd vec"
6066        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6067      "2nd vec" and  "1st vec"
6068        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6069      "3rd vec" and  "2nd vec"
6070        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6071 			  | New vectors                   |
6072 
6073      So that now new vectors are:
6074 
6075      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6076      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6077      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6078 
6079    3rd step:
6080      shift right by 5 the concatenation of:
6081      "1st vec" and  "1st vec"
6082        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6083      shift right by 3 the concatenation of:
6084      "2nd vec" and  "2nd vec"
6085                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6086 			  | New vectors                   |
6087 
6088      So that now all vectors are READY:
6089      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6090      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6091      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6092 
6093    This algorithm is faster than one in vect_permute_load_chain if:
6094      1.  "shift of a concatination" is faster than general permutation.
6095 	 This is usually so.
6096      2.  The TARGET machine can't execute vector instructions in parallel.
6097 	 This is because each step of the algorithm depends on previous.
6098 	 The algorithm in vect_permute_load_chain is much more parallel.
6099 
6100    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6101 */
6102 
6103 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6104 vect_shift_permute_load_chain (vec<tree> dr_chain,
6105 			       unsigned int length,
6106 			       gimple *stmt,
6107 			       gimple_stmt_iterator *gsi,
6108 			       vec<tree> *result_chain)
6109 {
6110   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6111   tree perm2_mask1, perm2_mask2, perm3_mask;
6112   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6113   gimple *perm_stmt;
6114 
6115   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6116   unsigned int i;
6117   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6118   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6119 
6120   unsigned HOST_WIDE_INT nelt, vf;
6121   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6122       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6123     /* Not supported for variable-length vectors.  */
6124     return false;
6125 
6126   vec_perm_builder sel (nelt, nelt, 1);
6127   sel.quick_grow (nelt);
6128 
6129   result_chain->quick_grow (length);
6130   memcpy (result_chain->address (), dr_chain.address (),
6131 	  length * sizeof (tree));
6132 
6133   if (pow2p_hwi (length) && vf > 4)
6134     {
6135       unsigned int j, log_length = exact_log2 (length);
6136       for (i = 0; i < nelt / 2; ++i)
6137 	sel[i] = i * 2;
6138       for (i = 0; i < nelt / 2; ++i)
6139 	sel[nelt / 2 + i] = i * 2 + 1;
6140       vec_perm_indices indices (sel, 2, nelt);
6141       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6142 	{
6143 	  if (dump_enabled_p ())
6144 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6145 			     "shuffle of 2 fields structure is not \
6146 			      supported by target\n");
6147 	  return false;
6148 	}
6149       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6150 
6151       for (i = 0; i < nelt / 2; ++i)
6152 	sel[i] = i * 2 + 1;
6153       for (i = 0; i < nelt / 2; ++i)
6154 	sel[nelt / 2 + i] = i * 2;
6155       indices.new_vector (sel, 2, nelt);
6156       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6157 	{
6158 	  if (dump_enabled_p ())
6159 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6160 			     "shuffle of 2 fields structure is not \
6161 			      supported by target\n");
6162 	  return false;
6163 	}
6164       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6165 
6166       /* Generating permutation constant to shift all elements.
6167 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6168       for (i = 0; i < nelt; i++)
6169 	sel[i] = nelt / 2 + i;
6170       indices.new_vector (sel, 2, nelt);
6171       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6172 	{
6173 	  if (dump_enabled_p ())
6174 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175 			     "shift permutation is not supported by target\n");
6176 	  return false;
6177 	}
6178       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6179 
6180       /* Generating permutation constant to select vector from 2.
6181 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6182       for (i = 0; i < nelt / 2; i++)
6183 	sel[i] = i;
6184       for (i = nelt / 2; i < nelt; i++)
6185 	sel[i] = nelt + i;
6186       indices.new_vector (sel, 2, nelt);
6187       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6188 	{
6189 	  if (dump_enabled_p ())
6190 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6191 			     "select is not supported by target\n");
6192 	  return false;
6193 	}
6194       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6195 
6196       for (i = 0; i < log_length; i++)
6197 	{
6198 	  for (j = 0; j < length; j += 2)
6199 	    {
6200 	      first_vect = dr_chain[j];
6201 	      second_vect = dr_chain[j + 1];
6202 
6203 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6204 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6205 					       first_vect, first_vect,
6206 					       perm2_mask1);
6207 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6208 	      vect[0] = data_ref;
6209 
6210 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6211 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212 					       second_vect, second_vect,
6213 					       perm2_mask2);
6214 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215 	      vect[1] = data_ref;
6216 
6217 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6218 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219 					       vect[0], vect[1], shift1_mask);
6220 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6221 	      (*result_chain)[j/2 + length/2] = data_ref;
6222 
6223 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6224 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6225 					       vect[0], vect[1], select_mask);
6226 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6227 	      (*result_chain)[j/2] = data_ref;
6228 	    }
6229 	  memcpy (dr_chain.address (), result_chain->address (),
6230 		  length * sizeof (tree));
6231 	}
6232       return true;
6233     }
6234   if (length == 3 && vf > 2)
6235     {
6236       unsigned int k = 0, l = 0;
6237 
6238       /* Generating permutation constant to get all elements in rigth order.
6239 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6240       for (i = 0; i < nelt; i++)
6241 	{
6242 	  if (3 * k + (l % 3) >= nelt)
6243 	    {
6244 	      k = 0;
6245 	      l += (3 - (nelt % 3));
6246 	    }
6247 	  sel[i] = 3 * k + (l % 3);
6248 	  k++;
6249 	}
6250       vec_perm_indices indices (sel, 2, nelt);
6251       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252 	{
6253 	  if (dump_enabled_p ())
6254 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255 			     "shuffle of 3 fields structure is not \
6256 			      supported by target\n");
6257 	  return false;
6258 	}
6259       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6260 
6261       /* Generating permutation constant to shift all elements.
6262 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6263       for (i = 0; i < nelt; i++)
6264 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6265       indices.new_vector (sel, 2, nelt);
6266       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6267 	{
6268 	  if (dump_enabled_p ())
6269 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270 			     "shift permutation is not supported by target\n");
6271 	  return false;
6272 	}
6273       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6274 
6275       /* Generating permutation constant to shift all elements.
6276 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6277       for (i = 0; i < nelt; i++)
6278 	sel[i] = 2 * (nelt / 3) + 1 + i;
6279       indices.new_vector (sel, 2, nelt);
6280       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6281 	{
6282 	  if (dump_enabled_p ())
6283 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284 			     "shift permutation is not supported by target\n");
6285 	  return false;
6286 	}
6287       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6288 
6289       /* Generating permutation constant to shift all elements.
6290 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6291       for (i = 0; i < nelt; i++)
6292 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6293       indices.new_vector (sel, 2, nelt);
6294       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6295 	{
6296 	  if (dump_enabled_p ())
6297 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298 			     "shift permutation is not supported by target\n");
6299 	  return false;
6300 	}
6301       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6302 
6303       /* Generating permutation constant to shift all elements.
6304 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6305       for (i = 0; i < nelt; i++)
6306 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6307       indices.new_vector (sel, 2, nelt);
6308       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6309 	{
6310 	  if (dump_enabled_p ())
6311 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6312 			     "shift permutation is not supported by target\n");
6313 	  return false;
6314 	}
6315       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6316 
6317       for (k = 0; k < 3; k++)
6318 	{
6319 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6320 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6321 					   dr_chain[k], dr_chain[k],
6322 					   perm3_mask);
6323 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6324 	  vect[k] = data_ref;
6325 	}
6326 
6327       for (k = 0; k < 3; k++)
6328 	{
6329 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6330 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6331 					   vect[k % 3], vect[(k + 1) % 3],
6332 					   shift1_mask);
6333 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6334 	  vect_shift[k] = data_ref;
6335 	}
6336 
6337       for (k = 0; k < 3; k++)
6338 	{
6339 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6340 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6341 					   vect_shift[(4 - k) % 3],
6342 					   vect_shift[(3 - k) % 3],
6343 					   shift2_mask);
6344 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6345 	  vect[k] = data_ref;
6346 	}
6347 
6348       (*result_chain)[3 - (nelt % 3)] = vect[2];
6349 
6350       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6351       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6352 				       vect[0], shift3_mask);
6353       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6354       (*result_chain)[nelt % 3] = data_ref;
6355 
6356       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6357       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6358 				       vect[1], shift4_mask);
6359       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6360       (*result_chain)[0] = data_ref;
6361       return true;
6362     }
6363   return false;
6364 }
6365 
6366 /* Function vect_transform_grouped_load.
6367 
6368    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6369    to perform their permutation and ascribe the result vectorized statements to
6370    the scalar statements.
6371 */
6372 
6373 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6374 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6375 			     gimple_stmt_iterator *gsi)
6376 {
6377   machine_mode mode;
6378   vec<tree> result_chain = vNULL;
6379 
6380   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6381      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6382      vectors, that are ready for vector computation.  */
6383   result_chain.create (size);
6384 
6385   /* If reassociation width for vector type is 2 or greater target machine can
6386      execute 2 or more vector instructions in parallel.  Otherwise try to
6387      get chain for loads group using vect_shift_permute_load_chain.  */
6388   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6389   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6390       || pow2p_hwi (size)
6391       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6392 					 gsi, &result_chain))
6393     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6394   vect_record_grouped_load_vectors (stmt, result_chain);
6395   result_chain.release ();
6396 }
6397 
6398 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6399    generated as part of the vectorization of STMT.  Assign the statement
6400    for each vector to the associated scalar statement.  */
6401 
6402 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6403 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6404 {
6405   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6406   gimple *next_stmt, *new_stmt;
6407   unsigned int i, gap_count;
6408   tree tmp_data_ref;
6409 
6410   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6411      Since we scan the chain starting from it's first node, their order
6412      corresponds the order of data-refs in RESULT_CHAIN.  */
6413   next_stmt = first_stmt;
6414   gap_count = 1;
6415   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6416     {
6417       if (!next_stmt)
6418 	break;
6419 
6420       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6421        code elimination pass later.  No need to check for the first stmt in
6422        the group, since it always exists.
6423        GROUP_GAP is the number of steps in elements from the previous
6424        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6425        correspond to the gaps.  */
6426       if (next_stmt != first_stmt
6427           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6428       {
6429         gap_count++;
6430         continue;
6431       }
6432 
6433       while (next_stmt)
6434         {
6435 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6436 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6437 	     copies, and we put the new vector statement in the first available
6438 	     RELATED_STMT.  */
6439 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6440 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6441 	  else
6442             {
6443               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6444                 {
6445 		  gimple *prev_stmt =
6446 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6447 		  gimple *rel_stmt =
6448 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6449 	          while (rel_stmt)
6450 		    {
6451 		      prev_stmt = rel_stmt;
6452 		      rel_stmt =
6453                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6454 		    }
6455 
6456   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6457                     new_stmt;
6458                 }
6459             }
6460 
6461 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6462 	  gap_count = 1;
6463 	  /* If NEXT_STMT accesses the same DR as the previous statement,
6464 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
6465 	     get the next data-ref from RESULT_CHAIN.  */
6466 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6467 	    break;
6468         }
6469     }
6470 }
6471 
6472 /* Function vect_force_dr_alignment_p.
6473 
6474    Returns whether the alignment of a DECL can be forced to be aligned
6475    on ALIGNMENT bit boundary.  */
6476 
6477 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6478 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6479 {
6480   if (!VAR_P (decl))
6481     return false;
6482 
6483   if (decl_in_symtab_p (decl)
6484       && !symtab_node::get (decl)->can_increase_alignment_p ())
6485     return false;
6486 
6487   if (TREE_STATIC (decl))
6488     return (alignment <= MAX_OFILE_ALIGNMENT);
6489   else
6490     return (alignment <= MAX_STACK_ALIGNMENT);
6491 }
6492 
6493 
6494 /* Return whether the data reference DR is supported with respect to its
6495    alignment.
6496    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6497    it is aligned, i.e., check if it is possible to vectorize it with different
6498    alignment.  */
6499 
6500 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6501 vect_supportable_dr_alignment (struct data_reference *dr,
6502                                bool check_aligned_accesses)
6503 {
6504   gimple *stmt = DR_STMT (dr);
6505   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6506   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6507   machine_mode mode = TYPE_MODE (vectype);
6508   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6509   struct loop *vect_loop = NULL;
6510   bool nested_in_vect_loop = false;
6511 
6512   if (aligned_access_p (dr) && !check_aligned_accesses)
6513     return dr_aligned;
6514 
6515   /* For now assume all conditional loads/stores support unaligned
6516      access without any special code.  */
6517   if (is_gimple_call (stmt)
6518       && gimple_call_internal_p (stmt)
6519       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6520 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6521     return dr_unaligned_supported;
6522 
6523   if (loop_vinfo)
6524     {
6525       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6526       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6527     }
6528 
6529   /* Possibly unaligned access.  */
6530 
6531   /* We can choose between using the implicit realignment scheme (generating
6532      a misaligned_move stmt) and the explicit realignment scheme (generating
6533      aligned loads with a REALIGN_LOAD).  There are two variants to the
6534      explicit realignment scheme: optimized, and unoptimized.
6535      We can optimize the realignment only if the step between consecutive
6536      vector loads is equal to the vector size.  Since the vector memory
6537      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6538      is guaranteed that the misalignment amount remains the same throughout the
6539      execution of the vectorized loop.  Therefore, we can create the
6540      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6541      at the loop preheader.
6542 
6543      However, in the case of outer-loop vectorization, when vectorizing a
6544      memory access in the inner-loop nested within the LOOP that is now being
6545      vectorized, while it is guaranteed that the misalignment of the
6546      vectorized memory access will remain the same in different outer-loop
6547      iterations, it is *not* guaranteed that is will remain the same throughout
6548      the execution of the inner-loop.  This is because the inner-loop advances
6549      with the original scalar step (and not in steps of VS).  If the inner-loop
6550      step happens to be a multiple of VS, then the misalignment remains fixed
6551      and we can use the optimized realignment scheme.  For example:
6552 
6553       for (i=0; i<N; i++)
6554         for (j=0; j<M; j++)
6555           s += a[i+j];
6556 
6557      When vectorizing the i-loop in the above example, the step between
6558      consecutive vector loads is 1, and so the misalignment does not remain
6559      fixed across the execution of the inner-loop, and the realignment cannot
6560      be optimized (as illustrated in the following pseudo vectorized loop):
6561 
6562       for (i=0; i<N; i+=4)
6563         for (j=0; j<M; j++){
6564           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6565                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6566                          // (assuming that we start from an aligned address).
6567           }
6568 
6569      We therefore have to use the unoptimized realignment scheme:
6570 
6571       for (i=0; i<N; i+=4)
6572           for (j=k; j<M; j+=4)
6573           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6574                            // that the misalignment of the initial address is
6575                            // 0).
6576 
6577      The loop can then be vectorized as follows:
6578 
6579       for (k=0; k<4; k++){
6580         rt = get_realignment_token (&vp[k]);
6581         for (i=0; i<N; i+=4){
6582           v1 = vp[i+k];
6583           for (j=k; j<M; j+=4){
6584             v2 = vp[i+j+VS-1];
6585             va = REALIGN_LOAD <v1,v2,rt>;
6586             vs += va;
6587             v1 = v2;
6588           }
6589         }
6590     } */
6591 
6592   if (DR_IS_READ (dr))
6593     {
6594       bool is_packed = false;
6595       tree type = (TREE_TYPE (DR_REF (dr)));
6596 
6597       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6598 	  && (!targetm.vectorize.builtin_mask_for_load
6599 	      || targetm.vectorize.builtin_mask_for_load ()))
6600 	{
6601 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6602 
6603 	  /* If we are doing SLP then the accesses need not have the
6604 	     same alignment, instead it depends on the SLP group size.  */
6605 	  if (loop_vinfo
6606 	      && STMT_SLP_TYPE (stmt_info)
6607 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6608 			      * GROUP_SIZE (vinfo_for_stmt
6609 					    (GROUP_FIRST_ELEMENT (stmt_info))),
6610 			      TYPE_VECTOR_SUBPARTS (vectype)))
6611 	    ;
6612 	  else if (!loop_vinfo
6613 		   || (nested_in_vect_loop
6614 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6615 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6616 	    return dr_explicit_realign;
6617 	  else
6618 	    return dr_explicit_realign_optimized;
6619 	}
6620       if (!known_alignment_for_access_p (dr))
6621 	is_packed = not_size_aligned (DR_REF (dr));
6622 
6623       if (targetm.vectorize.support_vector_misalignment
6624 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6625 	/* Can't software pipeline the loads, but can at least do them.  */
6626 	return dr_unaligned_supported;
6627     }
6628   else
6629     {
6630       bool is_packed = false;
6631       tree type = (TREE_TYPE (DR_REF (dr)));
6632 
6633       if (!known_alignment_for_access_p (dr))
6634 	is_packed = not_size_aligned (DR_REF (dr));
6635 
6636      if (targetm.vectorize.support_vector_misalignment
6637 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6638        return dr_unaligned_supported;
6639     }
6640 
6641   /* Unsupported.  */
6642   return dr_unaligned_unsupported;
6643 }
6644