1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s["
78 			     HOST_WIDE_INT_PRINT_DEC "]\n",
79 			     GET_MODE_NAME (mode), count);
80 	  return false;
81 	}
82     }
83 
84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85     {
86       if (dump_enabled_p ())
87 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88                          "cannot use %s<%s><%s>\n", name,
89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90       return false;
91     }
92 
93   if (dump_enabled_p ())
94     dump_printf_loc (MSG_NOTE, vect_location,
95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96                      GET_MODE_NAME (mode));
97 
98   return true;
99 }
100 
101 
102 /* Return the smallest scalar part of STMT.
103    This is used to determine the vectype of the stmt.  We generally set the
104    vectype according to the type of the result (lhs).  For stmts whose
105    result-type is different than the type of the arguments (e.g., demotion,
106    promotion), vectype will be reset appropriately (later).  Note that we have
107    to visit the smallest datatype in this function, because that determines the
108    VF.  If the smallest datatype in the loop is present only as the rhs of a
109    promotion operation - we'd miss it.
110    Such a case, where a variable of this datatype does not appear in the lhs
111    anywhere in the loop, can only occur if it's an invariant: e.g.:
112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
113    invariant motion.  However, we cannot rely on invariant motion to always
114    take invariants out of the loop, and so in the case of promotion we also
115    have to check the rhs.
116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117    types.  */
118 
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121                                HOST_WIDE_INT *rhs_size_unit)
122 {
123   tree scalar_type = gimple_expr_type (stmt);
124   HOST_WIDE_INT lhs, rhs;
125 
126   /* During the analysis phase, this function is called on arbitrary
127      statements that might not have scalar results.  */
128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129     return scalar_type;
130 
131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 
133   if (is_gimple_assign (stmt)
134       && (gimple_assign_cast_p (stmt)
135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140     {
141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142 
143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144       if (rhs < lhs)
145         scalar_type = rhs_type;
146     }
147 
148   *lhs_size_unit = lhs;
149   *rhs_size_unit = rhs;
150   return scalar_type;
151 }
152 
153 
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155    tested at run-time.  Return TRUE if DDR was successfully inserted.
156    Return false if versioning is not supported.  */
157 
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
160 {
161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
162 
163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164     return false;
165 
166   if (!runtime_alias_check_p (ddr, loop,
167 			      optimize_loop_nest_for_speed_p (loop)))
168     return false;
169 
170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171   return true;
172 }
173 
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
175 
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
178 {
179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180   for (unsigned int i = 0; i < checks.length(); ++i)
181     if (checks[i] == value)
182       return;
183 
184   if (dump_enabled_p ())
185     {
186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188       dump_printf (MSG_NOTE, " is nonzero\n");
189     }
190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 }
192 
193 /* Return true if we know that the order of vectorized STMT_A and
194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195    At least one of the statements is a write.  */
196 
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
199 {
200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
202 
203   /* Single statements are always kept in their original order.  */
204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206     return true;
207 
208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
209      group are emitted at the position of the first scalar load and all
210      stores in a group are emitted at the position of the last scalar store.
211      Thus writes will happen no earlier than their current position
212      (but could happen later) while reads will happen no later than their
213      current position (but could happen earlier).  Reordering is therefore
214      only possible if the first access is a write.  */
215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
217 }
218 
219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221    distances.  These distances are conservatively correct but they don't
222    reflect a guaranteed dependence.
223 
224    Return true if this function does all the work necessary to avoid
225    an alias or false if the caller should use the dependence distances
226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
227    the depth of the loop described by LOOP_VINFO and the other arguments
228    are as for vect_analyze_data_ref_dependence.  */
229 
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 				       loop_vec_info loop_vinfo,
233 				       int loop_depth, unsigned int *max_vf)
234 {
235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236   lambda_vector dist_v;
237   unsigned int i;
238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
239     {
240       int dist = dist_v[loop_depth];
241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
242 	{
243 	  /* If the user asserted safelen >= DIST consecutive iterations
244 	     can be executed concurrently, assume independence.
245 
246 	     ??? An alternative would be to add the alias check even
247 	     in this case, and vectorize the fallback loop with the
248 	     maximum VF set to safelen.  However, if the user has
249 	     explicitly given a length, it's less likely that that
250 	     would be a win.  */
251 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
252 	    {
253 	      if ((unsigned int) loop->safelen < *max_vf)
254 		*max_vf = loop->safelen;
255 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 	      continue;
257 	    }
258 
259 	  /* For dependence distances of 2 or more, we have the option
260 	     of limiting VF or checking for an alias at runtime.
261 	     Prefer to check at runtime if we can, to avoid limiting
262 	     the VF unnecessarily when the bases are in fact independent.
263 
264 	     Note that the alias checks will be removed if the VF ends up
265 	     being small enough.  */
266 	  return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
267 	}
268     }
269   return true;
270 }
271 
272 
273 /* Function vect_analyze_data_ref_dependence.
274 
275    Return TRUE if there (might) exist a dependence between a memory-reference
276    DRA and a memory-reference DRB.  When versioning for alias may check a
277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
278    the data dependence.  */
279 
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 				  loop_vec_info loop_vinfo,
283 				  unsigned int *max_vf)
284 {
285   unsigned int i;
286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287   struct data_reference *dra = DDR_A (ddr);
288   struct data_reference *drb = DDR_B (ddr);
289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291   lambda_vector dist_v;
292   unsigned int loop_depth;
293 
294   /* In loop analysis all data references should be vectorizable.  */
295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297     gcc_unreachable ();
298 
299   /* Independent data accesses.  */
300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301     return false;
302 
303   if (dra == drb
304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305     return false;
306 
307   /* We do not have to consider dependences between accesses that belong
308      to the same group, unless the stride could be smaller than the
309      group size.  */
310   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
311       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
312       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
313     return false;
314 
315   /* Even if we have an anti-dependence then, as the vectorized loop covers at
316      least two scalar iterations, there is always also a true dependence.
317      As the vectorizer does not re-order loads and stores we can ignore
318      the anti-dependence if TBAA can disambiguate both DRs similar to the
319      case with known negative distance anti-dependences (positive
320      distance anti-dependences would violate TBAA constraints).  */
321   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
322        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
323       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
324 				 get_alias_set (DR_REF (drb))))
325     return false;
326 
327   /* Unknown data dependence.  */
328   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
329     {
330       /* If user asserted safelen consecutive iterations can be
331 	 executed concurrently, assume independence.  */
332       if (loop->safelen >= 2)
333 	{
334 	  if ((unsigned int) loop->safelen < *max_vf)
335 	    *max_vf = loop->safelen;
336 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
337 	  return false;
338 	}
339 
340       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
341 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
342 	{
343 	  if (dump_enabled_p ())
344 	    {
345 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
346 			       "versioning for alias not supported for: "
347 			       "can't determine dependence between ");
348 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
349 				 DR_REF (dra));
350 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
351 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
352 				 DR_REF (drb));
353 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
354 	    }
355 	  return true;
356 	}
357 
358       if (dump_enabled_p ())
359 	{
360 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
361 			   "versioning for alias required: "
362 			   "can't determine dependence between ");
363 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
364 			     DR_REF (dra));
365 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
366 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
367 			     DR_REF (drb));
368 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
369 	}
370 
371       /* Add to list of ddrs that need to be tested at run-time.  */
372       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
373     }
374 
375   /* Known data dependence.  */
376   if (DDR_NUM_DIST_VECTS (ddr) == 0)
377     {
378       /* If user asserted safelen consecutive iterations can be
379 	 executed concurrently, assume independence.  */
380       if (loop->safelen >= 2)
381 	{
382 	  if ((unsigned int) loop->safelen < *max_vf)
383 	    *max_vf = loop->safelen;
384 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
385 	  return false;
386 	}
387 
388       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
389 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
390 	{
391 	  if (dump_enabled_p ())
392 	    {
393 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 			       "versioning for alias not supported for: "
395 			       "bad dist vector for ");
396 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
397 				 DR_REF (dra));
398 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
399 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
400 				 DR_REF (drb));
401 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
402 	    }
403 	  return true;
404 	}
405 
406       if (dump_enabled_p ())
407         {
408           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409                            "versioning for alias required: "
410                            "bad dist vector for ");
411           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
412           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
413           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
414           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
415         }
416       /* Add to list of ddrs that need to be tested at run-time.  */
417       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
418     }
419 
420   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
421 
422   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
423       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
424 						loop_depth, max_vf))
425     return false;
426 
427   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
428     {
429       int dist = dist_v[loop_depth];
430 
431       if (dump_enabled_p ())
432 	dump_printf_loc (MSG_NOTE, vect_location,
433                          "dependence distance  = %d.\n", dist);
434 
435       if (dist == 0)
436 	{
437 	  if (dump_enabled_p ())
438 	    {
439 	      dump_printf_loc (MSG_NOTE, vect_location,
440 	                       "dependence distance == 0 between ");
441 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
442 	      dump_printf (MSG_NOTE, " and ");
443 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
444 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
445 	    }
446 
447 	  /* When we perform grouped accesses and perform implicit CSE
448 	     by detecting equal accesses and doing disambiguation with
449 	     runtime alias tests like for
450 	        .. = a[i];
451 		.. = a[i+1];
452 		a[i] = ..;
453 		a[i+1] = ..;
454 		*p = ..;
455 		.. = a[i];
456 		.. = a[i+1];
457 	     where we will end up loading { a[i], a[i+1] } once, make
458 	     sure that inserting group loads before the first load and
459 	     stores after the last store will do the right thing.
460 	     Similar for groups like
461 	        a[i] = ...;
462 		... = a[i];
463 		a[i+1] = ...;
464 	     where loads from the group interleave with the store.  */
465 	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
466 	    {
467 	      if (dump_enabled_p ())
468 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 				 "READ_WRITE dependence in interleaving.\n");
470 	      return true;
471 	    }
472 
473 	  if (loop->safelen < 2)
474 	    {
475 	      tree indicator = dr_zero_step_indicator (dra);
476 	      if (TREE_CODE (indicator) != INTEGER_CST)
477 		vect_check_nonzero_value (loop_vinfo, indicator);
478 	      else if (integer_zerop (indicator))
479 		{
480 		  if (dump_enabled_p ())
481 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
482 				 "access also has a zero step\n");
483 		  return true;
484 		}
485 	    }
486 	  continue;
487 	}
488 
489       if (dist > 0 && DDR_REVERSED_P (ddr))
490 	{
491 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
492 	     reversed (to make distance vector positive), and the actual
493 	     distance is negative.  */
494 	  if (dump_enabled_p ())
495 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 	                     "dependence distance negative.\n");
497 	  /* Record a negative dependence distance to later limit the
498 	     amount of stmt copying / unrolling we can perform.
499 	     Only need to handle read-after-write dependence.  */
500 	  if (DR_IS_READ (drb)
501 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
502 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
503 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
504 	  continue;
505 	}
506 
507       unsigned int abs_dist = abs (dist);
508       if (abs_dist >= 2 && abs_dist < *max_vf)
509 	{
510 	  /* The dependence distance requires reduction of the maximal
511 	     vectorization factor.  */
512 	  *max_vf = abs (dist);
513 	  if (dump_enabled_p ())
514 	    dump_printf_loc (MSG_NOTE, vect_location,
515 	                     "adjusting maximal vectorization factor to %i\n",
516 	                     *max_vf);
517 	}
518 
519       if (abs_dist >= *max_vf)
520 	{
521 	  /* Dependence distance does not create dependence, as far as
522 	     vectorization is concerned, in this case.  */
523 	  if (dump_enabled_p ())
524 	    dump_printf_loc (MSG_NOTE, vect_location,
525 	                     "dependence distance >= VF.\n");
526 	  continue;
527 	}
528 
529       if (dump_enabled_p ())
530 	{
531 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532 	               "not vectorized, possible dependence "
533 	               "between data-refs ");
534 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 	  dump_printf (MSG_NOTE,  " and ");
536 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 	  dump_printf (MSG_NOTE,  "\n");
538 	}
539 
540       return true;
541     }
542 
543   return false;
544 }
545 
546 /* Function vect_analyze_data_ref_dependences.
547 
548    Examine all the data references in the loop, and make sure there do not
549    exist any data dependences between them.  Set *MAX_VF according to
550    the maximum vectorization factor the data dependences allow.  */
551 
552 bool
553 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
554 				   unsigned int *max_vf)
555 {
556   unsigned int i;
557   struct data_dependence_relation *ddr;
558 
559   if (dump_enabled_p ())
560     dump_printf_loc (MSG_NOTE, vect_location,
561                      "=== vect_analyze_data_ref_dependences ===\n");
562 
563   LOOP_VINFO_DDRS (loop_vinfo)
564     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
565 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
566   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
567   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
568   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
569 				&LOOP_VINFO_DDRS (loop_vinfo),
570 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
571     return false;
572 
573   /* For epilogues we either have no aliases or alias versioning
574      was applied to original loop.  Therefore we may just get max_vf
575      using VF of original loop.  */
576   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
577     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
578   else
579     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
580       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
581 	return false;
582 
583   return true;
584 }
585 
586 
587 /* Function vect_slp_analyze_data_ref_dependence.
588 
589    Return TRUE if there (might) exist a dependence between a memory-reference
590    DRA and a memory-reference DRB.  When versioning for alias may check a
591    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
592    the data dependence.  */
593 
594 static bool
595 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
596 {
597   struct data_reference *dra = DDR_A (ddr);
598   struct data_reference *drb = DDR_B (ddr);
599 
600   /* We need to check dependences of statements marked as unvectorizable
601      as well, they still can prohibit vectorization.  */
602 
603   /* Independent data accesses.  */
604   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
605     return false;
606 
607   if (dra == drb)
608     return false;
609 
610   /* Read-read is OK.  */
611   if (DR_IS_READ (dra) && DR_IS_READ (drb))
612     return false;
613 
614   /* If dra and drb are part of the same interleaving chain consider
615      them independent.  */
616   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
617       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
618 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
619     return false;
620 
621   /* Unknown data dependence.  */
622   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
623     {
624       if  (dump_enabled_p ())
625 	{
626 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 			   "can't determine dependence between ");
628 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
629 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
630 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
631 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
632 	}
633     }
634   else if (dump_enabled_p ())
635     {
636       dump_printf_loc (MSG_NOTE, vect_location,
637 		       "determined dependence between ");
638       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
639       dump_printf (MSG_NOTE, " and ");
640       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
641       dump_printf (MSG_NOTE,  "\n");
642     }
643 
644   return true;
645 }
646 
647 
648 /* Analyze dependences involved in the transform of SLP NODE.  STORES
649    contain the vector of scalar stores of this instance if we are
650    disambiguating the loads.  */
651 
652 static bool
653 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
654 				   vec<gimple *> stores, gimple *last_store)
655 {
656   /* This walks over all stmts involved in the SLP load/store done
657      in NODE verifying we can sink them up to the last stmt in the
658      group.  */
659   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
660   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
661     {
662       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
663       if (access == last_access)
664 	continue;
665       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
666       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
667 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
668 	{
669 	  gimple *stmt = gsi_stmt (gsi);
670 	  if (! gimple_vuse (stmt)
671 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
672 	    continue;
673 
674 	  /* If we couldn't record a (single) data reference for this
675 	     stmt we have to give up.  */
676 	  /* ???  Here and below if dependence analysis fails we can resort
677 	     to the alias oracle which can handle more kinds of stmts.  */
678 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
679 	  if (!dr_b)
680 	    return false;
681 
682 	  bool dependent = false;
683 	  /* If we run into a store of this same instance (we've just
684 	     marked those) then delay dependence checking until we run
685 	     into the last store because this is where it will have
686 	     been sunk to (and we verify if we can do that as well).  */
687 	  if (gimple_visited_p (stmt))
688 	    {
689 	      if (stmt != last_store)
690 		continue;
691 	      unsigned i;
692 	      gimple *store;
693 	      FOR_EACH_VEC_ELT (stores, i, store)
694 		{
695 		  data_reference *store_dr
696 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
697 		  ddr_p ddr = initialize_data_dependence_relation
698 				(dr_a, store_dr, vNULL);
699 		  dependent = vect_slp_analyze_data_ref_dependence (ddr);
700 		  free_dependence_relation (ddr);
701 		  if (dependent)
702 		    break;
703 		}
704 	    }
705 	  else
706 	    {
707 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
708 							       dr_b, vNULL);
709 	      dependent = vect_slp_analyze_data_ref_dependence (ddr);
710 	      free_dependence_relation (ddr);
711 	    }
712 	  if (dependent)
713 	    return false;
714 	}
715     }
716   return true;
717 }
718 
719 
720 /* Function vect_analyze_data_ref_dependences.
721 
722    Examine all the data references in the basic-block, and make sure there
723    do not exist any data dependences between them.  Set *MAX_VF according to
724    the maximum vectorization factor the data dependences allow.  */
725 
726 bool
727 vect_slp_analyze_instance_dependence (slp_instance instance)
728 {
729   if (dump_enabled_p ())
730     dump_printf_loc (MSG_NOTE, vect_location,
731                      "=== vect_slp_analyze_instance_dependence ===\n");
732 
733   /* The stores of this instance are at the root of the SLP tree.  */
734   slp_tree store = SLP_INSTANCE_TREE (instance);
735   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
736     store = NULL;
737 
738   /* Verify we can sink stores to the vectorized stmt insert location.  */
739   gimple *last_store = NULL;
740   if (store)
741     {
742       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
743 	return false;
744 
745       /* Mark stores in this instance and remember the last one.  */
746       last_store = vect_find_last_scalar_stmt_in_slp (store);
747       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
748 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
749     }
750 
751   bool res = true;
752 
753   /* Verify we can sink loads to the vectorized stmt insert location,
754      special-casing stores of this instance.  */
755   slp_tree load;
756   unsigned int i;
757   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
758     if (! vect_slp_analyze_node_dependences (instance, load,
759 					     store
760 					     ? SLP_TREE_SCALAR_STMTS (store)
761 					     : vNULL, last_store))
762       {
763 	res = false;
764 	break;
765       }
766 
767   /* Unset the visited flag.  */
768   if (store)
769     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
770       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
771 
772   return res;
773 }
774 
775 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
776    the statement that contains DRB, which is useful for recording in the
777    dump file.  */
778 
779 static void
780 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
781 			    innermost_loop_behavior *drb)
782 {
783   bool existed;
784   innermost_loop_behavior *&entry
785     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
786   if (!existed || entry->base_alignment < drb->base_alignment)
787     {
788       entry = drb;
789       if (dump_enabled_p ())
790 	{
791 	  dump_printf_loc (MSG_NOTE, vect_location,
792 			   "recording new base alignment for ");
793 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
794 	  dump_printf (MSG_NOTE, "\n");
795 	  dump_printf_loc (MSG_NOTE, vect_location,
796 			   "  alignment:    %d\n", drb->base_alignment);
797 	  dump_printf_loc (MSG_NOTE, vect_location,
798 			   "  misalignment: %d\n", drb->base_misalignment);
799 	  dump_printf_loc (MSG_NOTE, vect_location,
800 			   "  based on:     ");
801 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
802 	}
803     }
804 }
805 
806 /* If the region we're going to vectorize is reached, all unconditional
807    data references occur at least once.  We can therefore pool the base
808    alignment guarantees from each unconditional reference.  Do this by
809    going through all the data references in VINFO and checking whether
810    the containing statement makes the reference unconditionally.  If so,
811    record the alignment of the base address in VINFO so that it can be
812    used for all other references with the same base.  */
813 
814 void
815 vect_record_base_alignments (vec_info *vinfo)
816 {
817   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
818   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
819   data_reference *dr;
820   unsigned int i;
821   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
822     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
823       {
824 	gimple *stmt = DR_STMT (dr);
825 	vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
826 
827 	/* If DR is nested in the loop that is being vectorized, we can also
828 	   record the alignment of the base wrt the outer loop.  */
829 	if (loop && nested_in_vect_loop_p (loop, stmt))
830 	  {
831 	    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
832 	    vect_record_base_alignment
833 	      (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
834 	  }
835       }
836 }
837 
838 /* Return the target alignment for the vectorized form of DR.  */
839 
840 static unsigned int
841 vect_calculate_target_alignment (struct data_reference *dr)
842 {
843   gimple *stmt = DR_STMT (dr);
844   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
846   return targetm.vectorize.preferred_vector_alignment (vectype);
847 }
848 
849 /* Function vect_compute_data_ref_alignment
850 
851    Compute the misalignment of the data reference DR.
852 
853    Output:
854    1. If during the misalignment computation it is found that the data reference
855       cannot be vectorized then false is returned.
856    2. DR_MISALIGNMENT (DR) is defined.
857 
858    FOR NOW: No analysis is actually performed. Misalignment is calculated
859    only for trivial cases. TODO.  */
860 
861 bool
862 vect_compute_data_ref_alignment (struct data_reference *dr)
863 {
864   gimple *stmt = DR_STMT (dr);
865   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
866   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
867   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
868   struct loop *loop = NULL;
869   tree ref = DR_REF (dr);
870   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
871 
872   if (dump_enabled_p ())
873     dump_printf_loc (MSG_NOTE, vect_location,
874                      "vect_compute_data_ref_alignment:\n");
875 
876   if (loop_vinfo)
877     loop = LOOP_VINFO_LOOP (loop_vinfo);
878 
879   /* Initialize misalignment to unknown.  */
880   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
881 
882   innermost_loop_behavior *drb = vect_dr_behavior (dr);
883   bool step_preserves_misalignment_p;
884 
885   unsigned HOST_WIDE_INT vector_alignment
886     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
887   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
888 
889   /* No step for BB vectorization.  */
890   if (!loop)
891     {
892       gcc_assert (integer_zerop (drb->step));
893       step_preserves_misalignment_p = true;
894     }
895 
896   /* In case the dataref is in an inner-loop of the loop that is being
897      vectorized (LOOP), we use the base and misalignment information
898      relative to the outer-loop (LOOP).  This is ok only if the misalignment
899      stays the same throughout the execution of the inner-loop, which is why
900      we have to check that the stride of the dataref in the inner-loop evenly
901      divides by the vector alignment.  */
902   else if (nested_in_vect_loop_p (loop, stmt))
903     {
904       step_preserves_misalignment_p
905 	= (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
906 
907       if (dump_enabled_p ())
908 	{
909 	  if (step_preserves_misalignment_p)
910 	    dump_printf_loc (MSG_NOTE, vect_location,
911 			     "inner step divides the vector alignment.\n");
912 	  else
913 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 			     "inner step doesn't divide the vector"
915 			     " alignment.\n");
916 	}
917     }
918 
919   /* Similarly we can only use base and misalignment information relative to
920      an innermost loop if the misalignment stays the same throughout the
921      execution of the loop.  As above, this is the case if the stride of
922      the dataref evenly divides by the alignment.  */
923   else
924     {
925       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
926       step_preserves_misalignment_p
927 	= multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
928 
929       if (!step_preserves_misalignment_p && dump_enabled_p ())
930 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
931 			 "step doesn't divide the vector alignment.\n");
932     }
933 
934   unsigned int base_alignment = drb->base_alignment;
935   unsigned int base_misalignment = drb->base_misalignment;
936 
937   /* Calculate the maximum of the pooled base address alignment and the
938      alignment that we can compute for DR itself.  */
939   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
940   if (entry && base_alignment < (*entry)->base_alignment)
941     {
942       base_alignment = (*entry)->base_alignment;
943       base_misalignment = (*entry)->base_misalignment;
944     }
945 
946   if (drb->offset_alignment < vector_alignment
947       || !step_preserves_misalignment_p
948       /* We need to know whether the step wrt the vectorized loop is
949 	 negative when computing the starting misalignment below.  */
950       || TREE_CODE (drb->step) != INTEGER_CST)
951     {
952       if (dump_enabled_p ())
953 	{
954 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 	                   "Unknown alignment for access: ");
956 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
957 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
958 	}
959       return true;
960     }
961 
962   if (base_alignment < vector_alignment)
963     {
964       unsigned int max_alignment;
965       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
966       if (max_alignment < vector_alignment
967 	  || !vect_can_force_dr_alignment_p (base,
968 					     vector_alignment * BITS_PER_UNIT))
969 	{
970 	  if (dump_enabled_p ())
971 	    {
972 	      dump_printf_loc (MSG_NOTE, vect_location,
973 	                       "can't force alignment of ref: ");
974 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
975 	      dump_printf (MSG_NOTE, "\n");
976 	    }
977 	  return true;
978 	}
979 
980       /* Force the alignment of the decl.
981 	 NOTE: This is the only change to the code we make during
982 	 the analysis phase, before deciding to vectorize the loop.  */
983       if (dump_enabled_p ())
984         {
985           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
986           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
987           dump_printf (MSG_NOTE, "\n");
988         }
989 
990       DR_VECT_AUX (dr)->base_decl = base;
991       DR_VECT_AUX (dr)->base_misaligned = true;
992       base_misalignment = 0;
993     }
994   poly_int64 misalignment
995     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
996 
997   /* If this is a backward running DR then first access in the larger
998      vectype actually is N-1 elements before the address in the DR.
999      Adjust misalign accordingly.  */
1000   if (tree_int_cst_sgn (drb->step) < 0)
1001     /* PLUS because STEP is negative.  */
1002     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1003 		     * TREE_INT_CST_LOW (drb->step));
1004 
1005   unsigned int const_misalignment;
1006   if (!known_misalignment (misalignment, vector_alignment,
1007 			   &const_misalignment))
1008     {
1009       if (dump_enabled_p ())
1010 	{
1011 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 			   "Non-constant misalignment for access: ");
1013 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1014 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1015 	}
1016       return true;
1017     }
1018 
1019   SET_DR_MISALIGNMENT (dr, const_misalignment);
1020 
1021   if (dump_enabled_p ())
1022     {
1023       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1025       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027     }
1028 
1029   return true;
1030 }
1031 
1032 /* Function vect_update_misalignment_for_peel.
1033    Sets DR's misalignment
1034    - to 0 if it has the same alignment as DR_PEEL,
1035    - to the misalignment computed using NPEEL if DR's salignment is known,
1036    - to -1 (unknown) otherwise.
1037 
1038    DR - the data reference whose misalignment is to be adjusted.
1039    DR_PEEL - the data reference whose misalignment is being made
1040              zero in the vector loop by the peel.
1041    NPEEL - the number of iterations in the peel loop if the misalignment
1042            of DR_PEEL is known at compile time.  */
1043 
1044 static void
1045 vect_update_misalignment_for_peel (struct data_reference *dr,
1046                                    struct data_reference *dr_peel, int npeel)
1047 {
1048   unsigned int i;
1049   vec<dr_p> same_aligned_drs;
1050   struct data_reference *current_dr;
1051   int dr_size = vect_get_scalar_dr_size (dr);
1052   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1053   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1054   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1055 
1056  /* For interleaved data accesses the step in the loop must be multiplied by
1057      the size of the interleaving group.  */
1058   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1059     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1060   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1061     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1062 
1063   /* It can be assumed that the data refs with the same alignment as dr_peel
1064      are aligned in the vector loop.  */
1065   same_aligned_drs
1066     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1067   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1068     {
1069       if (current_dr != dr)
1070         continue;
1071       gcc_assert (!known_alignment_for_access_p (dr)
1072 		  || !known_alignment_for_access_p (dr_peel)
1073 		  || (DR_MISALIGNMENT (dr) / dr_size
1074 		      == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1075       SET_DR_MISALIGNMENT (dr, 0);
1076       return;
1077     }
1078 
1079   if (known_alignment_for_access_p (dr)
1080       && known_alignment_for_access_p (dr_peel))
1081     {
1082       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1083       int misal = DR_MISALIGNMENT (dr);
1084       misal += negative ? -npeel * dr_size : npeel * dr_size;
1085       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1086       SET_DR_MISALIGNMENT (dr, misal);
1087       return;
1088     }
1089 
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1092 		     "to unknown (-1).\n");
1093   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1094 }
1095 
1096 
1097 /* Function verify_data_ref_alignment
1098 
1099    Return TRUE if DR can be handled with respect to alignment.  */
1100 
1101 static bool
1102 verify_data_ref_alignment (data_reference_p dr)
1103 {
1104   enum dr_alignment_support supportable_dr_alignment
1105     = vect_supportable_dr_alignment (dr, false);
1106   if (!supportable_dr_alignment)
1107     {
1108       if (dump_enabled_p ())
1109 	{
1110 	  if (DR_IS_READ (dr))
1111 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 			     "not vectorized: unsupported unaligned load.");
1113 	  else
1114 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 			     "not vectorized: unsupported unaligned "
1116 			     "store.");
1117 
1118 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1119 			     DR_REF (dr));
1120 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1121 	}
1122       return false;
1123     }
1124 
1125   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127 		     "Vectorizing an unaligned access.\n");
1128 
1129   return true;
1130 }
1131 
1132 /* Function vect_verify_datarefs_alignment
1133 
1134    Return TRUE if all data references in the loop can be
1135    handled with respect to alignment.  */
1136 
1137 bool
1138 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1139 {
1140   vec<data_reference_p> datarefs = vinfo->datarefs;
1141   struct data_reference *dr;
1142   unsigned int i;
1143 
1144   FOR_EACH_VEC_ELT (datarefs, i, dr)
1145     {
1146       gimple *stmt = DR_STMT (dr);
1147       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148 
1149       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1150 	continue;
1151 
1152       /* For interleaving, only the alignment of the first access matters.   */
1153       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1154 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1155 	continue;
1156 
1157       /* Strided accesses perform only component accesses, alignment is
1158 	 irrelevant for them.  */
1159       if (STMT_VINFO_STRIDED_P (stmt_info)
1160 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1161 	continue;
1162 
1163       if (! verify_data_ref_alignment (dr))
1164 	return false;
1165     }
1166 
1167   return true;
1168 }
1169 
1170 /* Given an memory reference EXP return whether its alignment is less
1171    than its size.  */
1172 
1173 static bool
1174 not_size_aligned (tree exp)
1175 {
1176   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1177     return true;
1178 
1179   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1180 	  > get_object_alignment (exp));
1181 }
1182 
1183 /* Function vector_alignment_reachable_p
1184 
1185    Return true if vector alignment for DR is reachable by peeling
1186    a few loop iterations.  Return false otherwise.  */
1187 
1188 static bool
1189 vector_alignment_reachable_p (struct data_reference *dr)
1190 {
1191   gimple *stmt = DR_STMT (dr);
1192   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1193   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1194 
1195   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1196     {
1197       /* For interleaved access we peel only if number of iterations in
1198 	 the prolog loop ({VF - misalignment}), is a multiple of the
1199 	 number of the interleaved accesses.  */
1200       int elem_size, mis_in_elements;
1201 
1202       /* FORNOW: handle only known alignment.  */
1203       if (!known_alignment_for_access_p (dr))
1204 	return false;
1205 
1206       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1207       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1208       elem_size = vector_element_size (vector_size, nelements);
1209       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1210 
1211       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1212 	return false;
1213     }
1214 
1215   /* If misalignment is known at the compile time then allow peeling
1216      only if natural alignment is reachable through peeling.  */
1217   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1218     {
1219       HOST_WIDE_INT elmsize =
1220 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1221       if (dump_enabled_p ())
1222 	{
1223 	  dump_printf_loc (MSG_NOTE, vect_location,
1224 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1225 	  dump_printf (MSG_NOTE,
1226 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1227 	}
1228       if (DR_MISALIGNMENT (dr) % elmsize)
1229 	{
1230 	  if (dump_enabled_p ())
1231 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 	                     "data size does not divide the misalignment.\n");
1233 	  return false;
1234 	}
1235     }
1236 
1237   if (!known_alignment_for_access_p (dr))
1238     {
1239       tree type = TREE_TYPE (DR_REF (dr));
1240       bool is_packed = not_size_aligned (DR_REF (dr));
1241       if (dump_enabled_p ())
1242 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243 	                 "Unknown misalignment, %snaturally aligned\n",
1244 			 is_packed ? "not " : "");
1245       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1246     }
1247 
1248   return true;
1249 }
1250 
1251 
1252 /* Calculate the cost of the memory access represented by DR.  */
1253 
1254 static void
1255 vect_get_data_access_cost (struct data_reference *dr,
1256                            unsigned int *inside_cost,
1257                            unsigned int *outside_cost,
1258 			   stmt_vector_for_cost *body_cost_vec)
1259 {
1260   gimple *stmt = DR_STMT (dr);
1261   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1262   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1263   int ncopies;
1264 
1265   if (PURE_SLP_STMT (stmt_info))
1266     ncopies = 1;
1267   else
1268     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1269 
1270   if (DR_IS_READ (dr))
1271     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1272 			NULL, body_cost_vec, false);
1273   else
1274     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1275 
1276   if (dump_enabled_p ())
1277     dump_printf_loc (MSG_NOTE, vect_location,
1278                      "vect_get_data_access_cost: inside_cost = %d, "
1279                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1280 }
1281 
1282 
1283 typedef struct _vect_peel_info
1284 {
1285   struct data_reference *dr;
1286   int npeel;
1287   unsigned int count;
1288 } *vect_peel_info;
1289 
1290 typedef struct _vect_peel_extended_info
1291 {
1292   struct _vect_peel_info peel_info;
1293   unsigned int inside_cost;
1294   unsigned int outside_cost;
1295 } *vect_peel_extended_info;
1296 
1297 
1298 /* Peeling hashtable helpers.  */
1299 
1300 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1301 {
1302   static inline hashval_t hash (const _vect_peel_info *);
1303   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1304 };
1305 
1306 inline hashval_t
1307 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1308 {
1309   return (hashval_t) peel_info->npeel;
1310 }
1311 
1312 inline bool
1313 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1314 {
1315   return (a->npeel == b->npeel);
1316 }
1317 
1318 
1319 /* Insert DR into peeling hash table with NPEEL as key.  */
1320 
1321 static void
1322 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1323 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1324                           int npeel)
1325 {
1326   struct _vect_peel_info elem, *slot;
1327   _vect_peel_info **new_slot;
1328   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1329 
1330   elem.npeel = npeel;
1331   slot = peeling_htab->find (&elem);
1332   if (slot)
1333     slot->count++;
1334   else
1335     {
1336       slot = XNEW (struct _vect_peel_info);
1337       slot->npeel = npeel;
1338       slot->dr = dr;
1339       slot->count = 1;
1340       new_slot = peeling_htab->find_slot (slot, INSERT);
1341       *new_slot = slot;
1342     }
1343 
1344   if (!supportable_dr_alignment
1345       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1346     slot->count += VECT_MAX_COST;
1347 }
1348 
1349 
1350 /* Traverse peeling hash table to find peeling option that aligns maximum
1351    number of data accesses.  */
1352 
1353 int
1354 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1355 				     _vect_peel_extended_info *max)
1356 {
1357   vect_peel_info elem = *slot;
1358 
1359   if (elem->count > max->peel_info.count
1360       || (elem->count == max->peel_info.count
1361           && max->peel_info.npeel > elem->npeel))
1362     {
1363       max->peel_info.npeel = elem->npeel;
1364       max->peel_info.count = elem->count;
1365       max->peel_info.dr = elem->dr;
1366     }
1367 
1368   return 1;
1369 }
1370 
1371 /* Get the costs of peeling NPEEL iterations checking data access costs
1372    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1373    misalignment will be zero after peeling.  */
1374 
1375 static void
1376 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1377 				struct data_reference *dr0,
1378 				unsigned int *inside_cost,
1379 				unsigned int *outside_cost,
1380 				stmt_vector_for_cost *body_cost_vec,
1381 				unsigned int npeel,
1382 				bool unknown_misalignment)
1383 {
1384   unsigned i;
1385   data_reference *dr;
1386 
1387   FOR_EACH_VEC_ELT (datarefs, i, dr)
1388     {
1389       gimple *stmt = DR_STMT (dr);
1390       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1392 	continue;
1393 
1394       /* For interleaving, only the alignment of the first access
1395          matters.  */
1396       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1397           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1398         continue;
1399 
1400       /* Strided accesses perform only component accesses, alignment is
1401          irrelevant for them.  */
1402       if (STMT_VINFO_STRIDED_P (stmt_info)
1403 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1404 	continue;
1405 
1406       int save_misalignment;
1407       save_misalignment = DR_MISALIGNMENT (dr);
1408       if (npeel == 0)
1409 	;
1410       else if (unknown_misalignment && dr == dr0)
1411 	SET_DR_MISALIGNMENT (dr, 0);
1412       else
1413 	vect_update_misalignment_for_peel (dr, dr0, npeel);
1414       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1415 				 body_cost_vec);
1416       SET_DR_MISALIGNMENT (dr, save_misalignment);
1417     }
1418 }
1419 
1420 /* Traverse peeling hash table and calculate cost for each peeling option.
1421    Find the one with the lowest cost.  */
1422 
1423 int
1424 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1425 				   _vect_peel_extended_info *min)
1426 {
1427   vect_peel_info elem = *slot;
1428   int dummy;
1429   unsigned int inside_cost = 0, outside_cost = 0;
1430   gimple *stmt = DR_STMT (elem->dr);
1431   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1432   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1433   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1434 		       epilogue_cost_vec;
1435 
1436   prologue_cost_vec.create (2);
1437   body_cost_vec.create (2);
1438   epilogue_cost_vec.create (2);
1439 
1440   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1441 				  elem->dr, &inside_cost, &outside_cost,
1442 				  &body_cost_vec, elem->npeel, false);
1443 
1444   body_cost_vec.release ();
1445 
1446   outside_cost += vect_get_known_peeling_cost
1447     (loop_vinfo, elem->npeel, &dummy,
1448      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1449      &prologue_cost_vec, &epilogue_cost_vec);
1450 
1451   /* Prologue and epilogue costs are added to the target model later.
1452      These costs depend only on the scalar iteration cost, the
1453      number of peeling iterations finally chosen, and the number of
1454      misaligned statements.  So discard the information found here.  */
1455   prologue_cost_vec.release ();
1456   epilogue_cost_vec.release ();
1457 
1458   if (inside_cost < min->inside_cost
1459       || (inside_cost == min->inside_cost
1460 	  && outside_cost < min->outside_cost))
1461     {
1462       min->inside_cost = inside_cost;
1463       min->outside_cost = outside_cost;
1464       min->peel_info.dr = elem->dr;
1465       min->peel_info.npeel = elem->npeel;
1466       min->peel_info.count = elem->count;
1467     }
1468 
1469   return 1;
1470 }
1471 
1472 
1473 /* Choose best peeling option by traversing peeling hash table and either
1474    choosing an option with the lowest cost (if cost model is enabled) or the
1475    option that aligns as many accesses as possible.  */
1476 
1477 static struct _vect_peel_extended_info
1478 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1479 				       loop_vec_info loop_vinfo)
1480 {
1481    struct _vect_peel_extended_info res;
1482 
1483    res.peel_info.dr = NULL;
1484 
1485    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1486      {
1487        res.inside_cost = INT_MAX;
1488        res.outside_cost = INT_MAX;
1489        peeling_htab->traverse <_vect_peel_extended_info *,
1490 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1491      }
1492    else
1493      {
1494        res.peel_info.count = 0;
1495        peeling_htab->traverse <_vect_peel_extended_info *,
1496 	   		       vect_peeling_hash_get_most_frequent> (&res);
1497        res.inside_cost = 0;
1498        res.outside_cost = 0;
1499      }
1500 
1501    return res;
1502 }
1503 
1504 /* Return true if the new peeling NPEEL is supported.  */
1505 
1506 static bool
1507 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1508 			  unsigned npeel)
1509 {
1510   unsigned i;
1511   struct data_reference *dr = NULL;
1512   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1513   gimple *stmt;
1514   stmt_vec_info stmt_info;
1515   enum dr_alignment_support supportable_dr_alignment;
1516 
1517   /* Ensure that all data refs can be vectorized after the peel.  */
1518   FOR_EACH_VEC_ELT (datarefs, i, dr)
1519     {
1520       int save_misalignment;
1521 
1522       if (dr == dr0)
1523 	continue;
1524 
1525       stmt = DR_STMT (dr);
1526       stmt_info = vinfo_for_stmt (stmt);
1527       /* For interleaving, only the alignment of the first access
1528 	 matters.  */
1529       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1530 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1531 	continue;
1532 
1533       /* Strided accesses perform only component accesses, alignment is
1534 	 irrelevant for them.  */
1535       if (STMT_VINFO_STRIDED_P (stmt_info)
1536 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1537 	continue;
1538 
1539       save_misalignment = DR_MISALIGNMENT (dr);
1540       vect_update_misalignment_for_peel (dr, dr0, npeel);
1541       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1542       SET_DR_MISALIGNMENT (dr, save_misalignment);
1543 
1544       if (!supportable_dr_alignment)
1545 	return false;
1546     }
1547 
1548   return true;
1549 }
1550 
1551 /* Function vect_enhance_data_refs_alignment
1552 
1553    This pass will use loop versioning and loop peeling in order to enhance
1554    the alignment of data references in the loop.
1555 
1556    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1557    original loop is to be vectorized.  Any other loops that are created by
1558    the transformations performed in this pass - are not supposed to be
1559    vectorized.  This restriction will be relaxed.
1560 
1561    This pass will require a cost model to guide it whether to apply peeling
1562    or versioning or a combination of the two.  For example, the scheme that
1563    intel uses when given a loop with several memory accesses, is as follows:
1564    choose one memory access ('p') which alignment you want to force by doing
1565    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1566    other accesses are not necessarily aligned, or (2) use loop versioning to
1567    generate one loop in which all accesses are aligned, and another loop in
1568    which only 'p' is necessarily aligned.
1569 
1570    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1571    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1572    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1573 
1574    Devising a cost model is the most critical aspect of this work.  It will
1575    guide us on which access to peel for, whether to use loop versioning, how
1576    many versions to create, etc.  The cost model will probably consist of
1577    generic considerations as well as target specific considerations (on
1578    powerpc for example, misaligned stores are more painful than misaligned
1579    loads).
1580 
1581    Here are the general steps involved in alignment enhancements:
1582 
1583      -- original loop, before alignment analysis:
1584 	for (i=0; i<N; i++){
1585 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1586 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1587 	}
1588 
1589      -- After vect_compute_data_refs_alignment:
1590 	for (i=0; i<N; i++){
1591 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1592 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1593 	}
1594 
1595      -- Possibility 1: we do loop versioning:
1596      if (p is aligned) {
1597 	for (i=0; i<N; i++){	# loop 1A
1598 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1599 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1600 	}
1601      }
1602      else {
1603 	for (i=0; i<N; i++){	# loop 1B
1604 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1605 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1606 	}
1607      }
1608 
1609      -- Possibility 2: we do loop peeling:
1610      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1611 	x = q[i];
1612 	p[i] = y;
1613      }
1614      for (i = 3; i < N; i++){	# loop 2A
1615 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1616 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1617      }
1618 
1619      -- Possibility 3: combination of loop peeling and versioning:
1620      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1621 	x = q[i];
1622 	p[i] = y;
1623      }
1624      if (p is aligned) {
1625 	for (i = 3; i<N; i++){	# loop 3A
1626 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1627 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1628 	}
1629      }
1630      else {
1631 	for (i = 3; i<N; i++){	# loop 3B
1632 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1633 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1634 	}
1635      }
1636 
1637      These loops are later passed to loop_transform to be vectorized.  The
1638      vectorizer will use the alignment information to guide the transformation
1639      (whether to generate regular loads/stores, or with special handling for
1640      misalignment).  */
1641 
1642 bool
1643 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1644 {
1645   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1646   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1647   enum dr_alignment_support supportable_dr_alignment;
1648   struct data_reference *dr0 = NULL, *first_store = NULL;
1649   struct data_reference *dr;
1650   unsigned int i, j;
1651   bool do_peeling = false;
1652   bool do_versioning = false;
1653   bool stat;
1654   gimple *stmt;
1655   stmt_vec_info stmt_info;
1656   unsigned int npeel = 0;
1657   bool one_misalignment_known = false;
1658   bool one_misalignment_unknown = false;
1659   bool one_dr_unsupportable = false;
1660   struct data_reference *unsupportable_dr = NULL;
1661   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1662   unsigned possible_npeel_number = 1;
1663   tree vectype;
1664   unsigned int mis, same_align_drs_max = 0;
1665   hash_table<peel_info_hasher> peeling_htab (1);
1666 
1667   if (dump_enabled_p ())
1668     dump_printf_loc (MSG_NOTE, vect_location,
1669                      "=== vect_enhance_data_refs_alignment ===\n");
1670 
1671   /* Reset data so we can safely be called multiple times.  */
1672   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1673   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1674 
1675   /* While cost model enhancements are expected in the future, the high level
1676      view of the code at this time is as follows:
1677 
1678      A) If there is a misaligned access then see if peeling to align
1679         this access can make all data references satisfy
1680         vect_supportable_dr_alignment.  If so, update data structures
1681         as needed and return true.
1682 
1683      B) If peeling wasn't possible and there is a data reference with an
1684         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1685         then see if loop versioning checks can be used to make all data
1686         references satisfy vect_supportable_dr_alignment.  If so, update
1687         data structures as needed and return true.
1688 
1689      C) If neither peeling nor versioning were successful then return false if
1690         any data reference does not satisfy vect_supportable_dr_alignment.
1691 
1692      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1693 
1694      Note, Possibility 3 above (which is peeling and versioning together) is not
1695      being done at this time.  */
1696 
1697   /* (1) Peeling to force alignment.  */
1698 
1699   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1700      Considerations:
1701      + How many accesses will become aligned due to the peeling
1702      - How many accesses will become unaligned due to the peeling,
1703        and the cost of misaligned accesses.
1704      - The cost of peeling (the extra runtime checks, the increase
1705        in code size).  */
1706 
1707   FOR_EACH_VEC_ELT (datarefs, i, dr)
1708     {
1709       stmt = DR_STMT (dr);
1710       stmt_info = vinfo_for_stmt (stmt);
1711 
1712       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1713 	continue;
1714 
1715       /* For interleaving, only the alignment of the first access
1716          matters.  */
1717       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1718           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1719         continue;
1720 
1721       /* For invariant accesses there is nothing to enhance.  */
1722       if (integer_zerop (DR_STEP (dr)))
1723 	continue;
1724 
1725       /* Strided accesses perform only component accesses, alignment is
1726 	 irrelevant for them.  */
1727       if (STMT_VINFO_STRIDED_P (stmt_info)
1728 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1729 	continue;
1730 
1731       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1732       do_peeling = vector_alignment_reachable_p (dr);
1733       if (do_peeling)
1734         {
1735           if (known_alignment_for_access_p (dr))
1736             {
1737 	      unsigned int npeel_tmp = 0;
1738 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1739 						    size_zero_node) < 0;
1740 
1741 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1742 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1743 	      unsigned int dr_size = vect_get_scalar_dr_size (dr);
1744 	      mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1745 	      if (DR_MISALIGNMENT (dr) != 0)
1746 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1747 
1748               /* For multiple types, it is possible that the bigger type access
1749                  will have more than one peeling option.  E.g., a loop with two
1750                  types: one of size (vector size / 4), and the other one of
1751                  size (vector size / 8).  Vectorization factor will 8.  If both
1752                  accesses are misaligned by 3, the first one needs one scalar
1753                  iteration to be aligned, and the second one needs 5.  But the
1754 		 first one will be aligned also by peeling 5 scalar
1755                  iterations, and in that case both accesses will be aligned.
1756                  Hence, except for the immediate peeling amount, we also want
1757                  to try to add full vector size, while we don't exceed
1758                  vectorization factor.
1759                  We do this automatically for cost model, since we calculate
1760 		 cost for every peeling option.  */
1761               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1762 		{
1763 		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1764 					  ? vf * GROUP_SIZE (stmt_info) : vf);
1765 		  possible_npeel_number
1766 		    = vect_get_num_vectors (nscalars, vectype);
1767 
1768 		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1769 		     allow peeling NELEMENTS.  */
1770 		  if (DR_MISALIGNMENT (dr) == 0)
1771 		    possible_npeel_number++;
1772 		}
1773 
1774 	      /* Save info about DR in the hash table.  Also include peeling
1775 	         amounts according to the explanation above.  */
1776               for (j = 0; j < possible_npeel_number; j++)
1777                 {
1778                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1779 					    dr, npeel_tmp);
1780 		  npeel_tmp += target_align / dr_size;
1781                 }
1782 
1783 	      one_misalignment_known = true;
1784             }
1785           else
1786             {
1787               /* If we don't know any misalignment values, we prefer
1788                  peeling for data-ref that has the maximum number of data-refs
1789                  with the same alignment, unless the target prefers to align
1790                  stores over load.  */
1791 	      unsigned same_align_drs
1792 		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1793 	      if (!dr0
1794 		  || same_align_drs_max < same_align_drs)
1795 		{
1796 		  same_align_drs_max = same_align_drs;
1797 		  dr0 = dr;
1798 		}
1799 	      /* For data-refs with the same number of related
1800 		 accesses prefer the one where the misalign
1801 		 computation will be invariant in the outermost loop.  */
1802 	      else if (same_align_drs_max == same_align_drs)
1803 		{
1804 		  struct loop *ivloop0, *ivloop;
1805 		  ivloop0 = outermost_invariant_loop_for_expr
1806 		    (loop, DR_BASE_ADDRESS (dr0));
1807 		  ivloop = outermost_invariant_loop_for_expr
1808 		    (loop, DR_BASE_ADDRESS (dr));
1809 		  if ((ivloop && !ivloop0)
1810 		      || (ivloop && ivloop0
1811 			  && flow_loop_nested_p (ivloop, ivloop0)))
1812 		    dr0 = dr;
1813 		}
1814 
1815 	      one_misalignment_unknown = true;
1816 
1817 	      /* Check for data refs with unsupportable alignment that
1818 	         can be peeled.  */
1819 	      if (!supportable_dr_alignment)
1820 	      {
1821 		one_dr_unsupportable = true;
1822 		unsupportable_dr = dr;
1823 	      }
1824 
1825 	      if (!first_store && DR_IS_WRITE (dr))
1826 		first_store = dr;
1827             }
1828         }
1829       else
1830         {
1831           if (!aligned_access_p (dr))
1832             {
1833               if (dump_enabled_p ())
1834                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1835                                  "vector alignment may not be reachable\n");
1836               break;
1837             }
1838         }
1839     }
1840 
1841   /* Check if we can possibly peel the loop.  */
1842   if (!vect_can_advance_ivs_p (loop_vinfo)
1843       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1844       || loop->inner)
1845     do_peeling = false;
1846 
1847   struct _vect_peel_extended_info peel_for_known_alignment;
1848   struct _vect_peel_extended_info peel_for_unknown_alignment;
1849   struct _vect_peel_extended_info best_peel;
1850 
1851   peel_for_unknown_alignment.inside_cost = INT_MAX;
1852   peel_for_unknown_alignment.outside_cost = INT_MAX;
1853   peel_for_unknown_alignment.peel_info.count = 0;
1854 
1855   if (do_peeling
1856       && one_misalignment_unknown)
1857     {
1858       /* Check if the target requires to prefer stores over loads, i.e., if
1859          misaligned stores are more expensive than misaligned loads (taking
1860          drs with same alignment into account).  */
1861       unsigned int load_inside_cost = 0;
1862       unsigned int load_outside_cost = 0;
1863       unsigned int store_inside_cost = 0;
1864       unsigned int store_outside_cost = 0;
1865       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1866 
1867       stmt_vector_for_cost dummy;
1868       dummy.create (2);
1869       vect_get_peeling_costs_all_drs (datarefs, dr0,
1870 				      &load_inside_cost,
1871 				      &load_outside_cost,
1872 				      &dummy, estimated_npeels, true);
1873       dummy.release ();
1874 
1875       if (first_store)
1876 	{
1877 	  dummy.create (2);
1878 	  vect_get_peeling_costs_all_drs (datarefs, first_store,
1879 					  &store_inside_cost,
1880 					  &store_outside_cost,
1881 					  &dummy, estimated_npeels, true);
1882 	  dummy.release ();
1883 	}
1884       else
1885 	{
1886 	  store_inside_cost = INT_MAX;
1887 	  store_outside_cost = INT_MAX;
1888 	}
1889 
1890       if (load_inside_cost > store_inside_cost
1891 	  || (load_inside_cost == store_inside_cost
1892 	      && load_outside_cost > store_outside_cost))
1893 	{
1894 	  dr0 = first_store;
1895 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1896 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1897 	}
1898       else
1899 	{
1900 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1901 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1902 	}
1903 
1904       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1905       prologue_cost_vec.create (2);
1906       epilogue_cost_vec.create (2);
1907 
1908       int dummy2;
1909       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1910 	(loop_vinfo, estimated_npeels, &dummy2,
1911 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1912 	 &prologue_cost_vec, &epilogue_cost_vec);
1913 
1914       prologue_cost_vec.release ();
1915       epilogue_cost_vec.release ();
1916 
1917       peel_for_unknown_alignment.peel_info.count = 1
1918 	+ STMT_VINFO_SAME_ALIGN_REFS
1919 	(vinfo_for_stmt (DR_STMT (dr0))).length ();
1920     }
1921 
1922   peel_for_unknown_alignment.peel_info.npeel = 0;
1923   peel_for_unknown_alignment.peel_info.dr = dr0;
1924 
1925   best_peel = peel_for_unknown_alignment;
1926 
1927   peel_for_known_alignment.inside_cost = INT_MAX;
1928   peel_for_known_alignment.outside_cost = INT_MAX;
1929   peel_for_known_alignment.peel_info.count = 0;
1930   peel_for_known_alignment.peel_info.dr = NULL;
1931 
1932   if (do_peeling && one_misalignment_known)
1933     {
1934       /* Peeling is possible, but there is no data access that is not supported
1935          unless aligned.  So we try to choose the best possible peeling from
1936 	 the hash table.  */
1937       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1938 	(&peeling_htab, loop_vinfo);
1939     }
1940 
1941   /* Compare costs of peeling for known and unknown alignment. */
1942   if (peel_for_known_alignment.peel_info.dr != NULL
1943       && peel_for_unknown_alignment.inside_cost
1944       >= peel_for_known_alignment.inside_cost)
1945     {
1946       best_peel = peel_for_known_alignment;
1947 
1948       /* If the best peeling for known alignment has NPEEL == 0, perform no
1949          peeling at all except if there is an unsupportable dr that we can
1950          align.  */
1951       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1952 	do_peeling = false;
1953     }
1954 
1955   /* If there is an unsupportable data ref, prefer this over all choices so far
1956      since we'd have to discard a chosen peeling except when it accidentally
1957      aligned the unsupportable data ref.  */
1958   if (one_dr_unsupportable)
1959     dr0 = unsupportable_dr;
1960   else if (do_peeling)
1961     {
1962       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1963 	 TODO: Use nopeel_outside_cost or get rid of it?  */
1964       unsigned nopeel_inside_cost = 0;
1965       unsigned nopeel_outside_cost = 0;
1966 
1967       stmt_vector_for_cost dummy;
1968       dummy.create (2);
1969       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1970 				      &nopeel_outside_cost, &dummy, 0, false);
1971       dummy.release ();
1972 
1973       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1974 	 costs will be recorded.  */
1975       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1976       prologue_cost_vec.create (2);
1977       epilogue_cost_vec.create (2);
1978 
1979       int dummy2;
1980       nopeel_outside_cost += vect_get_known_peeling_cost
1981 	(loop_vinfo, 0, &dummy2,
1982 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1983 	 &prologue_cost_vec, &epilogue_cost_vec);
1984 
1985       prologue_cost_vec.release ();
1986       epilogue_cost_vec.release ();
1987 
1988       npeel = best_peel.peel_info.npeel;
1989       dr0 = best_peel.peel_info.dr;
1990 
1991       /* If no peeling is not more expensive than the best peeling we
1992 	 have so far, don't perform any peeling.  */
1993       if (nopeel_inside_cost <= best_peel.inside_cost)
1994 	do_peeling = false;
1995     }
1996 
1997   if (do_peeling)
1998     {
1999       stmt = DR_STMT (dr0);
2000       stmt_info = vinfo_for_stmt (stmt);
2001       vectype = STMT_VINFO_VECTYPE (stmt_info);
2002 
2003       if (known_alignment_for_access_p (dr0))
2004         {
2005 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
2006 						size_zero_node) < 0;
2007           if (!npeel)
2008             {
2009               /* Since it's known at compile time, compute the number of
2010                  iterations in the peeled loop (the peeling factor) for use in
2011                  updating DR_MISALIGNMENT values.  The peeling factor is the
2012                  vectorization factor minus the misalignment as an element
2013                  count.  */
2014 	      mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2015 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2016 	      npeel = ((mis & (target_align - 1))
2017 		       / vect_get_scalar_dr_size (dr0));
2018             }
2019 
2020 	  /* For interleaved data access every iteration accesses all the
2021 	     members of the group, therefore we divide the number of iterations
2022 	     by the group size.  */
2023 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2024 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2025 	    npeel /= GROUP_SIZE (stmt_info);
2026 
2027           if (dump_enabled_p ())
2028             dump_printf_loc (MSG_NOTE, vect_location,
2029                              "Try peeling by %d\n", npeel);
2030         }
2031 
2032       /* Ensure that all datarefs can be vectorized after the peel.  */
2033       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2034 	do_peeling = false;
2035 
2036       /* Check if all datarefs are supportable and log.  */
2037       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2038         {
2039           stat = vect_verify_datarefs_alignment (loop_vinfo);
2040           if (!stat)
2041             do_peeling = false;
2042           else
2043 	    return stat;
2044         }
2045 
2046       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2047       if (do_peeling)
2048         {
2049           unsigned max_allowed_peel
2050             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2051           if (max_allowed_peel != (unsigned)-1)
2052             {
2053               unsigned max_peel = npeel;
2054               if (max_peel == 0)
2055                 {
2056 		  unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2057 		  max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2058                 }
2059               if (max_peel > max_allowed_peel)
2060                 {
2061                   do_peeling = false;
2062                   if (dump_enabled_p ())
2063                     dump_printf_loc (MSG_NOTE, vect_location,
2064                         "Disable peeling, max peels reached: %d\n", max_peel);
2065                 }
2066             }
2067         }
2068 
2069       /* Cost model #2 - if peeling may result in a remaining loop not
2070 	 iterating enough to be vectorized then do not peel.  Since this
2071 	 is a cost heuristic rather than a correctness decision, use the
2072 	 most likely runtime value for variable vectorization factors.  */
2073       if (do_peeling
2074 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2075 	{
2076 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2077 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2078 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2079 	      < assumed_vf + max_peel)
2080 	    do_peeling = false;
2081 	}
2082 
2083       if (do_peeling)
2084         {
2085           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2086              If the misalignment of DR_i is identical to that of dr0 then set
2087              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2088              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2089              by the peeling factor times the element size of DR_i (MOD the
2090              vectorization factor times the size).  Otherwise, the
2091              misalignment of DR_i must be set to unknown.  */
2092 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2093 	    if (dr != dr0)
2094 	      {
2095 		/* Strided accesses perform only component accesses, alignment
2096 		   is irrelevant for them.  */
2097 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
2098 		if (STMT_VINFO_STRIDED_P (stmt_info)
2099 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2100 		  continue;
2101 
2102 		vect_update_misalignment_for_peel (dr, dr0, npeel);
2103 	      }
2104 
2105           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2106           if (npeel)
2107             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2108           else
2109             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2110 	      = DR_MISALIGNMENT (dr0);
2111 	  SET_DR_MISALIGNMENT (dr0, 0);
2112 	  if (dump_enabled_p ())
2113             {
2114               dump_printf_loc (MSG_NOTE, vect_location,
2115                                "Alignment of access forced using peeling.\n");
2116               dump_printf_loc (MSG_NOTE, vect_location,
2117                                "Peeling for alignment will be applied.\n");
2118             }
2119 
2120 	  /* The inside-loop cost will be accounted for in vectorizable_load
2121 	     and vectorizable_store correctly with adjusted alignments.
2122 	     Drop the body_cst_vec on the floor here.  */
2123 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
2124 	  gcc_assert (stat);
2125           return stat;
2126         }
2127     }
2128 
2129   /* (2) Versioning to force alignment.  */
2130 
2131   /* Try versioning if:
2132      1) optimize loop for speed
2133      2) there is at least one unsupported misaligned data ref with an unknown
2134         misalignment, and
2135      3) all misaligned data refs with a known misalignment are supported, and
2136      4) the number of runtime alignment checks is within reason.  */
2137 
2138   do_versioning =
2139 	optimize_loop_nest_for_speed_p (loop)
2140 	&& (!loop->inner); /* FORNOW */
2141 
2142   if (do_versioning)
2143     {
2144       FOR_EACH_VEC_ELT (datarefs, i, dr)
2145         {
2146 	  stmt = DR_STMT (dr);
2147 	  stmt_info = vinfo_for_stmt (stmt);
2148 
2149 	  /* For interleaving, only the alignment of the first access
2150 	     matters.  */
2151 	  if (aligned_access_p (dr)
2152 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2153 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2154 	    continue;
2155 
2156 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2157 	    {
2158 	      /* Strided loads perform only component accesses, alignment is
2159 		 irrelevant for them.  */
2160 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2161 		continue;
2162 	      do_versioning = false;
2163 	      break;
2164 	    }
2165 
2166 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2167 
2168           if (!supportable_dr_alignment)
2169             {
2170 	      gimple *stmt;
2171               int mask;
2172               tree vectype;
2173 
2174               if (known_alignment_for_access_p (dr)
2175                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2176                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2177                 {
2178                   do_versioning = false;
2179                   break;
2180                 }
2181 
2182               stmt = DR_STMT (dr);
2183               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2184               gcc_assert (vectype);
2185 
2186 	      /* At present we don't support versioning for alignment
2187 		 with variable VF, since there's no guarantee that the
2188 		 VF is a power of two.  We could relax this if we added
2189 		 a way of enforcing a power-of-two size.  */
2190 	      unsigned HOST_WIDE_INT size;
2191 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2192 		{
2193 		  do_versioning = false;
2194 		  break;
2195 		}
2196 
2197               /* The rightmost bits of an aligned address must be zeros.
2198                  Construct the mask needed for this test.  For example,
2199                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2200                  mask must be 15 = 0xf. */
2201 	      mask = size - 1;
2202 
2203               /* FORNOW: use the same mask to test all potentially unaligned
2204                  references in the loop.  The vectorizer currently supports
2205                  a single vector size, see the reference to
2206                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2207                  vectorization factor is computed.  */
2208               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2209                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2210               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2211               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2212 		      DR_STMT (dr));
2213             }
2214         }
2215 
2216       /* Versioning requires at least one misaligned data reference.  */
2217       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2218         do_versioning = false;
2219       else if (!do_versioning)
2220         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2221     }
2222 
2223   if (do_versioning)
2224     {
2225       vec<gimple *> may_misalign_stmts
2226         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2227       gimple *stmt;
2228 
2229       /* It can now be assumed that the data references in the statements
2230          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2231          of the loop being vectorized.  */
2232       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2233         {
2234           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2235           dr = STMT_VINFO_DATA_REF (stmt_info);
2236 	  SET_DR_MISALIGNMENT (dr, 0);
2237 	  if (dump_enabled_p ())
2238             dump_printf_loc (MSG_NOTE, vect_location,
2239                              "Alignment of access forced using versioning.\n");
2240         }
2241 
2242       if (dump_enabled_p ())
2243         dump_printf_loc (MSG_NOTE, vect_location,
2244                          "Versioning for alignment will be applied.\n");
2245 
2246       /* Peeling and versioning can't be done together at this time.  */
2247       gcc_assert (! (do_peeling && do_versioning));
2248 
2249       stat = vect_verify_datarefs_alignment (loop_vinfo);
2250       gcc_assert (stat);
2251       return stat;
2252     }
2253 
2254   /* This point is reached if neither peeling nor versioning is being done.  */
2255   gcc_assert (! (do_peeling || do_versioning));
2256 
2257   stat = vect_verify_datarefs_alignment (loop_vinfo);
2258   return stat;
2259 }
2260 
2261 
2262 /* Function vect_find_same_alignment_drs.
2263 
2264    Update group and alignment relations according to the chosen
2265    vectorization factor.  */
2266 
2267 static void
2268 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2269 {
2270   struct data_reference *dra = DDR_A (ddr);
2271   struct data_reference *drb = DDR_B (ddr);
2272   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2273   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2274 
2275   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2276     return;
2277 
2278   if (dra == drb)
2279     return;
2280 
2281   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2282       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2283       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2284     return;
2285 
2286   /* Two references with distance zero have the same alignment.  */
2287   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2288 			  - wi::to_poly_offset (DR_INIT (drb)));
2289   if (maybe_ne (diff, 0))
2290     {
2291       /* Get the wider of the two alignments.  */
2292       unsigned int align_a = (vect_calculate_target_alignment (dra)
2293 			      / BITS_PER_UNIT);
2294       unsigned int align_b = (vect_calculate_target_alignment (drb)
2295 			      / BITS_PER_UNIT);
2296       unsigned int max_align = MAX (align_a, align_b);
2297 
2298       /* Require the gap to be a multiple of the larger vector alignment.  */
2299       if (!multiple_p (diff, max_align))
2300 	return;
2301     }
2302 
2303   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2304   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2305   if (dump_enabled_p ())
2306     {
2307       dump_printf_loc (MSG_NOTE, vect_location,
2308 		       "accesses have the same alignment: ");
2309       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2310       dump_printf (MSG_NOTE,  " and ");
2311       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2312       dump_printf (MSG_NOTE, "\n");
2313     }
2314 }
2315 
2316 
2317 /* Function vect_analyze_data_refs_alignment
2318 
2319    Analyze the alignment of the data-references in the loop.
2320    Return FALSE if a data reference is found that cannot be vectorized.  */
2321 
2322 bool
2323 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2324 {
2325   if (dump_enabled_p ())
2326     dump_printf_loc (MSG_NOTE, vect_location,
2327                      "=== vect_analyze_data_refs_alignment ===\n");
2328 
2329   /* Mark groups of data references with same alignment using
2330      data dependence information.  */
2331   vec<ddr_p> ddrs = vinfo->ddrs;
2332   struct data_dependence_relation *ddr;
2333   unsigned int i;
2334 
2335   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2336     vect_find_same_alignment_drs (ddr);
2337 
2338   vec<data_reference_p> datarefs = vinfo->datarefs;
2339   struct data_reference *dr;
2340 
2341   vect_record_base_alignments (vinfo);
2342   FOR_EACH_VEC_ELT (datarefs, i, dr)
2343     {
2344       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2345       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2346 	  && !vect_compute_data_ref_alignment (dr))
2347 	{
2348 	  /* Strided accesses perform only component accesses, misalignment
2349 	     information is irrelevant for them.  */
2350 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2351 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2352 	    continue;
2353 
2354 	  if (dump_enabled_p ())
2355 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356 			     "not vectorized: can't calculate alignment "
2357 			     "for data ref.\n");
2358 
2359 	  return false;
2360 	}
2361     }
2362 
2363   return true;
2364 }
2365 
2366 
2367 /* Analyze alignment of DRs of stmts in NODE.  */
2368 
2369 static bool
2370 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2371 {
2372   /* We vectorize from the first scalar stmt in the node unless
2373      the node is permuted in which case we start from the first
2374      element in the group.  */
2375   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2376   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2377   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2378     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2379 
2380   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2381   if (! vect_compute_data_ref_alignment (dr)
2382       /* For creating the data-ref pointer we need alignment of the
2383 	 first element anyway.  */
2384       || (dr != first_dr
2385 	  && ! vect_compute_data_ref_alignment (first_dr))
2386       || ! verify_data_ref_alignment (dr))
2387     {
2388       if (dump_enabled_p ())
2389 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2390 			 "not vectorized: bad data alignment in basic "
2391 			 "block.\n");
2392       return false;
2393     }
2394 
2395   return true;
2396 }
2397 
2398 /* Function vect_slp_analyze_instance_alignment
2399 
2400    Analyze the alignment of the data-references in the SLP instance.
2401    Return FALSE if a data reference is found that cannot be vectorized.  */
2402 
2403 bool
2404 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2405 {
2406   if (dump_enabled_p ())
2407     dump_printf_loc (MSG_NOTE, vect_location,
2408                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2409 
2410   slp_tree node;
2411   unsigned i;
2412   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2413     if (! vect_slp_analyze_and_verify_node_alignment (node))
2414       return false;
2415 
2416   node = SLP_INSTANCE_TREE (instance);
2417   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2418       && ! vect_slp_analyze_and_verify_node_alignment
2419 	     (SLP_INSTANCE_TREE (instance)))
2420     return false;
2421 
2422   return true;
2423 }
2424 
2425 
2426 /* Analyze groups of accesses: check that DR belongs to a group of
2427    accesses of legal size, step, etc.  Detect gaps, single element
2428    interleaving, and other special cases. Set grouped access info.
2429    Collect groups of strided stores for further use in SLP analysis.
2430    Worker for vect_analyze_group_access.  */
2431 
2432 static bool
2433 vect_analyze_group_access_1 (struct data_reference *dr)
2434 {
2435   tree step = DR_STEP (dr);
2436   tree scalar_type = TREE_TYPE (DR_REF (dr));
2437   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2438   gimple *stmt = DR_STMT (dr);
2439   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2440   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2441   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2442   HOST_WIDE_INT dr_step = -1;
2443   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2444   bool slp_impossible = false;
2445 
2446   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2447      size of the interleaving group (including gaps).  */
2448   if (tree_fits_shwi_p (step))
2449     {
2450       dr_step = tree_to_shwi (step);
2451       /* Check that STEP is a multiple of type size.  Otherwise there is
2452          a non-element-sized gap at the end of the group which we
2453 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2454 	 ???  As we can handle non-constant step fine here we should
2455 	 simply remove uses of GROUP_GAP between the last and first
2456 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2457 	 simply not include that gap.  */
2458       if ((dr_step % type_size) != 0)
2459 	{
2460 	  if (dump_enabled_p ())
2461 	    {
2462 	      dump_printf_loc (MSG_NOTE, vect_location,
2463 	                       "Step ");
2464 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2465 	      dump_printf (MSG_NOTE,
2466 			   " is not a multiple of the element size for ");
2467 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2468 	      dump_printf (MSG_NOTE, "\n");
2469 	    }
2470 	  return false;
2471 	}
2472       groupsize = absu_hwi (dr_step) / type_size;
2473     }
2474   else
2475     groupsize = 0;
2476 
2477   /* Not consecutive access is possible only if it is a part of interleaving.  */
2478   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2479     {
2480       /* Check if it this DR is a part of interleaving, and is a single
2481 	 element of the group that is accessed in the loop.  */
2482 
2483       /* Gaps are supported only for loads. STEP must be a multiple of the type
2484 	 size.  */
2485       if (DR_IS_READ (dr)
2486 	  && (dr_step % type_size) == 0
2487 	  && groupsize > 0)
2488 	{
2489 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2490 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2491 	  GROUP_GAP (stmt_info) = groupsize - 1;
2492 	  if (dump_enabled_p ())
2493 	    {
2494 	      dump_printf_loc (MSG_NOTE, vect_location,
2495 	                       "Detected single element interleaving ");
2496 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2497 	      dump_printf (MSG_NOTE, " step ");
2498 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2499 	      dump_printf (MSG_NOTE, "\n");
2500 	    }
2501 
2502 	  return true;
2503 	}
2504 
2505       if (dump_enabled_p ())
2506         {
2507  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2508 	                   "not consecutive access ");
2509 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2510         }
2511 
2512       if (bb_vinfo)
2513         {
2514           /* Mark the statement as unvectorizable.  */
2515           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2516           return true;
2517         }
2518 
2519       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2520       STMT_VINFO_STRIDED_P (stmt_info) = true;
2521       return true;
2522     }
2523 
2524   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2525     {
2526       /* First stmt in the interleaving chain. Check the chain.  */
2527       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2528       struct data_reference *data_ref = dr;
2529       unsigned int count = 1;
2530       tree prev_init = DR_INIT (data_ref);
2531       gimple *prev = stmt;
2532       HOST_WIDE_INT diff, gaps = 0;
2533 
2534       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2535       while (next)
2536         {
2537           /* Skip same data-refs.  In case that two or more stmts share
2538              data-ref (supported only for loads), we vectorize only the first
2539              stmt, and the rest get their vectorized loads from the first
2540              one.  */
2541           if (!tree_int_cst_compare (DR_INIT (data_ref),
2542                                      DR_INIT (STMT_VINFO_DATA_REF (
2543 						   vinfo_for_stmt (next)))))
2544             {
2545               if (DR_IS_WRITE (data_ref))
2546                 {
2547                   if (dump_enabled_p ())
2548                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2549                                      "Two store stmts share the same dr.\n");
2550                   return false;
2551                 }
2552 
2553 	      if (dump_enabled_p ())
2554 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 				 "Two or more load stmts share the same dr.\n");
2556 
2557               /* For load use the same data-ref load.  */
2558               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2559 
2560               prev = next;
2561               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2562               continue;
2563             }
2564 
2565           prev = next;
2566           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2567 
2568 	  /* All group members have the same STEP by construction.  */
2569 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2570 
2571           /* Check that the distance between two accesses is equal to the type
2572              size. Otherwise, we have gaps.  */
2573           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2574                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2575 	  if (diff != 1)
2576 	    {
2577 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2578 	      slp_impossible = true;
2579 	      if (DR_IS_WRITE (data_ref))
2580 		{
2581                   if (dump_enabled_p ())
2582                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2583                                      "interleaved store with gaps\n");
2584 		  return false;
2585 		}
2586 
2587               gaps += diff - 1;
2588 	    }
2589 
2590 	  last_accessed_element += diff;
2591 
2592           /* Store the gap from the previous member of the group. If there is no
2593              gap in the access, GROUP_GAP is always 1.  */
2594           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2595 
2596           prev_init = DR_INIT (data_ref);
2597           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2598           /* Count the number of data-refs in the chain.  */
2599           count++;
2600         }
2601 
2602       if (groupsize == 0)
2603         groupsize = count + gaps;
2604 
2605       /* This could be UINT_MAX but as we are generating code in a very
2606          inefficient way we have to cap earlier.  See PR78699 for example.  */
2607       if (groupsize > 4096)
2608 	{
2609 	  if (dump_enabled_p ())
2610 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2611 			     "group is too large\n");
2612 	  return false;
2613 	}
2614 
2615       /* Check that the size of the interleaving is equal to count for stores,
2616          i.e., that there are no gaps.  */
2617       if (groupsize != count
2618 	  && !DR_IS_READ (dr))
2619         {
2620 	  if (dump_enabled_p ())
2621 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2622 			     "interleaved store with gaps\n");
2623 	  return false;
2624 	}
2625 
2626       /* If there is a gap after the last load in the group it is the
2627 	 difference between the groupsize and the last accessed
2628 	 element.
2629 	 When there is no gap, this difference should be 0.  */
2630       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2631 
2632       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2633       if (dump_enabled_p ())
2634 	{
2635 	  dump_printf_loc (MSG_NOTE, vect_location,
2636 			   "Detected interleaving ");
2637 	  if (DR_IS_READ (dr))
2638 	    dump_printf (MSG_NOTE, "load ");
2639 	  else
2640 	    dump_printf (MSG_NOTE, "store ");
2641 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2642 		       (unsigned)groupsize);
2643 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2644 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2645 	    dump_printf_loc (MSG_NOTE, vect_location,
2646 			     "There is a gap of %u elements after the group\n",
2647 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2648 	}
2649 
2650       /* SLP: create an SLP data structure for every interleaving group of
2651 	 stores for further analysis in vect_analyse_slp.  */
2652       if (DR_IS_WRITE (dr) && !slp_impossible)
2653         {
2654           if (loop_vinfo)
2655             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2656           if (bb_vinfo)
2657             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2658         }
2659     }
2660 
2661   return true;
2662 }
2663 
2664 /* Analyze groups of accesses: check that DR belongs to a group of
2665    accesses of legal size, step, etc.  Detect gaps, single element
2666    interleaving, and other special cases. Set grouped access info.
2667    Collect groups of strided stores for further use in SLP analysis.  */
2668 
2669 static bool
2670 vect_analyze_group_access (struct data_reference *dr)
2671 {
2672   if (!vect_analyze_group_access_1 (dr))
2673     {
2674       /* Dissolve the group if present.  */
2675       gimple *next;
2676       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2677       while (stmt)
2678 	{
2679 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2680 	  next = GROUP_NEXT_ELEMENT (vinfo);
2681 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2682 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2683 	  stmt = next;
2684 	}
2685       return false;
2686     }
2687   return true;
2688 }
2689 
2690 /* Analyze the access pattern of the data-reference DR.
2691    In case of non-consecutive accesses call vect_analyze_group_access() to
2692    analyze groups of accesses.  */
2693 
2694 static bool
2695 vect_analyze_data_ref_access (struct data_reference *dr)
2696 {
2697   tree step = DR_STEP (dr);
2698   tree scalar_type = TREE_TYPE (DR_REF (dr));
2699   gimple *stmt = DR_STMT (dr);
2700   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2701   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2702   struct loop *loop = NULL;
2703 
2704   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2705     return true;
2706 
2707   if (loop_vinfo)
2708     loop = LOOP_VINFO_LOOP (loop_vinfo);
2709 
2710   if (loop_vinfo && !step)
2711     {
2712       if (dump_enabled_p ())
2713 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2714 	                 "bad data-ref access in loop\n");
2715       return false;
2716     }
2717 
2718   /* Allow loads with zero step in inner-loop vectorization.  */
2719   if (loop_vinfo && integer_zerop (step))
2720     {
2721       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2722       if (!nested_in_vect_loop_p (loop, stmt))
2723 	return DR_IS_READ (dr);
2724       /* Allow references with zero step for outer loops marked
2725 	 with pragma omp simd only - it guarantees absence of
2726 	 loop-carried dependencies between inner loop iterations.  */
2727       if (loop->safelen < 2)
2728 	{
2729 	  if (dump_enabled_p ())
2730 	    dump_printf_loc (MSG_NOTE, vect_location,
2731 			     "zero step in inner loop of nest\n");
2732 	  return false;
2733 	}
2734     }
2735 
2736   if (loop && nested_in_vect_loop_p (loop, stmt))
2737     {
2738       /* Interleaved accesses are not yet supported within outer-loop
2739         vectorization for references in the inner-loop.  */
2740       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2741 
2742       /* For the rest of the analysis we use the outer-loop step.  */
2743       step = STMT_VINFO_DR_STEP (stmt_info);
2744       if (integer_zerop (step))
2745 	{
2746 	  if (dump_enabled_p ())
2747 	    dump_printf_loc (MSG_NOTE, vect_location,
2748 	                     "zero step in outer loop.\n");
2749 	  return DR_IS_READ (dr);
2750 	}
2751     }
2752 
2753   /* Consecutive?  */
2754   if (TREE_CODE (step) == INTEGER_CST)
2755     {
2756       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2757       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2758 	  || (dr_step < 0
2759 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2760 	{
2761 	  /* Mark that it is not interleaving.  */
2762 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2763 	  return true;
2764 	}
2765     }
2766 
2767   if (loop && nested_in_vect_loop_p (loop, stmt))
2768     {
2769       if (dump_enabled_p ())
2770 	dump_printf_loc (MSG_NOTE, vect_location,
2771 	                 "grouped access in outer loop.\n");
2772       return false;
2773     }
2774 
2775 
2776   /* Assume this is a DR handled by non-constant strided load case.  */
2777   if (TREE_CODE (step) != INTEGER_CST)
2778     return (STMT_VINFO_STRIDED_P (stmt_info)
2779 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2780 		|| vect_analyze_group_access (dr)));
2781 
2782   /* Not consecutive access - check if it's a part of interleaving group.  */
2783   return vect_analyze_group_access (dr);
2784 }
2785 
2786 /* Compare two data-references DRA and DRB to group them into chunks
2787    suitable for grouping.  */
2788 
2789 static int
2790 dr_group_sort_cmp (const void *dra_, const void *drb_)
2791 {
2792   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2793   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2794   int cmp;
2795 
2796   /* Stabilize sort.  */
2797   if (dra == drb)
2798     return 0;
2799 
2800   /* DRs in different loops never belong to the same group.  */
2801   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2802   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2803   if (loopa != loopb)
2804     return loopa->num < loopb->num ? -1 : 1;
2805 
2806   /* Ordering of DRs according to base.  */
2807   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2808 			       DR_BASE_ADDRESS (drb));
2809   if (cmp != 0)
2810     return cmp;
2811 
2812   /* And according to DR_OFFSET.  */
2813   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2814   if (cmp != 0)
2815     return cmp;
2816 
2817   /* Put reads before writes.  */
2818   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2819     return DR_IS_READ (dra) ? -1 : 1;
2820 
2821   /* Then sort after access size.  */
2822   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2823 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2824   if (cmp != 0)
2825     return cmp;
2826 
2827   /* And after step.  */
2828   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2829   if (cmp != 0)
2830     return cmp;
2831 
2832   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2833   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2834   if (cmp == 0)
2835     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2836   return cmp;
2837 }
2838 
2839 /* If OP is the result of a conversion, return the unconverted value,
2840    otherwise return null.  */
2841 
2842 static tree
2843 strip_conversion (tree op)
2844 {
2845   if (TREE_CODE (op) != SSA_NAME)
2846     return NULL_TREE;
2847   gimple *stmt = SSA_NAME_DEF_STMT (op);
2848   if (!is_gimple_assign (stmt)
2849       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2850     return NULL_TREE;
2851   return gimple_assign_rhs1 (stmt);
2852 }
2853 
2854 /* Return true if vectorizable_* routines can handle statements STMT1
2855    and STMT2 being in a single group.  */
2856 
2857 static bool
2858 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2859 {
2860   if (gimple_assign_single_p (stmt1))
2861     return gimple_assign_single_p (stmt2);
2862 
2863   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2864     {
2865       /* Check for two masked loads or two masked stores.  */
2866       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2867 	return false;
2868       internal_fn ifn = gimple_call_internal_fn (stmt1);
2869       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2870 	return false;
2871       if (ifn != gimple_call_internal_fn (stmt2))
2872 	return false;
2873 
2874       /* Check that the masks are the same.  Cope with casts of masks,
2875 	 like those created by build_mask_conversion.  */
2876       tree mask1 = gimple_call_arg (stmt1, 2);
2877       tree mask2 = gimple_call_arg (stmt2, 2);
2878       if (!operand_equal_p (mask1, mask2, 0))
2879 	{
2880 	  mask1 = strip_conversion (mask1);
2881 	  if (!mask1)
2882 	    return false;
2883 	  mask2 = strip_conversion (mask2);
2884 	  if (!mask2)
2885 	    return false;
2886 	  if (!operand_equal_p (mask1, mask2, 0))
2887 	    return false;
2888 	}
2889       return true;
2890     }
2891 
2892   return false;
2893 }
2894 
2895 /* Function vect_analyze_data_ref_accesses.
2896 
2897    Analyze the access pattern of all the data references in the loop.
2898 
2899    FORNOW: the only access pattern that is considered vectorizable is a
2900 	   simple step 1 (consecutive) access.
2901 
2902    FORNOW: handle only arrays and pointer accesses.  */
2903 
2904 bool
2905 vect_analyze_data_ref_accesses (vec_info *vinfo)
2906 {
2907   unsigned int i;
2908   vec<data_reference_p> datarefs = vinfo->datarefs;
2909   struct data_reference *dr;
2910 
2911   if (dump_enabled_p ())
2912     dump_printf_loc (MSG_NOTE, vect_location,
2913                      "=== vect_analyze_data_ref_accesses ===\n");
2914 
2915   if (datarefs.is_empty ())
2916     return true;
2917 
2918   /* Sort the array of datarefs to make building the interleaving chains
2919      linear.  Don't modify the original vector's order, it is needed for
2920      determining what dependencies are reversed.  */
2921   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2922   datarefs_copy.qsort (dr_group_sort_cmp);
2923 
2924   /* Build the interleaving chains.  */
2925   for (i = 0; i < datarefs_copy.length () - 1;)
2926     {
2927       data_reference_p dra = datarefs_copy[i];
2928       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2929       stmt_vec_info lastinfo = NULL;
2930       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2931 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2932 	{
2933 	  ++i;
2934 	  continue;
2935 	}
2936       for (i = i + 1; i < datarefs_copy.length (); ++i)
2937 	{
2938 	  data_reference_p drb = datarefs_copy[i];
2939 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2940 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2941 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2942 	    break;
2943 
2944 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2945 	     accesses, same accesses) can lead to a group to be artificially
2946 	     split here as we don't just skip over those.  If it really
2947 	     matters we can push those to a worklist and re-iterate
2948 	     over them.  The we can just skip ahead to the next DR here.  */
2949 
2950 	  /* DRs in a different loop should not be put into the same
2951 	     interleaving group.  */
2952 	  if (gimple_bb (DR_STMT (dra))->loop_father
2953 	      != gimple_bb (DR_STMT (drb))->loop_father)
2954 	    break;
2955 
2956 	  /* Check that the data-refs have same first location (except init)
2957 	     and they are both either store or load (not load and store,
2958 	     not masked loads or stores).  */
2959 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2960 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2961 					DR_BASE_ADDRESS (drb)) != 0
2962 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2963 	      || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2964 	    break;
2965 
2966 	  /* Check that the data-refs have the same constant size.  */
2967 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2968 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2969 	  if (!tree_fits_uhwi_p (sza)
2970 	      || !tree_fits_uhwi_p (szb)
2971 	      || !tree_int_cst_equal (sza, szb))
2972 	    break;
2973 
2974 	  /* Check that the data-refs have the same step.  */
2975 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2976 	    break;
2977 
2978 	  /* Check the types are compatible.
2979 	     ???  We don't distinguish this during sorting.  */
2980 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2981 				   TREE_TYPE (DR_REF (drb))))
2982 	    break;
2983 
2984 	  /* Check that the DR_INITs are compile-time constants.  */
2985 	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2986 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2987 	    break;
2988 
2989 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2990 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2991 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2992 	  HOST_WIDE_INT init_prev
2993 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2994 	  gcc_assert (init_a <= init_b
2995 		      && init_a <= init_prev
2996 		      && init_prev <= init_b);
2997 
2998 	  /* Do not place the same access in the interleaving chain twice.  */
2999 	  if (init_b == init_prev)
3000 	    {
3001 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3002 			  < gimple_uid (DR_STMT (drb)));
3003 	      /* ???  For now we simply "drop" the later reference which is
3004 	         otherwise the same rather than finishing off this group.
3005 		 In the end we'd want to re-process duplicates forming
3006 		 multiple groups from the refs, likely by just collecting
3007 		 all candidates (including duplicates and split points
3008 		 below) in a vector and then process them together.  */
3009 	      continue;
3010 	    }
3011 
3012 	  /* If init_b == init_a + the size of the type * k, we have an
3013 	     interleaving, and DRA is accessed before DRB.  */
3014 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3015 	  if (type_size_a == 0
3016 	      || (init_b - init_a) % type_size_a != 0)
3017 	    break;
3018 
3019 	  /* If we have a store, the accesses are adjacent.  This splits
3020 	     groups into chunks we support (we don't support vectorization
3021 	     of stores with gaps).  */
3022 	  if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3023 	    break;
3024 
3025 	  /* If the step (if not zero or non-constant) is greater than the
3026 	     difference between data-refs' inits this splits groups into
3027 	     suitable sizes.  */
3028 	  if (tree_fits_shwi_p (DR_STEP (dra)))
3029 	    {
3030 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3031 	      if (step != 0 && step <= (init_b - init_a))
3032 		break;
3033 	    }
3034 
3035 	  if (dump_enabled_p ())
3036 	    {
3037 	      dump_printf_loc (MSG_NOTE, vect_location,
3038 			       "Detected interleaving ");
3039 	      if (DR_IS_READ (dra))
3040 		dump_printf (MSG_NOTE, "load ");
3041 	      else
3042 		dump_printf (MSG_NOTE, "store ");
3043 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3044 	      dump_printf (MSG_NOTE,  " and ");
3045 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3046 	      dump_printf (MSG_NOTE, "\n");
3047 	    }
3048 
3049 	  /* Link the found element into the group list.  */
3050 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3051 	    {
3052 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3053 	      lastinfo = stmtinfo_a;
3054 	    }
3055 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3056 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3057 	  lastinfo = stmtinfo_b;
3058 	}
3059     }
3060 
3061   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3062     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3063         && !vect_analyze_data_ref_access (dr))
3064       {
3065 	if (dump_enabled_p ())
3066 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3067 	                   "not vectorized: complicated access pattern.\n");
3068 
3069         if (is_a <bb_vec_info> (vinfo))
3070           {
3071             /* Mark the statement as not vectorizable.  */
3072             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3073             continue;
3074           }
3075         else
3076 	  {
3077 	    datarefs_copy.release ();
3078 	    return false;
3079 	  }
3080       }
3081 
3082   datarefs_copy.release ();
3083   return true;
3084 }
3085 
3086 /* Function vect_vfa_segment_size.
3087 
3088    Input:
3089      DR: The data reference.
3090      LENGTH_FACTOR: segment length to consider.
3091 
3092    Return a value suitable for the dr_with_seg_len::seg_len field.
3093    This is the "distance travelled" by the pointer from the first
3094    iteration in the segment to the last.  Note that it does not include
3095    the size of the access; in effect it only describes the first byte.  */
3096 
3097 static tree
3098 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3099 {
3100   length_factor = size_binop (MINUS_EXPR,
3101 			      fold_convert (sizetype, length_factor),
3102 			      size_one_node);
3103   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3104 		     length_factor);
3105 }
3106 
3107 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3108    gives the worst-case number of bytes covered by the segment.  */
3109 
3110 static unsigned HOST_WIDE_INT
3111 vect_vfa_access_size (data_reference *dr)
3112 {
3113   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3114   tree ref_type = TREE_TYPE (DR_REF (dr));
3115   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3116   unsigned HOST_WIDE_INT access_size = ref_size;
3117   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3118     {
3119       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3120       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3121     }
3122   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3123       && (vect_supportable_dr_alignment (dr, false)
3124 	  == dr_explicit_realign_optimized))
3125     {
3126       /* We might access a full vector's worth.  */
3127       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3128       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3129     }
3130   return access_size;
3131 }
3132 
3133 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3134 
3135 static unsigned int
3136 vect_vfa_align (const data_reference *dr)
3137 {
3138   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3139 }
3140 
3141 /* Function vect_no_alias_p.
3142 
3143    Given data references A and B with equal base and offset, see whether
3144    the alias relation can be decided at compilation time.  Return 1 if
3145    it can and the references alias, 0 if it can and the references do
3146    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3147    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3148    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3149 
3150 static int
3151 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3152 			 tree segment_length_a, tree segment_length_b,
3153 			 unsigned HOST_WIDE_INT access_size_a,
3154 			 unsigned HOST_WIDE_INT access_size_b)
3155 {
3156   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3157   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3158   poly_uint64 const_length_a;
3159   poly_uint64 const_length_b;
3160 
3161   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3162      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3163      [a, a+12) */
3164   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3165     {
3166       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3167       offset_a = (offset_a + access_size_a) - const_length_a;
3168     }
3169   else
3170     const_length_a = tree_to_poly_uint64 (segment_length_a);
3171   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3172     {
3173       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3174       offset_b = (offset_b + access_size_b) - const_length_b;
3175     }
3176   else
3177     const_length_b = tree_to_poly_uint64 (segment_length_b);
3178 
3179   const_length_a += access_size_a;
3180   const_length_b += access_size_b;
3181 
3182   if (ranges_known_overlap_p (offset_a, const_length_a,
3183 			      offset_b, const_length_b))
3184     return 1;
3185 
3186   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3187 			       offset_b, const_length_b))
3188     return 0;
3189 
3190   return -1;
3191 }
3192 
3193 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3194    in DDR is >= VF.  */
3195 
3196 static bool
3197 dependence_distance_ge_vf (data_dependence_relation *ddr,
3198 			   unsigned int loop_depth, poly_uint64 vf)
3199 {
3200   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3201       || DDR_NUM_DIST_VECTS (ddr) == 0)
3202     return false;
3203 
3204   /* If the dependence is exact, we should have limited the VF instead.  */
3205   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3206 
3207   unsigned int i;
3208   lambda_vector dist_v;
3209   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3210     {
3211       HOST_WIDE_INT dist = dist_v[loop_depth];
3212       if (dist != 0
3213 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3214 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3215 	return false;
3216     }
3217 
3218   if (dump_enabled_p ())
3219     {
3220       dump_printf_loc (MSG_NOTE, vect_location,
3221 		       "dependence distance between ");
3222       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3223       dump_printf (MSG_NOTE,  " and ");
3224       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3225       dump_printf (MSG_NOTE,  " is >= VF\n");
3226     }
3227 
3228   return true;
3229 }
3230 
3231 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3232 
3233 static void
3234 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3235 {
3236   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3237   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3238   dump_printf (dump_kind, ") >= ");
3239   dump_dec (dump_kind, lower_bound.min_value);
3240 }
3241 
3242 /* Record that the vectorized loop requires the vec_lower_bound described
3243    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3244 
3245 static void
3246 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3247 			poly_uint64 min_value)
3248 {
3249   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3250   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3251     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3252       {
3253 	unsigned_p &= lower_bounds[i].unsigned_p;
3254 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3255 	if (lower_bounds[i].unsigned_p != unsigned_p
3256 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3257 	  {
3258 	    lower_bounds[i].unsigned_p = unsigned_p;
3259 	    lower_bounds[i].min_value = min_value;
3260 	    if (dump_enabled_p ())
3261 	      {
3262 		dump_printf_loc (MSG_NOTE, vect_location,
3263 				 "updating run-time check to ");
3264 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3265 		dump_printf (MSG_NOTE, "\n");
3266 	      }
3267 	  }
3268 	return;
3269       }
3270 
3271   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3272   if (dump_enabled_p ())
3273     {
3274       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3275       dump_lower_bound (MSG_NOTE, lower_bound);
3276       dump_printf (MSG_NOTE, "\n");
3277     }
3278   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3279 }
3280 
3281 /* Return true if it's unlikely that the step of the vectorized form of DR
3282    will span fewer than GAP bytes.  */
3283 
3284 static bool
3285 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3286 {
3287   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3288   HOST_WIDE_INT count
3289     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3290   if (GROUP_FIRST_ELEMENT (stmt_info))
3291     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3292   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3293 }
3294 
3295 /* Return true if we know that there is no alias between DR_A and DR_B
3296    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3297    *LOWER_BOUND_OUT to this N.  */
3298 
3299 static bool
3300 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3301 				poly_uint64 *lower_bound_out)
3302 {
3303   /* Check that there is a constant gap of known sign between DR_A
3304      and DR_B.  */
3305   poly_int64 init_a, init_b;
3306   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3307       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3308       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3309       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3310       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3311       || !ordered_p (init_a, init_b))
3312     return false;
3313 
3314   /* Sort DR_A and DR_B by the address they access.  */
3315   if (maybe_lt (init_b, init_a))
3316     {
3317       std::swap (init_a, init_b);
3318       std::swap (dr_a, dr_b);
3319     }
3320 
3321   /* If the two accesses could be dependent within a scalar iteration,
3322      make sure that we'd retain their order.  */
3323   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3324       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3325     return false;
3326 
3327   /* There is no alias if abs (DR_STEP) is greater than or equal to
3328      the bytes spanned by the combination of the two accesses.  */
3329   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3330   return true;
3331 }
3332 
3333 /* Function vect_prune_runtime_alias_test_list.
3334 
3335    Prune a list of ddrs to be tested at run-time by versioning for alias.
3336    Merge several alias checks into one if possible.
3337    Return FALSE if resulting list of ddrs is longer then allowed by
3338    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3339 
3340 bool
3341 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3342 {
3343   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3344   hash_set <tree_pair_hash> compared_objects;
3345 
3346   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3347   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3348     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3349   vec<vec_object_pair> &check_unequal_addrs
3350     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3351   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3352   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3353 
3354   ddr_p ddr;
3355   unsigned int i;
3356   tree length_factor;
3357 
3358   if (dump_enabled_p ())
3359     dump_printf_loc (MSG_NOTE, vect_location,
3360                      "=== vect_prune_runtime_alias_test_list ===\n");
3361 
3362   /* Step values are irrelevant for aliasing if the number of vector
3363      iterations is equal to the number of scalar iterations (which can
3364      happen for fully-SLP loops).  */
3365   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3366 
3367   if (!ignore_step_p)
3368     {
3369       /* Convert the checks for nonzero steps into bound tests.  */
3370       tree value;
3371       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3372 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3373     }
3374 
3375   if (may_alias_ddrs.is_empty ())
3376     return true;
3377 
3378   comp_alias_ddrs.create (may_alias_ddrs.length ());
3379 
3380   unsigned int loop_depth
3381     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3382 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3383 
3384   /* First, we collect all data ref pairs for aliasing checks.  */
3385   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3386     {
3387       int comp_res;
3388       poly_uint64 lower_bound;
3389       struct data_reference *dr_a, *dr_b;
3390       gimple *dr_group_first_a, *dr_group_first_b;
3391       tree segment_length_a, segment_length_b;
3392       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3393       unsigned int align_a, align_b;
3394       gimple *stmt_a, *stmt_b;
3395 
3396       /* Ignore the alias if the VF we chose ended up being no greater
3397 	 than the dependence distance.  */
3398       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3399 	continue;
3400 
3401       if (DDR_OBJECT_A (ddr))
3402 	{
3403 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3404 	  if (!compared_objects.add (new_pair))
3405 	    {
3406 	      if (dump_enabled_p ())
3407 		{
3408 		  dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3409 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3410 		  dump_printf (MSG_NOTE, " and ");
3411 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3412 		  dump_printf (MSG_NOTE, " have different addresses\n");
3413 		}
3414 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3415 	    }
3416 	  continue;
3417 	}
3418 
3419       dr_a = DDR_A (ddr);
3420       stmt_a = DR_STMT (DDR_A (ddr));
3421 
3422       dr_b = DDR_B (ddr);
3423       stmt_b = DR_STMT (DDR_B (ddr));
3424 
3425       /* Skip the pair if inter-iteration dependencies are irrelevant
3426 	 and intra-iteration dependencies are guaranteed to be honored.  */
3427       if (ignore_step_p
3428 	  && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3429 	      || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3430 	{
3431 	  if (dump_enabled_p ())
3432 	    {
3433 	      dump_printf_loc (MSG_NOTE, vect_location,
3434 			       "no need for alias check between ");
3435 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3436 	      dump_printf (MSG_NOTE, " and ");
3437 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3438 	      dump_printf (MSG_NOTE, " when VF is 1\n");
3439 	    }
3440 	  continue;
3441 	}
3442 
3443       /* See whether we can handle the alias using a bounds check on
3444 	 the step, and whether that's likely to be the best approach.
3445 	 (It might not be, for example, if the minimum step is much larger
3446 	 than the number of bytes handled by one vector iteration.)  */
3447       if (!ignore_step_p
3448 	  && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3449 	  && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3450 	  && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3451 	      || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3452 	{
3453 	  bool unsigned_p = dr_known_forward_stride_p (dr_a);
3454 	  if (dump_enabled_p ())
3455 	    {
3456 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3457 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3458 	      dump_printf (MSG_NOTE, " and ");
3459 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3460 	      dump_printf (MSG_NOTE, " when the step ");
3461 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3462 	      dump_printf (MSG_NOTE, " is outside ");
3463 	      if (unsigned_p)
3464 		dump_printf (MSG_NOTE, "[0");
3465 	      else
3466 		{
3467 		  dump_printf (MSG_NOTE, "(");
3468 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3469 		}
3470 	      dump_printf (MSG_NOTE, ", ");
3471 	      dump_dec (MSG_NOTE, lower_bound);
3472 	      dump_printf (MSG_NOTE, ")\n");
3473 	    }
3474 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3475 				  lower_bound);
3476 	  continue;
3477 	}
3478 
3479       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3480       if (dr_group_first_a)
3481 	{
3482 	  stmt_a = dr_group_first_a;
3483 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3484 	}
3485 
3486       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3487       if (dr_group_first_b)
3488 	{
3489 	  stmt_b = dr_group_first_b;
3490 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3491 	}
3492 
3493       if (ignore_step_p)
3494 	{
3495 	  segment_length_a = size_zero_node;
3496 	  segment_length_b = size_zero_node;
3497 	}
3498       else
3499 	{
3500 	  if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3501 	    length_factor = scalar_loop_iters;
3502 	  else
3503 	    length_factor = size_int (vect_factor);
3504 	  segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3505 	  segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3506 	}
3507       access_size_a = vect_vfa_access_size (dr_a);
3508       access_size_b = vect_vfa_access_size (dr_b);
3509       align_a = vect_vfa_align (dr_a);
3510       align_b = vect_vfa_align (dr_b);
3511 
3512       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3513 					DR_BASE_ADDRESS (dr_b));
3514       if (comp_res == 0)
3515 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3516 					  DR_OFFSET (dr_b));
3517 
3518       /* See whether the alias is known at compilation time.  */
3519       if (comp_res == 0
3520 	  && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3521 	  && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3522 	  && poly_int_tree_p (segment_length_a)
3523 	  && poly_int_tree_p (segment_length_b))
3524 	{
3525 	  int res = vect_compile_time_alias (dr_a, dr_b,
3526 					     segment_length_a,
3527 					     segment_length_b,
3528 					     access_size_a,
3529 					     access_size_b);
3530 	  if (res >= 0 && dump_enabled_p ())
3531 	    {
3532 	      dump_printf_loc (MSG_NOTE, vect_location,
3533 			       "can tell at compile time that ");
3534 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3535 	      dump_printf (MSG_NOTE, " and ");
3536 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3537 	      if (res == 0)
3538 		dump_printf (MSG_NOTE, " do not alias\n");
3539 	      else
3540 		dump_printf (MSG_NOTE, " alias\n");
3541 	    }
3542 
3543 	  if (res == 0)
3544 	    continue;
3545 
3546 	  if (res == 1)
3547 	    {
3548 	      if (dump_enabled_p ())
3549 		dump_printf_loc (MSG_NOTE, vect_location,
3550 				 "not vectorized: compilation time alias.\n");
3551 	      return false;
3552 	    }
3553 	}
3554 
3555       dr_with_seg_len_pair_t dr_with_seg_len_pair
3556 	(dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3557 	 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3558 
3559       /* Canonicalize pairs by sorting the two DR members.  */
3560       if (comp_res > 0)
3561 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3562 
3563       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3564     }
3565 
3566   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3567 
3568   unsigned int count = (comp_alias_ddrs.length ()
3569 			+ check_unequal_addrs.length ());
3570 
3571   dump_printf_loc (MSG_NOTE, vect_location,
3572 		   "improved number of alias checks from %d to %d\n",
3573 		   may_alias_ddrs.length (), count);
3574   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3575     {
3576       if (dump_enabled_p ())
3577 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3578 			 "number of versioning for alias "
3579 			 "run-time tests exceeds %d "
3580 			 "(--param vect-max-version-for-alias-checks)\n",
3581 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3582       return false;
3583     }
3584 
3585   return true;
3586 }
3587 
3588 /* Check whether we can use an internal function for a gather load
3589    or scatter store.  READ_P is true for loads and false for stores.
3590    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3591    the type of the memory elements being loaded or stored.  OFFSET_BITS
3592    is the number of bits in each scalar offset and OFFSET_SIGN is the
3593    sign of the offset.  SCALE is the amount by which the offset should
3594    be multiplied *after* it has been converted to address width.
3595 
3596    Return true if the function is supported, storing the function
3597    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3598 
3599 bool
3600 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3601 			  tree memory_type, unsigned int offset_bits,
3602 			  signop offset_sign, int scale,
3603 			  internal_fn *ifn_out, tree *element_type_out)
3604 {
3605   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3606   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3607   if (offset_bits > element_bits)
3608     /* Internal functions require the offset to be the same width as
3609        the vector elements.  We can extend narrower offsets, but it isn't
3610        safe to truncate wider offsets.  */
3611     return false;
3612 
3613   if (element_bits != memory_bits)
3614     /* For now the vector elements must be the same width as the
3615        memory elements.  */
3616     return false;
3617 
3618   /* Work out which function we need.  */
3619   internal_fn ifn;
3620   if (read_p)
3621     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3622   else
3623     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3624 
3625   /* Test whether the target supports this combination.  */
3626   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3627 					       offset_sign, scale))
3628     return false;
3629 
3630   *ifn_out = ifn;
3631   *element_type_out = TREE_TYPE (vectype);
3632   return true;
3633 }
3634 
3635 /* CALL is a call to an internal gather load or scatter store function.
3636    Describe the operation in INFO.  */
3637 
3638 static void
3639 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3640 {
3641   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3642   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3643   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3644 
3645   info->ifn = gimple_call_internal_fn (call);
3646   info->decl = NULL_TREE;
3647   info->base = gimple_call_arg (call, 0);
3648   info->offset = gimple_call_arg (call, 1);
3649   info->offset_dt = vect_unknown_def_type;
3650   info->offset_vectype = NULL_TREE;
3651   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3652   info->element_type = TREE_TYPE (vectype);
3653   info->memory_type = TREE_TYPE (DR_REF (dr));
3654 }
3655 
3656 /* Return true if a non-affine read or write in STMT is suitable for a
3657    gather load or scatter store.  Describe the operation in *INFO if so.  */
3658 
3659 bool
3660 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3661 			   gather_scatter_info *info)
3662 {
3663   HOST_WIDE_INT scale = 1;
3664   poly_int64 pbitpos, pbitsize;
3665   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3666   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3667   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3668   tree offtype = NULL_TREE;
3669   tree decl = NULL_TREE, base, off;
3670   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3671   tree memory_type = TREE_TYPE (DR_REF (dr));
3672   machine_mode pmode;
3673   int punsignedp, reversep, pvolatilep = 0;
3674   internal_fn ifn;
3675   tree element_type;
3676   bool masked_p = false;
3677 
3678   /* See whether this is already a call to a gather/scatter internal function.
3679      If not, see whether it's a masked load or store.  */
3680   gcall *call = dyn_cast <gcall *> (stmt);
3681   if (call && gimple_call_internal_p (call))
3682     {
3683       ifn = gimple_call_internal_fn (stmt);
3684       if (internal_gather_scatter_fn_p (ifn))
3685 	{
3686 	  vect_describe_gather_scatter_call (call, info);
3687 	  return true;
3688 	}
3689       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3690     }
3691 
3692   /* True if we should aim to use internal functions rather than
3693      built-in functions.  */
3694   bool use_ifn_p = (DR_IS_READ (dr)
3695 		    ? supports_vec_gather_load_p ()
3696 		    : supports_vec_scatter_store_p ());
3697 
3698   base = DR_REF (dr);
3699   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3700      see if we can use the def stmt of the address.  */
3701   if (masked_p
3702       && TREE_CODE (base) == MEM_REF
3703       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3704       && integer_zerop (TREE_OPERAND (base, 1))
3705       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3706     {
3707       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3708       if (is_gimple_assign (def_stmt)
3709 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3710 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3711     }
3712 
3713   /* The gather and scatter builtins need address of the form
3714      loop_invariant + vector * {1, 2, 4, 8}
3715      or
3716      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3717      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3718      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3719      multiplications and additions in it.  To get a vector, we need
3720      a single SSA_NAME that will be defined in the loop and will
3721      contain everything that is not loop invariant and that can be
3722      vectorized.  The following code attempts to find such a preexistng
3723      SSA_NAME OFF and put the loop invariants into a tree BASE
3724      that can be gimplified before the loop.  */
3725   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3726 			      &punsignedp, &reversep, &pvolatilep);
3727   gcc_assert (base && !reversep);
3728   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3729 
3730   if (TREE_CODE (base) == MEM_REF)
3731     {
3732       if (!integer_zerop (TREE_OPERAND (base, 1)))
3733 	{
3734 	  if (off == NULL_TREE)
3735 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3736 	  else
3737 	    off = size_binop (PLUS_EXPR, off,
3738 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3739 	}
3740       base = TREE_OPERAND (base, 0);
3741     }
3742   else
3743     base = build_fold_addr_expr (base);
3744 
3745   if (off == NULL_TREE)
3746     off = size_zero_node;
3747 
3748   /* If base is not loop invariant, either off is 0, then we start with just
3749      the constant offset in the loop invariant BASE and continue with base
3750      as OFF, otherwise give up.
3751      We could handle that case by gimplifying the addition of base + off
3752      into some SSA_NAME and use that as off, but for now punt.  */
3753   if (!expr_invariant_in_loop_p (loop, base))
3754     {
3755       if (!integer_zerop (off))
3756 	return false;
3757       off = base;
3758       base = size_int (pbytepos);
3759     }
3760   /* Otherwise put base + constant offset into the loop invariant BASE
3761      and continue with OFF.  */
3762   else
3763     {
3764       base = fold_convert (sizetype, base);
3765       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3766     }
3767 
3768   /* OFF at this point may be either a SSA_NAME or some tree expression
3769      from get_inner_reference.  Try to peel off loop invariants from it
3770      into BASE as long as possible.  */
3771   STRIP_NOPS (off);
3772   while (offtype == NULL_TREE)
3773     {
3774       enum tree_code code;
3775       tree op0, op1, add = NULL_TREE;
3776 
3777       if (TREE_CODE (off) == SSA_NAME)
3778 	{
3779 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3780 
3781 	  if (expr_invariant_in_loop_p (loop, off))
3782 	    return false;
3783 
3784 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3785 	    break;
3786 
3787 	  op0 = gimple_assign_rhs1 (def_stmt);
3788 	  code = gimple_assign_rhs_code (def_stmt);
3789 	  op1 = gimple_assign_rhs2 (def_stmt);
3790 	}
3791       else
3792 	{
3793 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3794 	    return false;
3795 	  code = TREE_CODE (off);
3796 	  extract_ops_from_tree (off, &code, &op0, &op1);
3797 	}
3798       switch (code)
3799 	{
3800 	case POINTER_PLUS_EXPR:
3801 	case PLUS_EXPR:
3802 	  if (expr_invariant_in_loop_p (loop, op0))
3803 	    {
3804 	      add = op0;
3805 	      off = op1;
3806 	    do_add:
3807 	      add = fold_convert (sizetype, add);
3808 	      if (scale != 1)
3809 		add = size_binop (MULT_EXPR, add, size_int (scale));
3810 	      base = size_binop (PLUS_EXPR, base, add);
3811 	      continue;
3812 	    }
3813 	  if (expr_invariant_in_loop_p (loop, op1))
3814 	    {
3815 	      add = op1;
3816 	      off = op0;
3817 	      goto do_add;
3818 	    }
3819 	  break;
3820 	case MINUS_EXPR:
3821 	  if (expr_invariant_in_loop_p (loop, op1))
3822 	    {
3823 	      add = fold_convert (sizetype, op1);
3824 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3825 	      off = op0;
3826 	      goto do_add;
3827 	    }
3828 	  break;
3829 	case MULT_EXPR:
3830 	  if (scale == 1 && tree_fits_shwi_p (op1))
3831 	    {
3832 	      int new_scale = tree_to_shwi (op1);
3833 	      /* Only treat this as a scaling operation if the target
3834 		 supports it.  */
3835 	      if (use_ifn_p
3836 		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3837 						vectype, memory_type, 1,
3838 						TYPE_SIGN (TREE_TYPE (op0)),
3839 						new_scale, &ifn,
3840 						&element_type))
3841 		break;
3842 	      scale = new_scale;
3843 	      off = op0;
3844 	      continue;
3845 	    }
3846 	  break;
3847 	case SSA_NAME:
3848 	  off = op0;
3849 	  continue;
3850 	CASE_CONVERT:
3851 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3852 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3853 	    break;
3854 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3855 	      == TYPE_PRECISION (TREE_TYPE (off)))
3856 	    {
3857 	      off = op0;
3858 	      continue;
3859 	    }
3860 
3861 	  /* The internal functions need the offset to be the same width
3862 	     as the elements of VECTYPE.  Don't include operations that
3863 	     cast the offset from that width to a different width.  */
3864 	  if (use_ifn_p
3865 	      && (int_size_in_bytes (TREE_TYPE (vectype))
3866 		  == int_size_in_bytes (TREE_TYPE (off))))
3867 	    break;
3868 
3869 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3870 	      < TYPE_PRECISION (TREE_TYPE (off)))
3871 	    {
3872 	      off = op0;
3873 	      offtype = TREE_TYPE (off);
3874 	      STRIP_NOPS (off);
3875 	      continue;
3876 	    }
3877 	  break;
3878 	default:
3879 	  break;
3880 	}
3881       break;
3882     }
3883 
3884   /* If at the end OFF still isn't a SSA_NAME or isn't
3885      defined in the loop, punt.  */
3886   if (TREE_CODE (off) != SSA_NAME
3887       || expr_invariant_in_loop_p (loop, off))
3888     return false;
3889 
3890   if (offtype == NULL_TREE)
3891     offtype = TREE_TYPE (off);
3892 
3893   if (use_ifn_p)
3894     {
3895       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3896 				     memory_type, TYPE_PRECISION (offtype),
3897 				     TYPE_SIGN (offtype), scale, &ifn,
3898 				     &element_type))
3899 	return false;
3900     }
3901   else
3902     {
3903       if (DR_IS_READ (dr))
3904 	{
3905 	  if (targetm.vectorize.builtin_gather)
3906 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3907 	}
3908       else
3909 	{
3910 	  if (targetm.vectorize.builtin_scatter)
3911 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3912 	}
3913 
3914       if (!decl)
3915 	return false;
3916 
3917       ifn = IFN_LAST;
3918       element_type = TREE_TYPE (vectype);
3919     }
3920 
3921   info->ifn = ifn;
3922   info->decl = decl;
3923   info->base = base;
3924   info->offset = off;
3925   info->offset_dt = vect_unknown_def_type;
3926   info->offset_vectype = NULL_TREE;
3927   info->scale = scale;
3928   info->element_type = element_type;
3929   info->memory_type = memory_type;
3930   return true;
3931 }
3932 
3933 /* Function vect_analyze_data_refs.
3934 
3935   Find all the data references in the loop or basic block.
3936 
3937    The general structure of the analysis of data refs in the vectorizer is as
3938    follows:
3939    1- vect_analyze_data_refs(loop/bb): call
3940       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3941       in the loop/bb and their dependences.
3942    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3943    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3944    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3945 
3946 */
3947 
3948 bool
3949 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3950 {
3951   struct loop *loop = NULL;
3952   unsigned int i;
3953   struct data_reference *dr;
3954   tree scalar_type;
3955 
3956   if (dump_enabled_p ())
3957     dump_printf_loc (MSG_NOTE, vect_location,
3958 		     "=== vect_analyze_data_refs ===\n");
3959 
3960   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3961     loop = LOOP_VINFO_LOOP (loop_vinfo);
3962 
3963   /* Go through the data-refs, check that the analysis succeeded.  Update
3964      pointer from stmt_vec_info struct to DR and vectype.  */
3965 
3966   vec<data_reference_p> datarefs = vinfo->datarefs;
3967   FOR_EACH_VEC_ELT (datarefs, i, dr)
3968     {
3969       gimple *stmt;
3970       stmt_vec_info stmt_info;
3971       tree base, offset, init;
3972       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3973       bool simd_lane_access = false;
3974       poly_uint64 vf;
3975 
3976 again:
3977       if (!dr || !DR_REF (dr))
3978         {
3979           if (dump_enabled_p ())
3980 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981 	                     "not vectorized: unhandled data-ref\n");
3982           return false;
3983         }
3984 
3985       stmt = DR_STMT (dr);
3986       stmt_info = vinfo_for_stmt (stmt);
3987 
3988       /* Discard clobbers from the dataref vector.  We will remove
3989          clobber stmts during vectorization.  */
3990       if (gimple_clobber_p (stmt))
3991 	{
3992 	  free_data_ref (dr);
3993 	  if (i == datarefs.length () - 1)
3994 	    {
3995 	      datarefs.pop ();
3996 	      break;
3997 	    }
3998 	  datarefs.ordered_remove (i);
3999 	  dr = datarefs[i];
4000 	  goto again;
4001 	}
4002 
4003       /* Check that analysis of the data-ref succeeded.  */
4004       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4005 	  || !DR_STEP (dr))
4006         {
4007 	  bool maybe_gather
4008 	    = DR_IS_READ (dr)
4009 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4010 	      && (targetm.vectorize.builtin_gather != NULL
4011 		  || supports_vec_gather_load_p ());
4012 	  bool maybe_scatter
4013 	    = DR_IS_WRITE (dr)
4014 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4015 	      && (targetm.vectorize.builtin_scatter != NULL
4016 		  || supports_vec_scatter_store_p ());
4017 	  bool maybe_simd_lane_access
4018 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
4019 
4020 	  /* If target supports vector gather loads or scatter stores, or if
4021 	     this might be a SIMD lane access, see if they can't be used.  */
4022 	  if (is_a <loop_vec_info> (vinfo)
4023 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4024 	      && !nested_in_vect_loop_p (loop, stmt))
4025 	    {
4026 	      struct data_reference *newdr
4027 		= create_data_ref (NULL, loop_containing_stmt (stmt),
4028 				   DR_REF (dr), stmt, !maybe_scatter,
4029 				   DR_IS_CONDITIONAL_IN_STMT (dr));
4030 	      gcc_assert (newdr != NULL && DR_REF (newdr));
4031 	      if (DR_BASE_ADDRESS (newdr)
4032 		  && DR_OFFSET (newdr)
4033 		  && DR_INIT (newdr)
4034 		  && DR_STEP (newdr)
4035 		  && integer_zerop (DR_STEP (newdr)))
4036 		{
4037 		  if (maybe_simd_lane_access)
4038 		    {
4039 		      tree off = DR_OFFSET (newdr);
4040 		      STRIP_NOPS (off);
4041 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4042 			  && TREE_CODE (off) == MULT_EXPR
4043 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4044 			{
4045 			  tree step = TREE_OPERAND (off, 1);
4046 			  off = TREE_OPERAND (off, 0);
4047 			  STRIP_NOPS (off);
4048 			  if (CONVERT_EXPR_P (off)
4049 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4050 									  0)))
4051 				 < TYPE_PRECISION (TREE_TYPE (off)))
4052 			    off = TREE_OPERAND (off, 0);
4053 			  if (TREE_CODE (off) == SSA_NAME)
4054 			    {
4055 			      gimple *def = SSA_NAME_DEF_STMT (off);
4056 			      tree reft = TREE_TYPE (DR_REF (newdr));
4057 			      if (is_gimple_call (def)
4058 				  && gimple_call_internal_p (def)
4059 				  && (gimple_call_internal_fn (def)
4060 				      == IFN_GOMP_SIMD_LANE))
4061 				{
4062 				  tree arg = gimple_call_arg (def, 0);
4063 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4064 				  arg = SSA_NAME_VAR (arg);
4065 				  if (arg == loop->simduid
4066 				      /* For now.  */
4067 				      && tree_int_cst_equal
4068 					   (TYPE_SIZE_UNIT (reft),
4069 					    step))
4070 				    {
4071 				      DR_OFFSET (newdr) = ssize_int (0);
4072 				      DR_STEP (newdr) = step;
4073 				      DR_OFFSET_ALIGNMENT (newdr)
4074 					= BIGGEST_ALIGNMENT;
4075 				      DR_STEP_ALIGNMENT (newdr)
4076 					= highest_pow2_factor (step);
4077 				      dr = newdr;
4078 				      simd_lane_access = true;
4079 				    }
4080 				}
4081 			    }
4082 			}
4083 		    }
4084 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
4085 		    {
4086 		      dr = newdr;
4087 		      if (maybe_gather)
4088 			gatherscatter = GATHER;
4089 		      else
4090 			gatherscatter = SCATTER;
4091 		    }
4092 		}
4093 	      if (gatherscatter == SG_NONE && !simd_lane_access)
4094 		free_data_ref (newdr);
4095 	    }
4096 
4097 	  if (gatherscatter == SG_NONE && !simd_lane_access)
4098 	    {
4099 	      if (dump_enabled_p ())
4100 		{
4101 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4102                                    "not vectorized: data ref analysis "
4103                                    "failed ");
4104 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4105 		}
4106 
4107 	      if (is_a <bb_vec_info> (vinfo))
4108 		break;
4109 
4110 	      return false;
4111 	    }
4112         }
4113 
4114       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4115         {
4116           if (dump_enabled_p ())
4117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4118                              "not vectorized: base addr of dr is a "
4119                              "constant\n");
4120 
4121           if (is_a <bb_vec_info> (vinfo))
4122 	    break;
4123 
4124 	  if (gatherscatter != SG_NONE || simd_lane_access)
4125 	    free_data_ref (dr);
4126 	  return false;
4127         }
4128 
4129       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4130         {
4131           if (dump_enabled_p ())
4132             {
4133               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4134                                "not vectorized: volatile type ");
4135               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4136             }
4137 
4138           if (is_a <bb_vec_info> (vinfo))
4139 	    break;
4140 
4141           return false;
4142         }
4143 
4144       if (stmt_can_throw_internal (stmt))
4145         {
4146           if (dump_enabled_p ())
4147             {
4148               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4149                                "not vectorized: statement can throw an "
4150                                "exception ");
4151               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4152             }
4153 
4154           if (is_a <bb_vec_info> (vinfo))
4155 	    break;
4156 
4157 	  if (gatherscatter != SG_NONE || simd_lane_access)
4158 	    free_data_ref (dr);
4159           return false;
4160         }
4161 
4162       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4163 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4164 	{
4165           if (dump_enabled_p ())
4166             {
4167               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4168                                "not vectorized: statement is bitfield "
4169                                "access ");
4170               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4171             }
4172 
4173           if (is_a <bb_vec_info> (vinfo))
4174 	    break;
4175 
4176 	  if (gatherscatter != SG_NONE || simd_lane_access)
4177 	    free_data_ref (dr);
4178           return false;
4179 	}
4180 
4181       base = unshare_expr (DR_BASE_ADDRESS (dr));
4182       offset = unshare_expr (DR_OFFSET (dr));
4183       init = unshare_expr (DR_INIT (dr));
4184 
4185       if (is_gimple_call (stmt)
4186 	  && (!gimple_call_internal_p (stmt)
4187 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4188 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4189 	{
4190 	  if (dump_enabled_p ())
4191 	    {
4192 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4193 	                       "not vectorized: dr in a call ");
4194 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4195 	    }
4196 
4197 	  if (is_a <bb_vec_info> (vinfo))
4198 	    break;
4199 
4200 	  if (gatherscatter != SG_NONE || simd_lane_access)
4201 	    free_data_ref (dr);
4202 	  return false;
4203 	}
4204 
4205       /* Update DR field in stmt_vec_info struct.  */
4206 
4207       /* If the dataref is in an inner-loop of the loop that is considered for
4208 	 for vectorization, we also want to analyze the access relative to
4209 	 the outer-loop (DR contains information only relative to the
4210 	 inner-most enclosing loop).  We do that by building a reference to the
4211 	 first location accessed by the inner-loop, and analyze it relative to
4212 	 the outer-loop.  */
4213       if (loop && nested_in_vect_loop_p (loop, stmt))
4214 	{
4215 	  /* Build a reference to the first location accessed by the
4216 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4217 	     this address must be invariant in the inner loop, so we
4218 	     can consider it as being used in the outer loop.  */
4219 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4220 					  init, offset);
4221 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4222 	  tree init_ref = build_fold_indirect_ref (init_addr);
4223 
4224 	  if (dump_enabled_p ())
4225 	    {
4226 	      dump_printf_loc (MSG_NOTE, vect_location,
4227                                "analyze in outer loop: ");
4228 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4229 	      dump_printf (MSG_NOTE, "\n");
4230 	    }
4231 
4232 	  if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4233 				     init_ref, loop))
4234 	    /* dr_analyze_innermost already explained the failure.  */
4235 	    return false;
4236 
4237           if (dump_enabled_p ())
4238 	    {
4239 	      dump_printf_loc (MSG_NOTE, vect_location,
4240                                "\touter base_address: ");
4241 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4242                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4243 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4244 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245                                  STMT_VINFO_DR_OFFSET (stmt_info));
4246 	      dump_printf (MSG_NOTE,
4247                            "\n\touter constant offset from base address: ");
4248 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4249                                  STMT_VINFO_DR_INIT (stmt_info));
4250 	      dump_printf (MSG_NOTE, "\n\touter step: ");
4251 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4252                                  STMT_VINFO_DR_STEP (stmt_info));
4253 	      dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4254 			   STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4255 	      dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4256 			   STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4257 	      dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4258 			   STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4259 	      dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4260 			   STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4261 	    }
4262 	}
4263 
4264       if (STMT_VINFO_DATA_REF (stmt_info))
4265         {
4266           if (dump_enabled_p ())
4267             {
4268               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4269                                "not vectorized: more than one data ref "
4270                                "in stmt: ");
4271               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4272             }
4273 
4274           if (is_a <bb_vec_info> (vinfo))
4275 	    break;
4276 
4277 	  if (gatherscatter != SG_NONE || simd_lane_access)
4278 	    free_data_ref (dr);
4279           return false;
4280         }
4281 
4282       STMT_VINFO_DATA_REF (stmt_info) = dr;
4283       if (simd_lane_access)
4284 	{
4285 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4286 	  free_data_ref (datarefs[i]);
4287 	  datarefs[i] = dr;
4288 	}
4289 
4290       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4291 	  && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4292 	  && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4293 	{
4294           if (dump_enabled_p ())
4295             {
4296               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297                                "not vectorized: base object not addressable "
4298 			       "for stmt: ");
4299               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4300             }
4301           if (is_a <bb_vec_info> (vinfo))
4302 	    {
4303 	      /* In BB vectorization the ref can still participate
4304 	         in dependence analysis, we just can't vectorize it.  */
4305 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4306 	      continue;
4307 	    }
4308 	  return false;
4309 	}
4310 
4311       /* Set vectype for STMT.  */
4312       scalar_type = TREE_TYPE (DR_REF (dr));
4313       STMT_VINFO_VECTYPE (stmt_info)
4314 	= get_vectype_for_scalar_type (scalar_type);
4315       if (!STMT_VINFO_VECTYPE (stmt_info))
4316         {
4317           if (dump_enabled_p ())
4318             {
4319               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4320                                "not vectorized: no vectype for stmt: ");
4321               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4322               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4323               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4324                                  scalar_type);
4325               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4326             }
4327 
4328           if (is_a <bb_vec_info> (vinfo))
4329 	    {
4330 	      /* No vector type is fine, the ref can still participate
4331 	         in dependence analysis, we just can't vectorize it.  */
4332 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4333 	      continue;
4334 	    }
4335 
4336 	  if (gatherscatter != SG_NONE || simd_lane_access)
4337 	    {
4338 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4339 	      if (gatherscatter != SG_NONE)
4340 		free_data_ref (dr);
4341 	    }
4342 	  return false;
4343         }
4344       else
4345 	{
4346 	  if (dump_enabled_p ())
4347 	    {
4348 	      dump_printf_loc (MSG_NOTE, vect_location,
4349 			       "got vectype for stmt: ");
4350 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4351 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4352 				 STMT_VINFO_VECTYPE (stmt_info));
4353 	      dump_printf (MSG_NOTE, "\n");
4354 	    }
4355 	}
4356 
4357       /* Adjust the minimal vectorization factor according to the
4358 	 vector type.  */
4359       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4360       *min_vf = upper_bound (*min_vf, vf);
4361 
4362       if (gatherscatter != SG_NONE)
4363 	{
4364 	  gather_scatter_info gs_info;
4365 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4366 					  &gs_info)
4367 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4368 	    {
4369 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4370 	      free_data_ref (dr);
4371 	      if (dump_enabled_p ())
4372 		{
4373 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374 				   (gatherscatter == GATHER) ?
4375 				   "not vectorized: not suitable for gather "
4376 				   "load " :
4377 				   "not vectorized: not suitable for scatter "
4378 				   "store ");
4379 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4380 		}
4381 	      return false;
4382 	    }
4383 
4384 	  free_data_ref (datarefs[i]);
4385 	  datarefs[i] = dr;
4386 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4387 	}
4388 
4389       else if (is_a <loop_vec_info> (vinfo)
4390 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4391 	{
4392 	  if (nested_in_vect_loop_p (loop, stmt))
4393 	    {
4394 	      if (dump_enabled_p ())
4395 		{
4396 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4397                                    "not vectorized: not suitable for strided "
4398                                    "load ");
4399 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4400 		}
4401 	      return false;
4402 	    }
4403 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4404 	}
4405     }
4406 
4407   /* If we stopped analysis at the first dataref we could not analyze
4408      when trying to vectorize a basic-block mark the rest of the datarefs
4409      as not vectorizable and truncate the vector of datarefs.  That
4410      avoids spending useless time in analyzing their dependence.  */
4411   if (i != datarefs.length ())
4412     {
4413       gcc_assert (is_a <bb_vec_info> (vinfo));
4414       for (unsigned j = i; j < datarefs.length (); ++j)
4415 	{
4416 	  data_reference_p dr = datarefs[j];
4417           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4418 	  free_data_ref (dr);
4419 	}
4420       datarefs.truncate (i);
4421     }
4422 
4423   return true;
4424 }
4425 
4426 
4427 /* Function vect_get_new_vect_var.
4428 
4429    Returns a name for a new variable.  The current naming scheme appends the
4430    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4431    the name of vectorizer generated variables, and appends that to NAME if
4432    provided.  */
4433 
4434 tree
4435 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4436 {
4437   const char *prefix;
4438   tree new_vect_var;
4439 
4440   switch (var_kind)
4441   {
4442   case vect_simple_var:
4443     prefix = "vect";
4444     break;
4445   case vect_scalar_var:
4446     prefix = "stmp";
4447     break;
4448   case vect_mask_var:
4449     prefix = "mask";
4450     break;
4451   case vect_pointer_var:
4452     prefix = "vectp";
4453     break;
4454   default:
4455     gcc_unreachable ();
4456   }
4457 
4458   if (name)
4459     {
4460       char* tmp = concat (prefix, "_", name, NULL);
4461       new_vect_var = create_tmp_reg (type, tmp);
4462       free (tmp);
4463     }
4464   else
4465     new_vect_var = create_tmp_reg (type, prefix);
4466 
4467   return new_vect_var;
4468 }
4469 
4470 /* Like vect_get_new_vect_var but return an SSA name.  */
4471 
4472 tree
4473 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4474 {
4475   const char *prefix;
4476   tree new_vect_var;
4477 
4478   switch (var_kind)
4479   {
4480   case vect_simple_var:
4481     prefix = "vect";
4482     break;
4483   case vect_scalar_var:
4484     prefix = "stmp";
4485     break;
4486   case vect_pointer_var:
4487     prefix = "vectp";
4488     break;
4489   default:
4490     gcc_unreachable ();
4491   }
4492 
4493   if (name)
4494     {
4495       char* tmp = concat (prefix, "_", name, NULL);
4496       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4497       free (tmp);
4498     }
4499   else
4500     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4501 
4502   return new_vect_var;
4503 }
4504 
4505 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4506 
4507 static void
4508 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4509 {
4510   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4511   int misalign = DR_MISALIGNMENT (dr);
4512   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4513     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4514   else
4515     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4516 			    DR_TARGET_ALIGNMENT (dr), misalign);
4517 }
4518 
4519 /* Function vect_create_addr_base_for_vector_ref.
4520 
4521    Create an expression that computes the address of the first memory location
4522    that will be accessed for a data reference.
4523 
4524    Input:
4525    STMT: The statement containing the data reference.
4526    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4527    OFFSET: Optional. If supplied, it is be added to the initial address.
4528    LOOP:    Specify relative to which loop-nest should the address be computed.
4529             For example, when the dataref is in an inner-loop nested in an
4530 	    outer-loop that is now being vectorized, LOOP can be either the
4531 	    outer-loop, or the inner-loop.  The first memory location accessed
4532 	    by the following dataref ('in' points to short):
4533 
4534 		for (i=0; i<N; i++)
4535 		   for (j=0; j<M; j++)
4536 		     s += in[i+j]
4537 
4538 	    is as follows:
4539 	    if LOOP=i_loop:	&in		(relative to i_loop)
4540 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4541    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4542 	    initial address.  Unlike OFFSET, which is number of elements to
4543 	    be added, BYTE_OFFSET is measured in bytes.
4544 
4545    Output:
4546    1. Return an SSA_NAME whose value is the address of the memory location of
4547       the first vector of the data reference.
4548    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4549       these statement(s) which define the returned SSA_NAME.
4550 
4551    FORNOW: We are only handling array accesses with step 1.  */
4552 
4553 tree
4554 vect_create_addr_base_for_vector_ref (gimple *stmt,
4555 				      gimple_seq *new_stmt_list,
4556 				      tree offset,
4557 				      tree byte_offset)
4558 {
4559   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4560   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4561   const char *base_name;
4562   tree addr_base;
4563   tree dest;
4564   gimple_seq seq = NULL;
4565   tree vect_ptr_type;
4566   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4567   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4568   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4569 
4570   tree data_ref_base = unshare_expr (drb->base_address);
4571   tree base_offset = unshare_expr (drb->offset);
4572   tree init = unshare_expr (drb->init);
4573 
4574   if (loop_vinfo)
4575     base_name = get_name (data_ref_base);
4576   else
4577     {
4578       base_offset = ssize_int (0);
4579       init = ssize_int (0);
4580       base_name = get_name (DR_REF (dr));
4581     }
4582 
4583   /* Create base_offset */
4584   base_offset = size_binop (PLUS_EXPR,
4585 			    fold_convert (sizetype, base_offset),
4586 			    fold_convert (sizetype, init));
4587 
4588   if (offset)
4589     {
4590       offset = fold_build2 (MULT_EXPR, sizetype,
4591 			    fold_convert (sizetype, offset), step);
4592       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4593 				 base_offset, offset);
4594     }
4595   if (byte_offset)
4596     {
4597       byte_offset = fold_convert (sizetype, byte_offset);
4598       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4599 				 base_offset, byte_offset);
4600     }
4601 
4602   /* base + base_offset */
4603   if (loop_vinfo)
4604     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4605   else
4606     {
4607       addr_base = build1 (ADDR_EXPR,
4608 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4609 			  unshare_expr (DR_REF (dr)));
4610     }
4611 
4612   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4613   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4614   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4615   gimple_seq_add_seq (new_stmt_list, seq);
4616 
4617   if (DR_PTR_INFO (dr)
4618       && TREE_CODE (addr_base) == SSA_NAME
4619       && !SSA_NAME_PTR_INFO (addr_base))
4620     {
4621       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4622       if (offset || byte_offset)
4623 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4624     }
4625 
4626   if (dump_enabled_p ())
4627     {
4628       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4629       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4630       dump_printf (MSG_NOTE, "\n");
4631     }
4632 
4633   return addr_base;
4634 }
4635 
4636 
4637 /* Function vect_create_data_ref_ptr.
4638 
4639    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4640    location accessed in the loop by STMT, along with the def-use update
4641    chain to appropriately advance the pointer through the loop iterations.
4642    Also set aliasing information for the pointer.  This pointer is used by
4643    the callers to this function to create a memory reference expression for
4644    vector load/store access.
4645 
4646    Input:
4647    1. STMT: a stmt that references memory. Expected to be of the form
4648          GIMPLE_ASSIGN <name, data-ref> or
4649 	 GIMPLE_ASSIGN <data-ref, name>.
4650    2. AGGR_TYPE: the type of the reference, which should be either a vector
4651         or an array.
4652    3. AT_LOOP: the loop where the vector memref is to be created.
4653    4. OFFSET (optional): an offset to be added to the initial address accessed
4654         by the data-ref in STMT.
4655    5. BSI: location where the new stmts are to be placed if there is no loop
4656    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4657         pointing to the initial address.
4658    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4659 	to the initial address accessed by the data-ref in STMT.  This is
4660 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4661 	in bytes.
4662    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4663 	to the IV during each iteration of the loop.  NULL says to move
4664 	by one copy of AGGR_TYPE up or down, depending on the step of the
4665 	data reference.
4666 
4667    Output:
4668    1. Declare a new ptr to vector_type, and have it point to the base of the
4669       data reference (initial addressed accessed by the data reference).
4670       For example, for vector of type V8HI, the following code is generated:
4671 
4672       v8hi *ap;
4673       ap = (v8hi *)initial_address;
4674 
4675       if OFFSET is not supplied:
4676          initial_address = &a[init];
4677       if OFFSET is supplied:
4678          initial_address = &a[init + OFFSET];
4679       if BYTE_OFFSET is supplied:
4680 	 initial_address = &a[init] + BYTE_OFFSET;
4681 
4682       Return the initial_address in INITIAL_ADDRESS.
4683 
4684    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4685       update the pointer in each iteration of the loop.
4686 
4687       Return the increment stmt that updates the pointer in PTR_INCR.
4688 
4689    3. Set INV_P to true if the access pattern of the data reference in the
4690       vectorized loop is invariant.  Set it to false otherwise.
4691 
4692    4. Return the pointer.  */
4693 
4694 tree
4695 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4696 			  tree offset, tree *initial_address,
4697 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4698 			  bool only_init, bool *inv_p, tree byte_offset,
4699 			  tree iv_step)
4700 {
4701   const char *base_name;
4702   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4703   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4704   struct loop *loop = NULL;
4705   bool nested_in_vect_loop = false;
4706   struct loop *containing_loop = NULL;
4707   tree aggr_ptr_type;
4708   tree aggr_ptr;
4709   tree new_temp;
4710   gimple_seq new_stmt_list = NULL;
4711   edge pe = NULL;
4712   basic_block new_bb;
4713   tree aggr_ptr_init;
4714   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4715   tree aptr;
4716   gimple_stmt_iterator incr_gsi;
4717   bool insert_after;
4718   tree indx_before_incr, indx_after_incr;
4719   gimple *incr;
4720   tree step;
4721   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4722 
4723   gcc_assert (iv_step != NULL_TREE
4724 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4725 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4726 
4727   if (loop_vinfo)
4728     {
4729       loop = LOOP_VINFO_LOOP (loop_vinfo);
4730       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4731       containing_loop = (gimple_bb (stmt))->loop_father;
4732       pe = loop_preheader_edge (loop);
4733     }
4734   else
4735     {
4736       gcc_assert (bb_vinfo);
4737       only_init = true;
4738       *ptr_incr = NULL;
4739     }
4740 
4741   /* Check the step (evolution) of the load in LOOP, and record
4742      whether it's invariant.  */
4743   step = vect_dr_behavior (dr)->step;
4744   if (integer_zerop (step))
4745     *inv_p = true;
4746   else
4747     *inv_p = false;
4748 
4749   /* Create an expression for the first address accessed by this load
4750      in LOOP.  */
4751   base_name = get_name (DR_BASE_ADDRESS (dr));
4752 
4753   if (dump_enabled_p ())
4754     {
4755       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4756       dump_printf_loc (MSG_NOTE, vect_location,
4757                        "create %s-pointer variable to type: ",
4758 		       get_tree_code_name (TREE_CODE (aggr_type)));
4759       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4760       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4761         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4762       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4763         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4764       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4765         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4766       else
4767         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4768       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4769       dump_printf (MSG_NOTE, "\n");
4770     }
4771 
4772   /* (1) Create the new aggregate-pointer variable.
4773      Vector and array types inherit the alias set of their component
4774      type by default so we need to use a ref-all pointer if the data
4775      reference does not conflict with the created aggregated data
4776      reference because it is not addressable.  */
4777   bool need_ref_all = false;
4778   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4779 			      get_alias_set (DR_REF (dr))))
4780     need_ref_all = true;
4781   /* Likewise for any of the data references in the stmt group.  */
4782   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4783     {
4784       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4785       do
4786 	{
4787 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4788 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4789 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4790 				      get_alias_set (DR_REF (sdr))))
4791 	    {
4792 	      need_ref_all = true;
4793 	      break;
4794 	    }
4795 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4796 	}
4797       while (orig_stmt);
4798     }
4799   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4800 					       need_ref_all);
4801   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4802 
4803 
4804   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4805      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4806      def-use update cycles for the pointer: one relative to the outer-loop
4807      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4808      to the inner-loop (which is the inner-most loop containing the dataref),
4809      and this is done be step (5) below.
4810 
4811      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4812      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4813      redundant.  Steps (3),(4) create the following:
4814 
4815 	vp0 = &base_addr;
4816 	LOOP:	vp1 = phi(vp0,vp2)
4817 		...
4818 		...
4819 		vp2 = vp1 + step
4820 		goto LOOP
4821 
4822      If there is an inner-loop nested in loop, then step (5) will also be
4823      applied, and an additional update in the inner-loop will be created:
4824 
4825 	vp0 = &base_addr;
4826 	LOOP:   vp1 = phi(vp0,vp2)
4827 		...
4828         inner:     vp3 = phi(vp1,vp4)
4829 	           vp4 = vp3 + inner_step
4830 	           if () goto inner
4831 		...
4832 		vp2 = vp1 + step
4833 		if () goto LOOP   */
4834 
4835   /* (2) Calculate the initial address of the aggregate-pointer, and set
4836      the aggregate-pointer to point to it before the loop.  */
4837 
4838   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4839 
4840   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4841 						   offset, byte_offset);
4842   if (new_stmt_list)
4843     {
4844       if (pe)
4845         {
4846           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4847           gcc_assert (!new_bb);
4848         }
4849       else
4850         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4851     }
4852 
4853   *initial_address = new_temp;
4854   aggr_ptr_init = new_temp;
4855 
4856   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4857      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4858      inner-loop nested in LOOP (during outer-loop vectorization).  */
4859 
4860   /* No update in loop is required.  */
4861   if (only_init && (!loop_vinfo || at_loop == loop))
4862     aptr = aggr_ptr_init;
4863   else
4864     {
4865       if (iv_step == NULL_TREE)
4866 	{
4867 	  /* The step of the aggregate pointer is the type size.  */
4868 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4869 	  /* One exception to the above is when the scalar step of the load in
4870 	     LOOP is zero. In this case the step here is also zero.  */
4871 	  if (*inv_p)
4872 	    iv_step = size_zero_node;
4873 	  else if (tree_int_cst_sgn (step) == -1)
4874 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4875 	}
4876 
4877       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4878 
4879       create_iv (aggr_ptr_init,
4880 		 fold_convert (aggr_ptr_type, iv_step),
4881 		 aggr_ptr, loop, &incr_gsi, insert_after,
4882 		 &indx_before_incr, &indx_after_incr);
4883       incr = gsi_stmt (incr_gsi);
4884       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4885 
4886       /* Copy the points-to information if it exists. */
4887       if (DR_PTR_INFO (dr))
4888 	{
4889 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4890 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4891 	}
4892       if (ptr_incr)
4893 	*ptr_incr = incr;
4894 
4895       aptr = indx_before_incr;
4896     }
4897 
4898   if (!nested_in_vect_loop || only_init)
4899     return aptr;
4900 
4901 
4902   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4903      nested in LOOP, if exists.  */
4904 
4905   gcc_assert (nested_in_vect_loop);
4906   if (!only_init)
4907     {
4908       standard_iv_increment_position (containing_loop, &incr_gsi,
4909 				      &insert_after);
4910       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4911 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4912 		 &indx_after_incr);
4913       incr = gsi_stmt (incr_gsi);
4914       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4915 
4916       /* Copy the points-to information if it exists. */
4917       if (DR_PTR_INFO (dr))
4918 	{
4919 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4920 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4921 	}
4922       if (ptr_incr)
4923 	*ptr_incr = incr;
4924 
4925       return indx_before_incr;
4926     }
4927   else
4928     gcc_unreachable ();
4929 }
4930 
4931 
4932 /* Function bump_vector_ptr
4933 
4934    Increment a pointer (to a vector type) by vector-size. If requested,
4935    i.e. if PTR-INCR is given, then also connect the new increment stmt
4936    to the existing def-use update-chain of the pointer, by modifying
4937    the PTR_INCR as illustrated below:
4938 
4939    The pointer def-use update-chain before this function:
4940                         DATAREF_PTR = phi (p_0, p_2)
4941                         ....
4942         PTR_INCR:       p_2 = DATAREF_PTR + step
4943 
4944    The pointer def-use update-chain after this function:
4945                         DATAREF_PTR = phi (p_0, p_2)
4946                         ....
4947                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4948                         ....
4949         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4950 
4951    Input:
4952    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4953                  in the loop.
4954    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4955 	      the loop.  The increment amount across iterations is expected
4956 	      to be vector_size.
4957    BSI - location where the new update stmt is to be placed.
4958    STMT - the original scalar memory-access stmt that is being vectorized.
4959    BUMP - optional. The offset by which to bump the pointer. If not given,
4960 	  the offset is assumed to be vector_size.
4961 
4962    Output: Return NEW_DATAREF_PTR as illustrated above.
4963 
4964 */
4965 
4966 tree
4967 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4968 		 gimple *stmt, tree bump)
4969 {
4970   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4971   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4972   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4973   tree update = TYPE_SIZE_UNIT (vectype);
4974   gassign *incr_stmt;
4975   ssa_op_iter iter;
4976   use_operand_p use_p;
4977   tree new_dataref_ptr;
4978 
4979   if (bump)
4980     update = bump;
4981 
4982   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4983     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4984   else
4985     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4986   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4987 				   dataref_ptr, update);
4988   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4989 
4990   /* Copy the points-to information if it exists. */
4991   if (DR_PTR_INFO (dr))
4992     {
4993       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4994       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4995     }
4996 
4997   if (!ptr_incr)
4998     return new_dataref_ptr;
4999 
5000   /* Update the vector-pointer's cross-iteration increment.  */
5001   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5002     {
5003       tree use = USE_FROM_PTR (use_p);
5004 
5005       if (use == dataref_ptr)
5006         SET_USE (use_p, new_dataref_ptr);
5007       else
5008         gcc_assert (operand_equal_p (use, update, 0));
5009     }
5010 
5011   return new_dataref_ptr;
5012 }
5013 
5014 
5015 /* Copy memory reference info such as base/clique from the SRC reference
5016    to the DEST MEM_REF.  */
5017 
5018 void
5019 vect_copy_ref_info (tree dest, tree src)
5020 {
5021   if (TREE_CODE (dest) != MEM_REF)
5022     return;
5023 
5024   tree src_base = src;
5025   while (handled_component_p (src_base))
5026     src_base = TREE_OPERAND (src_base, 0);
5027   if (TREE_CODE (src_base) != MEM_REF
5028       && TREE_CODE (src_base) != TARGET_MEM_REF)
5029     return;
5030 
5031   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5032   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5033 }
5034 
5035 
5036 /* Function vect_create_destination_var.
5037 
5038    Create a new temporary of type VECTYPE.  */
5039 
5040 tree
5041 vect_create_destination_var (tree scalar_dest, tree vectype)
5042 {
5043   tree vec_dest;
5044   const char *name;
5045   char *new_name;
5046   tree type;
5047   enum vect_var_kind kind;
5048 
5049   kind = vectype
5050     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5051     ? vect_mask_var
5052     : vect_simple_var
5053     : vect_scalar_var;
5054   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5055 
5056   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5057 
5058   name = get_name (scalar_dest);
5059   if (name)
5060     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5061   else
5062     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5063   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5064   free (new_name);
5065 
5066   return vec_dest;
5067 }
5068 
5069 /* Function vect_grouped_store_supported.
5070 
5071    Returns TRUE if interleave high and interleave low permutations
5072    are supported, and FALSE otherwise.  */
5073 
5074 bool
5075 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5076 {
5077   machine_mode mode = TYPE_MODE (vectype);
5078 
5079   /* vect_permute_store_chain requires the group size to be equal to 3 or
5080      be a power of two.  */
5081   if (count != 3 && exact_log2 (count) == -1)
5082     {
5083       if (dump_enabled_p ())
5084 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5085 			 "the size of the group of accesses"
5086 			 " is not a power of 2 or not eqaul to 3\n");
5087       return false;
5088     }
5089 
5090   /* Check that the permutation is supported.  */
5091   if (VECTOR_MODE_P (mode))
5092     {
5093       unsigned int i;
5094       if (count == 3)
5095 	{
5096 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5097 	  unsigned int i, j;
5098 
5099 	  unsigned int nelt;
5100 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5101 	    {
5102 	      if (dump_enabled_p ())
5103 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5104 				 "cannot handle groups of 3 stores for"
5105 				 " variable-length vectors\n");
5106 	      return false;
5107 	    }
5108 
5109 	  vec_perm_builder sel (nelt, nelt, 1);
5110 	  sel.quick_grow (nelt);
5111 	  vec_perm_indices indices;
5112 	  for (j = 0; j < 3; j++)
5113 	    {
5114 	      int nelt0 = ((3 - j) * nelt) % 3;
5115 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5116 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5117 	      for (i = 0; i < nelt; i++)
5118 		{
5119 		  if (3 * i + nelt0 < nelt)
5120 		    sel[3 * i + nelt0] = j0++;
5121 		  if (3 * i + nelt1 < nelt)
5122 		    sel[3 * i + nelt1] = nelt + j1++;
5123 		  if (3 * i + nelt2 < nelt)
5124 		    sel[3 * i + nelt2] = 0;
5125 		}
5126 	      indices.new_vector (sel, 2, nelt);
5127 	      if (!can_vec_perm_const_p (mode, indices))
5128 		{
5129 		  if (dump_enabled_p ())
5130 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5131 				 "permutation op not supported by target.\n");
5132 		  return false;
5133 		}
5134 
5135 	      for (i = 0; i < nelt; i++)
5136 		{
5137 		  if (3 * i + nelt0 < nelt)
5138 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5139 		  if (3 * i + nelt1 < nelt)
5140 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5141 		  if (3 * i + nelt2 < nelt)
5142 		    sel[3 * i + nelt2] = nelt + j2++;
5143 		}
5144 	      indices.new_vector (sel, 2, nelt);
5145 	      if (!can_vec_perm_const_p (mode, indices))
5146 		{
5147 		  if (dump_enabled_p ())
5148 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5149 				 "permutation op not supported by target.\n");
5150 		  return false;
5151 		}
5152 	    }
5153 	  return true;
5154 	}
5155       else
5156 	{
5157 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5158 	  gcc_assert (pow2p_hwi (count));
5159 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5160 
5161 	  /* The encoding has 2 interleaved stepped patterns.  */
5162 	  vec_perm_builder sel (nelt, 2, 3);
5163 	  sel.quick_grow (6);
5164 	  for (i = 0; i < 3; i++)
5165 	    {
5166 	      sel[i * 2] = i;
5167 	      sel[i * 2 + 1] = i + nelt;
5168 	    }
5169 	  vec_perm_indices indices (sel, 2, nelt);
5170 	  if (can_vec_perm_const_p (mode, indices))
5171 	    {
5172 	      for (i = 0; i < 6; i++)
5173 		sel[i] += exact_div (nelt, 2);
5174 	      indices.new_vector (sel, 2, nelt);
5175 	      if (can_vec_perm_const_p (mode, indices))
5176 		return true;
5177 	    }
5178 	}
5179     }
5180 
5181   if (dump_enabled_p ())
5182     dump_printf (MSG_MISSED_OPTIMIZATION,
5183 		 "permutaion op not supported by target.\n");
5184   return false;
5185 }
5186 
5187 
5188 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5189    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5190 
5191 bool
5192 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5193 			    bool masked_p)
5194 {
5195   if (masked_p)
5196     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5197 					 vec_mask_store_lanes_optab,
5198 					 vectype, count);
5199   else
5200     return vect_lanes_optab_supported_p ("vec_store_lanes",
5201 					 vec_store_lanes_optab,
5202 					 vectype, count);
5203 }
5204 
5205 
5206 /* Function vect_permute_store_chain.
5207 
5208    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5209    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5210    the data correctly for the stores.  Return the final references for stores
5211    in RESULT_CHAIN.
5212 
5213    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5214    The input is 4 vectors each containing 8 elements.  We assign a number to
5215    each element, the input sequence is:
5216 
5217    1st vec:   0  1  2  3  4  5  6  7
5218    2nd vec:   8  9 10 11 12 13 14 15
5219    3rd vec:  16 17 18 19 20 21 22 23
5220    4th vec:  24 25 26 27 28 29 30 31
5221 
5222    The output sequence should be:
5223 
5224    1st vec:  0  8 16 24  1  9 17 25
5225    2nd vec:  2 10 18 26  3 11 19 27
5226    3rd vec:  4 12 20 28  5 13 21 30
5227    4th vec:  6 14 22 30  7 15 23 31
5228 
5229    i.e., we interleave the contents of the four vectors in their order.
5230 
5231    We use interleave_high/low instructions to create such output.  The input of
5232    each interleave_high/low operation is two vectors:
5233    1st vec    2nd vec
5234    0 1 2 3    4 5 6 7
5235    the even elements of the result vector are obtained left-to-right from the
5236    high/low elements of the first vector.  The odd elements of the result are
5237    obtained left-to-right from the high/low elements of the second vector.
5238    The output of interleave_high will be:   0 4 1 5
5239    and of interleave_low:                   2 6 3 7
5240 
5241 
5242    The permutation is done in log LENGTH stages.  In each stage interleave_high
5243    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5244    where the first argument is taken from the first half of DR_CHAIN and the
5245    second argument from it's second half.
5246    In our example,
5247 
5248    I1: interleave_high (1st vec, 3rd vec)
5249    I2: interleave_low (1st vec, 3rd vec)
5250    I3: interleave_high (2nd vec, 4th vec)
5251    I4: interleave_low (2nd vec, 4th vec)
5252 
5253    The output for the first stage is:
5254 
5255    I1:  0 16  1 17  2 18  3 19
5256    I2:  4 20  5 21  6 22  7 23
5257    I3:  8 24  9 25 10 26 11 27
5258    I4: 12 28 13 29 14 30 15 31
5259 
5260    The output of the second stage, i.e. the final result is:
5261 
5262    I1:  0  8 16 24  1  9 17 25
5263    I2:  2 10 18 26  3 11 19 27
5264    I3:  4 12 20 28  5 13 21 30
5265    I4:  6 14 22 30  7 15 23 31.  */
5266 
5267 void
5268 vect_permute_store_chain (vec<tree> dr_chain,
5269 			  unsigned int length,
5270 			  gimple *stmt,
5271 			  gimple_stmt_iterator *gsi,
5272 			  vec<tree> *result_chain)
5273 {
5274   tree vect1, vect2, high, low;
5275   gimple *perm_stmt;
5276   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5277   tree perm_mask_low, perm_mask_high;
5278   tree data_ref;
5279   tree perm3_mask_low, perm3_mask_high;
5280   unsigned int i, j, n, log_length = exact_log2 (length);
5281 
5282   result_chain->quick_grow (length);
5283   memcpy (result_chain->address (), dr_chain.address (),
5284 	  length * sizeof (tree));
5285 
5286   if (length == 3)
5287     {
5288       /* vect_grouped_store_supported ensures that this is constant.  */
5289       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5290       unsigned int j0 = 0, j1 = 0, j2 = 0;
5291 
5292       vec_perm_builder sel (nelt, nelt, 1);
5293       sel.quick_grow (nelt);
5294       vec_perm_indices indices;
5295       for (j = 0; j < 3; j++)
5296         {
5297 	  int nelt0 = ((3 - j) * nelt) % 3;
5298 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5299 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5300 
5301 	  for (i = 0; i < nelt; i++)
5302 	    {
5303 	      if (3 * i + nelt0 < nelt)
5304 		sel[3 * i + nelt0] = j0++;
5305 	      if (3 * i + nelt1 < nelt)
5306 		sel[3 * i + nelt1] = nelt + j1++;
5307 	      if (3 * i + nelt2 < nelt)
5308 		sel[3 * i + nelt2] = 0;
5309 	    }
5310 	  indices.new_vector (sel, 2, nelt);
5311 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5312 
5313 	  for (i = 0; i < nelt; i++)
5314 	    {
5315 	      if (3 * i + nelt0 < nelt)
5316 		sel[3 * i + nelt0] = 3 * i + nelt0;
5317 	      if (3 * i + nelt1 < nelt)
5318 		sel[3 * i + nelt1] = 3 * i + nelt1;
5319 	      if (3 * i + nelt2 < nelt)
5320 		sel[3 * i + nelt2] = nelt + j2++;
5321 	    }
5322 	  indices.new_vector (sel, 2, nelt);
5323 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5324 
5325 	  vect1 = dr_chain[0];
5326 	  vect2 = dr_chain[1];
5327 
5328 	  /* Create interleaving stmt:
5329 	     low = VEC_PERM_EXPR <vect1, vect2,
5330 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5331 				   j + 2, nelt + j + 2, *, ...}>  */
5332 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5333 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5334 					   vect2, perm3_mask_low);
5335 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5336 
5337 	  vect1 = data_ref;
5338 	  vect2 = dr_chain[2];
5339 	  /* Create interleaving stmt:
5340 	     low = VEC_PERM_EXPR <vect1, vect2,
5341 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5342 				   6, 7, nelt + j + 2, ...}>  */
5343 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5344 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5345 					   vect2, perm3_mask_high);
5346 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5347 	  (*result_chain)[j] = data_ref;
5348 	}
5349     }
5350   else
5351     {
5352       /* If length is not equal to 3 then only power of 2 is supported.  */
5353       gcc_assert (pow2p_hwi (length));
5354 
5355       /* The encoding has 2 interleaved stepped patterns.  */
5356       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5357       vec_perm_builder sel (nelt, 2, 3);
5358       sel.quick_grow (6);
5359       for (i = 0; i < 3; i++)
5360 	{
5361 	  sel[i * 2] = i;
5362 	  sel[i * 2 + 1] = i + nelt;
5363 	}
5364 	vec_perm_indices indices (sel, 2, nelt);
5365 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5366 
5367 	for (i = 0; i < 6; i++)
5368 	  sel[i] += exact_div (nelt, 2);
5369 	indices.new_vector (sel, 2, nelt);
5370 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5371 
5372 	for (i = 0, n = log_length; i < n; i++)
5373 	  {
5374 	    for (j = 0; j < length/2; j++)
5375 	      {
5376 		vect1 = dr_chain[j];
5377 		vect2 = dr_chain[j+length/2];
5378 
5379 		/* Create interleaving stmt:
5380 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5381 							...}>  */
5382 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5383 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5384 						 vect2, perm_mask_high);
5385 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5386 		(*result_chain)[2*j] = high;
5387 
5388 		/* Create interleaving stmt:
5389 		   low = VEC_PERM_EXPR <vect1, vect2,
5390 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5391 					 ...}>  */
5392 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5393 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5394 						 vect2, perm_mask_low);
5395 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5396 		(*result_chain)[2*j+1] = low;
5397 	      }
5398 	    memcpy (dr_chain.address (), result_chain->address (),
5399 		    length * sizeof (tree));
5400 	  }
5401     }
5402 }
5403 
5404 /* Function vect_setup_realignment
5405 
5406    This function is called when vectorizing an unaligned load using
5407    the dr_explicit_realign[_optimized] scheme.
5408    This function generates the following code at the loop prolog:
5409 
5410       p = initial_addr;
5411    x  msq_init = *(floor(p));   # prolog load
5412       realignment_token = call target_builtin;
5413     loop:
5414    x  msq = phi (msq_init, ---)
5415 
5416    The stmts marked with x are generated only for the case of
5417    dr_explicit_realign_optimized.
5418 
5419    The code above sets up a new (vector) pointer, pointing to the first
5420    location accessed by STMT, and a "floor-aligned" load using that pointer.
5421    It also generates code to compute the "realignment-token" (if the relevant
5422    target hook was defined), and creates a phi-node at the loop-header bb
5423    whose arguments are the result of the prolog-load (created by this
5424    function) and the result of a load that takes place in the loop (to be
5425    created by the caller to this function).
5426 
5427    For the case of dr_explicit_realign_optimized:
5428    The caller to this function uses the phi-result (msq) to create the
5429    realignment code inside the loop, and sets up the missing phi argument,
5430    as follows:
5431     loop:
5432       msq = phi (msq_init, lsq)
5433       lsq = *(floor(p'));        # load in loop
5434       result = realign_load (msq, lsq, realignment_token);
5435 
5436    For the case of dr_explicit_realign:
5437     loop:
5438       msq = *(floor(p)); 	# load in loop
5439       p' = p + (VS-1);
5440       lsq = *(floor(p'));	# load in loop
5441       result = realign_load (msq, lsq, realignment_token);
5442 
5443    Input:
5444    STMT - (scalar) load stmt to be vectorized. This load accesses
5445           a memory location that may be unaligned.
5446    BSI - place where new code is to be inserted.
5447    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5448 			      is used.
5449 
5450    Output:
5451    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5452                        target hook, if defined.
5453    Return value - the result of the loop-header phi node.  */
5454 
5455 tree
5456 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5457                         tree *realignment_token,
5458 			enum dr_alignment_support alignment_support_scheme,
5459 			tree init_addr,
5460 			struct loop **at_loop)
5461 {
5462   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5463   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5465   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5466   struct loop *loop = NULL;
5467   edge pe = NULL;
5468   tree scalar_dest = gimple_assign_lhs (stmt);
5469   tree vec_dest;
5470   gimple *inc;
5471   tree ptr;
5472   tree data_ref;
5473   basic_block new_bb;
5474   tree msq_init = NULL_TREE;
5475   tree new_temp;
5476   gphi *phi_stmt;
5477   tree msq = NULL_TREE;
5478   gimple_seq stmts = NULL;
5479   bool inv_p;
5480   bool compute_in_loop = false;
5481   bool nested_in_vect_loop = false;
5482   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5483   struct loop *loop_for_initial_load = NULL;
5484 
5485   if (loop_vinfo)
5486     {
5487       loop = LOOP_VINFO_LOOP (loop_vinfo);
5488       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5489     }
5490 
5491   gcc_assert (alignment_support_scheme == dr_explicit_realign
5492 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5493 
5494   /* We need to generate three things:
5495      1. the misalignment computation
5496      2. the extra vector load (for the optimized realignment scheme).
5497      3. the phi node for the two vectors from which the realignment is
5498       done (for the optimized realignment scheme).  */
5499 
5500   /* 1. Determine where to generate the misalignment computation.
5501 
5502      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5503      calculation will be generated by this function, outside the loop (in the
5504      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5505      caller, inside the loop.
5506 
5507      Background: If the misalignment remains fixed throughout the iterations of
5508      the loop, then both realignment schemes are applicable, and also the
5509      misalignment computation can be done outside LOOP.  This is because we are
5510      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5511      are a multiple of VS (the Vector Size), and therefore the misalignment in
5512      different vectorized LOOP iterations is always the same.
5513      The problem arises only if the memory access is in an inner-loop nested
5514      inside LOOP, which is now being vectorized using outer-loop vectorization.
5515      This is the only case when the misalignment of the memory access may not
5516      remain fixed throughout the iterations of the inner-loop (as explained in
5517      detail in vect_supportable_dr_alignment).  In this case, not only is the
5518      optimized realignment scheme not applicable, but also the misalignment
5519      computation (and generation of the realignment token that is passed to
5520      REALIGN_LOAD) have to be done inside the loop.
5521 
5522      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5523      or not, which in turn determines if the misalignment is computed inside
5524      the inner-loop, or outside LOOP.  */
5525 
5526   if (init_addr != NULL_TREE || !loop_vinfo)
5527     {
5528       compute_in_loop = true;
5529       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5530     }
5531 
5532 
5533   /* 2. Determine where to generate the extra vector load.
5534 
5535      For the optimized realignment scheme, instead of generating two vector
5536      loads in each iteration, we generate a single extra vector load in the
5537      preheader of the loop, and in each iteration reuse the result of the
5538      vector load from the previous iteration.  In case the memory access is in
5539      an inner-loop nested inside LOOP, which is now being vectorized using
5540      outer-loop vectorization, we need to determine whether this initial vector
5541      load should be generated at the preheader of the inner-loop, or can be
5542      generated at the preheader of LOOP.  If the memory access has no evolution
5543      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5544      to be generated inside LOOP (in the preheader of the inner-loop).  */
5545 
5546   if (nested_in_vect_loop)
5547     {
5548       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5549       bool invariant_in_outerloop =
5550             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5551       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5552     }
5553   else
5554     loop_for_initial_load = loop;
5555   if (at_loop)
5556     *at_loop = loop_for_initial_load;
5557 
5558   if (loop_for_initial_load)
5559     pe = loop_preheader_edge (loop_for_initial_load);
5560 
5561   /* 3. For the case of the optimized realignment, create the first vector
5562       load at the loop preheader.  */
5563 
5564   if (alignment_support_scheme == dr_explicit_realign_optimized)
5565     {
5566       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5567       gassign *new_stmt;
5568 
5569       gcc_assert (!compute_in_loop);
5570       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5571       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5572 				      NULL_TREE, &init_addr, NULL, &inc,
5573 				      true, &inv_p);
5574       if (TREE_CODE (ptr) == SSA_NAME)
5575 	new_temp = copy_ssa_name (ptr);
5576       else
5577 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5578       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5579       new_stmt = gimple_build_assign
5580 		   (new_temp, BIT_AND_EXPR, ptr,
5581 		    build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5582       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5583       gcc_assert (!new_bb);
5584       data_ref
5585 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5586 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5587       vect_copy_ref_info (data_ref, DR_REF (dr));
5588       new_stmt = gimple_build_assign (vec_dest, data_ref);
5589       new_temp = make_ssa_name (vec_dest, new_stmt);
5590       gimple_assign_set_lhs (new_stmt, new_temp);
5591       if (pe)
5592         {
5593           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5594           gcc_assert (!new_bb);
5595         }
5596       else
5597          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5598 
5599       msq_init = gimple_assign_lhs (new_stmt);
5600     }
5601 
5602   /* 4. Create realignment token using a target builtin, if available.
5603       It is done either inside the containing loop, or before LOOP (as
5604       determined above).  */
5605 
5606   if (targetm.vectorize.builtin_mask_for_load)
5607     {
5608       gcall *new_stmt;
5609       tree builtin_decl;
5610 
5611       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5612       if (!init_addr)
5613 	{
5614 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5615 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5616 							    NULL_TREE);
5617           if (loop)
5618             {
5619    	      pe = loop_preheader_edge (loop);
5620 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5621 	      gcc_assert (!new_bb);
5622             }
5623           else
5624              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5625 	}
5626 
5627       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5628       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5629       vec_dest =
5630 	vect_create_destination_var (scalar_dest,
5631 				     gimple_call_return_type (new_stmt));
5632       new_temp = make_ssa_name (vec_dest, new_stmt);
5633       gimple_call_set_lhs (new_stmt, new_temp);
5634 
5635       if (compute_in_loop)
5636 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5637       else
5638 	{
5639 	  /* Generate the misalignment computation outside LOOP.  */
5640 	  pe = loop_preheader_edge (loop);
5641 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5642 	  gcc_assert (!new_bb);
5643 	}
5644 
5645       *realignment_token = gimple_call_lhs (new_stmt);
5646 
5647       /* The result of the CALL_EXPR to this builtin is determined from
5648          the value of the parameter and no global variables are touched
5649          which makes the builtin a "const" function.  Requiring the
5650          builtin to have the "const" attribute makes it unnecessary
5651          to call mark_call_clobbered.  */
5652       gcc_assert (TREE_READONLY (builtin_decl));
5653     }
5654 
5655   if (alignment_support_scheme == dr_explicit_realign)
5656     return msq;
5657 
5658   gcc_assert (!compute_in_loop);
5659   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5660 
5661 
5662   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5663 
5664   pe = loop_preheader_edge (containing_loop);
5665   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5666   msq = make_ssa_name (vec_dest);
5667   phi_stmt = create_phi_node (msq, containing_loop->header);
5668   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5669 
5670   return msq;
5671 }
5672 
5673 
5674 /* Function vect_grouped_load_supported.
5675 
5676    COUNT is the size of the load group (the number of statements plus the
5677    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5678    only one statement, with a gap of COUNT - 1.
5679 
5680    Returns true if a suitable permute exists.  */
5681 
5682 bool
5683 vect_grouped_load_supported (tree vectype, bool single_element_p,
5684 			     unsigned HOST_WIDE_INT count)
5685 {
5686   machine_mode mode = TYPE_MODE (vectype);
5687 
5688   /* If this is single-element interleaving with an element distance
5689      that leaves unused vector loads around punt - we at least create
5690      very sub-optimal code in that case (and blow up memory,
5691      see PR65518).  */
5692   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5693     {
5694       if (dump_enabled_p ())
5695 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 			 "single-element interleaving not supported "
5697 			 "for not adjacent vector loads\n");
5698       return false;
5699     }
5700 
5701   /* vect_permute_load_chain requires the group size to be equal to 3 or
5702      be a power of two.  */
5703   if (count != 3 && exact_log2 (count) == -1)
5704     {
5705       if (dump_enabled_p ())
5706 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 			 "the size of the group of accesses"
5708 			 " is not a power of 2 or not equal to 3\n");
5709       return false;
5710     }
5711 
5712   /* Check that the permutation is supported.  */
5713   if (VECTOR_MODE_P (mode))
5714     {
5715       unsigned int i, j;
5716       if (count == 3)
5717 	{
5718 	  unsigned int nelt;
5719 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5720 	    {
5721 	      if (dump_enabled_p ())
5722 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723 				 "cannot handle groups of 3 loads for"
5724 				 " variable-length vectors\n");
5725 	      return false;
5726 	    }
5727 
5728 	  vec_perm_builder sel (nelt, nelt, 1);
5729 	  sel.quick_grow (nelt);
5730 	  vec_perm_indices indices;
5731 	  unsigned int k;
5732 	  for (k = 0; k < 3; k++)
5733 	    {
5734 	      for (i = 0; i < nelt; i++)
5735 		if (3 * i + k < 2 * nelt)
5736 		  sel[i] = 3 * i + k;
5737 		else
5738 		  sel[i] = 0;
5739 	      indices.new_vector (sel, 2, nelt);
5740 	      if (!can_vec_perm_const_p (mode, indices))
5741 		{
5742 		  if (dump_enabled_p ())
5743 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5744 				     "shuffle of 3 loads is not supported by"
5745 				     " target\n");
5746 		  return false;
5747 		}
5748 	      for (i = 0, j = 0; i < nelt; i++)
5749 		if (3 * i + k < 2 * nelt)
5750 		  sel[i] = i;
5751 		else
5752 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5753 	      indices.new_vector (sel, 2, nelt);
5754 	      if (!can_vec_perm_const_p (mode, indices))
5755 		{
5756 		  if (dump_enabled_p ())
5757 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 				     "shuffle of 3 loads is not supported by"
5759 				     " target\n");
5760 		  return false;
5761 		}
5762 	    }
5763 	  return true;
5764 	}
5765       else
5766 	{
5767 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5768 	  gcc_assert (pow2p_hwi (count));
5769 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5770 
5771 	  /* The encoding has a single stepped pattern.  */
5772 	  vec_perm_builder sel (nelt, 1, 3);
5773 	  sel.quick_grow (3);
5774 	  for (i = 0; i < 3; i++)
5775 	    sel[i] = i * 2;
5776 	  vec_perm_indices indices (sel, 2, nelt);
5777 	  if (can_vec_perm_const_p (mode, indices))
5778 	    {
5779 	      for (i = 0; i < 3; i++)
5780 		sel[i] = i * 2 + 1;
5781 	      indices.new_vector (sel, 2, nelt);
5782 	      if (can_vec_perm_const_p (mode, indices))
5783 		return true;
5784 	    }
5785         }
5786     }
5787 
5788   if (dump_enabled_p ())
5789     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790 		     "extract even/odd not supported by target\n");
5791   return false;
5792 }
5793 
5794 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5795    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5796 
5797 bool
5798 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5799 			   bool masked_p)
5800 {
5801   if (masked_p)
5802     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5803 					 vec_mask_load_lanes_optab,
5804 					 vectype, count);
5805   else
5806     return vect_lanes_optab_supported_p ("vec_load_lanes",
5807 					 vec_load_lanes_optab,
5808 					 vectype, count);
5809 }
5810 
5811 /* Function vect_permute_load_chain.
5812 
5813    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5814    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5815    the input data correctly.  Return the final references for loads in
5816    RESULT_CHAIN.
5817 
5818    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5819    The input is 4 vectors each containing 8 elements. We assign a number to each
5820    element, the input sequence is:
5821 
5822    1st vec:   0  1  2  3  4  5  6  7
5823    2nd vec:   8  9 10 11 12 13 14 15
5824    3rd vec:  16 17 18 19 20 21 22 23
5825    4th vec:  24 25 26 27 28 29 30 31
5826 
5827    The output sequence should be:
5828 
5829    1st vec:  0 4  8 12 16 20 24 28
5830    2nd vec:  1 5  9 13 17 21 25 29
5831    3rd vec:  2 6 10 14 18 22 26 30
5832    4th vec:  3 7 11 15 19 23 27 31
5833 
5834    i.e., the first output vector should contain the first elements of each
5835    interleaving group, etc.
5836 
5837    We use extract_even/odd instructions to create such output.  The input of
5838    each extract_even/odd operation is two vectors
5839    1st vec    2nd vec
5840    0 1 2 3    4 5 6 7
5841 
5842    and the output is the vector of extracted even/odd elements.  The output of
5843    extract_even will be:   0 2 4 6
5844    and of extract_odd:     1 3 5 7
5845 
5846 
5847    The permutation is done in log LENGTH stages.  In each stage extract_even
5848    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5849    their order.  In our example,
5850 
5851    E1: extract_even (1st vec, 2nd vec)
5852    E2: extract_odd (1st vec, 2nd vec)
5853    E3: extract_even (3rd vec, 4th vec)
5854    E4: extract_odd (3rd vec, 4th vec)
5855 
5856    The output for the first stage will be:
5857 
5858    E1:  0  2  4  6  8 10 12 14
5859    E2:  1  3  5  7  9 11 13 15
5860    E3: 16 18 20 22 24 26 28 30
5861    E4: 17 19 21 23 25 27 29 31
5862 
5863    In order to proceed and create the correct sequence for the next stage (or
5864    for the correct output, if the second stage is the last one, as in our
5865    example), we first put the output of extract_even operation and then the
5866    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5867    The input for the second stage is:
5868 
5869    1st vec (E1):  0  2  4  6  8 10 12 14
5870    2nd vec (E3): 16 18 20 22 24 26 28 30
5871    3rd vec (E2):  1  3  5  7  9 11 13 15
5872    4th vec (E4): 17 19 21 23 25 27 29 31
5873 
5874    The output of the second stage:
5875 
5876    E1: 0 4  8 12 16 20 24 28
5877    E2: 2 6 10 14 18 22 26 30
5878    E3: 1 5  9 13 17 21 25 29
5879    E4: 3 7 11 15 19 23 27 31
5880 
5881    And RESULT_CHAIN after reordering:
5882 
5883    1st vec (E1):  0 4  8 12 16 20 24 28
5884    2nd vec (E3):  1 5  9 13 17 21 25 29
5885    3rd vec (E2):  2 6 10 14 18 22 26 30
5886    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5887 
5888 static void
5889 vect_permute_load_chain (vec<tree> dr_chain,
5890 			 unsigned int length,
5891 			 gimple *stmt,
5892 			 gimple_stmt_iterator *gsi,
5893 			 vec<tree> *result_chain)
5894 {
5895   tree data_ref, first_vect, second_vect;
5896   tree perm_mask_even, perm_mask_odd;
5897   tree perm3_mask_low, perm3_mask_high;
5898   gimple *perm_stmt;
5899   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5900   unsigned int i, j, log_length = exact_log2 (length);
5901 
5902   result_chain->quick_grow (length);
5903   memcpy (result_chain->address (), dr_chain.address (),
5904 	  length * sizeof (tree));
5905 
5906   if (length == 3)
5907     {
5908       /* vect_grouped_load_supported ensures that this is constant.  */
5909       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5910       unsigned int k;
5911 
5912       vec_perm_builder sel (nelt, nelt, 1);
5913       sel.quick_grow (nelt);
5914       vec_perm_indices indices;
5915       for (k = 0; k < 3; k++)
5916 	{
5917 	  for (i = 0; i < nelt; i++)
5918 	    if (3 * i + k < 2 * nelt)
5919 	      sel[i] = 3 * i + k;
5920 	    else
5921 	      sel[i] = 0;
5922 	  indices.new_vector (sel, 2, nelt);
5923 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5924 
5925 	  for (i = 0, j = 0; i < nelt; i++)
5926 	    if (3 * i + k < 2 * nelt)
5927 	      sel[i] = i;
5928 	    else
5929 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5930 	  indices.new_vector (sel, 2, nelt);
5931 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5932 
5933 	  first_vect = dr_chain[0];
5934 	  second_vect = dr_chain[1];
5935 
5936 	  /* Create interleaving stmt (low part of):
5937 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5938 							     ...}>  */
5939 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5940 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5941 					   second_vect, perm3_mask_low);
5942 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5943 
5944 	  /* Create interleaving stmt (high part of):
5945 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5946 							      ...}>  */
5947 	  first_vect = data_ref;
5948 	  second_vect = dr_chain[2];
5949 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5950 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5951 					   second_vect, perm3_mask_high);
5952 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5953 	  (*result_chain)[k] = data_ref;
5954 	}
5955     }
5956   else
5957     {
5958       /* If length is not equal to 3 then only power of 2 is supported.  */
5959       gcc_assert (pow2p_hwi (length));
5960 
5961       /* The encoding has a single stepped pattern.  */
5962       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5963       vec_perm_builder sel (nelt, 1, 3);
5964       sel.quick_grow (3);
5965       for (i = 0; i < 3; ++i)
5966 	sel[i] = i * 2;
5967       vec_perm_indices indices (sel, 2, nelt);
5968       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5969 
5970       for (i = 0; i < 3; ++i)
5971 	sel[i] = i * 2 + 1;
5972       indices.new_vector (sel, 2, nelt);
5973       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5974 
5975       for (i = 0; i < log_length; i++)
5976 	{
5977 	  for (j = 0; j < length; j += 2)
5978 	    {
5979 	      first_vect = dr_chain[j];
5980 	      second_vect = dr_chain[j+1];
5981 
5982 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5983 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5984 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5985 					       first_vect, second_vect,
5986 					       perm_mask_even);
5987 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5988 	      (*result_chain)[j/2] = data_ref;
5989 
5990 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5991 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5992 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5993 					       first_vect, second_vect,
5994 					       perm_mask_odd);
5995 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5996 	      (*result_chain)[j/2+length/2] = data_ref;
5997 	    }
5998 	  memcpy (dr_chain.address (), result_chain->address (),
5999 		  length * sizeof (tree));
6000 	}
6001     }
6002 }
6003 
6004 /* Function vect_shift_permute_load_chain.
6005 
6006    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6007    sequence of stmts to reorder the input data accordingly.
6008    Return the final references for loads in RESULT_CHAIN.
6009    Return true if successed, false otherwise.
6010 
6011    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6012    The input is 3 vectors each containing 8 elements.  We assign a
6013    number to each element, the input sequence is:
6014 
6015    1st vec:   0  1  2  3  4  5  6  7
6016    2nd vec:   8  9 10 11 12 13 14 15
6017    3rd vec:  16 17 18 19 20 21 22 23
6018 
6019    The output sequence should be:
6020 
6021    1st vec:  0 3 6  9 12 15 18 21
6022    2nd vec:  1 4 7 10 13 16 19 22
6023    3rd vec:  2 5 8 11 14 17 20 23
6024 
6025    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6026 
6027    First we shuffle all 3 vectors to get correct elements order:
6028 
6029    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6030    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6031    3rd vec:  (16 19 22) (17 20 23) (18 21)
6032 
6033    Next we unite and shift vector 3 times:
6034 
6035    1st step:
6036      shift right by 6 the concatenation of:
6037      "1st vec" and  "2nd vec"
6038        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6039      "2nd vec" and  "3rd vec"
6040        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6041      "3rd vec" and  "1st vec"
6042        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6043 			     | New vectors                   |
6044 
6045      So that now new vectors are:
6046 
6047      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6048      2nd vec:  (10 13) (16 19 22) (17 20 23)
6049      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6050 
6051    2nd step:
6052      shift right by 5 the concatenation of:
6053      "1st vec" and  "3rd vec"
6054        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6055      "2nd vec" and  "1st vec"
6056        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6057      "3rd vec" and  "2nd vec"
6058        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6059 			  | New vectors                   |
6060 
6061      So that now new vectors are:
6062 
6063      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6064      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6065      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6066 
6067    3rd step:
6068      shift right by 5 the concatenation of:
6069      "1st vec" and  "1st vec"
6070        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6071      shift right by 3 the concatenation of:
6072      "2nd vec" and  "2nd vec"
6073                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6074 			  | New vectors                   |
6075 
6076      So that now all vectors are READY:
6077      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6078      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6079      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6080 
6081    This algorithm is faster than one in vect_permute_load_chain if:
6082      1.  "shift of a concatination" is faster than general permutation.
6083 	 This is usually so.
6084      2.  The TARGET machine can't execute vector instructions in parallel.
6085 	 This is because each step of the algorithm depends on previous.
6086 	 The algorithm in vect_permute_load_chain is much more parallel.
6087 
6088    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6089 */
6090 
6091 static bool
6092 vect_shift_permute_load_chain (vec<tree> dr_chain,
6093 			       unsigned int length,
6094 			       gimple *stmt,
6095 			       gimple_stmt_iterator *gsi,
6096 			       vec<tree> *result_chain)
6097 {
6098   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6099   tree perm2_mask1, perm2_mask2, perm3_mask;
6100   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6101   gimple *perm_stmt;
6102 
6103   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6104   unsigned int i;
6105   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6106   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6107 
6108   unsigned HOST_WIDE_INT nelt, vf;
6109   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6110       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6111     /* Not supported for variable-length vectors.  */
6112     return false;
6113 
6114   vec_perm_builder sel (nelt, nelt, 1);
6115   sel.quick_grow (nelt);
6116 
6117   result_chain->quick_grow (length);
6118   memcpy (result_chain->address (), dr_chain.address (),
6119 	  length * sizeof (tree));
6120 
6121   if (pow2p_hwi (length) && vf > 4)
6122     {
6123       unsigned int j, log_length = exact_log2 (length);
6124       for (i = 0; i < nelt / 2; ++i)
6125 	sel[i] = i * 2;
6126       for (i = 0; i < nelt / 2; ++i)
6127 	sel[nelt / 2 + i] = i * 2 + 1;
6128       vec_perm_indices indices (sel, 2, nelt);
6129       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6130 	{
6131 	  if (dump_enabled_p ())
6132 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6133 			     "shuffle of 2 fields structure is not \
6134 			      supported by target\n");
6135 	  return false;
6136 	}
6137       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6138 
6139       for (i = 0; i < nelt / 2; ++i)
6140 	sel[i] = i * 2 + 1;
6141       for (i = 0; i < nelt / 2; ++i)
6142 	sel[nelt / 2 + i] = i * 2;
6143       indices.new_vector (sel, 2, nelt);
6144       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145 	{
6146 	  if (dump_enabled_p ())
6147 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148 			     "shuffle of 2 fields structure is not \
6149 			      supported by target\n");
6150 	  return false;
6151 	}
6152       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6153 
6154       /* Generating permutation constant to shift all elements.
6155 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6156       for (i = 0; i < nelt; i++)
6157 	sel[i] = nelt / 2 + i;
6158       indices.new_vector (sel, 2, nelt);
6159       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6160 	{
6161 	  if (dump_enabled_p ())
6162 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163 			     "shift permutation is not supported by target\n");
6164 	  return false;
6165 	}
6166       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6167 
6168       /* Generating permutation constant to select vector from 2.
6169 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6170       for (i = 0; i < nelt / 2; i++)
6171 	sel[i] = i;
6172       for (i = nelt / 2; i < nelt; i++)
6173 	sel[i] = nelt + i;
6174       indices.new_vector (sel, 2, nelt);
6175       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6176 	{
6177 	  if (dump_enabled_p ())
6178 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6179 			     "select is not supported by target\n");
6180 	  return false;
6181 	}
6182       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6183 
6184       for (i = 0; i < log_length; i++)
6185 	{
6186 	  for (j = 0; j < length; j += 2)
6187 	    {
6188 	      first_vect = dr_chain[j];
6189 	      second_vect = dr_chain[j + 1];
6190 
6191 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6192 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6193 					       first_vect, first_vect,
6194 					       perm2_mask1);
6195 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6196 	      vect[0] = data_ref;
6197 
6198 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6199 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6200 					       second_vect, second_vect,
6201 					       perm2_mask2);
6202 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6203 	      vect[1] = data_ref;
6204 
6205 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6206 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6207 					       vect[0], vect[1], shift1_mask);
6208 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6209 	      (*result_chain)[j/2 + length/2] = data_ref;
6210 
6211 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6212 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213 					       vect[0], vect[1], select_mask);
6214 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215 	      (*result_chain)[j/2] = data_ref;
6216 	    }
6217 	  memcpy (dr_chain.address (), result_chain->address (),
6218 		  length * sizeof (tree));
6219 	}
6220       return true;
6221     }
6222   if (length == 3 && vf > 2)
6223     {
6224       unsigned int k = 0, l = 0;
6225 
6226       /* Generating permutation constant to get all elements in rigth order.
6227 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6228       for (i = 0; i < nelt; i++)
6229 	{
6230 	  if (3 * k + (l % 3) >= nelt)
6231 	    {
6232 	      k = 0;
6233 	      l += (3 - (nelt % 3));
6234 	    }
6235 	  sel[i] = 3 * k + (l % 3);
6236 	  k++;
6237 	}
6238       vec_perm_indices indices (sel, 2, nelt);
6239       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6240 	{
6241 	  if (dump_enabled_p ())
6242 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6243 			     "shuffle of 3 fields structure is not \
6244 			      supported by target\n");
6245 	  return false;
6246 	}
6247       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6248 
6249       /* Generating permutation constant to shift all elements.
6250 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6251       for (i = 0; i < nelt; i++)
6252 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6253       indices.new_vector (sel, 2, nelt);
6254       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6255 	{
6256 	  if (dump_enabled_p ())
6257 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 			     "shift permutation is not supported by target\n");
6259 	  return false;
6260 	}
6261       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6262 
6263       /* Generating permutation constant to shift all elements.
6264 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6265       for (i = 0; i < nelt; i++)
6266 	sel[i] = 2 * (nelt / 3) + 1 + i;
6267       indices.new_vector (sel, 2, nelt);
6268       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6269 	{
6270 	  if (dump_enabled_p ())
6271 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272 			     "shift permutation is not supported by target\n");
6273 	  return false;
6274 	}
6275       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6276 
6277       /* Generating permutation constant to shift all elements.
6278 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6279       for (i = 0; i < nelt; i++)
6280 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6281       indices.new_vector (sel, 2, nelt);
6282       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6283 	{
6284 	  if (dump_enabled_p ())
6285 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6286 			     "shift permutation is not supported by target\n");
6287 	  return false;
6288 	}
6289       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6290 
6291       /* Generating permutation constant to shift all elements.
6292 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6293       for (i = 0; i < nelt; i++)
6294 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6295       indices.new_vector (sel, 2, nelt);
6296       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6297 	{
6298 	  if (dump_enabled_p ())
6299 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6300 			     "shift permutation is not supported by target\n");
6301 	  return false;
6302 	}
6303       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6304 
6305       for (k = 0; k < 3; k++)
6306 	{
6307 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6308 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6309 					   dr_chain[k], dr_chain[k],
6310 					   perm3_mask);
6311 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6312 	  vect[k] = data_ref;
6313 	}
6314 
6315       for (k = 0; k < 3; k++)
6316 	{
6317 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6318 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6319 					   vect[k % 3], vect[(k + 1) % 3],
6320 					   shift1_mask);
6321 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6322 	  vect_shift[k] = data_ref;
6323 	}
6324 
6325       for (k = 0; k < 3; k++)
6326 	{
6327 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6328 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6329 					   vect_shift[(4 - k) % 3],
6330 					   vect_shift[(3 - k) % 3],
6331 					   shift2_mask);
6332 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6333 	  vect[k] = data_ref;
6334 	}
6335 
6336       (*result_chain)[3 - (nelt % 3)] = vect[2];
6337 
6338       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6339       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6340 				       vect[0], shift3_mask);
6341       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6342       (*result_chain)[nelt % 3] = data_ref;
6343 
6344       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6345       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6346 				       vect[1], shift4_mask);
6347       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6348       (*result_chain)[0] = data_ref;
6349       return true;
6350     }
6351   return false;
6352 }
6353 
6354 /* Function vect_transform_grouped_load.
6355 
6356    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6357    to perform their permutation and ascribe the result vectorized statements to
6358    the scalar statements.
6359 */
6360 
6361 void
6362 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6363 			     gimple_stmt_iterator *gsi)
6364 {
6365   machine_mode mode;
6366   vec<tree> result_chain = vNULL;
6367 
6368   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6369      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6370      vectors, that are ready for vector computation.  */
6371   result_chain.create (size);
6372 
6373   /* If reassociation width for vector type is 2 or greater target machine can
6374      execute 2 or more vector instructions in parallel.  Otherwise try to
6375      get chain for loads group using vect_shift_permute_load_chain.  */
6376   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6377   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6378       || pow2p_hwi (size)
6379       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6380 					 gsi, &result_chain))
6381     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6382   vect_record_grouped_load_vectors (stmt, result_chain);
6383   result_chain.release ();
6384 }
6385 
6386 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6387    generated as part of the vectorization of STMT.  Assign the statement
6388    for each vector to the associated scalar statement.  */
6389 
6390 void
6391 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6392 {
6393   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6394   gimple *next_stmt, *new_stmt;
6395   unsigned int i, gap_count;
6396   tree tmp_data_ref;
6397 
6398   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6399      Since we scan the chain starting from it's first node, their order
6400      corresponds the order of data-refs in RESULT_CHAIN.  */
6401   next_stmt = first_stmt;
6402   gap_count = 1;
6403   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6404     {
6405       if (!next_stmt)
6406 	break;
6407 
6408       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6409        code elimination pass later.  No need to check for the first stmt in
6410        the group, since it always exists.
6411        GROUP_GAP is the number of steps in elements from the previous
6412        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6413        correspond to the gaps.  */
6414       if (next_stmt != first_stmt
6415           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6416       {
6417         gap_count++;
6418         continue;
6419       }
6420 
6421       while (next_stmt)
6422         {
6423 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6424 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6425 	     copies, and we put the new vector statement in the first available
6426 	     RELATED_STMT.  */
6427 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6428 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6429 	  else
6430             {
6431               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6432                 {
6433 		  gimple *prev_stmt =
6434 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6435 		  gimple *rel_stmt =
6436 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6437 	          while (rel_stmt)
6438 		    {
6439 		      prev_stmt = rel_stmt;
6440 		      rel_stmt =
6441                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6442 		    }
6443 
6444   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6445                     new_stmt;
6446                 }
6447             }
6448 
6449 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6450 	  gap_count = 1;
6451 	  /* If NEXT_STMT accesses the same DR as the previous statement,
6452 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
6453 	     get the next data-ref from RESULT_CHAIN.  */
6454 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6455 	    break;
6456         }
6457     }
6458 }
6459 
6460 /* Function vect_force_dr_alignment_p.
6461 
6462    Returns whether the alignment of a DECL can be forced to be aligned
6463    on ALIGNMENT bit boundary.  */
6464 
6465 bool
6466 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6467 {
6468   if (!VAR_P (decl))
6469     return false;
6470 
6471   if (decl_in_symtab_p (decl)
6472       && !symtab_node::get (decl)->can_increase_alignment_p ())
6473     return false;
6474 
6475   if (TREE_STATIC (decl))
6476     return (alignment <= MAX_OFILE_ALIGNMENT);
6477   else
6478     return (alignment <= MAX_STACK_ALIGNMENT);
6479 }
6480 
6481 
6482 /* Return whether the data reference DR is supported with respect to its
6483    alignment.
6484    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6485    it is aligned, i.e., check if it is possible to vectorize it with different
6486    alignment.  */
6487 
6488 enum dr_alignment_support
6489 vect_supportable_dr_alignment (struct data_reference *dr,
6490                                bool check_aligned_accesses)
6491 {
6492   gimple *stmt = DR_STMT (dr);
6493   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6494   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6495   machine_mode mode = TYPE_MODE (vectype);
6496   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6497   struct loop *vect_loop = NULL;
6498   bool nested_in_vect_loop = false;
6499 
6500   if (aligned_access_p (dr) && !check_aligned_accesses)
6501     return dr_aligned;
6502 
6503   /* For now assume all conditional loads/stores support unaligned
6504      access without any special code.  */
6505   if (is_gimple_call (stmt)
6506       && gimple_call_internal_p (stmt)
6507       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6508 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6509     return dr_unaligned_supported;
6510 
6511   if (loop_vinfo)
6512     {
6513       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6514       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6515     }
6516 
6517   /* Possibly unaligned access.  */
6518 
6519   /* We can choose between using the implicit realignment scheme (generating
6520      a misaligned_move stmt) and the explicit realignment scheme (generating
6521      aligned loads with a REALIGN_LOAD).  There are two variants to the
6522      explicit realignment scheme: optimized, and unoptimized.
6523      We can optimize the realignment only if the step between consecutive
6524      vector loads is equal to the vector size.  Since the vector memory
6525      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6526      is guaranteed that the misalignment amount remains the same throughout the
6527      execution of the vectorized loop.  Therefore, we can create the
6528      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6529      at the loop preheader.
6530 
6531      However, in the case of outer-loop vectorization, when vectorizing a
6532      memory access in the inner-loop nested within the LOOP that is now being
6533      vectorized, while it is guaranteed that the misalignment of the
6534      vectorized memory access will remain the same in different outer-loop
6535      iterations, it is *not* guaranteed that is will remain the same throughout
6536      the execution of the inner-loop.  This is because the inner-loop advances
6537      with the original scalar step (and not in steps of VS).  If the inner-loop
6538      step happens to be a multiple of VS, then the misalignment remains fixed
6539      and we can use the optimized realignment scheme.  For example:
6540 
6541       for (i=0; i<N; i++)
6542         for (j=0; j<M; j++)
6543           s += a[i+j];
6544 
6545      When vectorizing the i-loop in the above example, the step between
6546      consecutive vector loads is 1, and so the misalignment does not remain
6547      fixed across the execution of the inner-loop, and the realignment cannot
6548      be optimized (as illustrated in the following pseudo vectorized loop):
6549 
6550       for (i=0; i<N; i+=4)
6551         for (j=0; j<M; j++){
6552           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6553                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6554                          // (assuming that we start from an aligned address).
6555           }
6556 
6557      We therefore have to use the unoptimized realignment scheme:
6558 
6559       for (i=0; i<N; i+=4)
6560           for (j=k; j<M; j+=4)
6561           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6562                            // that the misalignment of the initial address is
6563                            // 0).
6564 
6565      The loop can then be vectorized as follows:
6566 
6567       for (k=0; k<4; k++){
6568         rt = get_realignment_token (&vp[k]);
6569         for (i=0; i<N; i+=4){
6570           v1 = vp[i+k];
6571           for (j=k; j<M; j+=4){
6572             v2 = vp[i+j+VS-1];
6573             va = REALIGN_LOAD <v1,v2,rt>;
6574             vs += va;
6575             v1 = v2;
6576           }
6577         }
6578     } */
6579 
6580   if (DR_IS_READ (dr))
6581     {
6582       bool is_packed = false;
6583       tree type = (TREE_TYPE (DR_REF (dr)));
6584 
6585       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6586 	  && (!targetm.vectorize.builtin_mask_for_load
6587 	      || targetm.vectorize.builtin_mask_for_load ()))
6588 	{
6589 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6590 
6591 	  /* If we are doing SLP then the accesses need not have the
6592 	     same alignment, instead it depends on the SLP group size.  */
6593 	  if (loop_vinfo
6594 	      && STMT_SLP_TYPE (stmt_info)
6595 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6596 			      * GROUP_SIZE (vinfo_for_stmt
6597 					    (GROUP_FIRST_ELEMENT (stmt_info))),
6598 			      TYPE_VECTOR_SUBPARTS (vectype)))
6599 	    ;
6600 	  else if (!loop_vinfo
6601 		   || (nested_in_vect_loop
6602 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6603 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6604 	    return dr_explicit_realign;
6605 	  else
6606 	    return dr_explicit_realign_optimized;
6607 	}
6608       if (!known_alignment_for_access_p (dr))
6609 	is_packed = not_size_aligned (DR_REF (dr));
6610 
6611       if (targetm.vectorize.support_vector_misalignment
6612 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6613 	/* Can't software pipeline the loads, but can at least do them.  */
6614 	return dr_unaligned_supported;
6615     }
6616   else
6617     {
6618       bool is_packed = false;
6619       tree type = (TREE_TYPE (DR_REF (dr)));
6620 
6621       if (!known_alignment_for_access_p (dr))
6622 	is_packed = not_size_aligned (DR_REF (dr));
6623 
6624      if (targetm.vectorize.support_vector_misalignment
6625 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6626        return dr_unaligned_supported;
6627     }
6628 
6629   /* Unsupported.  */
6630   return dr_unaligned_unsupported;
6631 }
6632