1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s[%wu]\n",
78 			     GET_MODE_NAME (mode), count);
79 	  return false;
80 	}
81     }
82 
83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84     {
85       if (dump_enabled_p ())
86 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87                          "cannot use %s<%s><%s>\n", name,
88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89       return false;
90     }
91 
92   if (dump_enabled_p ())
93     dump_printf_loc (MSG_NOTE, vect_location,
94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95                      GET_MODE_NAME (mode));
96 
97   return true;
98 }
99 
100 
101 /* Return the smallest scalar part of STMT_INFO.
102    This is used to determine the vectype of the stmt.  We generally set the
103    vectype according to the type of the result (lhs).  For stmts whose
104    result-type is different than the type of the arguments (e.g., demotion,
105    promotion), vectype will be reset appropriately (later).  Note that we have
106    to visit the smallest datatype in this function, because that determines the
107    VF.  If the smallest datatype in the loop is present only as the rhs of a
108    promotion operation - we'd miss it.
109    Such a case, where a variable of this datatype does not appear in the lhs
110    anywhere in the loop, can only occur if it's an invariant: e.g.:
111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
112    invariant motion.  However, we cannot rely on invariant motion to always
113    take invariants out of the loop, and so in the case of promotion we also
114    have to check the rhs.
115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116    types.  */
117 
118 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 			       HOST_WIDE_INT *lhs_size_unit,
121 			       HOST_WIDE_INT *rhs_size_unit)
122 {
123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
124   HOST_WIDE_INT lhs, rhs;
125 
126   /* During the analysis phase, this function is called on arbitrary
127      statements that might not have scalar results.  */
128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129     return scalar_type;
130 
131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 
133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134   if (assign
135       && (gimple_assign_cast_p (assign)
136 	  || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 	  || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 	  || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 	  || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 	  || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141     {
142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143 
144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145       if (rhs < lhs)
146         scalar_type = rhs_type;
147     }
148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149     {
150       unsigned int i = 0;
151       if (gimple_call_internal_p (call))
152 	{
153 	  internal_fn ifn = gimple_call_internal_fn (call);
154 	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
155 	    /* gimple_expr_type already picked the type of the loaded
156 	       or stored data.  */
157 	    i = ~0U;
158 	  else if (internal_fn_mask_index (ifn) == 0)
159 	    i = 1;
160 	}
161       if (i < gimple_call_num_args (call))
162 	{
163 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
164 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 	    {
166 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
167 	      if (rhs < lhs)
168 		scalar_type = rhs_type;
169 	    }
170 	}
171     }
172 
173   *lhs_size_unit = lhs;
174   *rhs_size_unit = rhs;
175   return scalar_type;
176 }
177 
178 
179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
180    tested at run-time.  Return TRUE if DDR was successfully inserted.
181    Return false if versioning is not supported.  */
182 
183 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 {
186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187 
188   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
189     return opt_result::failure_at (vect_location,
190 				   "will not create alias checks, as"
191 				   " --param vect-max-version-for-alias-checks"
192 				   " == 0\n");
193 
194   opt_result res
195     = runtime_alias_check_p (ddr, loop,
196 			     optimize_loop_nest_for_speed_p (loop));
197   if (!res)
198     return res;
199 
200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201   return opt_result::success ();
202 }
203 
204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
205 
206 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 {
209   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
210   for (unsigned int i = 0; i < checks.length(); ++i)
211     if (checks[i] == value)
212       return;
213 
214   if (dump_enabled_p ())
215     dump_printf_loc (MSG_NOTE, vect_location,
216 		     "need run-time check that %T is nonzero\n",
217 		     value);
218   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
219 }
220 
221 /* Return true if we know that the order of vectorized DR_INFO_A and
222    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
223    DR_INFO_B.  At least one of the accesses is a write.  */
224 
225 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 {
228   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
229   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230 
231   /* Single statements are always kept in their original order.  */
232   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
233       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
234     return true;
235 
236   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
237      SLP group are emitted at the position of the last scalar load and
238      all loads in an interleaving group are emitted at the position
239      of the first scalar load.
240      Stores in a group are emitted at the position of the last scalar store.
241      Compute that position and check whether the resulting order matches
242      the current one.
243      We have not yet decided between SLP and interleaving so we have
244      to conservatively assume both.  */
245   stmt_vec_info il_a;
246   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
247   if (last_a)
248     {
249       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
250 	   s = DR_GROUP_NEXT_ELEMENT (s))
251 	last_a = get_later_stmt (last_a, s);
252       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
253 	{
254 	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
255 	       s = DR_GROUP_NEXT_ELEMENT (s))
256 	    if (get_later_stmt (il_a, s) == il_a)
257 	      il_a = s;
258 	}
259       else
260 	il_a = last_a;
261     }
262   else
263     last_a = il_a = stmtinfo_a;
264   stmt_vec_info il_b;
265   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
266   if (last_b)
267     {
268       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
269 	   s = DR_GROUP_NEXT_ELEMENT (s))
270 	last_b = get_later_stmt (last_b, s);
271       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
272 	{
273 	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
274 	       s = DR_GROUP_NEXT_ELEMENT (s))
275 	    if (get_later_stmt (il_b, s) == il_b)
276 	      il_b = s;
277 	}
278       else
279 	il_b = last_b;
280     }
281   else
282     last_b = il_b = stmtinfo_b;
283   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
284   return (/* SLP */
285 	  (get_later_stmt (last_a, last_b) == last_a) == a_after_b
286 	  /* Interleaving */
287 	  && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
288 	  /* Mixed */
289 	  && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
290 	  && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
291 }
292 
293 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
294    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
295    distances.  These distances are conservatively correct but they don't
296    reflect a guaranteed dependence.
297 
298    Return true if this function does all the work necessary to avoid
299    an alias or false if the caller should use the dependence distances
300    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
301    the depth of the loop described by LOOP_VINFO and the other arguments
302    are as for vect_analyze_data_ref_dependence.  */
303 
304 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
306 				       loop_vec_info loop_vinfo,
307 				       int loop_depth, unsigned int *max_vf)
308 {
309   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
310   lambda_vector dist_v;
311   unsigned int i;
312   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
313     {
314       int dist = dist_v[loop_depth];
315       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
316 	{
317 	  /* If the user asserted safelen >= DIST consecutive iterations
318 	     can be executed concurrently, assume independence.
319 
320 	     ??? An alternative would be to add the alias check even
321 	     in this case, and vectorize the fallback loop with the
322 	     maximum VF set to safelen.  However, if the user has
323 	     explicitly given a length, it's less likely that that
324 	     would be a win.  */
325 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
326 	    {
327 	      if ((unsigned int) loop->safelen < *max_vf)
328 		*max_vf = loop->safelen;
329 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 	      continue;
331 	    }
332 
333 	  /* For dependence distances of 2 or more, we have the option
334 	     of limiting VF or checking for an alias at runtime.
335 	     Prefer to check at runtime if we can, to avoid limiting
336 	     the VF unnecessarily when the bases are in fact independent.
337 
338 	     Note that the alias checks will be removed if the VF ends up
339 	     being small enough.  */
340 	  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
341 	  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
342 	  return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
343 		  && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
344 		  && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
345 	}
346     }
347   return true;
348 }
349 
350 
351 /* Function vect_analyze_data_ref_dependence.
352 
353    FIXME: I needed to change the sense of the returned flag.
354 
355    Return FALSE if there (might) exist a dependence between a memory-reference
356    DRA and a memory-reference DRB.  When versioning for alias may check a
357    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
358    the data dependence.  */
359 
360 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
362 				  loop_vec_info loop_vinfo,
363 				  unsigned int *max_vf)
364 {
365   unsigned int i;
366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
367   struct data_reference *dra = DDR_A (ddr);
368   struct data_reference *drb = DDR_B (ddr);
369   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
370   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
371   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
372   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
373   lambda_vector dist_v;
374   unsigned int loop_depth;
375 
376   /* In loop analysis all data references should be vectorizable.  */
377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379     gcc_unreachable ();
380 
381   /* Independent data accesses.  */
382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383     return opt_result::success ();
384 
385   if (dra == drb
386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387     return opt_result::success ();
388 
389   /* We do not have to consider dependences between accesses that belong
390      to the same group, unless the stride could be smaller than the
391      group size.  */
392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 	  == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396     return opt_result::success ();
397 
398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
399      least two scalar iterations, there is always also a true dependence.
400      As the vectorizer does not re-order loads and stores we can ignore
401      the anti-dependence if TBAA can disambiguate both DRs similar to the
402      case with known negative distance anti-dependences (positive
403      distance anti-dependences would violate TBAA constraints).  */
404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 				 get_alias_set (DR_REF (drb))))
408     return opt_result::success ();
409 
410   /* Unknown data dependence.  */
411   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
412     {
413       /* If user asserted safelen consecutive iterations can be
414 	 executed concurrently, assume independence.  */
415       if (loop->safelen >= 2)
416 	{
417 	  if ((unsigned int) loop->safelen < *max_vf)
418 	    *max_vf = loop->safelen;
419 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 	  return opt_result::success ();
421 	}
422 
423       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
424 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
425 	return opt_result::failure_at
426 	  (stmtinfo_a->stmt,
427 	   "versioning for alias not supported for: "
428 	   "can't determine dependence between %T and %T\n",
429 	   DR_REF (dra), DR_REF (drb));
430 
431       if (dump_enabled_p ())
432 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
433 			 "versioning for alias required: "
434 			 "can't determine dependence between %T and %T\n",
435 			 DR_REF (dra), DR_REF (drb));
436 
437       /* Add to list of ddrs that need to be tested at run-time.  */
438       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
439     }
440 
441   /* Known data dependence.  */
442   if (DDR_NUM_DIST_VECTS (ddr) == 0)
443     {
444       /* If user asserted safelen consecutive iterations can be
445 	 executed concurrently, assume independence.  */
446       if (loop->safelen >= 2)
447 	{
448 	  if ((unsigned int) loop->safelen < *max_vf)
449 	    *max_vf = loop->safelen;
450 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
451 	  return opt_result::success ();
452 	}
453 
454       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
455 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 	return opt_result::failure_at
457 	  (stmtinfo_a->stmt,
458 	   "versioning for alias not supported for: "
459 	   "bad dist vector for %T and %T\n",
460 	   DR_REF (dra), DR_REF (drb));
461 
462       if (dump_enabled_p ())
463 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
464 			 "versioning for alias required: "
465 			 "bad dist vector for %T and %T\n",
466 			 DR_REF (dra), DR_REF (drb));
467       /* Add to list of ddrs that need to be tested at run-time.  */
468       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
469     }
470 
471   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
472 
473   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
474       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
475 						loop_depth, max_vf))
476     return opt_result::success ();
477 
478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
479     {
480       int dist = dist_v[loop_depth];
481 
482       if (dump_enabled_p ())
483 	dump_printf_loc (MSG_NOTE, vect_location,
484                          "dependence distance  = %d.\n", dist);
485 
486       if (dist == 0)
487 	{
488 	  if (dump_enabled_p ())
489 	    dump_printf_loc (MSG_NOTE, vect_location,
490 			     "dependence distance == 0 between %T and %T\n",
491 			     DR_REF (dra), DR_REF (drb));
492 
493 	  /* When we perform grouped accesses and perform implicit CSE
494 	     by detecting equal accesses and doing disambiguation with
495 	     runtime alias tests like for
496 	        .. = a[i];
497 		.. = a[i+1];
498 		a[i] = ..;
499 		a[i+1] = ..;
500 		*p = ..;
501 		.. = a[i];
502 		.. = a[i+1];
503 	     where we will end up loading { a[i], a[i+1] } once, make
504 	     sure that inserting group loads before the first load and
505 	     stores after the last store will do the right thing.
506 	     Similar for groups like
507 	        a[i] = ...;
508 		... = a[i];
509 		a[i+1] = ...;
510 	     where loads from the group interleave with the store.  */
511 	  if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
512 	    return opt_result::failure_at (stmtinfo_a->stmt,
513 					   "READ_WRITE dependence"
514 					   " in interleaving.\n");
515 
516 	  if (loop->safelen < 2)
517 	    {
518 	      tree indicator = dr_zero_step_indicator (dra);
519 	      if (!indicator || integer_zerop (indicator))
520 		return opt_result::failure_at (stmtinfo_a->stmt,
521 					       "access also has a zero step\n");
522 	      else if (TREE_CODE (indicator) != INTEGER_CST)
523 		vect_check_nonzero_value (loop_vinfo, indicator);
524 	    }
525 	  continue;
526 	}
527 
528       if (dist > 0 && DDR_REVERSED_P (ddr))
529 	{
530 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
531 	     reversed (to make distance vector positive), and the actual
532 	     distance is negative.  */
533 	  if (dump_enabled_p ())
534 	    dump_printf_loc (MSG_NOTE, vect_location,
535 	                     "dependence distance negative.\n");
536 	  /* When doing outer loop vectorization, we need to check if there is
537 	     a backward dependence at the inner loop level if the dependence
538 	     at the outer loop is reversed.  See PR81740.  */
539 	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
540 	      || nested_in_vect_loop_p (loop, stmtinfo_b))
541 	    {
542 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
543 							 DDR_LOOP_NEST (ddr));
544 	      if (dist_v[inner_depth] < 0)
545 		return opt_result::failure_at (stmtinfo_a->stmt,
546 					       "not vectorized, dependence "
547 					       "between data-refs %T and %T\n",
548 					       DR_REF (dra), DR_REF (drb));
549 	    }
550 	  /* Record a negative dependence distance to later limit the
551 	     amount of stmt copying / unrolling we can perform.
552 	     Only need to handle read-after-write dependence.  */
553 	  if (DR_IS_READ (drb)
554 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
555 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
556 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
557 	  continue;
558 	}
559 
560       unsigned int abs_dist = abs (dist);
561       if (abs_dist >= 2 && abs_dist < *max_vf)
562 	{
563 	  /* The dependence distance requires reduction of the maximal
564 	     vectorization factor.  */
565 	  *max_vf = abs_dist;
566 	  if (dump_enabled_p ())
567 	    dump_printf_loc (MSG_NOTE, vect_location,
568 	                     "adjusting maximal vectorization factor to %i\n",
569 	                     *max_vf);
570 	}
571 
572       if (abs_dist >= *max_vf)
573 	{
574 	  /* Dependence distance does not create dependence, as far as
575 	     vectorization is concerned, in this case.  */
576 	  if (dump_enabled_p ())
577 	    dump_printf_loc (MSG_NOTE, vect_location,
578 	                     "dependence distance >= VF.\n");
579 	  continue;
580 	}
581 
582       return opt_result::failure_at (stmtinfo_a->stmt,
583 				     "not vectorized, possible dependence "
584 				     "between data-refs %T and %T\n",
585 				     DR_REF (dra), DR_REF (drb));
586     }
587 
588   return opt_result::success ();
589 }
590 
591 /* Function vect_analyze_data_ref_dependences.
592 
593    Examine all the data references in the loop, and make sure there do not
594    exist any data dependences between them.  Set *MAX_VF according to
595    the maximum vectorization factor the data dependences allow.  */
596 
597 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
599 				   unsigned int *max_vf)
600 {
601   unsigned int i;
602   struct data_dependence_relation *ddr;
603 
604   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
605 
606   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
607     {
608       LOOP_VINFO_DDRS (loop_vinfo)
609 	.create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
610 		 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
611       /* We need read-read dependences to compute
612 	 STMT_VINFO_SAME_ALIGN_REFS.  */
613       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
614 					  &LOOP_VINFO_DDRS (loop_vinfo),
615 					  LOOP_VINFO_LOOP_NEST (loop_vinfo),
616 					  true);
617       gcc_assert (res);
618     }
619 
620   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
621 
622   /* For epilogues we either have no aliases or alias versioning
623      was applied to original loop.  Therefore we may just get max_vf
624      using VF of original loop.  */
625   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
626     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
627   else
628     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
629       {
630 	opt_result res
631 	  = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
632 	if (!res)
633 	  return res;
634       }
635 
636   return opt_result::success ();
637 }
638 
639 
640 /* Function vect_slp_analyze_data_ref_dependence.
641 
642    Return TRUE if there (might) exist a dependence between a memory-reference
643    DRA and a memory-reference DRB for VINFO.  When versioning for alias
644    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
645    according to the data dependence.  */
646 
647 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
649 				      struct data_dependence_relation *ddr)
650 {
651   struct data_reference *dra = DDR_A (ddr);
652   struct data_reference *drb = DDR_B (ddr);
653   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
654   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
655 
656   /* We need to check dependences of statements marked as unvectorizable
657      as well, they still can prohibit vectorization.  */
658 
659   /* Independent data accesses.  */
660   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
661     return false;
662 
663   if (dra == drb)
664     return false;
665 
666   /* Read-read is OK.  */
667   if (DR_IS_READ (dra) && DR_IS_READ (drb))
668     return false;
669 
670   /* If dra and drb are part of the same interleaving chain consider
671      them independent.  */
672   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
673       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
674 	  == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
675     return false;
676 
677   /* Unknown data dependence.  */
678   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
679     {
680       if  (dump_enabled_p ())
681 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 			 "can't determine dependence between %T and %T\n",
683 			 DR_REF (dra), DR_REF (drb));
684     }
685   else if (dump_enabled_p ())
686     dump_printf_loc (MSG_NOTE, vect_location,
687 		     "determined dependence between %T and %T\n",
688 		     DR_REF (dra), DR_REF (drb));
689 
690   return true;
691 }
692 
693 
694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
695    contain the vector of scalar stores of this instance if we are
696    disambiguating the loads.  */
697 
698 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700 				   vec<stmt_vec_info> stores,
701 				   stmt_vec_info last_store_info)
702 {
703   /* This walks over all stmts involved in the SLP load/store done
704      in NODE verifying we can sink them up to the last stmt in the
705      group.  */
706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
707   vec_info *vinfo = last_access_info->vinfo;
708   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
709     {
710       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
711       if (access_info == last_access_info)
712 	continue;
713       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
714       ao_ref ref;
715       bool ref_initialized_p = false;
716       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
717 	   gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
718 	{
719 	  gimple *stmt = gsi_stmt (gsi);
720 	  if (! gimple_vuse (stmt)
721 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
722 	    continue;
723 
724 	  /* If we couldn't record a (single) data reference for this
725 	     stmt we have to resort to the alias oracle.  */
726 	  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
727 	  data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
728 	  if (!dr_b)
729 	    {
730 	      /* We are moving a store or sinking a load - this means
731 	         we cannot use TBAA for disambiguation.  */
732 	      if (!ref_initialized_p)
733 		ao_ref_init (&ref, DR_REF (dr_a));
734 	      if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
735 		  || ref_maybe_used_by_stmt_p (stmt, &ref, false))
736 		return false;
737 	      continue;
738 	    }
739 
740 	  bool dependent = false;
741 	  /* If we run into a store of this same instance (we've just
742 	     marked those) then delay dependence checking until we run
743 	     into the last store because this is where it will have
744 	     been sunk to (and we verify if we can do that as well).  */
745 	  if (gimple_visited_p (stmt))
746 	    {
747 	      if (stmt_info != last_store_info)
748 		continue;
749 	      unsigned i;
750 	      stmt_vec_info store_info;
751 	      FOR_EACH_VEC_ELT (stores, i, store_info)
752 		{
753 		  data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
754 		  ddr_p ddr = initialize_data_dependence_relation
755 				(dr_a, store_dr, vNULL);
756 		  dependent
757 		    = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
758 		  free_dependence_relation (ddr);
759 		  if (dependent)
760 		    break;
761 		}
762 	    }
763 	  else
764 	    {
765 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
766 							       dr_b, vNULL);
767 	      dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
768 	      free_dependence_relation (ddr);
769 	    }
770 	  if (dependent)
771 	    return false;
772 	}
773     }
774   return true;
775 }
776 
777 
778 /* Function vect_analyze_data_ref_dependences.
779 
780    Examine all the data references in the basic-block, and make sure there
781    do not exist any data dependences between them.  Set *MAX_VF according to
782    the maximum vectorization factor the data dependences allow.  */
783 
784 bool
vect_slp_analyze_instance_dependence(slp_instance instance)785 vect_slp_analyze_instance_dependence (slp_instance instance)
786 {
787   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
788 
789   /* The stores of this instance are at the root of the SLP tree.  */
790   slp_tree store = SLP_INSTANCE_TREE (instance);
791   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
792     store = NULL;
793 
794   /* Verify we can sink stores to the vectorized stmt insert location.  */
795   stmt_vec_info last_store_info = NULL;
796   if (store)
797     {
798       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
799 	return false;
800 
801       /* Mark stores in this instance and remember the last one.  */
802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
804 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
805     }
806 
807   bool res = true;
808 
809   /* Verify we can sink loads to the vectorized stmt insert location,
810      special-casing stores of this instance.  */
811   slp_tree load;
812   unsigned int i;
813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
814     if (! vect_slp_analyze_node_dependences (instance, load,
815 					     store
816 					     ? SLP_TREE_SCALAR_STMTS (store)
817 					     : vNULL, last_store_info))
818       {
819 	res = false;
820 	break;
821       }
822 
823   /* Unset the visited flag.  */
824   if (store)
825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
827 
828   return res;
829 }
830 
831 /* Record the base alignment guarantee given by DRB, which occurs
832    in STMT_INFO.  */
833 
834 static void
vect_record_base_alignment(stmt_vec_info stmt_info,innermost_loop_behavior * drb)835 vect_record_base_alignment (stmt_vec_info stmt_info,
836 			    innermost_loop_behavior *drb)
837 {
838   vec_info *vinfo = stmt_info->vinfo;
839   bool existed;
840   innermost_loop_behavior *&entry
841     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
842   if (!existed || entry->base_alignment < drb->base_alignment)
843     {
844       entry = drb;
845       if (dump_enabled_p ())
846 	dump_printf_loc (MSG_NOTE, vect_location,
847 			 "recording new base alignment for %T\n"
848 			 "  alignment:    %d\n"
849 			 "  misalignment: %d\n"
850 			 "  based on:     %G",
851 			 drb->base_address,
852 			 drb->base_alignment,
853 			 drb->base_misalignment,
854 			 stmt_info->stmt);
855     }
856 }
857 
858 /* If the region we're going to vectorize is reached, all unconditional
859    data references occur at least once.  We can therefore pool the base
860    alignment guarantees from each unconditional reference.  Do this by
861    going through all the data references in VINFO and checking whether
862    the containing statement makes the reference unconditionally.  If so,
863    record the alignment of the base address in VINFO so that it can be
864    used for all other references with the same base.  */
865 
866 void
vect_record_base_alignments(vec_info * vinfo)867 vect_record_base_alignments (vec_info *vinfo)
868 {
869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
870   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
871   data_reference *dr;
872   unsigned int i;
873   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
874     {
875       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
876       stmt_vec_info stmt_info = dr_info->stmt;
877       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
878 	  && STMT_VINFO_VECTORIZABLE (stmt_info)
879 	  && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
880 	{
881 	  vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
882 
883 	  /* If DR is nested in the loop that is being vectorized, we can also
884 	     record the alignment of the base wrt the outer loop.  */
885 	  if (loop && nested_in_vect_loop_p (loop, stmt_info))
886 	    vect_record_base_alignment
887 	      (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
888 	}
889     }
890 }
891 
892 /* Return the target alignment for the vectorized form of DR_INFO.  */
893 
894 static poly_uint64
vect_calculate_target_alignment(dr_vec_info * dr_info)895 vect_calculate_target_alignment (dr_vec_info *dr_info)
896 {
897   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
898   return targetm.vectorize.preferred_vector_alignment (vectype);
899 }
900 
901 /* Function vect_compute_data_ref_alignment
902 
903    Compute the misalignment of the data reference DR_INFO.
904 
905    Output:
906    1. DR_MISALIGNMENT (DR_INFO) is defined.
907 
908    FOR NOW: No analysis is actually performed. Misalignment is calculated
909    only for trivial cases. TODO.  */
910 
911 static void
vect_compute_data_ref_alignment(dr_vec_info * dr_info)912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
913 {
914   stmt_vec_info stmt_info = dr_info->stmt;
915   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
916   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
917   struct loop *loop = NULL;
918   tree ref = DR_REF (dr_info->dr);
919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
920 
921   if (dump_enabled_p ())
922     dump_printf_loc (MSG_NOTE, vect_location,
923                      "vect_compute_data_ref_alignment:\n");
924 
925   if (loop_vinfo)
926     loop = LOOP_VINFO_LOOP (loop_vinfo);
927 
928   /* Initialize misalignment to unknown.  */
929   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
930 
931   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
932     return;
933 
934   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
935   bool step_preserves_misalignment_p;
936 
937   poly_uint64 vector_alignment
938     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
939   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
940 
941   unsigned HOST_WIDE_INT vect_align_c;
942   if (!vector_alignment.is_constant (&vect_align_c))
943     return;
944 
945   /* No step for BB vectorization.  */
946   if (!loop)
947     {
948       gcc_assert (integer_zerop (drb->step));
949       step_preserves_misalignment_p = true;
950     }
951 
952   /* In case the dataref is in an inner-loop of the loop that is being
953      vectorized (LOOP), we use the base and misalignment information
954      relative to the outer-loop (LOOP).  This is ok only if the misalignment
955      stays the same throughout the execution of the inner-loop, which is why
956      we have to check that the stride of the dataref in the inner-loop evenly
957      divides by the vector alignment.  */
958   else if (nested_in_vect_loop_p (loop, stmt_info))
959     {
960       step_preserves_misalignment_p
961 	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
962 
963       if (dump_enabled_p ())
964 	{
965 	  if (step_preserves_misalignment_p)
966 	    dump_printf_loc (MSG_NOTE, vect_location,
967 			     "inner step divides the vector alignment.\n");
968 	  else
969 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 			     "inner step doesn't divide the vector"
971 			     " alignment.\n");
972 	}
973     }
974 
975   /* Similarly we can only use base and misalignment information relative to
976      an innermost loop if the misalignment stays the same throughout the
977      execution of the loop.  As above, this is the case if the stride of
978      the dataref evenly divides by the alignment.  */
979   else
980     {
981       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
982       step_preserves_misalignment_p
983 	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
984 
985       if (!step_preserves_misalignment_p && dump_enabled_p ())
986 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 			 "step doesn't divide the vector alignment.\n");
988     }
989 
990   unsigned int base_alignment = drb->base_alignment;
991   unsigned int base_misalignment = drb->base_misalignment;
992 
993   /* Calculate the maximum of the pooled base address alignment and the
994      alignment that we can compute for DR itself.  */
995   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
996   if (entry && base_alignment < (*entry)->base_alignment)
997     {
998       base_alignment = (*entry)->base_alignment;
999       base_misalignment = (*entry)->base_misalignment;
1000     }
1001 
1002   if (drb->offset_alignment < vect_align_c
1003       || !step_preserves_misalignment_p
1004       /* We need to know whether the step wrt the vectorized loop is
1005 	 negative when computing the starting misalignment below.  */
1006       || TREE_CODE (drb->step) != INTEGER_CST)
1007     {
1008       if (dump_enabled_p ())
1009 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 			 "Unknown alignment for access: %T\n", ref);
1011       return;
1012     }
1013 
1014   if (base_alignment < vect_align_c)
1015     {
1016       unsigned int max_alignment;
1017       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018       if (max_alignment < vect_align_c
1019 	  || !vect_can_force_dr_alignment_p (base,
1020 					     vect_align_c * BITS_PER_UNIT))
1021 	{
1022 	  if (dump_enabled_p ())
1023 	    dump_printf_loc (MSG_NOTE, vect_location,
1024 			     "can't force alignment of ref: %T\n", ref);
1025 	  return;
1026 	}
1027 
1028       /* Force the alignment of the decl.
1029 	 NOTE: This is the only change to the code we make during
1030 	 the analysis phase, before deciding to vectorize the loop.  */
1031       if (dump_enabled_p ())
1032 	dump_printf_loc (MSG_NOTE, vect_location,
1033 			 "force alignment of %T\n", ref);
1034 
1035       dr_info->base_decl = base;
1036       dr_info->base_misaligned = true;
1037       base_misalignment = 0;
1038     }
1039   poly_int64 misalignment
1040     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041 
1042   /* If this is a backward running DR then first access in the larger
1043      vectype actually is N-1 elements before the address in the DR.
1044      Adjust misalign accordingly.  */
1045   if (tree_int_cst_sgn (drb->step) < 0)
1046     /* PLUS because STEP is negative.  */
1047     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048 		     * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1049 
1050   unsigned int const_misalignment;
1051   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052     {
1053       if (dump_enabled_p ())
1054 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 			 "Non-constant misalignment for access: %T\n", ref);
1056       return;
1057     }
1058 
1059   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060 
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 		     "misalign = %d bytes of ref %T\n",
1064 		     DR_MISALIGNMENT (dr_info), ref);
1065 
1066   return;
1067 }
1068 
1069 /* Function vect_update_misalignment_for_peel.
1070    Sets DR_INFO's misalignment
1071    - to 0 if it has the same alignment as DR_PEEL_INFO,
1072    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073    - to -1 (unknown) otherwise.
1074 
1075    DR_INFO - the data reference whose misalignment is to be adjusted.
1076    DR_PEEL_INFO - the data reference whose misalignment is being made
1077 		  zero in the vector loop by the peel.
1078    NPEEL - the number of iterations in the peel loop if the misalignment
1079            of DR_PEEL_INFO is known at compile time.  */
1080 
1081 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083 				   dr_vec_info *dr_peel_info, int npeel)
1084 {
1085   unsigned int i;
1086   vec<dr_p> same_aligned_drs;
1087   struct data_reference *current_dr;
1088   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089 
1090   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091      it is aligned in the vector loop.  */
1092   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094     {
1095       if (current_dr != dr_info->dr)
1096         continue;
1097       gcc_assert (!known_alignment_for_access_p (dr_info)
1098 		  || !known_alignment_for_access_p (dr_peel_info)
1099 		  || (DR_MISALIGNMENT (dr_info)
1100 		      == DR_MISALIGNMENT (dr_peel_info)));
1101       SET_DR_MISALIGNMENT (dr_info, 0);
1102       return;
1103     }
1104 
1105   unsigned HOST_WIDE_INT alignment;
1106   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107       && known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       int misal = DR_MISALIGNMENT (dr_info);
1111       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112       misal &= alignment - 1;
1113       SET_DR_MISALIGNMENT (dr_info, misal);
1114       return;
1115     }
1116 
1117   if (dump_enabled_p ())
1118     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119 		     "to unknown (-1).\n");
1120   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122 
1123 
1124 /* Function verify_data_ref_alignment
1125 
1126    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1127 
1128 static opt_result
verify_data_ref_alignment(dr_vec_info * dr_info)1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131   enum dr_alignment_support supportable_dr_alignment
1132     = vect_supportable_dr_alignment (dr_info, false);
1133   if (!supportable_dr_alignment)
1134     return opt_result::failure_at
1135       (dr_info->stmt->stmt,
1136        DR_IS_READ (dr_info->dr)
1137 	? "not vectorized: unsupported unaligned load: %T\n"
1138 	: "not vectorized: unsupported unaligned store: %T\n",
1139        DR_REF (dr_info->dr));
1140 
1141   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142     dump_printf_loc (MSG_NOTE, vect_location,
1143 		     "Vectorizing an unaligned access.\n");
1144 
1145   return opt_result::success ();
1146 }
1147 
1148 /* Function vect_verify_datarefs_alignment
1149 
1150    Return TRUE if all data references in the loop can be
1151    handled with respect to alignment.  */
1152 
1153 opt_result
vect_verify_datarefs_alignment(loop_vec_info vinfo)1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157   struct data_reference *dr;
1158   unsigned int i;
1159 
1160   FOR_EACH_VEC_ELT (datarefs, i, dr)
1161     {
1162       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163       stmt_vec_info stmt_info = dr_info->stmt;
1164 
1165       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166 	continue;
1167 
1168       /* For interleaving, only the alignment of the first access matters.   */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171 	continue;
1172 
1173       /* Strided accesses perform only component accesses, alignment is
1174 	 irrelevant for them.  */
1175       if (STMT_VINFO_STRIDED_P (stmt_info)
1176 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177 	continue;
1178 
1179       opt_result res = verify_data_ref_alignment (dr_info);
1180       if (!res)
1181 	return res;
1182     }
1183 
1184   return opt_result::success ();
1185 }
1186 
1187 /* Given an memory reference EXP return whether its alignment is less
1188    than its size.  */
1189 
1190 static bool
not_size_aligned(tree exp)1191 not_size_aligned (tree exp)
1192 {
1193   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194     return true;
1195 
1196   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197 	  > get_object_alignment (exp));
1198 }
1199 
1200 /* Function vector_alignment_reachable_p
1201 
1202    Return true if vector alignment for DR_INFO is reachable by peeling
1203    a few loop iterations.  Return false otherwise.  */
1204 
1205 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208   stmt_vec_info stmt_info = dr_info->stmt;
1209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210 
1211   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     {
1213       /* For interleaved access we peel only if number of iterations in
1214 	 the prolog loop ({VF - misalignment}), is a multiple of the
1215 	 number of the interleaved accesses.  */
1216       int elem_size, mis_in_elements;
1217 
1218       /* FORNOW: handle only known alignment.  */
1219       if (!known_alignment_for_access_p (dr_info))
1220 	return false;
1221 
1222       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224       elem_size = vector_element_size (vector_size, nelements);
1225       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226 
1227       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228 	return false;
1229     }
1230 
1231   /* If misalignment is known at the compile time then allow peeling
1232      only if natural alignment is reachable through peeling.  */
1233   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234     {
1235       HOST_WIDE_INT elmsize =
1236 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237       if (dump_enabled_p ())
1238 	{
1239 	  dump_printf_loc (MSG_NOTE, vect_location,
1240 	                   "data size = %wd. misalignment = %d.\n", elmsize,
1241 			   DR_MISALIGNMENT (dr_info));
1242 	}
1243       if (DR_MISALIGNMENT (dr_info) % elmsize)
1244 	{
1245 	  if (dump_enabled_p ())
1246 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 	                     "data size does not divide the misalignment.\n");
1248 	  return false;
1249 	}
1250     }
1251 
1252   if (!known_alignment_for_access_p (dr_info))
1253     {
1254       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256       if (dump_enabled_p ())
1257 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 	                 "Unknown misalignment, %snaturally aligned\n",
1259 			 is_packed ? "not " : "");
1260       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261     }
1262 
1263   return true;
1264 }
1265 
1266 
1267 /* Calculate the cost of the memory access represented by DR_INFO.  */
1268 
1269 static void
vect_get_data_access_cost(dr_vec_info * dr_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271                            unsigned int *inside_cost,
1272                            unsigned int *outside_cost,
1273 			   stmt_vector_for_cost *body_cost_vec,
1274 			   stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276   stmt_vec_info stmt_info = dr_info->stmt;
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279 
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284 
1285   if (DR_IS_READ (dr_info->dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287 			prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290 
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296 
1297 
1298 typedef struct _vect_peel_info
1299 {
1300   dr_vec_info *dr_info;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304 
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311 
1312 
1313 /* Peeling hashtable helpers.  */
1314 
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320 
1321 inline hashval_t
hash(const _vect_peel_info * peel_info)1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326 
1327 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332 
1333 
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1335 
1336 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel)1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 			  loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment
1344     = vect_supportable_dr_alignment (dr_info, true);
1345 
1346   elem.npeel = npeel;
1347   slot = peeling_htab->find (&elem);
1348   if (slot)
1349     slot->count++;
1350   else
1351     {
1352       slot = XNEW (struct _vect_peel_info);
1353       slot->npeel = npeel;
1354       slot->dr_info = dr_info;
1355       slot->count = 1;
1356       new_slot = peeling_htab->find_slot (slot, INSERT);
1357       *new_slot = slot;
1358     }
1359 
1360   if (!supportable_dr_alignment
1361       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362     slot->count += VECT_MAX_COST;
1363 }
1364 
1365 
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367    number of data accesses.  */
1368 
1369 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371 				     _vect_peel_extended_info *max)
1372 {
1373   vect_peel_info elem = *slot;
1374 
1375   if (elem->count > max->peel_info.count
1376       || (elem->count == max->peel_info.count
1377           && max->peel_info.npeel > elem->npeel))
1378     {
1379       max->peel_info.npeel = elem->npeel;
1380       max->peel_info.count = elem->count;
1381       max->peel_info.dr_info = elem->dr_info;
1382     }
1383 
1384   return 1;
1385 }
1386 
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1389    we assume DR0_INFO's misalignment will be zero after peeling.  */
1390 
1391 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel,bool unknown_misalignment)1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393 				dr_vec_info *dr0_info,
1394 				unsigned int *inside_cost,
1395 				unsigned int *outside_cost,
1396 				stmt_vector_for_cost *body_cost_vec,
1397 				stmt_vector_for_cost *prologue_cost_vec,
1398 				unsigned int npeel,
1399 				bool unknown_misalignment)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   unsigned i;
1403   data_reference *dr;
1404 
1405   FOR_EACH_VEC_ELT (datarefs, i, dr)
1406     {
1407       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408       stmt_vec_info stmt_info = dr_info->stmt;
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410 	continue;
1411 
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416 	continue;
1417 
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422 	continue;
1423 
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr_info);
1426       if (npeel == 0)
1427 	;
1428       else if (unknown_misalignment && dr_info == dr0_info)
1429 	SET_DR_MISALIGNMENT (dr_info, 0);
1430       else
1431 	vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433 				 body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435     }
1436 }
1437 
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440 
1441 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443 				   _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = elem->dr_info->stmt;
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451 		       epilogue_cost_vec;
1452 
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456 
1457   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458 				  &outside_cost, &body_cost_vec,
1459 				  &prologue_cost_vec, elem->npeel, false);
1460 
1461   body_cost_vec.release ();
1462 
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467 
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474 
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477 	  && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr_info = elem->dr_info;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485 
1486   return 1;
1487 }
1488 
1489 
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493 
1494 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 				       loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499 
1500    res.peel_info.dr_info = NULL;
1501 
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513 	   		       vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517 
1518    return res;
1519 }
1520 
1521 /* Return true if the new peeling NPEEL is supported.  */
1522 
1523 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525 			  unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   enum dr_alignment_support supportable_dr_alignment;
1531 
1532   /* Ensure that all data refs can be vectorized after the peel.  */
1533   FOR_EACH_VEC_ELT (datarefs, i, dr)
1534     {
1535       int save_misalignment;
1536 
1537       if (dr == dr0_info->dr)
1538 	continue;
1539 
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       stmt_vec_info stmt_info = dr_info->stmt;
1542       /* For interleaving, only the alignment of the first access
1543 	 matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546 	continue;
1547 
1548       /* Strided accesses perform only component accesses, alignment is
1549 	 irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 	continue;
1553 
1554       save_misalignment = DR_MISALIGNMENT (dr_info);
1555       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556       supportable_dr_alignment
1557 	= vect_supportable_dr_alignment (dr_info, false);
1558       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559 
1560       if (!supportable_dr_alignment)
1561 	return false;
1562     }
1563 
1564   return true;
1565 }
1566 
1567 /* Function vect_enhance_data_refs_alignment
1568 
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571 
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576 
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585 
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589 
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596 
1597    Here are the general steps involved in alignment enhancements:
1598 
1599      -- original loop, before alignment analysis:
1600 	for (i=0; i<N; i++){
1601 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1602 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1603 	}
1604 
1605      -- After vect_compute_data_refs_alignment:
1606 	for (i=0; i<N; i++){
1607 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1608 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1609 	}
1610 
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613 	for (i=0; i<N; i++){	# loop 1A
1614 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1615 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1616 	}
1617      }
1618      else {
1619 	for (i=0; i<N; i++){	# loop 1B
1620 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1621 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1622 	}
1623      }
1624 
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1627 	x = q[i];
1628 	p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){	# loop 2A
1631 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1632 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1633      }
1634 
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1637 	x = q[i];
1638 	p[i] = y;
1639      }
1640      if (p is aligned) {
1641 	for (i = 3; i<N; i++){	# loop 3A
1642 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1643 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1644 	}
1645      }
1646      else {
1647 	for (i = 3; i<N; i++){	# loop 3B
1648 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1649 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1650 	}
1651      }
1652 
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657 
1658 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   dr_vec_info *first_store = NULL;
1665   dr_vec_info *dr0_info = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   dr_vec_info *unsupportable_dr_info = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680 
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682 
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686 
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689 
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694 
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700 
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703 
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705 
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708 
1709   /* (1) Peeling to force alignment.  */
1710 
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718 
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722       stmt_vec_info stmt_info = dr_info->stmt;
1723 
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 	continue;
1726 
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731 	continue;
1732 
1733       /* For scatter-gather or invariant accesses there is nothing
1734 	 to enhance.  */
1735       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736 	  || integer_zerop (DR_STEP (dr)))
1737 	continue;
1738 
1739       /* Strided accesses perform only component accesses, alignment is
1740 	 irrelevant for them.  */
1741       if (STMT_VINFO_STRIDED_P (stmt_info)
1742 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743 	continue;
1744 
1745       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746       do_peeling = vector_alignment_reachable_p (dr_info);
1747       if (do_peeling)
1748         {
1749           if (known_alignment_for_access_p (dr_info))
1750             {
1751 	      unsigned int npeel_tmp = 0;
1752 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1753 						    size_zero_node) < 0;
1754 
1755 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1756 	      /* If known_alignment_for_access_p then we have set
1757 	         DR_MISALIGNMENT which is only done if we know it at compiler
1758 	         time, so it is safe to assume target alignment is constant.
1759 	       */
1760 	      unsigned int target_align =
1761 		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762 	      unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763 	      mis = (negative
1764 		     ? DR_MISALIGNMENT (dr_info)
1765 		     : -DR_MISALIGNMENT (dr_info));
1766 	      if (DR_MISALIGNMENT (dr_info) != 0)
1767 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768 
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775 		 first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781 		 cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783 		{
1784 		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 					  ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 		  possible_npeel_number
1787 		    = vect_get_num_vectors (nscalars, vectype);
1788 
1789 		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 		     allow peeling NELEMENTS.  */
1791 		  if (DR_MISALIGNMENT (dr_info) == 0)
1792 		    possible_npeel_number++;
1793 		}
1794 
1795 	      /* Save info about DR in the hash table.  Also include peeling
1796 	         amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 					    dr_info, npeel_tmp);
1801 		  npeel_tmp += target_align / dr_size;
1802                 }
1803 
1804 	      one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812 	      unsigned same_align_drs
1813 		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 	      if (!dr0_info
1815 		  || same_align_drs_max < same_align_drs)
1816 		{
1817 		  same_align_drs_max = same_align_drs;
1818 		  dr0_info = dr_info;
1819 		}
1820 	      /* For data-refs with the same number of related
1821 		 accesses prefer the one where the misalign
1822 		 computation will be invariant in the outermost loop.  */
1823 	      else if (same_align_drs_max == same_align_drs)
1824 		{
1825 		  struct loop *ivloop0, *ivloop;
1826 		  ivloop0 = outermost_invariant_loop_for_expr
1827 		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828 		  ivloop = outermost_invariant_loop_for_expr
1829 		    (loop, DR_BASE_ADDRESS (dr));
1830 		  if ((ivloop && !ivloop0)
1831 		      || (ivloop && ivloop0
1832 			  && flow_loop_nested_p (ivloop, ivloop0)))
1833 		    dr0_info = dr_info;
1834 		}
1835 
1836 	      one_misalignment_unknown = true;
1837 
1838 	      /* Check for data refs with unsupportable alignment that
1839 	         can be peeled.  */
1840 	      if (!supportable_dr_alignment)
1841 	      {
1842 		one_dr_unsupportable = true;
1843 		unsupportable_dr_info = dr_info;
1844 	      }
1845 
1846 	      if (!first_store && DR_IS_WRITE (dr))
1847 		first_store = dr_info;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr_info))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861 
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867 
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871 
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875 
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887 
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891 				      &load_inside_cost,
1892 				      &load_outside_cost,
1893 				      &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895 
1896       if (first_store)
1897 	{
1898 	  dummy.create (2);
1899 	  vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900 					  &store_inside_cost,
1901 					  &store_outside_cost,
1902 					  &dummy, &dummy,
1903 					  estimated_npeels, true);
1904 	  dummy.release ();
1905 	}
1906       else
1907 	{
1908 	  store_inside_cost = INT_MAX;
1909 	  store_outside_cost = INT_MAX;
1910 	}
1911 
1912       if (load_inside_cost > store_inside_cost
1913 	  || (load_inside_cost == store_inside_cost
1914 	      && load_outside_cost > store_outside_cost))
1915 	{
1916 	  dr0_info = first_store;
1917 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919 	}
1920       else
1921 	{
1922 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924 	}
1925 
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929 
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 	(loop_vinfo, estimated_npeels, &dummy2,
1933 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 	 &prologue_cost_vec, &epilogue_cost_vec);
1935 
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938 
1939       peel_for_unknown_alignment.peel_info.count = 1
1940 	+ STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941     }
1942 
1943   peel_for_unknown_alignment.peel_info.npeel = 0;
1944   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945 
1946   best_peel = peel_for_unknown_alignment;
1947 
1948   peel_for_known_alignment.inside_cost = INT_MAX;
1949   peel_for_known_alignment.outside_cost = INT_MAX;
1950   peel_for_known_alignment.peel_info.count = 0;
1951   peel_for_known_alignment.peel_info.dr_info = NULL;
1952 
1953   if (do_peeling && one_misalignment_known)
1954     {
1955       /* Peeling is possible, but there is no data access that is not supported
1956          unless aligned.  So we try to choose the best possible peeling from
1957 	 the hash table.  */
1958       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959 	(&peeling_htab, loop_vinfo);
1960     }
1961 
1962   /* Compare costs of peeling for known and unknown alignment. */
1963   if (peel_for_known_alignment.peel_info.dr_info != NULL
1964       && peel_for_unknown_alignment.inside_cost
1965       >= peel_for_known_alignment.inside_cost)
1966     {
1967       best_peel = peel_for_known_alignment;
1968 
1969       /* If the best peeling for known alignment has NPEEL == 0, perform no
1970          peeling at all except if there is an unsupportable dr that we can
1971          align.  */
1972       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973 	do_peeling = false;
1974     }
1975 
1976   /* If there is an unsupportable data ref, prefer this over all choices so far
1977      since we'd have to discard a chosen peeling except when it accidentally
1978      aligned the unsupportable data ref.  */
1979   if (one_dr_unsupportable)
1980     dr0_info = unsupportable_dr_info;
1981   else if (do_peeling)
1982     {
1983       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984 	 TODO: Use nopeel_outside_cost or get rid of it?  */
1985       unsigned nopeel_inside_cost = 0;
1986       unsigned nopeel_outside_cost = 0;
1987 
1988       stmt_vector_for_cost dummy;
1989       dummy.create (2);
1990       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991 				      &nopeel_outside_cost, &dummy, &dummy,
1992 				      0, false);
1993       dummy.release ();
1994 
1995       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1996 	 costs will be recorded.  */
1997       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998       prologue_cost_vec.create (2);
1999       epilogue_cost_vec.create (2);
2000 
2001       int dummy2;
2002       nopeel_outside_cost += vect_get_known_peeling_cost
2003 	(loop_vinfo, 0, &dummy2,
2004 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005 	 &prologue_cost_vec, &epilogue_cost_vec);
2006 
2007       prologue_cost_vec.release ();
2008       epilogue_cost_vec.release ();
2009 
2010       npeel = best_peel.peel_info.npeel;
2011       dr0_info = best_peel.peel_info.dr_info;
2012 
2013       /* If no peeling is not more expensive than the best peeling we
2014 	 have so far, don't perform any peeling.  */
2015       if (nopeel_inside_cost <= best_peel.inside_cost)
2016 	do_peeling = false;
2017     }
2018 
2019   if (do_peeling)
2020     {
2021       stmt_vec_info stmt_info = dr0_info->stmt;
2022       vectype = STMT_VINFO_VECTYPE (stmt_info);
2023 
2024       if (known_alignment_for_access_p (dr0_info))
2025         {
2026 	  bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027 						size_zero_node) < 0;
2028           if (!npeel)
2029             {
2030               /* Since it's known at compile time, compute the number of
2031                  iterations in the peeled loop (the peeling factor) for use in
2032                  updating DR_MISALIGNMENT values.  The peeling factor is the
2033                  vectorization factor minus the misalignment as an element
2034                  count.  */
2035 	      mis = (negative
2036 		     ? DR_MISALIGNMENT (dr0_info)
2037 		     : -DR_MISALIGNMENT (dr0_info));
2038 	      /* If known_alignment_for_access_p then we have set
2039 	         DR_MISALIGNMENT which is only done if we know it at compiler
2040 	         time, so it is safe to assume target alignment is constant.
2041 	       */
2042 	      unsigned int target_align =
2043 		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044 	      npeel = ((mis & (target_align - 1))
2045 		       / vect_get_scalar_dr_size (dr0_info));
2046             }
2047 
2048 	  /* For interleaved data access every iteration accesses all the
2049 	     members of the group, therefore we divide the number of iterations
2050 	     by the group size.  */
2051 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 	    npeel /= DR_GROUP_SIZE (stmt_info);
2053 
2054           if (dump_enabled_p ())
2055             dump_printf_loc (MSG_NOTE, vect_location,
2056                              "Try peeling by %d\n", npeel);
2057         }
2058 
2059       /* Ensure that all datarefs can be vectorized after the peel.  */
2060       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061 	do_peeling = false;
2062 
2063       /* Check if all datarefs are supportable and log.  */
2064       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065         {
2066           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           if (!stat)
2068             do_peeling = false;
2069           else
2070 	    return stat;
2071         }
2072 
2073       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2074       if (do_peeling)
2075         {
2076           unsigned max_allowed_peel
2077             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078           if (max_allowed_peel != (unsigned)-1)
2079             {
2080               unsigned max_peel = npeel;
2081               if (max_peel == 0)
2082                 {
2083 		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084 		  unsigned HOST_WIDE_INT target_align_c;
2085 		  if (target_align.is_constant (&target_align_c))
2086 		    max_peel =
2087 		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088 		  else
2089 		    {
2090 		      do_peeling = false;
2091 		      if (dump_enabled_p ())
2092 			dump_printf_loc (MSG_NOTE, vect_location,
2093 			  "Disable peeling, max peels set and vector"
2094 			  " alignment unknown\n");
2095 		    }
2096                 }
2097               if (max_peel > max_allowed_peel)
2098                 {
2099                   do_peeling = false;
2100                   if (dump_enabled_p ())
2101                     dump_printf_loc (MSG_NOTE, vect_location,
2102                         "Disable peeling, max peels reached: %d\n", max_peel);
2103                 }
2104             }
2105         }
2106 
2107       /* Cost model #2 - if peeling may result in a remaining loop not
2108 	 iterating enough to be vectorized then do not peel.  Since this
2109 	 is a cost heuristic rather than a correctness decision, use the
2110 	 most likely runtime value for variable vectorization factors.  */
2111       if (do_peeling
2112 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2113 	{
2114 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117 	      < assumed_vf + max_peel)
2118 	    do_peeling = false;
2119 	}
2120 
2121       if (do_peeling)
2122         {
2123           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124              If the misalignment of DR_i is identical to that of dr0 then set
2125              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2126              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127              by the peeling factor times the element size of DR_i (MOD the
2128              vectorization factor times the size).  Otherwise, the
2129              misalignment of DR_i must be set to unknown.  */
2130 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2131 	    if (dr != dr0_info->dr)
2132 	      {
2133 		/* Strided accesses perform only component accesses, alignment
2134 		   is irrelevant for them.  */
2135 		dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136 		stmt_info = dr_info->stmt;
2137 		if (STMT_VINFO_STRIDED_P (stmt_info)
2138 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139 		  continue;
2140 
2141 		vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2142 	      }
2143 
2144           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145           if (npeel)
2146             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147           else
2148             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149 	      = DR_MISALIGNMENT (dr0_info);
2150 	  SET_DR_MISALIGNMENT (dr0_info, 0);
2151 	  if (dump_enabled_p ())
2152             {
2153               dump_printf_loc (MSG_NOTE, vect_location,
2154                                "Alignment of access forced using peeling.\n");
2155               dump_printf_loc (MSG_NOTE, vect_location,
2156                                "Peeling for alignment will be applied.\n");
2157             }
2158 
2159 	  /* The inside-loop cost will be accounted for in vectorizable_load
2160 	     and vectorizable_store correctly with adjusted alignments.
2161 	     Drop the body_cst_vec on the floor here.  */
2162 	  opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163 	  gcc_assert (stat);
2164           return stat;
2165         }
2166     }
2167 
2168   /* (2) Versioning to force alignment.  */
2169 
2170   /* Try versioning if:
2171      1) optimize loop for speed
2172      2) there is at least one unsupported misaligned data ref with an unknown
2173         misalignment, and
2174      3) all misaligned data refs with a known misalignment are supported, and
2175      4) the number of runtime alignment checks is within reason.  */
2176 
2177   do_versioning =
2178 	optimize_loop_nest_for_speed_p (loop)
2179 	&& (!loop->inner); /* FORNOW */
2180 
2181   if (do_versioning)
2182     {
2183       FOR_EACH_VEC_ELT (datarefs, i, dr)
2184         {
2185 	  dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186 	  stmt_vec_info stmt_info = dr_info->stmt;
2187 
2188 	  /* For interleaving, only the alignment of the first access
2189 	     matters.  */
2190 	  if (aligned_access_p (dr_info)
2191 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192 		  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193 	    continue;
2194 
2195 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2196 	    {
2197 	      /* Strided loads perform only component accesses, alignment is
2198 		 irrelevant for them.  */
2199 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200 		continue;
2201 	      do_versioning = false;
2202 	      break;
2203 	    }
2204 
2205 	  supportable_dr_alignment
2206 	    = vect_supportable_dr_alignment (dr_info, false);
2207 
2208           if (!supportable_dr_alignment)
2209             {
2210               int mask;
2211               tree vectype;
2212 
2213               if (known_alignment_for_access_p (dr_info)
2214                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2216                 {
2217                   do_versioning = false;
2218                   break;
2219                 }
2220 
2221 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
2222 	      gcc_assert (vectype);
2223 
2224 	      /* At present we don't support versioning for alignment
2225 		 with variable VF, since there's no guarantee that the
2226 		 VF is a power of two.  We could relax this if we added
2227 		 a way of enforcing a power-of-two size.  */
2228 	      unsigned HOST_WIDE_INT size;
2229 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2230 		{
2231 		  do_versioning = false;
2232 		  break;
2233 		}
2234 
2235 	      /* Forcing alignment in the first iteration is no good if
2236 		 we don't keep it across iterations.  For now, just disable
2237 		 versioning in this case.
2238 		 ?? We could actually unroll the loop to achieve the required
2239 		 overall step alignment, and forcing the alignment could be
2240 		 done by doing some iterations of the non-vectorized loop.  */
2241 	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242 			       * DR_STEP_ALIGNMENT (dr),
2243 			       DR_TARGET_ALIGNMENT (dr_info)))
2244 		{
2245 		  do_versioning = false;
2246 		  break;
2247 		}
2248 
2249               /* The rightmost bits of an aligned address must be zeros.
2250                  Construct the mask needed for this test.  For example,
2251                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252                  mask must be 15 = 0xf. */
2253 	      mask = size - 1;
2254 
2255               /* FORNOW: use the same mask to test all potentially unaligned
2256                  references in the loop.  The vectorizer currently supports
2257                  a single vector size, see the reference to
2258                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259                  vectorization factor is computed.  */
2260               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263 	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2264             }
2265         }
2266 
2267       /* Versioning requires at least one misaligned data reference.  */
2268       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269         do_versioning = false;
2270       else if (!do_versioning)
2271         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2272     }
2273 
2274   if (do_versioning)
2275     {
2276       vec<stmt_vec_info> may_misalign_stmts
2277         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278       stmt_vec_info stmt_info;
2279 
2280       /* It can now be assumed that the data references in the statements
2281          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282          of the loop being vectorized.  */
2283       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2284         {
2285 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286 	  SET_DR_MISALIGNMENT (dr_info, 0);
2287 	  if (dump_enabled_p ())
2288             dump_printf_loc (MSG_NOTE, vect_location,
2289                              "Alignment of access forced using versioning.\n");
2290         }
2291 
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "Versioning for alignment will be applied.\n");
2295 
2296       /* Peeling and versioning can't be done together at this time.  */
2297       gcc_assert (! (do_peeling && do_versioning));
2298 
2299       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300       gcc_assert (stat);
2301       return stat;
2302     }
2303 
2304   /* This point is reached if neither peeling nor versioning is being done.  */
2305   gcc_assert (! (do_peeling || do_versioning));
2306 
2307   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308   return stat;
2309 }
2310 
2311 
2312 /* Function vect_find_same_alignment_drs.
2313 
2314    Update group and alignment relations in VINFO according to the chosen
2315    vectorization factor.  */
2316 
2317 static void
vect_find_same_alignment_drs(vec_info * vinfo,data_dependence_relation * ddr)2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2319 {
2320   struct data_reference *dra = DDR_A (ddr);
2321   struct data_reference *drb = DDR_B (ddr);
2322   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2326 
2327   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328     return;
2329 
2330   if (dra == drb)
2331     return;
2332 
2333   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335     return;
2336 
2337   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340     return;
2341 
2342   /* Two references with distance zero have the same alignment.  */
2343   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344 			  - wi::to_poly_offset (DR_INIT (drb)));
2345   if (maybe_ne (diff, 0))
2346     {
2347       /* Get the wider of the two alignments.  */
2348       poly_uint64 align_a =
2349 	exact_div (vect_calculate_target_alignment (dr_info_a),
2350 		   BITS_PER_UNIT);
2351       poly_uint64 align_b =
2352 	exact_div (vect_calculate_target_alignment (dr_info_b),
2353 		   BITS_PER_UNIT);
2354       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355       if (!align_a.is_constant (&align_a_c)
2356 	  || !align_b.is_constant (&align_b_c))
2357 	return;
2358 
2359       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2360 
2361       /* Require the gap to be a multiple of the larger vector alignment.  */
2362       if (!multiple_p (diff, max_align))
2363 	return;
2364     }
2365 
2366   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368   if (dump_enabled_p ())
2369     dump_printf_loc (MSG_NOTE, vect_location,
2370 		     "accesses have the same alignment: %T and %T\n",
2371 		     DR_REF (dra), DR_REF (drb));
2372 }
2373 
2374 
2375 /* Function vect_analyze_data_refs_alignment
2376 
2377    Analyze the alignment of the data-references in the loop.
2378    Return FALSE if a data reference is found that cannot be vectorized.  */
2379 
2380 opt_result
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2382 {
2383   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2384 
2385   /* Mark groups of data references with same alignment using
2386      data dependence information.  */
2387   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388   struct data_dependence_relation *ddr;
2389   unsigned int i;
2390 
2391   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392     vect_find_same_alignment_drs (vinfo, ddr);
2393 
2394   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395   struct data_reference *dr;
2396 
2397   vect_record_base_alignments (vinfo);
2398   FOR_EACH_VEC_ELT (datarefs, i, dr)
2399     {
2400       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402 	vect_compute_data_ref_alignment (dr_info);
2403     }
2404 
2405   return opt_result::success ();
2406 }
2407 
2408 
2409 /* Analyze alignment of DRs of stmts in NODE.  */
2410 
2411 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2413 {
2414   /* We vectorize from the first scalar stmt in the node unless
2415      the node is permuted in which case we start from the first
2416      element in the group.  */
2417   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2421 
2422   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423   vect_compute_data_ref_alignment (dr_info);
2424   /* For creating the data-ref pointer we need alignment of the
2425      first element anyway.  */
2426   if (dr_info != first_dr_info)
2427     vect_compute_data_ref_alignment (first_dr_info);
2428   if (! verify_data_ref_alignment (dr_info))
2429     {
2430       if (dump_enabled_p ())
2431 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 			 "not vectorized: bad data alignment in basic "
2433 			 "block.\n");
2434       return false;
2435     }
2436 
2437   return true;
2438 }
2439 
2440 /* Function vect_slp_analyze_instance_alignment
2441 
2442    Analyze the alignment of the data-references in the SLP instance.
2443    Return FALSE if a data reference is found that cannot be vectorized.  */
2444 
2445 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2447 {
2448   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2449 
2450   slp_tree node;
2451   unsigned i;
2452   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453     if (! vect_slp_analyze_and_verify_node_alignment (node))
2454       return false;
2455 
2456   node = SLP_INSTANCE_TREE (instance);
2457   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458       && ! vect_slp_analyze_and_verify_node_alignment
2459 	     (SLP_INSTANCE_TREE (instance)))
2460     return false;
2461 
2462   return true;
2463 }
2464 
2465 
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467    accesses of legal size, step, etc.  Detect gaps, single element
2468    interleaving, and other special cases. Set grouped access info.
2469    Collect groups of strided stores for further use in SLP analysis.
2470    Worker for vect_analyze_group_access.  */
2471 
2472 static bool
vect_analyze_group_access_1(dr_vec_info * dr_info)2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2474 {
2475   data_reference *dr = dr_info->dr;
2476   tree step = DR_STEP (dr);
2477   tree scalar_type = TREE_TYPE (DR_REF (dr));
2478   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479   stmt_vec_info stmt_info = dr_info->stmt;
2480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482   HOST_WIDE_INT dr_step = -1;
2483   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484   bool slp_impossible = false;
2485 
2486   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487      size of the interleaving group (including gaps).  */
2488   if (tree_fits_shwi_p (step))
2489     {
2490       dr_step = tree_to_shwi (step);
2491       /* Check that STEP is a multiple of type size.  Otherwise there is
2492          a non-element-sized gap at the end of the group which we
2493 	 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494 	 ???  As we can handle non-constant step fine here we should
2495 	 simply remove uses of DR_GROUP_GAP between the last and first
2496 	 element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2497 	 simply not include that gap.  */
2498       if ((dr_step % type_size) != 0)
2499 	{
2500 	  if (dump_enabled_p ())
2501 	    dump_printf_loc (MSG_NOTE, vect_location,
2502 			     "Step %T is not a multiple of the element size"
2503 			     " for %T\n",
2504 			     step, DR_REF (dr));
2505 	  return false;
2506 	}
2507       groupsize = absu_hwi (dr_step) / type_size;
2508     }
2509   else
2510     groupsize = 0;
2511 
2512   /* Not consecutive access is possible only if it is a part of interleaving.  */
2513   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2514     {
2515       /* Check if it this DR is a part of interleaving, and is a single
2516 	 element of the group that is accessed in the loop.  */
2517 
2518       /* Gaps are supported only for loads. STEP must be a multiple of the type
2519 	 size.  */
2520       if (DR_IS_READ (dr)
2521 	  && (dr_step % type_size) == 0
2522 	  && groupsize > 0)
2523 	{
2524 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525 	  DR_GROUP_SIZE (stmt_info) = groupsize;
2526 	  DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527 	  if (dump_enabled_p ())
2528 	    dump_printf_loc (MSG_NOTE, vect_location,
2529 			     "Detected single element interleaving %T"
2530 			     " step %T\n",
2531 			     DR_REF (dr), step);
2532 
2533 	  return true;
2534 	}
2535 
2536       if (dump_enabled_p ())
2537 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538 			 "not consecutive access %G", stmt_info->stmt);
2539 
2540       if (bb_vinfo)
2541 	{
2542 	  /* Mark the statement as unvectorizable.  */
2543 	  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544 	  return true;
2545 	}
2546 
2547       if (dump_enabled_p ())
2548 	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549       STMT_VINFO_STRIDED_P (stmt_info) = true;
2550       return true;
2551     }
2552 
2553   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2554     {
2555       /* First stmt in the interleaving chain. Check the chain.  */
2556       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557       struct data_reference *data_ref = dr;
2558       unsigned int count = 1;
2559       tree prev_init = DR_INIT (data_ref);
2560       HOST_WIDE_INT diff, gaps = 0;
2561 
2562       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2563       while (next)
2564         {
2565           /* We never have the same DR multiple times.  */
2566           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567 				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2568 
2569 	  data_ref = STMT_VINFO_DATA_REF (next);
2570 
2571 	  /* All group members have the same STEP by construction.  */
2572 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573 
2574           /* Check that the distance between two accesses is equal to the type
2575              size. Otherwise, we have gaps.  */
2576           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2578 	  if (diff != 1)
2579 	    {
2580 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2581 	      slp_impossible = true;
2582 	      if (DR_IS_WRITE (data_ref))
2583 		{
2584                   if (dump_enabled_p ())
2585                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586                                      "interleaved store with gaps\n");
2587 		  return false;
2588 		}
2589 
2590               gaps += diff - 1;
2591 	    }
2592 
2593 	  last_accessed_element += diff;
2594 
2595           /* Store the gap from the previous member of the group. If there is no
2596              gap in the access, DR_GROUP_GAP is always 1.  */
2597 	  DR_GROUP_GAP (next) = diff;
2598 
2599 	  prev_init = DR_INIT (data_ref);
2600 	  next = DR_GROUP_NEXT_ELEMENT (next);
2601 	  /* Count the number of data-refs in the chain.  */
2602 	  count++;
2603         }
2604 
2605       if (groupsize == 0)
2606         groupsize = count + gaps;
2607 
2608       /* This could be UINT_MAX but as we are generating code in a very
2609          inefficient way we have to cap earlier.  See PR78699 for example.  */
2610       if (groupsize > 4096)
2611 	{
2612 	  if (dump_enabled_p ())
2613 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 			     "group is too large\n");
2615 	  return false;
2616 	}
2617 
2618       /* Check that the size of the interleaving is equal to count for stores,
2619          i.e., that there are no gaps.  */
2620       if (groupsize != count
2621 	  && !DR_IS_READ (dr))
2622         {
2623 	  groupsize = count;
2624 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
2625 	}
2626 
2627       /* If there is a gap after the last load in the group it is the
2628 	 difference between the groupsize and the last accessed
2629 	 element.
2630 	 When there is no gap, this difference should be 0.  */
2631       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2632 
2633       DR_GROUP_SIZE (stmt_info) = groupsize;
2634       if (dump_enabled_p ())
2635 	{
2636 	  dump_printf_loc (MSG_NOTE, vect_location,
2637 			   "Detected interleaving ");
2638 	  if (DR_IS_READ (dr))
2639 	    dump_printf (MSG_NOTE, "load ");
2640 	  else if (STMT_VINFO_STRIDED_P (stmt_info))
2641 	    dump_printf (MSG_NOTE, "strided store ");
2642 	  else
2643 	    dump_printf (MSG_NOTE, "store ");
2644 	  dump_printf (MSG_NOTE, "of size %u\n",
2645 		       (unsigned)groupsize);
2646 	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647 	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648 	  while (next)
2649 	    {
2650 	      if (DR_GROUP_GAP (next) != 1)
2651 		dump_printf_loc (MSG_NOTE, vect_location,
2652 				 "\t<gap of %d elements>\n",
2653 				 DR_GROUP_GAP (next) - 1);
2654 	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655 	      next = DR_GROUP_NEXT_ELEMENT (next);
2656 	    }
2657 	  if (DR_GROUP_GAP (stmt_info) != 0)
2658 	    dump_printf_loc (MSG_NOTE, vect_location,
2659 			     "\t<gap of %d elements>\n",
2660 			     DR_GROUP_GAP (stmt_info));
2661 	}
2662 
2663       /* SLP: create an SLP data structure for every interleaving group of
2664 	 stores for further analysis in vect_analyse_slp.  */
2665       if (DR_IS_WRITE (dr) && !slp_impossible)
2666 	{
2667 	  if (loop_vinfo)
2668 	    LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669 	  if (bb_vinfo)
2670 	    BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2671 	}
2672     }
2673 
2674   return true;
2675 }
2676 
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678    accesses of legal size, step, etc.  Detect gaps, single element
2679    interleaving, and other special cases. Set grouped access info.
2680    Collect groups of strided stores for further use in SLP analysis.  */
2681 
2682 static bool
vect_analyze_group_access(dr_vec_info * dr_info)2683 vect_analyze_group_access (dr_vec_info *dr_info)
2684 {
2685   if (!vect_analyze_group_access_1 (dr_info))
2686     {
2687       /* Dissolve the group if present.  */
2688       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689       while (stmt_info)
2690 	{
2691 	  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693 	  DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694 	  stmt_info = next;
2695 	}
2696       return false;
2697     }
2698   return true;
2699 }
2700 
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702    In case of non-consecutive accesses call vect_analyze_group_access() to
2703    analyze groups of accesses.  */
2704 
2705 static bool
vect_analyze_data_ref_access(dr_vec_info * dr_info)2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2707 {
2708   data_reference *dr = dr_info->dr;
2709   tree step = DR_STEP (dr);
2710   tree scalar_type = TREE_TYPE (DR_REF (dr));
2711   stmt_vec_info stmt_info = dr_info->stmt;
2712   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713   struct loop *loop = NULL;
2714 
2715   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716     return true;
2717 
2718   if (loop_vinfo)
2719     loop = LOOP_VINFO_LOOP (loop_vinfo);
2720 
2721   if (loop_vinfo && !step)
2722     {
2723       if (dump_enabled_p ())
2724 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725 	                 "bad data-ref access in loop\n");
2726       return false;
2727     }
2728 
2729   /* Allow loads with zero step in inner-loop vectorization.  */
2730   if (loop_vinfo && integer_zerop (step))
2731     {
2732       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733       if (!nested_in_vect_loop_p (loop, stmt_info))
2734 	return DR_IS_READ (dr);
2735       /* Allow references with zero step for outer loops marked
2736 	 with pragma omp simd only - it guarantees absence of
2737 	 loop-carried dependencies between inner loop iterations.  */
2738       if (loop->safelen < 2)
2739 	{
2740 	  if (dump_enabled_p ())
2741 	    dump_printf_loc (MSG_NOTE, vect_location,
2742 			     "zero step in inner loop of nest\n");
2743 	  return false;
2744 	}
2745     }
2746 
2747   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748     {
2749       /* Interleaved accesses are not yet supported within outer-loop
2750         vectorization for references in the inner-loop.  */
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752 
2753       /* For the rest of the analysis we use the outer-loop step.  */
2754       step = STMT_VINFO_DR_STEP (stmt_info);
2755       if (integer_zerop (step))
2756 	{
2757 	  if (dump_enabled_p ())
2758 	    dump_printf_loc (MSG_NOTE, vect_location,
2759 	                     "zero step in outer loop.\n");
2760 	  return DR_IS_READ (dr);
2761 	}
2762     }
2763 
2764   /* Consecutive?  */
2765   if (TREE_CODE (step) == INTEGER_CST)
2766     {
2767       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769 	  || (dr_step < 0
2770 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2771 	{
2772 	  /* Mark that it is not interleaving.  */
2773 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774 	  return true;
2775 	}
2776     }
2777 
2778   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2779     {
2780       if (dump_enabled_p ())
2781 	dump_printf_loc (MSG_NOTE, vect_location,
2782 	                 "grouped access in outer loop.\n");
2783       return false;
2784     }
2785 
2786 
2787   /* Assume this is a DR handled by non-constant strided load case.  */
2788   if (TREE_CODE (step) != INTEGER_CST)
2789     return (STMT_VINFO_STRIDED_P (stmt_info)
2790 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791 		|| vect_analyze_group_access (dr_info)));
2792 
2793   /* Not consecutive access - check if it's a part of interleaving group.  */
2794   return vect_analyze_group_access (dr_info);
2795 }
2796 
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798    suitable for grouping.  */
2799 
2800 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2802 {
2803   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805   int cmp;
2806 
2807   /* Stabilize sort.  */
2808   if (dra == drb)
2809     return 0;
2810 
2811   /* DRs in different loops never belong to the same group.  */
2812   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814   if (loopa != loopb)
2815     return loopa->num < loopb->num ? -1 : 1;
2816 
2817   /* Ordering of DRs according to base.  */
2818   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819 			       DR_BASE_ADDRESS (drb));
2820   if (cmp != 0)
2821     return cmp;
2822 
2823   /* And according to DR_OFFSET.  */
2824   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825   if (cmp != 0)
2826     return cmp;
2827 
2828   /* Put reads before writes.  */
2829   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830     return DR_IS_READ (dra) ? -1 : 1;
2831 
2832   /* Then sort after access size.  */
2833   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835   if (cmp != 0)
2836     return cmp;
2837 
2838   /* And after step.  */
2839   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840   if (cmp != 0)
2841     return cmp;
2842 
2843   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2844   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845   if (cmp == 0)
2846     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847   return cmp;
2848 }
2849 
2850 /* If OP is the result of a conversion, return the unconverted value,
2851    otherwise return null.  */
2852 
2853 static tree
strip_conversion(tree op)2854 strip_conversion (tree op)
2855 {
2856   if (TREE_CODE (op) != SSA_NAME)
2857     return NULL_TREE;
2858   gimple *stmt = SSA_NAME_DEF_STMT (op);
2859   if (!is_gimple_assign (stmt)
2860       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861     return NULL_TREE;
2862   return gimple_assign_rhs1 (stmt);
2863 }
2864 
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866    and STMT2_INFO being in a single group.  */
2867 
2868 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info)2869 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2870 {
2871   if (gimple_assign_single_p (stmt1_info->stmt))
2872     return gimple_assign_single_p (stmt2_info->stmt);
2873 
2874   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2875   if (call1 && gimple_call_internal_p (call1))
2876     {
2877       /* Check for two masked loads or two masked stores.  */
2878       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2879       if (!call2 || !gimple_call_internal_p (call2))
2880 	return false;
2881       internal_fn ifn = gimple_call_internal_fn (call1);
2882       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2883 	return false;
2884       if (ifn != gimple_call_internal_fn (call2))
2885 	return false;
2886 
2887       /* Check that the masks are the same.  Cope with casts of masks,
2888 	 like those created by build_mask_conversion.  */
2889       tree mask1 = gimple_call_arg (call1, 2);
2890       tree mask2 = gimple_call_arg (call2, 2);
2891       if (!operand_equal_p (mask1, mask2, 0))
2892 	{
2893 	  mask1 = strip_conversion (mask1);
2894 	  if (!mask1)
2895 	    return false;
2896 	  mask2 = strip_conversion (mask2);
2897 	  if (!mask2)
2898 	    return false;
2899 	  if (!operand_equal_p (mask1, mask2, 0))
2900 	    return false;
2901 	}
2902       return true;
2903     }
2904 
2905   return false;
2906 }
2907 
2908 /* Function vect_analyze_data_ref_accesses.
2909 
2910    Analyze the access pattern of all the data references in the loop.
2911 
2912    FORNOW: the only access pattern that is considered vectorizable is a
2913 	   simple step 1 (consecutive) access.
2914 
2915    FORNOW: handle only arrays and pointer accesses.  */
2916 
2917 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo)2918 vect_analyze_data_ref_accesses (vec_info *vinfo)
2919 {
2920   unsigned int i;
2921   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2922   struct data_reference *dr;
2923 
2924   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2925 
2926   if (datarefs.is_empty ())
2927     return opt_result::success ();
2928 
2929   /* Sort the array of datarefs to make building the interleaving chains
2930      linear.  Don't modify the original vector's order, it is needed for
2931      determining what dependencies are reversed.  */
2932   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2933   datarefs_copy.qsort (dr_group_sort_cmp);
2934   hash_set<stmt_vec_info> to_fixup;
2935 
2936   /* Build the interleaving chains.  */
2937   for (i = 0; i < datarefs_copy.length () - 1;)
2938     {
2939       data_reference_p dra = datarefs_copy[i];
2940       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2941       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2942       stmt_vec_info lastinfo = NULL;
2943       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2944 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2945 	{
2946 	  ++i;
2947 	  continue;
2948 	}
2949       for (i = i + 1; i < datarefs_copy.length (); ++i)
2950 	{
2951 	  data_reference_p drb = datarefs_copy[i];
2952 	  dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2953 	  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2954 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2955 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2956 	    break;
2957 
2958 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2959 	     accesses, same accesses) can lead to a group to be artificially
2960 	     split here as we don't just skip over those.  If it really
2961 	     matters we can push those to a worklist and re-iterate
2962 	     over them.  The we can just skip ahead to the next DR here.  */
2963 
2964 	  /* DRs in a different loop should not be put into the same
2965 	     interleaving group.  */
2966 	  if (gimple_bb (DR_STMT (dra))->loop_father
2967 	      != gimple_bb (DR_STMT (drb))->loop_father)
2968 	    break;
2969 
2970 	  /* Check that the data-refs have same first location (except init)
2971 	     and they are both either store or load (not load and store,
2972 	     not masked loads or stores).  */
2973 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2974 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2975 					DR_BASE_ADDRESS (drb)) != 0
2976 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2977 	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2978 	    break;
2979 
2980 	  /* Check that the data-refs have the same constant size.  */
2981 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2982 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2983 	  if (!tree_fits_uhwi_p (sza)
2984 	      || !tree_fits_uhwi_p (szb)
2985 	      || !tree_int_cst_equal (sza, szb))
2986 	    break;
2987 
2988 	  /* Check that the data-refs have the same step.  */
2989 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2990 	    break;
2991 
2992 	  /* Check the types are compatible.
2993 	     ???  We don't distinguish this during sorting.  */
2994 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2995 				   TREE_TYPE (DR_REF (drb))))
2996 	    break;
2997 
2998 	  /* Check that the DR_INITs are compile-time constants.  */
2999 	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3000 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3001 	    break;
3002 
3003 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3004 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3005 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3006 	  HOST_WIDE_INT init_prev
3007 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3008 	  gcc_assert (init_a <= init_b
3009 		      && init_a <= init_prev
3010 		      && init_prev <= init_b);
3011 
3012 	  /* Do not place the same access in the interleaving chain twice.  */
3013 	  if (init_b == init_prev)
3014 	    {
3015 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3016 			  < gimple_uid (DR_STMT (drb)));
3017 	      /* Simply link in duplicates and fix up the chain below.  */
3018 	    }
3019 	  else
3020 	    {
3021 	      /* If init_b == init_a + the size of the type * k, we have an
3022 		 interleaving, and DRA is accessed before DRB.  */
3023 	      HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3024 	      if (type_size_a == 0
3025 		  || (init_b - init_a) % type_size_a != 0)
3026 		break;
3027 
3028 	      /* If we have a store, the accesses are adjacent.  This splits
3029 		 groups into chunks we support (we don't support vectorization
3030 		 of stores with gaps).  */
3031 	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3032 		break;
3033 
3034 	      /* If the step (if not zero or non-constant) is greater than the
3035 		 difference between data-refs' inits this splits groups into
3036 		 suitable sizes.  */
3037 	      if (tree_fits_shwi_p (DR_STEP (dra)))
3038 		{
3039 		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3040 		  if (step != 0 && step <= (init_b - init_a))
3041 		    break;
3042 		}
3043 	    }
3044 
3045 	  if (dump_enabled_p ())
3046 	    dump_printf_loc (MSG_NOTE, vect_location,
3047 			     DR_IS_READ (dra)
3048 			     ? "Detected interleaving load %T and %T\n"
3049 			     : "Detected interleaving store %T and %T\n",
3050 			     DR_REF (dra), DR_REF (drb));
3051 
3052 	  /* Link the found element into the group list.  */
3053 	  if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3054 	    {
3055 	      DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3056 	      lastinfo = stmtinfo_a;
3057 	    }
3058 	  DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3059 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3060 	  lastinfo = stmtinfo_b;
3061 
3062 	  if (init_b == init_prev
3063 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3064 	      && dump_enabled_p ())
3065 	    dump_printf_loc (MSG_NOTE, vect_location,
3066 			     "Queuing group with duplicate access for fixup\n");
3067 	}
3068     }
3069 
3070   /* Fixup groups with duplicate entries by splitting it.  */
3071   while (1)
3072     {
3073       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3074       if (!(it != to_fixup.end ()))
3075 	break;
3076       stmt_vec_info grp = *it;
3077       to_fixup.remove (grp);
3078 
3079       /* Find the earliest duplicate group member.  */
3080       unsigned first_duplicate = -1u;
3081       stmt_vec_info next, g = grp;
3082       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3083 	{
3084 	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3085 				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3086 	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3087 	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3088 	  g = next;
3089 	}
3090       if (first_duplicate == -1U)
3091 	continue;
3092 
3093       /* Then move all stmts after the first duplicate to a new group.
3094          Note this is a heuristic but one with the property that *it
3095 	 is fixed up completely.  */
3096       g = grp;
3097       stmt_vec_info newgroup = NULL, ng = grp;
3098       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3099 	{
3100 	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3101 	    {
3102 	      DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3103 	      if (!newgroup)
3104 		newgroup = next;
3105 	      else
3106 		DR_GROUP_NEXT_ELEMENT (ng) = next;
3107 	      ng = next;
3108 	      DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3109 	    }
3110 	  else
3111 	    g = DR_GROUP_NEXT_ELEMENT (g);
3112 	}
3113       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3114 
3115       /* Fixup the new group which still may contain duplicates.  */
3116       to_fixup.add (newgroup);
3117     }
3118 
3119   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3120     {
3121       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3122       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3123 	  && !vect_analyze_data_ref_access (dr_info))
3124 	{
3125 	  if (dump_enabled_p ())
3126 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3127 			     "not vectorized: complicated access pattern.\n");
3128 
3129 	  if (is_a <bb_vec_info> (vinfo))
3130 	    {
3131 	      /* Mark the statement as not vectorizable.  */
3132 	      STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3133 	      continue;
3134 	    }
3135 	  else
3136 	    {
3137 	      datarefs_copy.release ();
3138 	      return opt_result::failure_at (dr_info->stmt->stmt,
3139 					     "not vectorized:"
3140 					     " complicated access pattern.\n");
3141 	    }
3142 	}
3143     }
3144 
3145   datarefs_copy.release ();
3146   return opt_result::success ();
3147 }
3148 
3149 /* Function vect_vfa_segment_size.
3150 
3151    Input:
3152      DR_INFO: The data reference.
3153      LENGTH_FACTOR: segment length to consider.
3154 
3155    Return a value suitable for the dr_with_seg_len::seg_len field.
3156    This is the "distance travelled" by the pointer from the first
3157    iteration in the segment to the last.  Note that it does not include
3158    the size of the access; in effect it only describes the first byte.  */
3159 
3160 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3161 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3162 {
3163   length_factor = size_binop (MINUS_EXPR,
3164 			      fold_convert (sizetype, length_factor),
3165 			      size_one_node);
3166   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3167 		     length_factor);
3168 }
3169 
3170 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3171    gives the worst-case number of bytes covered by the segment.  */
3172 
3173 static unsigned HOST_WIDE_INT
vect_vfa_access_size(dr_vec_info * dr_info)3174 vect_vfa_access_size (dr_vec_info *dr_info)
3175 {
3176   stmt_vec_info stmt_vinfo = dr_info->stmt;
3177   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3178   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3179   unsigned HOST_WIDE_INT access_size = ref_size;
3180   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3181     {
3182       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3183       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3184     }
3185   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3186       && (vect_supportable_dr_alignment (dr_info, false)
3187 	  == dr_explicit_realign_optimized))
3188     {
3189       /* We might access a full vector's worth.  */
3190       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3191       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3192     }
3193   return access_size;
3194 }
3195 
3196 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3197    describes.  */
3198 
3199 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3200 vect_vfa_align (dr_vec_info *dr_info)
3201 {
3202   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3203 }
3204 
3205 /* Function vect_no_alias_p.
3206 
3207    Given data references A and B with equal base and offset, see whether
3208    the alias relation can be decided at compilation time.  Return 1 if
3209    it can and the references alias, 0 if it can and the references do
3210    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3211    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3212    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3213 
3214 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3215 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3216 			 tree segment_length_a, tree segment_length_b,
3217 			 unsigned HOST_WIDE_INT access_size_a,
3218 			 unsigned HOST_WIDE_INT access_size_b)
3219 {
3220   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3221   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3222   poly_uint64 const_length_a;
3223   poly_uint64 const_length_b;
3224 
3225   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3226      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3227      [a, a+12) */
3228   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3229     {
3230       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3231       offset_a -= const_length_a;
3232     }
3233   else
3234     const_length_a = tree_to_poly_uint64 (segment_length_a);
3235   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3236     {
3237       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3238       offset_b -= const_length_b;
3239     }
3240   else
3241     const_length_b = tree_to_poly_uint64 (segment_length_b);
3242 
3243   const_length_a += access_size_a;
3244   const_length_b += access_size_b;
3245 
3246   if (ranges_known_overlap_p (offset_a, const_length_a,
3247 			      offset_b, const_length_b))
3248     return 1;
3249 
3250   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3251 			       offset_b, const_length_b))
3252     return 0;
3253 
3254   return -1;
3255 }
3256 
3257 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3258    in DDR is >= VF.  */
3259 
3260 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3261 dependence_distance_ge_vf (data_dependence_relation *ddr,
3262 			   unsigned int loop_depth, poly_uint64 vf)
3263 {
3264   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3265       || DDR_NUM_DIST_VECTS (ddr) == 0)
3266     return false;
3267 
3268   /* If the dependence is exact, we should have limited the VF instead.  */
3269   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3270 
3271   unsigned int i;
3272   lambda_vector dist_v;
3273   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3274     {
3275       HOST_WIDE_INT dist = dist_v[loop_depth];
3276       if (dist != 0
3277 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3278 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3279 	return false;
3280     }
3281 
3282   if (dump_enabled_p ())
3283     dump_printf_loc (MSG_NOTE, vect_location,
3284 		     "dependence distance between %T and %T is >= VF\n",
3285 		     DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3286 
3287   return true;
3288 }
3289 
3290 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3291 
3292 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3293 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3294 {
3295   dump_printf (dump_kind, "%s (%T) >= ",
3296 	       lower_bound.unsigned_p ? "unsigned" : "abs",
3297 	       lower_bound.expr);
3298   dump_dec (dump_kind, lower_bound.min_value);
3299 }
3300 
3301 /* Record that the vectorized loop requires the vec_lower_bound described
3302    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3303 
3304 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3305 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3306 			poly_uint64 min_value)
3307 {
3308   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3309   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3310     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3311       {
3312 	unsigned_p &= lower_bounds[i].unsigned_p;
3313 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3314 	if (lower_bounds[i].unsigned_p != unsigned_p
3315 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3316 	  {
3317 	    lower_bounds[i].unsigned_p = unsigned_p;
3318 	    lower_bounds[i].min_value = min_value;
3319 	    if (dump_enabled_p ())
3320 	      {
3321 		dump_printf_loc (MSG_NOTE, vect_location,
3322 				 "updating run-time check to ");
3323 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3324 		dump_printf (MSG_NOTE, "\n");
3325 	      }
3326 	  }
3327 	return;
3328       }
3329 
3330   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3331   if (dump_enabled_p ())
3332     {
3333       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3334       dump_lower_bound (MSG_NOTE, lower_bound);
3335       dump_printf (MSG_NOTE, "\n");
3336     }
3337   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3338 }
3339 
3340 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3341    will span fewer than GAP bytes.  */
3342 
3343 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3344 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3345 		  poly_int64 gap)
3346 {
3347   stmt_vec_info stmt_info = dr_info->stmt;
3348   HOST_WIDE_INT count
3349     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3350   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3351     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3352   return (estimated_poly_value (gap)
3353 	  <= count * vect_get_scalar_dr_size (dr_info));
3354 }
3355 
3356 /* Return true if we know that there is no alias between DR_INFO_A and
3357    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3358    When returning true, set *LOWER_BOUND_OUT to this N.  */
3359 
3360 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3361 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3362 				poly_uint64 *lower_bound_out)
3363 {
3364   /* Check that there is a constant gap of known sign between DR_A
3365      and DR_B.  */
3366   data_reference *dr_a = dr_info_a->dr;
3367   data_reference *dr_b = dr_info_b->dr;
3368   poly_int64 init_a, init_b;
3369   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3370       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3371       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3372       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3373       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3374       || !ordered_p (init_a, init_b))
3375     return false;
3376 
3377   /* Sort DR_A and DR_B by the address they access.  */
3378   if (maybe_lt (init_b, init_a))
3379     {
3380       std::swap (init_a, init_b);
3381       std::swap (dr_info_a, dr_info_b);
3382       std::swap (dr_a, dr_b);
3383     }
3384 
3385   /* If the two accesses could be dependent within a scalar iteration,
3386      make sure that we'd retain their order.  */
3387   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3388       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3389     return false;
3390 
3391   /* There is no alias if abs (DR_STEP) is greater than or equal to
3392      the bytes spanned by the combination of the two accesses.  */
3393   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3394   return true;
3395 }
3396 
3397 /* Function vect_prune_runtime_alias_test_list.
3398 
3399    Prune a list of ddrs to be tested at run-time by versioning for alias.
3400    Merge several alias checks into one if possible.
3401    Return FALSE if resulting list of ddrs is longer then allowed by
3402    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3403 
3404 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3405 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3406 {
3407   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3408   hash_set <tree_pair_hash> compared_objects;
3409 
3410   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3411   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3412     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3413   vec<vec_object_pair> &check_unequal_addrs
3414     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3415   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3416   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3417 
3418   ddr_p ddr;
3419   unsigned int i;
3420   tree length_factor;
3421 
3422   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3423 
3424   /* Step values are irrelevant for aliasing if the number of vector
3425      iterations is equal to the number of scalar iterations (which can
3426      happen for fully-SLP loops).  */
3427   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3428 
3429   if (!ignore_step_p)
3430     {
3431       /* Convert the checks for nonzero steps into bound tests.  */
3432       tree value;
3433       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3434 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3435     }
3436 
3437   if (may_alias_ddrs.is_empty ())
3438     return opt_result::success ();
3439 
3440   comp_alias_ddrs.create (may_alias_ddrs.length ());
3441 
3442   unsigned int loop_depth
3443     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3444 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3445 
3446   /* First, we collect all data ref pairs for aliasing checks.  */
3447   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3448     {
3449       int comp_res;
3450       poly_uint64 lower_bound;
3451       tree segment_length_a, segment_length_b;
3452       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3453       unsigned int align_a, align_b;
3454 
3455       /* Ignore the alias if the VF we chose ended up being no greater
3456 	 than the dependence distance.  */
3457       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3458 	continue;
3459 
3460       if (DDR_OBJECT_A (ddr))
3461 	{
3462 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3463 	  if (!compared_objects.add (new_pair))
3464 	    {
3465 	      if (dump_enabled_p ())
3466 		dump_printf_loc (MSG_NOTE, vect_location,
3467 				 "checking that %T and %T"
3468 				 " have different addresses\n",
3469 				 new_pair.first, new_pair.second);
3470 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3471 	    }
3472 	  continue;
3473 	}
3474 
3475       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3476       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3477 
3478       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3479       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3480 
3481       /* Skip the pair if inter-iteration dependencies are irrelevant
3482 	 and intra-iteration dependencies are guaranteed to be honored.  */
3483       if (ignore_step_p
3484 	  && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3485 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3486 						 &lower_bound)))
3487 	{
3488 	  if (dump_enabled_p ())
3489 	    dump_printf_loc (MSG_NOTE, vect_location,
3490 			     "no need for alias check between "
3491 			     "%T and %T when VF is 1\n",
3492 			     DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3493 	  continue;
3494 	}
3495 
3496       /* See whether we can handle the alias using a bounds check on
3497 	 the step, and whether that's likely to be the best approach.
3498 	 (It might not be, for example, if the minimum step is much larger
3499 	 than the number of bytes handled by one vector iteration.)  */
3500       if (!ignore_step_p
3501 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3502 	  && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3503 					     &lower_bound)
3504 	  && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3505 	      || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3506 	{
3507 	  bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3508 	  if (dump_enabled_p ())
3509 	    {
3510 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3511 			       "%T and %T when the step %T is outside ",
3512 			       DR_REF (dr_info_a->dr),
3513 			       DR_REF (dr_info_b->dr),
3514 			       DR_STEP (dr_info_a->dr));
3515 	      if (unsigned_p)
3516 		dump_printf (MSG_NOTE, "[0");
3517 	      else
3518 		{
3519 		  dump_printf (MSG_NOTE, "(");
3520 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3521 		}
3522 	      dump_printf (MSG_NOTE, ", ");
3523 	      dump_dec (MSG_NOTE, lower_bound);
3524 	      dump_printf (MSG_NOTE, ")\n");
3525 	    }
3526 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3527 				  unsigned_p, lower_bound);
3528 	  continue;
3529 	}
3530 
3531       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3532       if (dr_group_first_a)
3533 	{
3534 	  stmt_info_a = dr_group_first_a;
3535 	  dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3536 	}
3537 
3538       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3539       if (dr_group_first_b)
3540 	{
3541 	  stmt_info_b = dr_group_first_b;
3542 	  dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3543 	}
3544 
3545       if (ignore_step_p)
3546 	{
3547 	  segment_length_a = size_zero_node;
3548 	  segment_length_b = size_zero_node;
3549 	}
3550       else
3551 	{
3552 	  if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3553 				DR_STEP (dr_info_b->dr), 0))
3554 	    length_factor = scalar_loop_iters;
3555 	  else
3556 	    length_factor = size_int (vect_factor);
3557 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3558 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3559 	}
3560       access_size_a = vect_vfa_access_size (dr_info_a);
3561       access_size_b = vect_vfa_access_size (dr_info_b);
3562       align_a = vect_vfa_align (dr_info_a);
3563       align_b = vect_vfa_align (dr_info_b);
3564 
3565       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3566 					DR_BASE_ADDRESS (dr_info_b->dr));
3567       if (comp_res == 0)
3568 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3569 					  DR_OFFSET (dr_info_b->dr));
3570 
3571       /* See whether the alias is known at compilation time.  */
3572       if (comp_res == 0
3573 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3574 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3575 	  && poly_int_tree_p (segment_length_a)
3576 	  && poly_int_tree_p (segment_length_b))
3577 	{
3578 	  int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3579 					     segment_length_a,
3580 					     segment_length_b,
3581 					     access_size_a,
3582 					     access_size_b);
3583 	  if (res >= 0 && dump_enabled_p ())
3584 	    {
3585 	      dump_printf_loc (MSG_NOTE, vect_location,
3586 			       "can tell at compile time that %T and %T",
3587 			       DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3588 	      if (res == 0)
3589 		dump_printf (MSG_NOTE, " do not alias\n");
3590 	      else
3591 		dump_printf (MSG_NOTE, " alias\n");
3592 	    }
3593 
3594 	  if (res == 0)
3595 	    continue;
3596 
3597 	  if (res == 1)
3598 	    return opt_result::failure_at (stmt_info_b->stmt,
3599 					   "not vectorized:"
3600 					   " compilation time alias: %G%G",
3601 					   stmt_info_a->stmt,
3602 					   stmt_info_b->stmt);
3603 	}
3604 
3605       dr_with_seg_len_pair_t dr_with_seg_len_pair
3606 	(dr_with_seg_len (dr_info_a->dr, segment_length_a,
3607 			  access_size_a, align_a),
3608 	 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3609 			  access_size_b, align_b));
3610 
3611       /* Canonicalize pairs by sorting the two DR members.  */
3612       if (comp_res > 0)
3613 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3614 
3615       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3616     }
3617 
3618   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3619 
3620   unsigned int count = (comp_alias_ddrs.length ()
3621 			+ check_unequal_addrs.length ());
3622 
3623   if (dump_enabled_p ())
3624     dump_printf_loc (MSG_NOTE, vect_location,
3625 		     "improved number of alias checks from %d to %d\n",
3626 		     may_alias_ddrs.length (), count);
3627   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3628     return opt_result::failure_at
3629       (vect_location,
3630        "number of versioning for alias "
3631        "run-time tests exceeds %d "
3632        "(--param vect-max-version-for-alias-checks)\n",
3633        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3634 
3635   return opt_result::success ();
3636 }
3637 
3638 /* Check whether we can use an internal function for a gather load
3639    or scatter store.  READ_P is true for loads and false for stores.
3640    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3641    the type of the memory elements being loaded or stored.  OFFSET_BITS
3642    is the number of bits in each scalar offset and OFFSET_SIGN is the
3643    sign of the offset.  SCALE is the amount by which the offset should
3644    be multiplied *after* it has been converted to address width.
3645 
3646    Return true if the function is supported, storing the function
3647    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3648 
3649 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3650 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3651 			  tree memory_type, unsigned int offset_bits,
3652 			  signop offset_sign, int scale,
3653 			  internal_fn *ifn_out, tree *element_type_out)
3654 {
3655   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3656   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3657   if (offset_bits > element_bits)
3658     /* Internal functions require the offset to be the same width as
3659        the vector elements.  We can extend narrower offsets, but it isn't
3660        safe to truncate wider offsets.  */
3661     return false;
3662 
3663   if (element_bits != memory_bits)
3664     /* For now the vector elements must be the same width as the
3665        memory elements.  */
3666     return false;
3667 
3668   /* Work out which function we need.  */
3669   internal_fn ifn;
3670   if (read_p)
3671     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3672   else
3673     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3674 
3675   /* Test whether the target supports this combination.  */
3676   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3677 					       offset_sign, scale))
3678     return false;
3679 
3680   *ifn_out = ifn;
3681   *element_type_out = TREE_TYPE (vectype);
3682   return true;
3683 }
3684 
3685 /* STMT_INFO is a call to an internal gather load or scatter store function.
3686    Describe the operation in INFO.  */
3687 
3688 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3689 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3690 				   gather_scatter_info *info)
3691 {
3692   gcall *call = as_a <gcall *> (stmt_info->stmt);
3693   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3694   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3695 
3696   info->ifn = gimple_call_internal_fn (call);
3697   info->decl = NULL_TREE;
3698   info->base = gimple_call_arg (call, 0);
3699   info->offset = gimple_call_arg (call, 1);
3700   info->offset_dt = vect_unknown_def_type;
3701   info->offset_vectype = NULL_TREE;
3702   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3703   info->element_type = TREE_TYPE (vectype);
3704   info->memory_type = TREE_TYPE (DR_REF (dr));
3705 }
3706 
3707 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3708    gather load or scatter store.  Describe the operation in *INFO if so.  */
3709 
3710 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3711 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3712 			   gather_scatter_info *info)
3713 {
3714   HOST_WIDE_INT scale = 1;
3715   poly_int64 pbitpos, pbitsize;
3716   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3717   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3718   tree offtype = NULL_TREE;
3719   tree decl = NULL_TREE, base, off;
3720   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3721   tree memory_type = TREE_TYPE (DR_REF (dr));
3722   machine_mode pmode;
3723   int punsignedp, reversep, pvolatilep = 0;
3724   internal_fn ifn;
3725   tree element_type;
3726   bool masked_p = false;
3727 
3728   /* See whether this is already a call to a gather/scatter internal function.
3729      If not, see whether it's a masked load or store.  */
3730   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3731   if (call && gimple_call_internal_p (call))
3732     {
3733       ifn = gimple_call_internal_fn (call);
3734       if (internal_gather_scatter_fn_p (ifn))
3735 	{
3736 	  vect_describe_gather_scatter_call (stmt_info, info);
3737 	  return true;
3738 	}
3739       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3740     }
3741 
3742   /* True if we should aim to use internal functions rather than
3743      built-in functions.  */
3744   bool use_ifn_p = (DR_IS_READ (dr)
3745 		    ? supports_vec_gather_load_p ()
3746 		    : supports_vec_scatter_store_p ());
3747 
3748   base = DR_REF (dr);
3749   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3750      see if we can use the def stmt of the address.  */
3751   if (masked_p
3752       && TREE_CODE (base) == MEM_REF
3753       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3754       && integer_zerop (TREE_OPERAND (base, 1))
3755       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3756     {
3757       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3758       if (is_gimple_assign (def_stmt)
3759 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3760 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3761     }
3762 
3763   /* The gather and scatter builtins need address of the form
3764      loop_invariant + vector * {1, 2, 4, 8}
3765      or
3766      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3767      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3768      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3769      multiplications and additions in it.  To get a vector, we need
3770      a single SSA_NAME that will be defined in the loop and will
3771      contain everything that is not loop invariant and that can be
3772      vectorized.  The following code attempts to find such a preexistng
3773      SSA_NAME OFF and put the loop invariants into a tree BASE
3774      that can be gimplified before the loop.  */
3775   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3776 			      &punsignedp, &reversep, &pvolatilep);
3777   if (reversep)
3778     return false;
3779 
3780   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3781 
3782   if (TREE_CODE (base) == MEM_REF)
3783     {
3784       if (!integer_zerop (TREE_OPERAND (base, 1)))
3785 	{
3786 	  if (off == NULL_TREE)
3787 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3788 	  else
3789 	    off = size_binop (PLUS_EXPR, off,
3790 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3791 	}
3792       base = TREE_OPERAND (base, 0);
3793     }
3794   else
3795     base = build_fold_addr_expr (base);
3796 
3797   if (off == NULL_TREE)
3798     off = size_zero_node;
3799 
3800   /* If base is not loop invariant, either off is 0, then we start with just
3801      the constant offset in the loop invariant BASE and continue with base
3802      as OFF, otherwise give up.
3803      We could handle that case by gimplifying the addition of base + off
3804      into some SSA_NAME and use that as off, but for now punt.  */
3805   if (!expr_invariant_in_loop_p (loop, base))
3806     {
3807       if (!integer_zerop (off))
3808 	return false;
3809       off = base;
3810       base = size_int (pbytepos);
3811     }
3812   /* Otherwise put base + constant offset into the loop invariant BASE
3813      and continue with OFF.  */
3814   else
3815     {
3816       base = fold_convert (sizetype, base);
3817       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3818     }
3819 
3820   /* OFF at this point may be either a SSA_NAME or some tree expression
3821      from get_inner_reference.  Try to peel off loop invariants from it
3822      into BASE as long as possible.  */
3823   STRIP_NOPS (off);
3824   while (offtype == NULL_TREE)
3825     {
3826       enum tree_code code;
3827       tree op0, op1, add = NULL_TREE;
3828 
3829       if (TREE_CODE (off) == SSA_NAME)
3830 	{
3831 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3832 
3833 	  if (expr_invariant_in_loop_p (loop, off))
3834 	    return false;
3835 
3836 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3837 	    break;
3838 
3839 	  op0 = gimple_assign_rhs1 (def_stmt);
3840 	  code = gimple_assign_rhs_code (def_stmt);
3841 	  op1 = gimple_assign_rhs2 (def_stmt);
3842 	}
3843       else
3844 	{
3845 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3846 	    return false;
3847 	  code = TREE_CODE (off);
3848 	  extract_ops_from_tree (off, &code, &op0, &op1);
3849 	}
3850       switch (code)
3851 	{
3852 	case POINTER_PLUS_EXPR:
3853 	case PLUS_EXPR:
3854 	  if (expr_invariant_in_loop_p (loop, op0))
3855 	    {
3856 	      add = op0;
3857 	      off = op1;
3858 	    do_add:
3859 	      add = fold_convert (sizetype, add);
3860 	      if (scale != 1)
3861 		add = size_binop (MULT_EXPR, add, size_int (scale));
3862 	      base = size_binop (PLUS_EXPR, base, add);
3863 	      continue;
3864 	    }
3865 	  if (expr_invariant_in_loop_p (loop, op1))
3866 	    {
3867 	      add = op1;
3868 	      off = op0;
3869 	      goto do_add;
3870 	    }
3871 	  break;
3872 	case MINUS_EXPR:
3873 	  if (expr_invariant_in_loop_p (loop, op1))
3874 	    {
3875 	      add = fold_convert (sizetype, op1);
3876 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3877 	      off = op0;
3878 	      goto do_add;
3879 	    }
3880 	  break;
3881 	case MULT_EXPR:
3882 	  if (scale == 1 && tree_fits_shwi_p (op1))
3883 	    {
3884 	      int new_scale = tree_to_shwi (op1);
3885 	      /* Only treat this as a scaling operation if the target
3886 		 supports it.  */
3887 	      if (use_ifn_p
3888 		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3889 						vectype, memory_type, 1,
3890 						TYPE_SIGN (TREE_TYPE (op0)),
3891 						new_scale, &ifn,
3892 						&element_type))
3893 		break;
3894 	      scale = new_scale;
3895 	      off = op0;
3896 	      continue;
3897 	    }
3898 	  break;
3899 	case SSA_NAME:
3900 	  off = op0;
3901 	  continue;
3902 	CASE_CONVERT:
3903 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3904 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3905 	    break;
3906 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3907 	      == TYPE_PRECISION (TREE_TYPE (off)))
3908 	    {
3909 	      off = op0;
3910 	      continue;
3911 	    }
3912 
3913 	  /* The internal functions need the offset to be the same width
3914 	     as the elements of VECTYPE.  Don't include operations that
3915 	     cast the offset from that width to a different width.  */
3916 	  if (use_ifn_p
3917 	      && (int_size_in_bytes (TREE_TYPE (vectype))
3918 		  == int_size_in_bytes (TREE_TYPE (off))))
3919 	    break;
3920 
3921 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3922 	      < TYPE_PRECISION (TREE_TYPE (off)))
3923 	    {
3924 	      off = op0;
3925 	      offtype = TREE_TYPE (off);
3926 	      STRIP_NOPS (off);
3927 	      continue;
3928 	    }
3929 	  break;
3930 	default:
3931 	  break;
3932 	}
3933       break;
3934     }
3935 
3936   /* If at the end OFF still isn't a SSA_NAME or isn't
3937      defined in the loop, punt.  */
3938   if (TREE_CODE (off) != SSA_NAME
3939       || expr_invariant_in_loop_p (loop, off))
3940     return false;
3941 
3942   if (offtype == NULL_TREE)
3943     offtype = TREE_TYPE (off);
3944 
3945   if (use_ifn_p)
3946     {
3947       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3948 				     memory_type, TYPE_PRECISION (offtype),
3949 				     TYPE_SIGN (offtype), scale, &ifn,
3950 				     &element_type))
3951 	return false;
3952     }
3953   else
3954     {
3955       if (DR_IS_READ (dr))
3956 	{
3957 	  if (targetm.vectorize.builtin_gather)
3958 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3959 	}
3960       else
3961 	{
3962 	  if (targetm.vectorize.builtin_scatter)
3963 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3964 	}
3965 
3966       if (!decl)
3967 	return false;
3968 
3969       ifn = IFN_LAST;
3970       element_type = TREE_TYPE (vectype);
3971     }
3972 
3973   info->ifn = ifn;
3974   info->decl = decl;
3975   info->base = base;
3976   info->offset = off;
3977   info->offset_dt = vect_unknown_def_type;
3978   info->offset_vectype = NULL_TREE;
3979   info->scale = scale;
3980   info->element_type = element_type;
3981   info->memory_type = memory_type;
3982   return true;
3983 }
3984 
3985 /* Find the data references in STMT, analyze them with respect to LOOP and
3986    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3987    be handled.  */
3988 
3989 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs)3990 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3991 			       vec<data_reference_p> *datarefs)
3992 {
3993   /* We can ignore clobbers for dataref analysis - they are removed during
3994      loop vectorization and BB vectorization checks dependences with a
3995      stmt walk.  */
3996   if (gimple_clobber_p (stmt))
3997     return opt_result::success ();
3998 
3999   if (gimple_has_volatile_ops (stmt))
4000     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4001 				   stmt);
4002 
4003   if (stmt_can_throw_internal (cfun, stmt))
4004     return opt_result::failure_at (stmt,
4005 				   "not vectorized:"
4006 				   " statement can throw an exception: %G",
4007 				   stmt);
4008 
4009   auto_vec<data_reference_p, 2> refs;
4010   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4011   if (!res)
4012     return res;
4013 
4014   if (refs.is_empty ())
4015     return opt_result::success ();
4016 
4017   if (refs.length () > 1)
4018     return opt_result::failure_at (stmt,
4019 				   "not vectorized:"
4020 				   " more than one data ref in stmt: %G", stmt);
4021 
4022   if (gcall *call = dyn_cast <gcall *> (stmt))
4023     if (!gimple_call_internal_p (call)
4024 	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4025 	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4026       return opt_result::failure_at (stmt,
4027 				     "not vectorized: dr in a call %G", stmt);
4028 
4029   data_reference_p dr = refs.pop ();
4030   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4031       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4032     return opt_result::failure_at (stmt,
4033 				   "not vectorized:"
4034 				   " statement is bitfield access %G", stmt);
4035 
4036   if (DR_BASE_ADDRESS (dr)
4037       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4038     return opt_result::failure_at (stmt,
4039 				   "not vectorized:"
4040 				   " base addr of dr is a constant\n");
4041 
4042   /* Check whether this may be a SIMD lane access and adjust the
4043      DR to make it easier for us to handle it.  */
4044   if (loop
4045       && loop->simduid
4046       && (!DR_BASE_ADDRESS (dr)
4047 	  || !DR_OFFSET (dr)
4048 	  || !DR_INIT (dr)
4049 	  || !DR_STEP (dr)))
4050     {
4051       struct data_reference *newdr
4052 	= create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4053 			   DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4054       if (DR_BASE_ADDRESS (newdr)
4055 	  && DR_OFFSET (newdr)
4056 	  && DR_INIT (newdr)
4057 	  && DR_STEP (newdr)
4058 	  && integer_zerop (DR_STEP (newdr)))
4059 	{
4060 	  tree off = DR_OFFSET (newdr);
4061 	  STRIP_NOPS (off);
4062 	  if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4063 	      && TREE_CODE (off) == MULT_EXPR
4064 	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4065 	    {
4066 	      tree step = TREE_OPERAND (off, 1);
4067 	      off = TREE_OPERAND (off, 0);
4068 	      STRIP_NOPS (off);
4069 	      if (CONVERT_EXPR_P (off)
4070 		  && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4071 		      < TYPE_PRECISION (TREE_TYPE (off))))
4072 		off = TREE_OPERAND (off, 0);
4073 	      if (TREE_CODE (off) == SSA_NAME)
4074 		{
4075 		  gimple *def = SSA_NAME_DEF_STMT (off);
4076 		  tree reft = TREE_TYPE (DR_REF (newdr));
4077 		  if (is_gimple_call (def)
4078 		      && gimple_call_internal_p (def)
4079 		      && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4080 		    {
4081 		      tree arg = gimple_call_arg (def, 0);
4082 		      gcc_assert (TREE_CODE (arg) == SSA_NAME);
4083 		      arg = SSA_NAME_VAR (arg);
4084 		      if (arg == loop->simduid
4085 			  /* For now.  */
4086 			  && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4087 			{
4088 			  DR_OFFSET (newdr) = ssize_int (0);
4089 			  DR_STEP (newdr) = step;
4090 			  DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4091 			  DR_STEP_ALIGNMENT (newdr)
4092 			    = highest_pow2_factor (step);
4093 			  /* Mark as simd-lane access.  */
4094 			  newdr->aux = (void *)-1;
4095 			  free_data_ref (dr);
4096 			  datarefs->safe_push (newdr);
4097 			  return opt_result::success ();
4098 			}
4099 		    }
4100 		}
4101 	    }
4102 	}
4103       free_data_ref (newdr);
4104     }
4105 
4106   datarefs->safe_push (dr);
4107   return opt_result::success ();
4108 }
4109 
4110 /* Function vect_analyze_data_refs.
4111 
4112   Find all the data references in the loop or basic block.
4113 
4114    The general structure of the analysis of data refs in the vectorizer is as
4115    follows:
4116    1- vect_analyze_data_refs(loop/bb): call
4117       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4118       in the loop/bb and their dependences.
4119    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4120    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4121    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4122 
4123 */
4124 
4125 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)4126 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4127 {
4128   struct loop *loop = NULL;
4129   unsigned int i;
4130   struct data_reference *dr;
4131   tree scalar_type;
4132 
4133   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4134 
4135   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4136     loop = LOOP_VINFO_LOOP (loop_vinfo);
4137 
4138   /* Go through the data-refs, check that the analysis succeeded.  Update
4139      pointer from stmt_vec_info struct to DR and vectype.  */
4140 
4141   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4142   FOR_EACH_VEC_ELT (datarefs, i, dr)
4143     {
4144       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4145       poly_uint64 vf;
4146 
4147       gcc_assert (DR_REF (dr));
4148       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4149       gcc_assert (!stmt_info->dr_aux.dr);
4150       stmt_info->dr_aux.dr = dr;
4151       stmt_info->dr_aux.stmt = stmt_info;
4152 
4153       /* Check that analysis of the data-ref succeeded.  */
4154       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4155 	  || !DR_STEP (dr))
4156         {
4157 	  bool maybe_gather
4158 	    = DR_IS_READ (dr)
4159 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4160 	      && (targetm.vectorize.builtin_gather != NULL
4161 		  || supports_vec_gather_load_p ());
4162 	  bool maybe_scatter
4163 	    = DR_IS_WRITE (dr)
4164 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4165 	      && (targetm.vectorize.builtin_scatter != NULL
4166 		  || supports_vec_scatter_store_p ());
4167 
4168 	  /* If target supports vector gather loads or scatter stores,
4169 	     see if they can't be used.  */
4170 	  if (is_a <loop_vec_info> (vinfo)
4171 	      && !nested_in_vect_loop_p (loop, stmt_info))
4172 	    {
4173 	      if (maybe_gather || maybe_scatter)
4174 		{
4175 		  if (maybe_gather)
4176 		    gatherscatter = GATHER;
4177 		  else
4178 		    gatherscatter = SCATTER;
4179 		}
4180 	    }
4181 
4182 	  if (gatherscatter == SG_NONE)
4183 	    {
4184 	      if (dump_enabled_p ())
4185 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4186 				 "not vectorized: data ref analysis "
4187 				 "failed %G", stmt_info->stmt);
4188 	      if (is_a <bb_vec_info> (vinfo))
4189 		{
4190 		  /* In BB vectorization the ref can still participate
4191 		     in dependence analysis, we just can't vectorize it.  */
4192 		  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4193 		  continue;
4194 		}
4195 	      return opt_result::failure_at (stmt_info->stmt,
4196 					     "not vectorized:"
4197 					     " data ref analysis failed: %G",
4198 					     stmt_info->stmt);
4199 	    }
4200         }
4201 
4202       /* See if this was detected as SIMD lane access.  */
4203       if (dr->aux == (void *)-1)
4204 	{
4205 	  if (nested_in_vect_loop_p (loop, stmt_info))
4206 	    return opt_result::failure_at (stmt_info->stmt,
4207 					   "not vectorized:"
4208 					   " data ref analysis failed: %G",
4209 					   stmt_info->stmt);
4210 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4211 	}
4212 
4213       tree base = get_base_address (DR_REF (dr));
4214       if (base && VAR_P (base) && DECL_NONALIASED (base))
4215 	{
4216           if (dump_enabled_p ())
4217 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4218 			     "not vectorized: base object not addressable "
4219 			     "for stmt: %G", stmt_info->stmt);
4220           if (is_a <bb_vec_info> (vinfo))
4221 	    {
4222 	      /* In BB vectorization the ref can still participate
4223 	         in dependence analysis, we just can't vectorize it.  */
4224 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4225 	      continue;
4226 	    }
4227 	  return opt_result::failure_at (stmt_info->stmt,
4228 					 "not vectorized: base object not"
4229 					 " addressable for stmt: %G",
4230 					 stmt_info->stmt);
4231 	}
4232 
4233       if (is_a <loop_vec_info> (vinfo)
4234 	  && DR_STEP (dr)
4235 	  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4236 	{
4237 	  if (nested_in_vect_loop_p (loop, stmt_info))
4238 	    return opt_result::failure_at (stmt_info->stmt,
4239 					   "not vectorized:"
4240 					   "not suitable for strided load %G",
4241 					   stmt_info->stmt);
4242 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4243 	}
4244 
4245       /* Update DR field in stmt_vec_info struct.  */
4246 
4247       /* If the dataref is in an inner-loop of the loop that is considered for
4248 	 for vectorization, we also want to analyze the access relative to
4249 	 the outer-loop (DR contains information only relative to the
4250 	 inner-most enclosing loop).  We do that by building a reference to the
4251 	 first location accessed by the inner-loop, and analyze it relative to
4252 	 the outer-loop.  */
4253       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4254 	{
4255 	  /* Build a reference to the first location accessed by the
4256 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4257 	     this address must be invariant in the inner loop, so we
4258 	     can consider it as being used in the outer loop.  */
4259 	  tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4260 	  tree offset = unshare_expr (DR_OFFSET (dr));
4261 	  tree init = unshare_expr (DR_INIT (dr));
4262 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4263 					  init, offset);
4264 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4265 	  tree init_ref = build_fold_indirect_ref (init_addr);
4266 
4267 	  if (dump_enabled_p ())
4268 	    dump_printf_loc (MSG_NOTE, vect_location,
4269 			     "analyze in outer loop: %T\n", init_ref);
4270 
4271 	  opt_result res
4272 	    = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4273 				    init_ref, loop, stmt_info->stmt);
4274 	  if (!res)
4275 	    /* dr_analyze_innermost already explained the failure.  */
4276 	    return res;
4277 
4278           if (dump_enabled_p ())
4279 	    dump_printf_loc (MSG_NOTE, vect_location,
4280 			     "\touter base_address: %T\n"
4281 			     "\touter offset from base address: %T\n"
4282 			     "\touter constant offset from base address: %T\n"
4283 			     "\touter step: %T\n"
4284 			     "\touter base alignment: %d\n\n"
4285 			     "\touter base misalignment: %d\n"
4286 			     "\touter offset alignment: %d\n"
4287 			     "\touter step alignment: %d\n",
4288 			     STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4289 			     STMT_VINFO_DR_OFFSET (stmt_info),
4290 			     STMT_VINFO_DR_INIT (stmt_info),
4291 			     STMT_VINFO_DR_STEP (stmt_info),
4292 			     STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4293 			     STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4294 			     STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4295 			     STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4296 	}
4297 
4298       /* Set vectype for STMT.  */
4299       scalar_type = TREE_TYPE (DR_REF (dr));
4300       STMT_VINFO_VECTYPE (stmt_info)
4301 	= get_vectype_for_scalar_type (scalar_type);
4302       if (!STMT_VINFO_VECTYPE (stmt_info))
4303         {
4304           if (dump_enabled_p ())
4305             {
4306               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4307                                "not vectorized: no vectype for stmt: %G",
4308 			       stmt_info->stmt);
4309               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4310               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4311                                  scalar_type);
4312               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4313             }
4314 
4315           if (is_a <bb_vec_info> (vinfo))
4316 	    {
4317 	      /* No vector type is fine, the ref can still participate
4318 	         in dependence analysis, we just can't vectorize it.  */
4319 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4320 	      continue;
4321 	    }
4322 	  return opt_result::failure_at (stmt_info->stmt,
4323 					 "not vectorized:"
4324 					 " no vectype for stmt: %G"
4325 					 " scalar_type: %T\n",
4326 					 stmt_info->stmt, scalar_type);
4327         }
4328       else
4329 	{
4330 	  if (dump_enabled_p ())
4331 	    dump_printf_loc (MSG_NOTE, vect_location,
4332 			     "got vectype for stmt: %G%T\n",
4333 			     stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4334 	}
4335 
4336       /* Adjust the minimal vectorization factor according to the
4337 	 vector type.  */
4338       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4339       *min_vf = upper_bound (*min_vf, vf);
4340 
4341       if (gatherscatter != SG_NONE)
4342 	{
4343 	  gather_scatter_info gs_info;
4344 	  if (!vect_check_gather_scatter (stmt_info,
4345 					  as_a <loop_vec_info> (vinfo),
4346 					  &gs_info)
4347 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4348 	    return opt_result::failure_at
4349 	      (stmt_info->stmt,
4350 	       (gatherscatter == GATHER) ?
4351 	       "not vectorized: not suitable for gather load %G" :
4352 	       "not vectorized: not suitable for scatter store %G",
4353 	       stmt_info->stmt);
4354 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4355 	}
4356     }
4357 
4358   /* We used to stop processing and prune the list here.  Verify we no
4359      longer need to.  */
4360   gcc_assert (i == datarefs.length ());
4361 
4362   return opt_result::success ();
4363 }
4364 
4365 
4366 /* Function vect_get_new_vect_var.
4367 
4368    Returns a name for a new variable.  The current naming scheme appends the
4369    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4370    the name of vectorizer generated variables, and appends that to NAME if
4371    provided.  */
4372 
4373 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4374 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4375 {
4376   const char *prefix;
4377   tree new_vect_var;
4378 
4379   switch (var_kind)
4380   {
4381   case vect_simple_var:
4382     prefix = "vect";
4383     break;
4384   case vect_scalar_var:
4385     prefix = "stmp";
4386     break;
4387   case vect_mask_var:
4388     prefix = "mask";
4389     break;
4390   case vect_pointer_var:
4391     prefix = "vectp";
4392     break;
4393   default:
4394     gcc_unreachable ();
4395   }
4396 
4397   if (name)
4398     {
4399       char* tmp = concat (prefix, "_", name, NULL);
4400       new_vect_var = create_tmp_reg (type, tmp);
4401       free (tmp);
4402     }
4403   else
4404     new_vect_var = create_tmp_reg (type, prefix);
4405 
4406   return new_vect_var;
4407 }
4408 
4409 /* Like vect_get_new_vect_var but return an SSA name.  */
4410 
4411 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4412 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4413 {
4414   const char *prefix;
4415   tree new_vect_var;
4416 
4417   switch (var_kind)
4418   {
4419   case vect_simple_var:
4420     prefix = "vect";
4421     break;
4422   case vect_scalar_var:
4423     prefix = "stmp";
4424     break;
4425   case vect_pointer_var:
4426     prefix = "vectp";
4427     break;
4428   default:
4429     gcc_unreachable ();
4430   }
4431 
4432   if (name)
4433     {
4434       char* tmp = concat (prefix, "_", name, NULL);
4435       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4436       free (tmp);
4437     }
4438   else
4439     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4440 
4441   return new_vect_var;
4442 }
4443 
4444 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4445 
4446 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4447 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4448 {
4449   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4450   int misalign = DR_MISALIGNMENT (dr_info);
4451   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4452     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4453   else
4454     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4455 			    known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4456 			    misalign);
4457 }
4458 
4459 /* Function vect_create_addr_base_for_vector_ref.
4460 
4461    Create an expression that computes the address of the first memory location
4462    that will be accessed for a data reference.
4463 
4464    Input:
4465    STMT_INFO: The statement containing the data reference.
4466    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4467    OFFSET: Optional. If supplied, it is be added to the initial address.
4468    LOOP:    Specify relative to which loop-nest should the address be computed.
4469             For example, when the dataref is in an inner-loop nested in an
4470 	    outer-loop that is now being vectorized, LOOP can be either the
4471 	    outer-loop, or the inner-loop.  The first memory location accessed
4472 	    by the following dataref ('in' points to short):
4473 
4474 		for (i=0; i<N; i++)
4475 		   for (j=0; j<M; j++)
4476 		     s += in[i+j]
4477 
4478 	    is as follows:
4479 	    if LOOP=i_loop:	&in		(relative to i_loop)
4480 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4481    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4482 	    initial address.  Unlike OFFSET, which is number of elements to
4483 	    be added, BYTE_OFFSET is measured in bytes.
4484 
4485    Output:
4486    1. Return an SSA_NAME whose value is the address of the memory location of
4487       the first vector of the data reference.
4488    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4489       these statement(s) which define the returned SSA_NAME.
4490 
4491    FORNOW: We are only handling array accesses with step 1.  */
4492 
4493 tree
vect_create_addr_base_for_vector_ref(stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4494 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4495 				      gimple_seq *new_stmt_list,
4496 				      tree offset,
4497 				      tree byte_offset)
4498 {
4499   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4500   struct data_reference *dr = dr_info->dr;
4501   const char *base_name;
4502   tree addr_base;
4503   tree dest;
4504   gimple_seq seq = NULL;
4505   tree vect_ptr_type;
4506   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4507   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4508   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4509 
4510   tree data_ref_base = unshare_expr (drb->base_address);
4511   tree base_offset = unshare_expr (drb->offset);
4512   tree init = unshare_expr (drb->init);
4513 
4514   if (loop_vinfo)
4515     base_name = get_name (data_ref_base);
4516   else
4517     {
4518       base_offset = ssize_int (0);
4519       init = ssize_int (0);
4520       base_name = get_name (DR_REF (dr));
4521     }
4522 
4523   /* Create base_offset */
4524   base_offset = size_binop (PLUS_EXPR,
4525 			    fold_convert (sizetype, base_offset),
4526 			    fold_convert (sizetype, init));
4527 
4528   if (offset)
4529     {
4530       offset = fold_build2 (MULT_EXPR, sizetype,
4531 			    fold_convert (sizetype, offset), step);
4532       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4533 				 base_offset, offset);
4534     }
4535   if (byte_offset)
4536     {
4537       byte_offset = fold_convert (sizetype, byte_offset);
4538       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4539 				 base_offset, byte_offset);
4540     }
4541 
4542   /* base + base_offset */
4543   if (loop_vinfo)
4544     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4545   else
4546     {
4547       addr_base = build1 (ADDR_EXPR,
4548 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4549 			  unshare_expr (DR_REF (dr)));
4550     }
4551 
4552   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4553   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4554   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4555   gimple_seq_add_seq (new_stmt_list, seq);
4556 
4557   if (DR_PTR_INFO (dr)
4558       && TREE_CODE (addr_base) == SSA_NAME
4559       && !SSA_NAME_PTR_INFO (addr_base))
4560     {
4561       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4562       if (offset || byte_offset)
4563 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4564     }
4565 
4566   if (dump_enabled_p ())
4567     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4568 
4569   return addr_base;
4570 }
4571 
4572 
4573 /* Function vect_create_data_ref_ptr.
4574 
4575    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4576    location accessed in the loop by STMT_INFO, along with the def-use update
4577    chain to appropriately advance the pointer through the loop iterations.
4578    Also set aliasing information for the pointer.  This pointer is used by
4579    the callers to this function to create a memory reference expression for
4580    vector load/store access.
4581 
4582    Input:
4583    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4584          GIMPLE_ASSIGN <name, data-ref> or
4585 	 GIMPLE_ASSIGN <data-ref, name>.
4586    2. AGGR_TYPE: the type of the reference, which should be either a vector
4587         or an array.
4588    3. AT_LOOP: the loop where the vector memref is to be created.
4589    4. OFFSET (optional): an offset to be added to the initial address accessed
4590 	by the data-ref in STMT_INFO.
4591    5. BSI: location where the new stmts are to be placed if there is no loop
4592    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4593         pointing to the initial address.
4594    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4595 	to the initial address accessed by the data-ref in STMT_INFO.  This is
4596 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4597 	in bytes.
4598    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4599 	to the IV during each iteration of the loop.  NULL says to move
4600 	by one copy of AGGR_TYPE up or down, depending on the step of the
4601 	data reference.
4602 
4603    Output:
4604    1. Declare a new ptr to vector_type, and have it point to the base of the
4605       data reference (initial addressed accessed by the data reference).
4606       For example, for vector of type V8HI, the following code is generated:
4607 
4608       v8hi *ap;
4609       ap = (v8hi *)initial_address;
4610 
4611       if OFFSET is not supplied:
4612          initial_address = &a[init];
4613       if OFFSET is supplied:
4614          initial_address = &a[init + OFFSET];
4615       if BYTE_OFFSET is supplied:
4616 	 initial_address = &a[init] + BYTE_OFFSET;
4617 
4618       Return the initial_address in INITIAL_ADDRESS.
4619 
4620    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4621       update the pointer in each iteration of the loop.
4622 
4623       Return the increment stmt that updates the pointer in PTR_INCR.
4624 
4625    3. Return the pointer.  */
4626 
4627 tree
vect_create_data_ref_ptr(stmt_vec_info stmt_info,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree byte_offset,tree iv_step)4628 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4629 			  struct loop *at_loop, tree offset,
4630 			  tree *initial_address, gimple_stmt_iterator *gsi,
4631 			  gimple **ptr_incr, bool only_init,
4632 			  tree byte_offset, tree iv_step)
4633 {
4634   const char *base_name;
4635   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4636   struct loop *loop = NULL;
4637   bool nested_in_vect_loop = false;
4638   struct loop *containing_loop = NULL;
4639   tree aggr_ptr_type;
4640   tree aggr_ptr;
4641   tree new_temp;
4642   gimple_seq new_stmt_list = NULL;
4643   edge pe = NULL;
4644   basic_block new_bb;
4645   tree aggr_ptr_init;
4646   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4647   struct data_reference *dr = dr_info->dr;
4648   tree aptr;
4649   gimple_stmt_iterator incr_gsi;
4650   bool insert_after;
4651   tree indx_before_incr, indx_after_incr;
4652   gimple *incr;
4653   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4654 
4655   gcc_assert (iv_step != NULL_TREE
4656 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4657 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4658 
4659   if (loop_vinfo)
4660     {
4661       loop = LOOP_VINFO_LOOP (loop_vinfo);
4662       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4663       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4664       pe = loop_preheader_edge (loop);
4665     }
4666   else
4667     {
4668       gcc_assert (bb_vinfo);
4669       only_init = true;
4670       *ptr_incr = NULL;
4671     }
4672 
4673   /* Create an expression for the first address accessed by this load
4674      in LOOP.  */
4675   base_name = get_name (DR_BASE_ADDRESS (dr));
4676 
4677   if (dump_enabled_p ())
4678     {
4679       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4680       dump_printf_loc (MSG_NOTE, vect_location,
4681                        "create %s-pointer variable to type: %T",
4682 		       get_tree_code_name (TREE_CODE (aggr_type)),
4683 		       aggr_type);
4684       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4685         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4686       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4687         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4688       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4689         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4690       else
4691         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4692       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4693     }
4694 
4695   /* (1) Create the new aggregate-pointer variable.
4696      Vector and array types inherit the alias set of their component
4697      type by default so we need to use a ref-all pointer if the data
4698      reference does not conflict with the created aggregated data
4699      reference because it is not addressable.  */
4700   bool need_ref_all = false;
4701   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4702 			      get_alias_set (DR_REF (dr))))
4703     need_ref_all = true;
4704   /* Likewise for any of the data references in the stmt group.  */
4705   else if (DR_GROUP_SIZE (stmt_info) > 1)
4706     {
4707       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4708       do
4709 	{
4710 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4711 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4712 				      get_alias_set (DR_REF (sdr))))
4713 	    {
4714 	      need_ref_all = true;
4715 	      break;
4716 	    }
4717 	  sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4718 	}
4719       while (sinfo);
4720     }
4721   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4722 					       need_ref_all);
4723   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4724 
4725 
4726   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4727      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4728      def-use update cycles for the pointer: one relative to the outer-loop
4729      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4730      to the inner-loop (which is the inner-most loop containing the dataref),
4731      and this is done be step (5) below.
4732 
4733      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4734      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4735      redundant.  Steps (3),(4) create the following:
4736 
4737 	vp0 = &base_addr;
4738 	LOOP:	vp1 = phi(vp0,vp2)
4739 		...
4740 		...
4741 		vp2 = vp1 + step
4742 		goto LOOP
4743 
4744      If there is an inner-loop nested in loop, then step (5) will also be
4745      applied, and an additional update in the inner-loop will be created:
4746 
4747 	vp0 = &base_addr;
4748 	LOOP:   vp1 = phi(vp0,vp2)
4749 		...
4750         inner:     vp3 = phi(vp1,vp4)
4751 	           vp4 = vp3 + inner_step
4752 	           if () goto inner
4753 		...
4754 		vp2 = vp1 + step
4755 		if () goto LOOP   */
4756 
4757   /* (2) Calculate the initial address of the aggregate-pointer, and set
4758      the aggregate-pointer to point to it before the loop.  */
4759 
4760   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4761 
4762   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4763 						   offset, byte_offset);
4764   if (new_stmt_list)
4765     {
4766       if (pe)
4767         {
4768           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4769           gcc_assert (!new_bb);
4770         }
4771       else
4772         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4773     }
4774 
4775   *initial_address = new_temp;
4776   aggr_ptr_init = new_temp;
4777 
4778   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4779      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4780      inner-loop nested in LOOP (during outer-loop vectorization).  */
4781 
4782   /* No update in loop is required.  */
4783   if (only_init && (!loop_vinfo || at_loop == loop))
4784     aptr = aggr_ptr_init;
4785   else
4786     {
4787       /* Accesses to invariant addresses should be handled specially
4788 	 by the caller.  */
4789       tree step = vect_dr_behavior (dr_info)->step;
4790       gcc_assert (!integer_zerop (step));
4791 
4792       if (iv_step == NULL_TREE)
4793 	{
4794 	  /* The step of the aggregate pointer is the type size,
4795 	     negated for downward accesses.  */
4796 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4797 	  if (tree_int_cst_sgn (step) == -1)
4798 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4799 	}
4800 
4801       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4802 
4803       create_iv (aggr_ptr_init,
4804 		 fold_convert (aggr_ptr_type, iv_step),
4805 		 aggr_ptr, loop, &incr_gsi, insert_after,
4806 		 &indx_before_incr, &indx_after_incr);
4807       incr = gsi_stmt (incr_gsi);
4808       loop_vinfo->add_stmt (incr);
4809 
4810       /* Copy the points-to information if it exists. */
4811       if (DR_PTR_INFO (dr))
4812 	{
4813 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4814 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4815 	}
4816       if (ptr_incr)
4817 	*ptr_incr = incr;
4818 
4819       aptr = indx_before_incr;
4820     }
4821 
4822   if (!nested_in_vect_loop || only_init)
4823     return aptr;
4824 
4825 
4826   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4827      nested in LOOP, if exists.  */
4828 
4829   gcc_assert (nested_in_vect_loop);
4830   if (!only_init)
4831     {
4832       standard_iv_increment_position (containing_loop, &incr_gsi,
4833 				      &insert_after);
4834       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4835 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4836 		 &indx_after_incr);
4837       incr = gsi_stmt (incr_gsi);
4838       loop_vinfo->add_stmt (incr);
4839 
4840       /* Copy the points-to information if it exists. */
4841       if (DR_PTR_INFO (dr))
4842 	{
4843 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4844 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4845 	}
4846       if (ptr_incr)
4847 	*ptr_incr = incr;
4848 
4849       return indx_before_incr;
4850     }
4851   else
4852     gcc_unreachable ();
4853 }
4854 
4855 
4856 /* Function bump_vector_ptr
4857 
4858    Increment a pointer (to a vector type) by vector-size. If requested,
4859    i.e. if PTR-INCR is given, then also connect the new increment stmt
4860    to the existing def-use update-chain of the pointer, by modifying
4861    the PTR_INCR as illustrated below:
4862 
4863    The pointer def-use update-chain before this function:
4864                         DATAREF_PTR = phi (p_0, p_2)
4865                         ....
4866         PTR_INCR:       p_2 = DATAREF_PTR + step
4867 
4868    The pointer def-use update-chain after this function:
4869                         DATAREF_PTR = phi (p_0, p_2)
4870                         ....
4871                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4872                         ....
4873         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4874 
4875    Input:
4876    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4877                  in the loop.
4878    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4879 	      the loop.  The increment amount across iterations is expected
4880 	      to be vector_size.
4881    BSI - location where the new update stmt is to be placed.
4882    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4883    BUMP - optional. The offset by which to bump the pointer. If not given,
4884 	  the offset is assumed to be vector_size.
4885 
4886    Output: Return NEW_DATAREF_PTR as illustrated above.
4887 
4888 */
4889 
4890 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)4891 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4892 		 stmt_vec_info stmt_info, tree bump)
4893 {
4894   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4895   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4896   tree update = TYPE_SIZE_UNIT (vectype);
4897   gassign *incr_stmt;
4898   ssa_op_iter iter;
4899   use_operand_p use_p;
4900   tree new_dataref_ptr;
4901 
4902   if (bump)
4903     update = bump;
4904 
4905   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4906     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4907   else
4908     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4909   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4910 				   dataref_ptr, update);
4911   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4912 
4913   /* Copy the points-to information if it exists. */
4914   if (DR_PTR_INFO (dr))
4915     {
4916       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4917       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4918     }
4919 
4920   if (!ptr_incr)
4921     return new_dataref_ptr;
4922 
4923   /* Update the vector-pointer's cross-iteration increment.  */
4924   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4925     {
4926       tree use = USE_FROM_PTR (use_p);
4927 
4928       if (use == dataref_ptr)
4929         SET_USE (use_p, new_dataref_ptr);
4930       else
4931         gcc_assert (operand_equal_p (use, update, 0));
4932     }
4933 
4934   return new_dataref_ptr;
4935 }
4936 
4937 
4938 /* Copy memory reference info such as base/clique from the SRC reference
4939    to the DEST MEM_REF.  */
4940 
4941 void
vect_copy_ref_info(tree dest,tree src)4942 vect_copy_ref_info (tree dest, tree src)
4943 {
4944   if (TREE_CODE (dest) != MEM_REF)
4945     return;
4946 
4947   tree src_base = src;
4948   while (handled_component_p (src_base))
4949     src_base = TREE_OPERAND (src_base, 0);
4950   if (TREE_CODE (src_base) != MEM_REF
4951       && TREE_CODE (src_base) != TARGET_MEM_REF)
4952     return;
4953 
4954   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4955   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4956 }
4957 
4958 
4959 /* Function vect_create_destination_var.
4960 
4961    Create a new temporary of type VECTYPE.  */
4962 
4963 tree
vect_create_destination_var(tree scalar_dest,tree vectype)4964 vect_create_destination_var (tree scalar_dest, tree vectype)
4965 {
4966   tree vec_dest;
4967   const char *name;
4968   char *new_name;
4969   tree type;
4970   enum vect_var_kind kind;
4971 
4972   kind = vectype
4973     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4974     ? vect_mask_var
4975     : vect_simple_var
4976     : vect_scalar_var;
4977   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4978 
4979   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4980 
4981   name = get_name (scalar_dest);
4982   if (name)
4983     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4984   else
4985     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4986   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4987   free (new_name);
4988 
4989   return vec_dest;
4990 }
4991 
4992 /* Function vect_grouped_store_supported.
4993 
4994    Returns TRUE if interleave high and interleave low permutations
4995    are supported, and FALSE otherwise.  */
4996 
4997 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)4998 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4999 {
5000   machine_mode mode = TYPE_MODE (vectype);
5001 
5002   /* vect_permute_store_chain requires the group size to be equal to 3 or
5003      be a power of two.  */
5004   if (count != 3 && exact_log2 (count) == -1)
5005     {
5006       if (dump_enabled_p ())
5007 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5008 			 "the size of the group of accesses"
5009 			 " is not a power of 2 or not eqaul to 3\n");
5010       return false;
5011     }
5012 
5013   /* Check that the permutation is supported.  */
5014   if (VECTOR_MODE_P (mode))
5015     {
5016       unsigned int i;
5017       if (count == 3)
5018 	{
5019 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5020 	  unsigned int i, j;
5021 
5022 	  unsigned int nelt;
5023 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5024 	    {
5025 	      if (dump_enabled_p ())
5026 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5027 				 "cannot handle groups of 3 stores for"
5028 				 " variable-length vectors\n");
5029 	      return false;
5030 	    }
5031 
5032 	  vec_perm_builder sel (nelt, nelt, 1);
5033 	  sel.quick_grow (nelt);
5034 	  vec_perm_indices indices;
5035 	  for (j = 0; j < 3; j++)
5036 	    {
5037 	      int nelt0 = ((3 - j) * nelt) % 3;
5038 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5039 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5040 	      for (i = 0; i < nelt; i++)
5041 		{
5042 		  if (3 * i + nelt0 < nelt)
5043 		    sel[3 * i + nelt0] = j0++;
5044 		  if (3 * i + nelt1 < nelt)
5045 		    sel[3 * i + nelt1] = nelt + j1++;
5046 		  if (3 * i + nelt2 < nelt)
5047 		    sel[3 * i + nelt2] = 0;
5048 		}
5049 	      indices.new_vector (sel, 2, nelt);
5050 	      if (!can_vec_perm_const_p (mode, indices))
5051 		{
5052 		  if (dump_enabled_p ())
5053 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5054 				 "permutation op not supported by target.\n");
5055 		  return false;
5056 		}
5057 
5058 	      for (i = 0; i < nelt; i++)
5059 		{
5060 		  if (3 * i + nelt0 < nelt)
5061 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5062 		  if (3 * i + nelt1 < nelt)
5063 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5064 		  if (3 * i + nelt2 < nelt)
5065 		    sel[3 * i + nelt2] = nelt + j2++;
5066 		}
5067 	      indices.new_vector (sel, 2, nelt);
5068 	      if (!can_vec_perm_const_p (mode, indices))
5069 		{
5070 		  if (dump_enabled_p ())
5071 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5072 				 "permutation op not supported by target.\n");
5073 		  return false;
5074 		}
5075 	    }
5076 	  return true;
5077 	}
5078       else
5079 	{
5080 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5081 	  gcc_assert (pow2p_hwi (count));
5082 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5083 
5084 	  /* The encoding has 2 interleaved stepped patterns.  */
5085 	  vec_perm_builder sel (nelt, 2, 3);
5086 	  sel.quick_grow (6);
5087 	  for (i = 0; i < 3; i++)
5088 	    {
5089 	      sel[i * 2] = i;
5090 	      sel[i * 2 + 1] = i + nelt;
5091 	    }
5092 	  vec_perm_indices indices (sel, 2, nelt);
5093 	  if (can_vec_perm_const_p (mode, indices))
5094 	    {
5095 	      for (i = 0; i < 6; i++)
5096 		sel[i] += exact_div (nelt, 2);
5097 	      indices.new_vector (sel, 2, nelt);
5098 	      if (can_vec_perm_const_p (mode, indices))
5099 		return true;
5100 	    }
5101 	}
5102     }
5103 
5104   if (dump_enabled_p ())
5105     dump_printf (MSG_MISSED_OPTIMIZATION,
5106 		 "permutation op not supported by target.\n");
5107   return false;
5108 }
5109 
5110 
5111 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5112    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5113 
5114 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5115 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5116 			    bool masked_p)
5117 {
5118   if (masked_p)
5119     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5120 					 vec_mask_store_lanes_optab,
5121 					 vectype, count);
5122   else
5123     return vect_lanes_optab_supported_p ("vec_store_lanes",
5124 					 vec_store_lanes_optab,
5125 					 vectype, count);
5126 }
5127 
5128 
5129 /* Function vect_permute_store_chain.
5130 
5131    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5132    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5133    the data correctly for the stores.  Return the final references for stores
5134    in RESULT_CHAIN.
5135 
5136    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5137    The input is 4 vectors each containing 8 elements.  We assign a number to
5138    each element, the input sequence is:
5139 
5140    1st vec:   0  1  2  3  4  5  6  7
5141    2nd vec:   8  9 10 11 12 13 14 15
5142    3rd vec:  16 17 18 19 20 21 22 23
5143    4th vec:  24 25 26 27 28 29 30 31
5144 
5145    The output sequence should be:
5146 
5147    1st vec:  0  8 16 24  1  9 17 25
5148    2nd vec:  2 10 18 26  3 11 19 27
5149    3rd vec:  4 12 20 28  5 13 21 30
5150    4th vec:  6 14 22 30  7 15 23 31
5151 
5152    i.e., we interleave the contents of the four vectors in their order.
5153 
5154    We use interleave_high/low instructions to create such output.  The input of
5155    each interleave_high/low operation is two vectors:
5156    1st vec    2nd vec
5157    0 1 2 3    4 5 6 7
5158    the even elements of the result vector are obtained left-to-right from the
5159    high/low elements of the first vector.  The odd elements of the result are
5160    obtained left-to-right from the high/low elements of the second vector.
5161    The output of interleave_high will be:   0 4 1 5
5162    and of interleave_low:                   2 6 3 7
5163 
5164 
5165    The permutation is done in log LENGTH stages.  In each stage interleave_high
5166    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5167    where the first argument is taken from the first half of DR_CHAIN and the
5168    second argument from it's second half.
5169    In our example,
5170 
5171    I1: interleave_high (1st vec, 3rd vec)
5172    I2: interleave_low (1st vec, 3rd vec)
5173    I3: interleave_high (2nd vec, 4th vec)
5174    I4: interleave_low (2nd vec, 4th vec)
5175 
5176    The output for the first stage is:
5177 
5178    I1:  0 16  1 17  2 18  3 19
5179    I2:  4 20  5 21  6 22  7 23
5180    I3:  8 24  9 25 10 26 11 27
5181    I4: 12 28 13 29 14 30 15 31
5182 
5183    The output of the second stage, i.e. the final result is:
5184 
5185    I1:  0  8 16 24  1  9 17 25
5186    I2:  2 10 18 26  3 11 19 27
5187    I3:  4 12 20 28  5 13 21 30
5188    I4:  6 14 22 30  7 15 23 31.  */
5189 
5190 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5191 vect_permute_store_chain (vec<tree> dr_chain,
5192 			  unsigned int length,
5193 			  stmt_vec_info stmt_info,
5194 			  gimple_stmt_iterator *gsi,
5195 			  vec<tree> *result_chain)
5196 {
5197   tree vect1, vect2, high, low;
5198   gimple *perm_stmt;
5199   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5200   tree perm_mask_low, perm_mask_high;
5201   tree data_ref;
5202   tree perm3_mask_low, perm3_mask_high;
5203   unsigned int i, j, n, log_length = exact_log2 (length);
5204 
5205   result_chain->quick_grow (length);
5206   memcpy (result_chain->address (), dr_chain.address (),
5207 	  length * sizeof (tree));
5208 
5209   if (length == 3)
5210     {
5211       /* vect_grouped_store_supported ensures that this is constant.  */
5212       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5213       unsigned int j0 = 0, j1 = 0, j2 = 0;
5214 
5215       vec_perm_builder sel (nelt, nelt, 1);
5216       sel.quick_grow (nelt);
5217       vec_perm_indices indices;
5218       for (j = 0; j < 3; j++)
5219         {
5220 	  int nelt0 = ((3 - j) * nelt) % 3;
5221 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5222 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5223 
5224 	  for (i = 0; i < nelt; i++)
5225 	    {
5226 	      if (3 * i + nelt0 < nelt)
5227 		sel[3 * i + nelt0] = j0++;
5228 	      if (3 * i + nelt1 < nelt)
5229 		sel[3 * i + nelt1] = nelt + j1++;
5230 	      if (3 * i + nelt2 < nelt)
5231 		sel[3 * i + nelt2] = 0;
5232 	    }
5233 	  indices.new_vector (sel, 2, nelt);
5234 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5235 
5236 	  for (i = 0; i < nelt; i++)
5237 	    {
5238 	      if (3 * i + nelt0 < nelt)
5239 		sel[3 * i + nelt0] = 3 * i + nelt0;
5240 	      if (3 * i + nelt1 < nelt)
5241 		sel[3 * i + nelt1] = 3 * i + nelt1;
5242 	      if (3 * i + nelt2 < nelt)
5243 		sel[3 * i + nelt2] = nelt + j2++;
5244 	    }
5245 	  indices.new_vector (sel, 2, nelt);
5246 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5247 
5248 	  vect1 = dr_chain[0];
5249 	  vect2 = dr_chain[1];
5250 
5251 	  /* Create interleaving stmt:
5252 	     low = VEC_PERM_EXPR <vect1, vect2,
5253 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5254 				   j + 2, nelt + j + 2, *, ...}>  */
5255 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5256 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5257 					   vect2, perm3_mask_low);
5258 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5259 
5260 	  vect1 = data_ref;
5261 	  vect2 = dr_chain[2];
5262 	  /* Create interleaving stmt:
5263 	     low = VEC_PERM_EXPR <vect1, vect2,
5264 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5265 				   6, 7, nelt + j + 2, ...}>  */
5266 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5267 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5268 					   vect2, perm3_mask_high);
5269 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5270 	  (*result_chain)[j] = data_ref;
5271 	}
5272     }
5273   else
5274     {
5275       /* If length is not equal to 3 then only power of 2 is supported.  */
5276       gcc_assert (pow2p_hwi (length));
5277 
5278       /* The encoding has 2 interleaved stepped patterns.  */
5279       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5280       vec_perm_builder sel (nelt, 2, 3);
5281       sel.quick_grow (6);
5282       for (i = 0; i < 3; i++)
5283 	{
5284 	  sel[i * 2] = i;
5285 	  sel[i * 2 + 1] = i + nelt;
5286 	}
5287 	vec_perm_indices indices (sel, 2, nelt);
5288 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5289 
5290 	for (i = 0; i < 6; i++)
5291 	  sel[i] += exact_div (nelt, 2);
5292 	indices.new_vector (sel, 2, nelt);
5293 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5294 
5295 	for (i = 0, n = log_length; i < n; i++)
5296 	  {
5297 	    for (j = 0; j < length/2; j++)
5298 	      {
5299 		vect1 = dr_chain[j];
5300 		vect2 = dr_chain[j+length/2];
5301 
5302 		/* Create interleaving stmt:
5303 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5304 							...}>  */
5305 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5306 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5307 						 vect2, perm_mask_high);
5308 		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5309 		(*result_chain)[2*j] = high;
5310 
5311 		/* Create interleaving stmt:
5312 		   low = VEC_PERM_EXPR <vect1, vect2,
5313 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5314 					 ...}>  */
5315 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5316 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5317 						 vect2, perm_mask_low);
5318 		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5319 		(*result_chain)[2*j+1] = low;
5320 	      }
5321 	    memcpy (dr_chain.address (), result_chain->address (),
5322 		    length * sizeof (tree));
5323 	  }
5324     }
5325 }
5326 
5327 /* Function vect_setup_realignment
5328 
5329    This function is called when vectorizing an unaligned load using
5330    the dr_explicit_realign[_optimized] scheme.
5331    This function generates the following code at the loop prolog:
5332 
5333       p = initial_addr;
5334    x  msq_init = *(floor(p));   # prolog load
5335       realignment_token = call target_builtin;
5336     loop:
5337    x  msq = phi (msq_init, ---)
5338 
5339    The stmts marked with x are generated only for the case of
5340    dr_explicit_realign_optimized.
5341 
5342    The code above sets up a new (vector) pointer, pointing to the first
5343    location accessed by STMT_INFO, and a "floor-aligned" load using that
5344    pointer.  It also generates code to compute the "realignment-token"
5345    (if the relevant target hook was defined), and creates a phi-node at the
5346    loop-header bb whose arguments are the result of the prolog-load (created
5347    by this function) and the result of a load that takes place in the loop
5348    (to be created by the caller to this function).
5349 
5350    For the case of dr_explicit_realign_optimized:
5351    The caller to this function uses the phi-result (msq) to create the
5352    realignment code inside the loop, and sets up the missing phi argument,
5353    as follows:
5354     loop:
5355       msq = phi (msq_init, lsq)
5356       lsq = *(floor(p'));        # load in loop
5357       result = realign_load (msq, lsq, realignment_token);
5358 
5359    For the case of dr_explicit_realign:
5360     loop:
5361       msq = *(floor(p)); 	# load in loop
5362       p' = p + (VS-1);
5363       lsq = *(floor(p'));	# load in loop
5364       result = realign_load (msq, lsq, realignment_token);
5365 
5366    Input:
5367    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5368 	       a memory location that may be unaligned.
5369    BSI - place where new code is to be inserted.
5370    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5371 			      is used.
5372 
5373    Output:
5374    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5375                        target hook, if defined.
5376    Return value - the result of the loop-header phi node.  */
5377 
5378 tree
vect_setup_realignment(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5379 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5380                         tree *realignment_token,
5381 			enum dr_alignment_support alignment_support_scheme,
5382 			tree init_addr,
5383 			struct loop **at_loop)
5384 {
5385   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5386   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5387   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5388   struct data_reference *dr = dr_info->dr;
5389   struct loop *loop = NULL;
5390   edge pe = NULL;
5391   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5392   tree vec_dest;
5393   gimple *inc;
5394   tree ptr;
5395   tree data_ref;
5396   basic_block new_bb;
5397   tree msq_init = NULL_TREE;
5398   tree new_temp;
5399   gphi *phi_stmt;
5400   tree msq = NULL_TREE;
5401   gimple_seq stmts = NULL;
5402   bool compute_in_loop = false;
5403   bool nested_in_vect_loop = false;
5404   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5405   struct loop *loop_for_initial_load = NULL;
5406 
5407   if (loop_vinfo)
5408     {
5409       loop = LOOP_VINFO_LOOP (loop_vinfo);
5410       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5411     }
5412 
5413   gcc_assert (alignment_support_scheme == dr_explicit_realign
5414 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5415 
5416   /* We need to generate three things:
5417      1. the misalignment computation
5418      2. the extra vector load (for the optimized realignment scheme).
5419      3. the phi node for the two vectors from which the realignment is
5420       done (for the optimized realignment scheme).  */
5421 
5422   /* 1. Determine where to generate the misalignment computation.
5423 
5424      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5425      calculation will be generated by this function, outside the loop (in the
5426      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5427      caller, inside the loop.
5428 
5429      Background: If the misalignment remains fixed throughout the iterations of
5430      the loop, then both realignment schemes are applicable, and also the
5431      misalignment computation can be done outside LOOP.  This is because we are
5432      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5433      are a multiple of VS (the Vector Size), and therefore the misalignment in
5434      different vectorized LOOP iterations is always the same.
5435      The problem arises only if the memory access is in an inner-loop nested
5436      inside LOOP, which is now being vectorized using outer-loop vectorization.
5437      This is the only case when the misalignment of the memory access may not
5438      remain fixed throughout the iterations of the inner-loop (as explained in
5439      detail in vect_supportable_dr_alignment).  In this case, not only is the
5440      optimized realignment scheme not applicable, but also the misalignment
5441      computation (and generation of the realignment token that is passed to
5442      REALIGN_LOAD) have to be done inside the loop.
5443 
5444      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5445      or not, which in turn determines if the misalignment is computed inside
5446      the inner-loop, or outside LOOP.  */
5447 
5448   if (init_addr != NULL_TREE || !loop_vinfo)
5449     {
5450       compute_in_loop = true;
5451       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5452     }
5453 
5454 
5455   /* 2. Determine where to generate the extra vector load.
5456 
5457      For the optimized realignment scheme, instead of generating two vector
5458      loads in each iteration, we generate a single extra vector load in the
5459      preheader of the loop, and in each iteration reuse the result of the
5460      vector load from the previous iteration.  In case the memory access is in
5461      an inner-loop nested inside LOOP, which is now being vectorized using
5462      outer-loop vectorization, we need to determine whether this initial vector
5463      load should be generated at the preheader of the inner-loop, or can be
5464      generated at the preheader of LOOP.  If the memory access has no evolution
5465      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5466      to be generated inside LOOP (in the preheader of the inner-loop).  */
5467 
5468   if (nested_in_vect_loop)
5469     {
5470       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5471       bool invariant_in_outerloop =
5472             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5473       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5474     }
5475   else
5476     loop_for_initial_load = loop;
5477   if (at_loop)
5478     *at_loop = loop_for_initial_load;
5479 
5480   if (loop_for_initial_load)
5481     pe = loop_preheader_edge (loop_for_initial_load);
5482 
5483   /* 3. For the case of the optimized realignment, create the first vector
5484       load at the loop preheader.  */
5485 
5486   if (alignment_support_scheme == dr_explicit_realign_optimized)
5487     {
5488       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5489       gassign *new_stmt;
5490 
5491       gcc_assert (!compute_in_loop);
5492       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5493       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5494 				      loop_for_initial_load, NULL_TREE,
5495 				      &init_addr, NULL, &inc, true);
5496       if (TREE_CODE (ptr) == SSA_NAME)
5497 	new_temp = copy_ssa_name (ptr);
5498       else
5499 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5500       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5501       tree type = TREE_TYPE (ptr);
5502       new_stmt = gimple_build_assign
5503 		   (new_temp, BIT_AND_EXPR, ptr,
5504 		    fold_build2 (MINUS_EXPR, type,
5505 				 build_int_cst (type, 0),
5506 				 build_int_cst (type, align)));
5507       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5508       gcc_assert (!new_bb);
5509       data_ref
5510 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5511 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5512       vect_copy_ref_info (data_ref, DR_REF (dr));
5513       new_stmt = gimple_build_assign (vec_dest, data_ref);
5514       new_temp = make_ssa_name (vec_dest, new_stmt);
5515       gimple_assign_set_lhs (new_stmt, new_temp);
5516       if (pe)
5517         {
5518           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5519           gcc_assert (!new_bb);
5520         }
5521       else
5522          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5523 
5524       msq_init = gimple_assign_lhs (new_stmt);
5525     }
5526 
5527   /* 4. Create realignment token using a target builtin, if available.
5528       It is done either inside the containing loop, or before LOOP (as
5529       determined above).  */
5530 
5531   if (targetm.vectorize.builtin_mask_for_load)
5532     {
5533       gcall *new_stmt;
5534       tree builtin_decl;
5535 
5536       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5537       if (!init_addr)
5538 	{
5539 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5540 	  init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5541 							    NULL_TREE);
5542           if (loop)
5543             {
5544    	      pe = loop_preheader_edge (loop);
5545 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5546 	      gcc_assert (!new_bb);
5547             }
5548           else
5549              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5550 	}
5551 
5552       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5553       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5554       vec_dest =
5555 	vect_create_destination_var (scalar_dest,
5556 				     gimple_call_return_type (new_stmt));
5557       new_temp = make_ssa_name (vec_dest, new_stmt);
5558       gimple_call_set_lhs (new_stmt, new_temp);
5559 
5560       if (compute_in_loop)
5561 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5562       else
5563 	{
5564 	  /* Generate the misalignment computation outside LOOP.  */
5565 	  pe = loop_preheader_edge (loop);
5566 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567 	  gcc_assert (!new_bb);
5568 	}
5569 
5570       *realignment_token = gimple_call_lhs (new_stmt);
5571 
5572       /* The result of the CALL_EXPR to this builtin is determined from
5573          the value of the parameter and no global variables are touched
5574          which makes the builtin a "const" function.  Requiring the
5575          builtin to have the "const" attribute makes it unnecessary
5576          to call mark_call_clobbered.  */
5577       gcc_assert (TREE_READONLY (builtin_decl));
5578     }
5579 
5580   if (alignment_support_scheme == dr_explicit_realign)
5581     return msq;
5582 
5583   gcc_assert (!compute_in_loop);
5584   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5585 
5586 
5587   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5588 
5589   pe = loop_preheader_edge (containing_loop);
5590   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5591   msq = make_ssa_name (vec_dest);
5592   phi_stmt = create_phi_node (msq, containing_loop->header);
5593   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5594 
5595   return msq;
5596 }
5597 
5598 
5599 /* Function vect_grouped_load_supported.
5600 
5601    COUNT is the size of the load group (the number of statements plus the
5602    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5603    only one statement, with a gap of COUNT - 1.
5604 
5605    Returns true if a suitable permute exists.  */
5606 
5607 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5608 vect_grouped_load_supported (tree vectype, bool single_element_p,
5609 			     unsigned HOST_WIDE_INT count)
5610 {
5611   machine_mode mode = TYPE_MODE (vectype);
5612 
5613   /* If this is single-element interleaving with an element distance
5614      that leaves unused vector loads around punt - we at least create
5615      very sub-optimal code in that case (and blow up memory,
5616      see PR65518).  */
5617   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5618     {
5619       if (dump_enabled_p ())
5620 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5621 			 "single-element interleaving not supported "
5622 			 "for not adjacent vector loads\n");
5623       return false;
5624     }
5625 
5626   /* vect_permute_load_chain requires the group size to be equal to 3 or
5627      be a power of two.  */
5628   if (count != 3 && exact_log2 (count) == -1)
5629     {
5630       if (dump_enabled_p ())
5631 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632 			 "the size of the group of accesses"
5633 			 " is not a power of 2 or not equal to 3\n");
5634       return false;
5635     }
5636 
5637   /* Check that the permutation is supported.  */
5638   if (VECTOR_MODE_P (mode))
5639     {
5640       unsigned int i, j;
5641       if (count == 3)
5642 	{
5643 	  unsigned int nelt;
5644 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5645 	    {
5646 	      if (dump_enabled_p ())
5647 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5648 				 "cannot handle groups of 3 loads for"
5649 				 " variable-length vectors\n");
5650 	      return false;
5651 	    }
5652 
5653 	  vec_perm_builder sel (nelt, nelt, 1);
5654 	  sel.quick_grow (nelt);
5655 	  vec_perm_indices indices;
5656 	  unsigned int k;
5657 	  for (k = 0; k < 3; k++)
5658 	    {
5659 	      for (i = 0; i < nelt; i++)
5660 		if (3 * i + k < 2 * nelt)
5661 		  sel[i] = 3 * i + k;
5662 		else
5663 		  sel[i] = 0;
5664 	      indices.new_vector (sel, 2, nelt);
5665 	      if (!can_vec_perm_const_p (mode, indices))
5666 		{
5667 		  if (dump_enabled_p ())
5668 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 				     "shuffle of 3 loads is not supported by"
5670 				     " target\n");
5671 		  return false;
5672 		}
5673 	      for (i = 0, j = 0; i < nelt; i++)
5674 		if (3 * i + k < 2 * nelt)
5675 		  sel[i] = i;
5676 		else
5677 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5678 	      indices.new_vector (sel, 2, nelt);
5679 	      if (!can_vec_perm_const_p (mode, indices))
5680 		{
5681 		  if (dump_enabled_p ())
5682 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5683 				     "shuffle of 3 loads is not supported by"
5684 				     " target\n");
5685 		  return false;
5686 		}
5687 	    }
5688 	  return true;
5689 	}
5690       else
5691 	{
5692 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5693 	  gcc_assert (pow2p_hwi (count));
5694 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5695 
5696 	  /* The encoding has a single stepped pattern.  */
5697 	  vec_perm_builder sel (nelt, 1, 3);
5698 	  sel.quick_grow (3);
5699 	  for (i = 0; i < 3; i++)
5700 	    sel[i] = i * 2;
5701 	  vec_perm_indices indices (sel, 2, nelt);
5702 	  if (can_vec_perm_const_p (mode, indices))
5703 	    {
5704 	      for (i = 0; i < 3; i++)
5705 		sel[i] = i * 2 + 1;
5706 	      indices.new_vector (sel, 2, nelt);
5707 	      if (can_vec_perm_const_p (mode, indices))
5708 		return true;
5709 	    }
5710         }
5711     }
5712 
5713   if (dump_enabled_p ())
5714     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5715 		     "extract even/odd not supported by target\n");
5716   return false;
5717 }
5718 
5719 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5720    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5721 
5722 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5723 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5724 			   bool masked_p)
5725 {
5726   if (masked_p)
5727     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5728 					 vec_mask_load_lanes_optab,
5729 					 vectype, count);
5730   else
5731     return vect_lanes_optab_supported_p ("vec_load_lanes",
5732 					 vec_load_lanes_optab,
5733 					 vectype, count);
5734 }
5735 
5736 /* Function vect_permute_load_chain.
5737 
5738    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5739    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5740    the input data correctly.  Return the final references for loads in
5741    RESULT_CHAIN.
5742 
5743    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5744    The input is 4 vectors each containing 8 elements. We assign a number to each
5745    element, the input sequence is:
5746 
5747    1st vec:   0  1  2  3  4  5  6  7
5748    2nd vec:   8  9 10 11 12 13 14 15
5749    3rd vec:  16 17 18 19 20 21 22 23
5750    4th vec:  24 25 26 27 28 29 30 31
5751 
5752    The output sequence should be:
5753 
5754    1st vec:  0 4  8 12 16 20 24 28
5755    2nd vec:  1 5  9 13 17 21 25 29
5756    3rd vec:  2 6 10 14 18 22 26 30
5757    4th vec:  3 7 11 15 19 23 27 31
5758 
5759    i.e., the first output vector should contain the first elements of each
5760    interleaving group, etc.
5761 
5762    We use extract_even/odd instructions to create such output.  The input of
5763    each extract_even/odd operation is two vectors
5764    1st vec    2nd vec
5765    0 1 2 3    4 5 6 7
5766 
5767    and the output is the vector of extracted even/odd elements.  The output of
5768    extract_even will be:   0 2 4 6
5769    and of extract_odd:     1 3 5 7
5770 
5771 
5772    The permutation is done in log LENGTH stages.  In each stage extract_even
5773    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5774    their order.  In our example,
5775 
5776    E1: extract_even (1st vec, 2nd vec)
5777    E2: extract_odd (1st vec, 2nd vec)
5778    E3: extract_even (3rd vec, 4th vec)
5779    E4: extract_odd (3rd vec, 4th vec)
5780 
5781    The output for the first stage will be:
5782 
5783    E1:  0  2  4  6  8 10 12 14
5784    E2:  1  3  5  7  9 11 13 15
5785    E3: 16 18 20 22 24 26 28 30
5786    E4: 17 19 21 23 25 27 29 31
5787 
5788    In order to proceed and create the correct sequence for the next stage (or
5789    for the correct output, if the second stage is the last one, as in our
5790    example), we first put the output of extract_even operation and then the
5791    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5792    The input for the second stage is:
5793 
5794    1st vec (E1):  0  2  4  6  8 10 12 14
5795    2nd vec (E3): 16 18 20 22 24 26 28 30
5796    3rd vec (E2):  1  3  5  7  9 11 13 15
5797    4th vec (E4): 17 19 21 23 25 27 29 31
5798 
5799    The output of the second stage:
5800 
5801    E1: 0 4  8 12 16 20 24 28
5802    E2: 2 6 10 14 18 22 26 30
5803    E3: 1 5  9 13 17 21 25 29
5804    E4: 3 7 11 15 19 23 27 31
5805 
5806    And RESULT_CHAIN after reordering:
5807 
5808    1st vec (E1):  0 4  8 12 16 20 24 28
5809    2nd vec (E3):  1 5  9 13 17 21 25 29
5810    3rd vec (E2):  2 6 10 14 18 22 26 30
5811    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5812 
5813 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5814 vect_permute_load_chain (vec<tree> dr_chain,
5815 			 unsigned int length,
5816 			 stmt_vec_info stmt_info,
5817 			 gimple_stmt_iterator *gsi,
5818 			 vec<tree> *result_chain)
5819 {
5820   tree data_ref, first_vect, second_vect;
5821   tree perm_mask_even, perm_mask_odd;
5822   tree perm3_mask_low, perm3_mask_high;
5823   gimple *perm_stmt;
5824   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5825   unsigned int i, j, log_length = exact_log2 (length);
5826 
5827   result_chain->quick_grow (length);
5828   memcpy (result_chain->address (), dr_chain.address (),
5829 	  length * sizeof (tree));
5830 
5831   if (length == 3)
5832     {
5833       /* vect_grouped_load_supported ensures that this is constant.  */
5834       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5835       unsigned int k;
5836 
5837       vec_perm_builder sel (nelt, nelt, 1);
5838       sel.quick_grow (nelt);
5839       vec_perm_indices indices;
5840       for (k = 0; k < 3; k++)
5841 	{
5842 	  for (i = 0; i < nelt; i++)
5843 	    if (3 * i + k < 2 * nelt)
5844 	      sel[i] = 3 * i + k;
5845 	    else
5846 	      sel[i] = 0;
5847 	  indices.new_vector (sel, 2, nelt);
5848 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5849 
5850 	  for (i = 0, j = 0; i < nelt; i++)
5851 	    if (3 * i + k < 2 * nelt)
5852 	      sel[i] = i;
5853 	    else
5854 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5855 	  indices.new_vector (sel, 2, nelt);
5856 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5857 
5858 	  first_vect = dr_chain[0];
5859 	  second_vect = dr_chain[1];
5860 
5861 	  /* Create interleaving stmt (low part of):
5862 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5863 							     ...}>  */
5864 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5865 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5866 					   second_vect, perm3_mask_low);
5867 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5868 
5869 	  /* Create interleaving stmt (high part of):
5870 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5871 							      ...}>  */
5872 	  first_vect = data_ref;
5873 	  second_vect = dr_chain[2];
5874 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5875 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5876 					   second_vect, perm3_mask_high);
5877 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5878 	  (*result_chain)[k] = data_ref;
5879 	}
5880     }
5881   else
5882     {
5883       /* If length is not equal to 3 then only power of 2 is supported.  */
5884       gcc_assert (pow2p_hwi (length));
5885 
5886       /* The encoding has a single stepped pattern.  */
5887       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5888       vec_perm_builder sel (nelt, 1, 3);
5889       sel.quick_grow (3);
5890       for (i = 0; i < 3; ++i)
5891 	sel[i] = i * 2;
5892       vec_perm_indices indices (sel, 2, nelt);
5893       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5894 
5895       for (i = 0; i < 3; ++i)
5896 	sel[i] = i * 2 + 1;
5897       indices.new_vector (sel, 2, nelt);
5898       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5899 
5900       for (i = 0; i < log_length; i++)
5901 	{
5902 	  for (j = 0; j < length; j += 2)
5903 	    {
5904 	      first_vect = dr_chain[j];
5905 	      second_vect = dr_chain[j+1];
5906 
5907 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5908 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5909 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5910 					       first_vect, second_vect,
5911 					       perm_mask_even);
5912 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5913 	      (*result_chain)[j/2] = data_ref;
5914 
5915 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5916 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5917 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5918 					       first_vect, second_vect,
5919 					       perm_mask_odd);
5920 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5921 	      (*result_chain)[j/2+length/2] = data_ref;
5922 	    }
5923 	  memcpy (dr_chain.address (), result_chain->address (),
5924 		  length * sizeof (tree));
5925 	}
5926     }
5927 }
5928 
5929 /* Function vect_shift_permute_load_chain.
5930 
5931    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5932    sequence of stmts to reorder the input data accordingly.
5933    Return the final references for loads in RESULT_CHAIN.
5934    Return true if successed, false otherwise.
5935 
5936    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5937    The input is 3 vectors each containing 8 elements.  We assign a
5938    number to each element, the input sequence is:
5939 
5940    1st vec:   0  1  2  3  4  5  6  7
5941    2nd vec:   8  9 10 11 12 13 14 15
5942    3rd vec:  16 17 18 19 20 21 22 23
5943 
5944    The output sequence should be:
5945 
5946    1st vec:  0 3 6  9 12 15 18 21
5947    2nd vec:  1 4 7 10 13 16 19 22
5948    3rd vec:  2 5 8 11 14 17 20 23
5949 
5950    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5951 
5952    First we shuffle all 3 vectors to get correct elements order:
5953 
5954    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5955    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5956    3rd vec:  (16 19 22) (17 20 23) (18 21)
5957 
5958    Next we unite and shift vector 3 times:
5959 
5960    1st step:
5961      shift right by 6 the concatenation of:
5962      "1st vec" and  "2nd vec"
5963        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5964      "2nd vec" and  "3rd vec"
5965        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5966      "3rd vec" and  "1st vec"
5967        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5968 			     | New vectors                   |
5969 
5970      So that now new vectors are:
5971 
5972      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5973      2nd vec:  (10 13) (16 19 22) (17 20 23)
5974      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5975 
5976    2nd step:
5977      shift right by 5 the concatenation of:
5978      "1st vec" and  "3rd vec"
5979        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5980      "2nd vec" and  "1st vec"
5981        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5982      "3rd vec" and  "2nd vec"
5983        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5984 			  | New vectors                   |
5985 
5986      So that now new vectors are:
5987 
5988      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5989      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5990      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5991 
5992    3rd step:
5993      shift right by 5 the concatenation of:
5994      "1st vec" and  "1st vec"
5995        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5996      shift right by 3 the concatenation of:
5997      "2nd vec" and  "2nd vec"
5998                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5999 			  | New vectors                   |
6000 
6001      So that now all vectors are READY:
6002      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6003      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6004      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6005 
6006    This algorithm is faster than one in vect_permute_load_chain if:
6007      1.  "shift of a concatination" is faster than general permutation.
6008 	 This is usually so.
6009      2.  The TARGET machine can't execute vector instructions in parallel.
6010 	 This is because each step of the algorithm depends on previous.
6011 	 The algorithm in vect_permute_load_chain is much more parallel.
6012 
6013    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6014 */
6015 
6016 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6017 vect_shift_permute_load_chain (vec<tree> dr_chain,
6018 			       unsigned int length,
6019 			       stmt_vec_info stmt_info,
6020 			       gimple_stmt_iterator *gsi,
6021 			       vec<tree> *result_chain)
6022 {
6023   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6024   tree perm2_mask1, perm2_mask2, perm3_mask;
6025   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6026   gimple *perm_stmt;
6027 
6028   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6029   unsigned int i;
6030   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6031 
6032   unsigned HOST_WIDE_INT nelt, vf;
6033   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6034       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6035     /* Not supported for variable-length vectors.  */
6036     return false;
6037 
6038   vec_perm_builder sel (nelt, nelt, 1);
6039   sel.quick_grow (nelt);
6040 
6041   result_chain->quick_grow (length);
6042   memcpy (result_chain->address (), dr_chain.address (),
6043 	  length * sizeof (tree));
6044 
6045   if (pow2p_hwi (length) && vf > 4)
6046     {
6047       unsigned int j, log_length = exact_log2 (length);
6048       for (i = 0; i < nelt / 2; ++i)
6049 	sel[i] = i * 2;
6050       for (i = 0; i < nelt / 2; ++i)
6051 	sel[nelt / 2 + i] = i * 2 + 1;
6052       vec_perm_indices indices (sel, 2, nelt);
6053       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6054 	{
6055 	  if (dump_enabled_p ())
6056 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6057 			     "shuffle of 2 fields structure is not \
6058 			      supported by target\n");
6059 	  return false;
6060 	}
6061       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6062 
6063       for (i = 0; i < nelt / 2; ++i)
6064 	sel[i] = i * 2 + 1;
6065       for (i = 0; i < nelt / 2; ++i)
6066 	sel[nelt / 2 + i] = i * 2;
6067       indices.new_vector (sel, 2, nelt);
6068       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6069 	{
6070 	  if (dump_enabled_p ())
6071 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6072 			     "shuffle of 2 fields structure is not \
6073 			      supported by target\n");
6074 	  return false;
6075 	}
6076       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6077 
6078       /* Generating permutation constant to shift all elements.
6079 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6080       for (i = 0; i < nelt; i++)
6081 	sel[i] = nelt / 2 + i;
6082       indices.new_vector (sel, 2, nelt);
6083       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6084 	{
6085 	  if (dump_enabled_p ())
6086 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6087 			     "shift permutation is not supported by target\n");
6088 	  return false;
6089 	}
6090       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6091 
6092       /* Generating permutation constant to select vector from 2.
6093 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6094       for (i = 0; i < nelt / 2; i++)
6095 	sel[i] = i;
6096       for (i = nelt / 2; i < nelt; i++)
6097 	sel[i] = nelt + i;
6098       indices.new_vector (sel, 2, nelt);
6099       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6100 	{
6101 	  if (dump_enabled_p ())
6102 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6103 			     "select is not supported by target\n");
6104 	  return false;
6105 	}
6106       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6107 
6108       for (i = 0; i < log_length; i++)
6109 	{
6110 	  for (j = 0; j < length; j += 2)
6111 	    {
6112 	      first_vect = dr_chain[j];
6113 	      second_vect = dr_chain[j + 1];
6114 
6115 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6116 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6117 					       first_vect, first_vect,
6118 					       perm2_mask1);
6119 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6120 	      vect[0] = data_ref;
6121 
6122 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6123 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6124 					       second_vect, second_vect,
6125 					       perm2_mask2);
6126 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6127 	      vect[1] = data_ref;
6128 
6129 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6130 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6131 					       vect[0], vect[1], shift1_mask);
6132 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6133 	      (*result_chain)[j/2 + length/2] = data_ref;
6134 
6135 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6136 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6137 					       vect[0], vect[1], select_mask);
6138 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6139 	      (*result_chain)[j/2] = data_ref;
6140 	    }
6141 	  memcpy (dr_chain.address (), result_chain->address (),
6142 		  length * sizeof (tree));
6143 	}
6144       return true;
6145     }
6146   if (length == 3 && vf > 2)
6147     {
6148       unsigned int k = 0, l = 0;
6149 
6150       /* Generating permutation constant to get all elements in rigth order.
6151 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6152       for (i = 0; i < nelt; i++)
6153 	{
6154 	  if (3 * k + (l % 3) >= nelt)
6155 	    {
6156 	      k = 0;
6157 	      l += (3 - (nelt % 3));
6158 	    }
6159 	  sel[i] = 3 * k + (l % 3);
6160 	  k++;
6161 	}
6162       vec_perm_indices indices (sel, 2, nelt);
6163       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164 	{
6165 	  if (dump_enabled_p ())
6166 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167 			     "shuffle of 3 fields structure is not \
6168 			      supported by target\n");
6169 	  return false;
6170 	}
6171       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6172 
6173       /* Generating permutation constant to shift all elements.
6174 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6175       for (i = 0; i < nelt; i++)
6176 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6177       indices.new_vector (sel, 2, nelt);
6178       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6179 	{
6180 	  if (dump_enabled_p ())
6181 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182 			     "shift permutation is not supported by target\n");
6183 	  return false;
6184 	}
6185       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6186 
6187       /* Generating permutation constant to shift all elements.
6188 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6189       for (i = 0; i < nelt; i++)
6190 	sel[i] = 2 * (nelt / 3) + 1 + i;
6191       indices.new_vector (sel, 2, nelt);
6192       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6193 	{
6194 	  if (dump_enabled_p ())
6195 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196 			     "shift permutation is not supported by target\n");
6197 	  return false;
6198 	}
6199       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6200 
6201       /* Generating permutation constant to shift all elements.
6202 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6203       for (i = 0; i < nelt; i++)
6204 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6205       indices.new_vector (sel, 2, nelt);
6206       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6207 	{
6208 	  if (dump_enabled_p ())
6209 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6210 			     "shift permutation is not supported by target\n");
6211 	  return false;
6212 	}
6213       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6214 
6215       /* Generating permutation constant to shift all elements.
6216 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6217       for (i = 0; i < nelt; i++)
6218 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6219       indices.new_vector (sel, 2, nelt);
6220       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6221 	{
6222 	  if (dump_enabled_p ())
6223 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6224 			     "shift permutation is not supported by target\n");
6225 	  return false;
6226 	}
6227       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6228 
6229       for (k = 0; k < 3; k++)
6230 	{
6231 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6232 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6233 					   dr_chain[k], dr_chain[k],
6234 					   perm3_mask);
6235 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6236 	  vect[k] = data_ref;
6237 	}
6238 
6239       for (k = 0; k < 3; k++)
6240 	{
6241 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6242 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6243 					   vect[k % 3], vect[(k + 1) % 3],
6244 					   shift1_mask);
6245 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6246 	  vect_shift[k] = data_ref;
6247 	}
6248 
6249       for (k = 0; k < 3; k++)
6250 	{
6251 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6252 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253 					   vect_shift[(4 - k) % 3],
6254 					   vect_shift[(3 - k) % 3],
6255 					   shift2_mask);
6256 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6257 	  vect[k] = data_ref;
6258 	}
6259 
6260       (*result_chain)[3 - (nelt % 3)] = vect[2];
6261 
6262       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6263       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6264 				       vect[0], shift3_mask);
6265       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6266       (*result_chain)[nelt % 3] = data_ref;
6267 
6268       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6269       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6270 				       vect[1], shift4_mask);
6271       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6272       (*result_chain)[0] = data_ref;
6273       return true;
6274     }
6275   return false;
6276 }
6277 
6278 /* Function vect_transform_grouped_load.
6279 
6280    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6281    to perform their permutation and ascribe the result vectorized statements to
6282    the scalar statements.
6283 */
6284 
6285 void
vect_transform_grouped_load(stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6286 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6287 			     int size, gimple_stmt_iterator *gsi)
6288 {
6289   machine_mode mode;
6290   vec<tree> result_chain = vNULL;
6291 
6292   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6293      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6294      vectors, that are ready for vector computation.  */
6295   result_chain.create (size);
6296 
6297   /* If reassociation width for vector type is 2 or greater target machine can
6298      execute 2 or more vector instructions in parallel.  Otherwise try to
6299      get chain for loads group using vect_shift_permute_load_chain.  */
6300   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6301   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6302       || pow2p_hwi (size)
6303       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6304 					 gsi, &result_chain))
6305     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6306   vect_record_grouped_load_vectors (stmt_info, result_chain);
6307   result_chain.release ();
6308 }
6309 
6310 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6311    generated as part of the vectorization of STMT_INFO.  Assign the statement
6312    for each vector to the associated scalar statement.  */
6313 
6314 void
vect_record_grouped_load_vectors(stmt_vec_info stmt_info,vec<tree> result_chain)6315 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6316 				  vec<tree> result_chain)
6317 {
6318   vec_info *vinfo = stmt_info->vinfo;
6319   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6320   unsigned int i, gap_count;
6321   tree tmp_data_ref;
6322 
6323   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6324      Since we scan the chain starting from it's first node, their order
6325      corresponds the order of data-refs in RESULT_CHAIN.  */
6326   stmt_vec_info next_stmt_info = first_stmt_info;
6327   gap_count = 1;
6328   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6329     {
6330       if (!next_stmt_info)
6331 	break;
6332 
6333       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6334        code elimination pass later.  No need to check for the first stmt in
6335        the group, since it always exists.
6336        DR_GROUP_GAP is the number of steps in elements from the previous
6337        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6338        correspond to the gaps.  */
6339       if (next_stmt_info != first_stmt_info
6340 	  && gap_count < DR_GROUP_GAP (next_stmt_info))
6341 	{
6342 	  gap_count++;
6343 	  continue;
6344 	}
6345 
6346       /* ???  The following needs cleanup after the removal of
6347          DR_GROUP_SAME_DR_STMT.  */
6348       if (next_stmt_info)
6349         {
6350 	  stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6351 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6352 	     copies, and we put the new vector statement in the first available
6353 	     RELATED_STMT.  */
6354 	  if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6355 	    STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6356 	  else
6357             {
6358 	      stmt_vec_info prev_stmt_info
6359 		= STMT_VINFO_VEC_STMT (next_stmt_info);
6360 	      stmt_vec_info rel_stmt_info
6361 		= STMT_VINFO_RELATED_STMT (prev_stmt_info);
6362 	      while (rel_stmt_info)
6363 		{
6364 		  prev_stmt_info = rel_stmt_info;
6365 		  rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6366 		}
6367 
6368 	      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6369             }
6370 
6371 	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6372 	  gap_count = 1;
6373         }
6374     }
6375 }
6376 
6377 /* Function vect_force_dr_alignment_p.
6378 
6379    Returns whether the alignment of a DECL can be forced to be aligned
6380    on ALIGNMENT bit boundary.  */
6381 
6382 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6383 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6384 {
6385   if (!VAR_P (decl))
6386     return false;
6387 
6388   if (decl_in_symtab_p (decl)
6389       && !symtab_node::get (decl)->can_increase_alignment_p ())
6390     return false;
6391 
6392   if (TREE_STATIC (decl))
6393     return (known_le (alignment,
6394 		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6395   else
6396     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6397 }
6398 
6399 
6400 /* Return whether the data reference DR_INFO is supported with respect to its
6401    alignment.
6402    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6403    it is aligned, i.e., check if it is possible to vectorize it with different
6404    alignment.  */
6405 
6406 enum dr_alignment_support
vect_supportable_dr_alignment(dr_vec_info * dr_info,bool check_aligned_accesses)6407 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6408                                bool check_aligned_accesses)
6409 {
6410   data_reference *dr = dr_info->dr;
6411   stmt_vec_info stmt_info = dr_info->stmt;
6412   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6413   machine_mode mode = TYPE_MODE (vectype);
6414   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6415   struct loop *vect_loop = NULL;
6416   bool nested_in_vect_loop = false;
6417 
6418   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6419     return dr_aligned;
6420 
6421   /* For now assume all conditional loads/stores support unaligned
6422      access without any special code.  */
6423   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6424     if (gimple_call_internal_p (stmt)
6425 	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6426 	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6427       return dr_unaligned_supported;
6428 
6429   if (loop_vinfo)
6430     {
6431       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6432       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6433     }
6434 
6435   /* Possibly unaligned access.  */
6436 
6437   /* We can choose between using the implicit realignment scheme (generating
6438      a misaligned_move stmt) and the explicit realignment scheme (generating
6439      aligned loads with a REALIGN_LOAD).  There are two variants to the
6440      explicit realignment scheme: optimized, and unoptimized.
6441      We can optimize the realignment only if the step between consecutive
6442      vector loads is equal to the vector size.  Since the vector memory
6443      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6444      is guaranteed that the misalignment amount remains the same throughout the
6445      execution of the vectorized loop.  Therefore, we can create the
6446      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6447      at the loop preheader.
6448 
6449      However, in the case of outer-loop vectorization, when vectorizing a
6450      memory access in the inner-loop nested within the LOOP that is now being
6451      vectorized, while it is guaranteed that the misalignment of the
6452      vectorized memory access will remain the same in different outer-loop
6453      iterations, it is *not* guaranteed that is will remain the same throughout
6454      the execution of the inner-loop.  This is because the inner-loop advances
6455      with the original scalar step (and not in steps of VS).  If the inner-loop
6456      step happens to be a multiple of VS, then the misalignment remains fixed
6457      and we can use the optimized realignment scheme.  For example:
6458 
6459       for (i=0; i<N; i++)
6460         for (j=0; j<M; j++)
6461           s += a[i+j];
6462 
6463      When vectorizing the i-loop in the above example, the step between
6464      consecutive vector loads is 1, and so the misalignment does not remain
6465      fixed across the execution of the inner-loop, and the realignment cannot
6466      be optimized (as illustrated in the following pseudo vectorized loop):
6467 
6468       for (i=0; i<N; i+=4)
6469         for (j=0; j<M; j++){
6470           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6471                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6472                          // (assuming that we start from an aligned address).
6473           }
6474 
6475      We therefore have to use the unoptimized realignment scheme:
6476 
6477       for (i=0; i<N; i+=4)
6478           for (j=k; j<M; j+=4)
6479           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6480                            // that the misalignment of the initial address is
6481                            // 0).
6482 
6483      The loop can then be vectorized as follows:
6484 
6485       for (k=0; k<4; k++){
6486         rt = get_realignment_token (&vp[k]);
6487         for (i=0; i<N; i+=4){
6488           v1 = vp[i+k];
6489           for (j=k; j<M; j+=4){
6490             v2 = vp[i+j+VS-1];
6491             va = REALIGN_LOAD <v1,v2,rt>;
6492             vs += va;
6493             v1 = v2;
6494           }
6495         }
6496     } */
6497 
6498   if (DR_IS_READ (dr))
6499     {
6500       bool is_packed = false;
6501       tree type = (TREE_TYPE (DR_REF (dr)));
6502 
6503       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6504 	  && (!targetm.vectorize.builtin_mask_for_load
6505 	      || targetm.vectorize.builtin_mask_for_load ()))
6506 	{
6507 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6508 
6509 	  /* If we are doing SLP then the accesses need not have the
6510 	     same alignment, instead it depends on the SLP group size.  */
6511 	  if (loop_vinfo
6512 	      && STMT_SLP_TYPE (stmt_info)
6513 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6514 			      * (DR_GROUP_SIZE
6515 				 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6516 			      TYPE_VECTOR_SUBPARTS (vectype)))
6517 	    ;
6518 	  else if (!loop_vinfo
6519 		   || (nested_in_vect_loop
6520 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6521 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6522 	    return dr_explicit_realign;
6523 	  else
6524 	    return dr_explicit_realign_optimized;
6525 	}
6526       if (!known_alignment_for_access_p (dr_info))
6527 	is_packed = not_size_aligned (DR_REF (dr));
6528 
6529       if (targetm.vectorize.support_vector_misalignment
6530 	    (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6531 	/* Can't software pipeline the loads, but can at least do them.  */
6532 	return dr_unaligned_supported;
6533     }
6534   else
6535     {
6536       bool is_packed = false;
6537       tree type = (TREE_TYPE (DR_REF (dr)));
6538 
6539       if (!known_alignment_for_access_p (dr_info))
6540 	is_packed = not_size_aligned (DR_REF (dr));
6541 
6542      if (targetm.vectorize.support_vector_misalignment
6543 	   (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6544        return dr_unaligned_supported;
6545     }
6546 
6547   /* Unsupported.  */
6548   return dr_unaligned_unsupported;
6549 }
6550