1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s["
78 			     HOST_WIDE_INT_PRINT_DEC "]\n",
79 			     GET_MODE_NAME (mode), count);
80 	  return false;
81 	}
82     }
83 
84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85     {
86       if (dump_enabled_p ())
87 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88                          "cannot use %s<%s><%s>\n", name,
89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90       return false;
91     }
92 
93   if (dump_enabled_p ())
94     dump_printf_loc (MSG_NOTE, vect_location,
95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96                      GET_MODE_NAME (mode));
97 
98   return true;
99 }
100 
101 
102 /* Return the smallest scalar part of STMT.
103    This is used to determine the vectype of the stmt.  We generally set the
104    vectype according to the type of the result (lhs).  For stmts whose
105    result-type is different than the type of the arguments (e.g., demotion,
106    promotion), vectype will be reset appropriately (later).  Note that we have
107    to visit the smallest datatype in this function, because that determines the
108    VF.  If the smallest datatype in the loop is present only as the rhs of a
109    promotion operation - we'd miss it.
110    Such a case, where a variable of this datatype does not appear in the lhs
111    anywhere in the loop, can only occur if it's an invariant: e.g.:
112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
113    invariant motion.  However, we cannot rely on invariant motion to always
114    take invariants out of the loop, and so in the case of promotion we also
115    have to check the rhs.
116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117    types.  */
118 
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121                                HOST_WIDE_INT *rhs_size_unit)
122 {
123   tree scalar_type = gimple_expr_type (stmt);
124   HOST_WIDE_INT lhs, rhs;
125 
126   /* During the analysis phase, this function is called on arbitrary
127      statements that might not have scalar results.  */
128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129     return scalar_type;
130 
131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 
133   if (is_gimple_assign (stmt)
134       && (gimple_assign_cast_p (stmt)
135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140     {
141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142 
143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144       if (rhs < lhs)
145         scalar_type = rhs_type;
146     }
147   else if (gcall *call = dyn_cast <gcall *> (stmt))
148     {
149       unsigned int i = 0;
150       if (gimple_call_internal_p (call))
151 	{
152 	  internal_fn ifn = gimple_call_internal_fn (call);
153 	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 	    /* gimple_expr_type already picked the type of the loaded
155 	       or stored data.  */
156 	    i = ~0U;
157 	  else if (internal_fn_mask_index (ifn) == 0)
158 	    i = 1;
159 	}
160       if (i < gimple_call_num_args (call))
161 	{
162 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 	    {
165 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 	      if (rhs < lhs)
167 		scalar_type = rhs_type;
168 	    }
169 	}
170     }
171 
172   *lhs_size_unit = lhs;
173   *rhs_size_unit = rhs;
174   return scalar_type;
175 }
176 
177 
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179    tested at run-time.  Return TRUE if DDR was successfully inserted.
180    Return false if versioning is not supported.  */
181 
182 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184 {
185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 
187   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
188     return false;
189 
190   if (!runtime_alias_check_p (ddr, loop,
191 			      optimize_loop_nest_for_speed_p (loop)))
192     return false;
193 
194   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
195   return true;
196 }
197 
198 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
199 
200 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)201 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
202 {
203   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
204   for (unsigned int i = 0; i < checks.length(); ++i)
205     if (checks[i] == value)
206       return;
207 
208   if (dump_enabled_p ())
209     {
210       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
211       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
212       dump_printf (MSG_NOTE, " is nonzero\n");
213     }
214   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
215 }
216 
217 /* Return true if we know that the order of vectorized STMT_A and
218    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
219    At least one of the statements is a write.  */
220 
221 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)222 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
223 {
224   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
225   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
226 
227   /* Single statements are always kept in their original order.  */
228   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
229       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
230     return true;
231 
232   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
233      SLP group are emitted at the position of the last scalar load and
234      all loads in an interleaving group are emitted at the position
235      of the first scalar load.
236      Stores in a group are emitted at the position of the last scalar store.
237      Compute that position and check whether the resulting order matches
238      the current one.
239      We have not yet decided between SLP and interleaving so we have
240      to conservatively assume both.  */
241   gimple *il_a;
242   gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
243   if (last_a)
244     {
245       for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
246 	   s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247 	last_a = get_later_stmt (last_a, s);
248       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
249 	{
250 	  for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s;
251 	       s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
252 	    if (get_later_stmt (il_a, s) == il_a)
253 	      il_a = s;
254 	}
255       else
256 	il_a = last_a;
257     }
258   else
259     last_a = il_a = stmt_a;
260   gimple *il_b;
261   gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
262   if (last_b)
263     {
264       for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
265 	   s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
266 	last_b = get_later_stmt (last_b, s);
267       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
268 	{
269 	  for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s;
270 	       s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
271 	    if (get_later_stmt (il_b, s) == il_b)
272 	      il_b = s;
273 	}
274       else
275 	il_b = last_b;
276     }
277   else
278     last_b = il_b = stmt_b;
279   bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a);
280   return (/* SLP */
281 	  (get_later_stmt (last_a, last_b) == last_a) == a_after_b
282 	  /* Interleaving */
283 	  && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
284 	  /* Mixed */
285 	  && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
286 	  && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
287 }
288 
289 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
290    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
291    distances.  These distances are conservatively correct but they don't
292    reflect a guaranteed dependence.
293 
294    Return true if this function does all the work necessary to avoid
295    an alias or false if the caller should use the dependence distances
296    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
297    the depth of the loop described by LOOP_VINFO and the other arguments
298    are as for vect_analyze_data_ref_dependence.  */
299 
300 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)301 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
302 				       loop_vec_info loop_vinfo,
303 				       int loop_depth, unsigned int *max_vf)
304 {
305   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
306   lambda_vector dist_v;
307   unsigned int i;
308   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
309     {
310       int dist = dist_v[loop_depth];
311       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
312 	{
313 	  /* If the user asserted safelen >= DIST consecutive iterations
314 	     can be executed concurrently, assume independence.
315 
316 	     ??? An alternative would be to add the alias check even
317 	     in this case, and vectorize the fallback loop with the
318 	     maximum VF set to safelen.  However, if the user has
319 	     explicitly given a length, it's less likely that that
320 	     would be a win.  */
321 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
322 	    {
323 	      if ((unsigned int) loop->safelen < *max_vf)
324 		*max_vf = loop->safelen;
325 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 	      continue;
327 	    }
328 
329 	  /* For dependence distances of 2 or more, we have the option
330 	     of limiting VF or checking for an alias at runtime.
331 	     Prefer to check at runtime if we can, to avoid limiting
332 	     the VF unnecessarily when the bases are in fact independent.
333 
334 	     Note that the alias checks will be removed if the VF ends up
335 	     being small enough.  */
336 	  return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
337 	}
338     }
339   return true;
340 }
341 
342 
343 /* Function vect_analyze_data_ref_dependence.
344 
345    Return TRUE if there (might) exist a dependence between a memory-reference
346    DRA and a memory-reference DRB.  When versioning for alias may check a
347    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
348    the data dependence.  */
349 
350 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)351 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
352 				  loop_vec_info loop_vinfo,
353 				  unsigned int *max_vf)
354 {
355   unsigned int i;
356   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
357   struct data_reference *dra = DDR_A (ddr);
358   struct data_reference *drb = DDR_B (ddr);
359   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
360   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
361   lambda_vector dist_v;
362   unsigned int loop_depth;
363 
364   /* In loop analysis all data references should be vectorizable.  */
365   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
366       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
367     gcc_unreachable ();
368 
369   /* Independent data accesses.  */
370   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
371     return false;
372 
373   if (dra == drb
374       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
375     return false;
376 
377   /* We do not have to consider dependences between accesses that belong
378      to the same group, unless the stride could be smaller than the
379      group size.  */
380   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
381       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
382       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
383     return false;
384 
385   /* Even if we have an anti-dependence then, as the vectorized loop covers at
386      least two scalar iterations, there is always also a true dependence.
387      As the vectorizer does not re-order loads and stores we can ignore
388      the anti-dependence if TBAA can disambiguate both DRs similar to the
389      case with known negative distance anti-dependences (positive
390      distance anti-dependences would violate TBAA constraints).  */
391   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
392        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
393       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
394 				 get_alias_set (DR_REF (drb))))
395     return false;
396 
397   /* Unknown data dependence.  */
398   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
399     {
400       /* If user asserted safelen consecutive iterations can be
401 	 executed concurrently, assume independence.  */
402       if (loop->safelen >= 2)
403 	{
404 	  if ((unsigned int) loop->safelen < *max_vf)
405 	    *max_vf = loop->safelen;
406 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
407 	  return false;
408 	}
409 
410       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412 	{
413 	  if (dump_enabled_p ())
414 	    {
415 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 			       "versioning for alias not supported for: "
417 			       "can't determine dependence between ");
418 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
419 				 DR_REF (dra));
420 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
421 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
422 				 DR_REF (drb));
423 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
424 	    }
425 	  return true;
426 	}
427 
428       if (dump_enabled_p ())
429 	{
430 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 			   "versioning for alias required: "
432 			   "can't determine dependence between ");
433 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
434 			     DR_REF (dra));
435 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
436 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
437 			     DR_REF (drb));
438 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
439 	}
440 
441       /* Add to list of ddrs that need to be tested at run-time.  */
442       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
443     }
444 
445   /* Known data dependence.  */
446   if (DDR_NUM_DIST_VECTS (ddr) == 0)
447     {
448       /* If user asserted safelen consecutive iterations can be
449 	 executed concurrently, assume independence.  */
450       if (loop->safelen >= 2)
451 	{
452 	  if ((unsigned int) loop->safelen < *max_vf)
453 	    *max_vf = loop->safelen;
454 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
455 	  return false;
456 	}
457 
458       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
459 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
460 	{
461 	  if (dump_enabled_p ())
462 	    {
463 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 			       "versioning for alias not supported for: "
465 			       "bad dist vector for ");
466 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
467 				 DR_REF (dra));
468 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
469 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
470 				 DR_REF (drb));
471 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
472 	    }
473 	  return true;
474 	}
475 
476       if (dump_enabled_p ())
477         {
478           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479                            "versioning for alias required: "
480                            "bad dist vector for ");
481           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
482           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
483           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
484           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
485         }
486       /* Add to list of ddrs that need to be tested at run-time.  */
487       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
488     }
489 
490   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
491 
492   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
493       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
494 						loop_depth, max_vf))
495     return false;
496 
497   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
498     {
499       int dist = dist_v[loop_depth];
500 
501       if (dump_enabled_p ())
502 	dump_printf_loc (MSG_NOTE, vect_location,
503                          "dependence distance  = %d.\n", dist);
504 
505       if (dist == 0)
506 	{
507 	  if (dump_enabled_p ())
508 	    {
509 	      dump_printf_loc (MSG_NOTE, vect_location,
510 	                       "dependence distance == 0 between ");
511 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
512 	      dump_printf (MSG_NOTE, " and ");
513 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
514 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
515 	    }
516 
517 	  /* When we perform grouped accesses and perform implicit CSE
518 	     by detecting equal accesses and doing disambiguation with
519 	     runtime alias tests like for
520 	        .. = a[i];
521 		.. = a[i+1];
522 		a[i] = ..;
523 		a[i+1] = ..;
524 		*p = ..;
525 		.. = a[i];
526 		.. = a[i+1];
527 	     where we will end up loading { a[i], a[i+1] } once, make
528 	     sure that inserting group loads before the first load and
529 	     stores after the last store will do the right thing.
530 	     Similar for groups like
531 	        a[i] = ...;
532 		... = a[i];
533 		a[i+1] = ...;
534 	     where loads from the group interleave with the store.  */
535 	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
536 	    {
537 	      if (dump_enabled_p ())
538 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
539 				 "READ_WRITE dependence in interleaving.\n");
540 	      return true;
541 	    }
542 
543 	  if (loop->safelen < 2)
544 	    {
545 	      tree indicator = dr_zero_step_indicator (dra);
546 	      if (TREE_CODE (indicator) != INTEGER_CST)
547 		vect_check_nonzero_value (loop_vinfo, indicator);
548 	      else if (integer_zerop (indicator))
549 		{
550 		  if (dump_enabled_p ())
551 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
552 				 "access also has a zero step\n");
553 		  return true;
554 		}
555 	    }
556 	  continue;
557 	}
558 
559       if (dist > 0 && DDR_REVERSED_P (ddr))
560 	{
561 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
562 	     reversed (to make distance vector positive), and the actual
563 	     distance is negative.  */
564 	  if (dump_enabled_p ())
565 	    dump_printf_loc (MSG_NOTE, vect_location,
566 	                     "dependence distance negative.\n");
567 	  /* When doing outer loop vectorization, we need to check if there is
568 	     a backward dependence at the inner loop level if the dependence
569 	     at the outer loop is reversed.  See PR81740.  */
570 	  if (nested_in_vect_loop_p (loop, DR_STMT (dra))
571 	      || nested_in_vect_loop_p (loop, DR_STMT (drb)))
572 	    {
573 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
574 							 DDR_LOOP_NEST (ddr));
575 	      if (dist_v[inner_depth] < 0)
576 		return true;
577 	    }
578 	  /* Record a negative dependence distance to later limit the
579 	     amount of stmt copying / unrolling we can perform.
580 	     Only need to handle read-after-write dependence.  */
581 	  if (DR_IS_READ (drb)
582 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
583 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
584 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
585 	  continue;
586 	}
587 
588       unsigned int abs_dist = abs (dist);
589       if (abs_dist >= 2 && abs_dist < *max_vf)
590 	{
591 	  /* The dependence distance requires reduction of the maximal
592 	     vectorization factor.  */
593 	  *max_vf = abs_dist;
594 	  if (dump_enabled_p ())
595 	    dump_printf_loc (MSG_NOTE, vect_location,
596 	                     "adjusting maximal vectorization factor to %i\n",
597 	                     *max_vf);
598 	}
599 
600       if (abs_dist >= *max_vf)
601 	{
602 	  /* Dependence distance does not create dependence, as far as
603 	     vectorization is concerned, in this case.  */
604 	  if (dump_enabled_p ())
605 	    dump_printf_loc (MSG_NOTE, vect_location,
606 	                     "dependence distance >= VF.\n");
607 	  continue;
608 	}
609 
610       if (dump_enabled_p ())
611 	{
612 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
613 	               "not vectorized, possible dependence "
614 	               "between data-refs ");
615 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
616 	  dump_printf (MSG_NOTE,  " and ");
617 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
618 	  dump_printf (MSG_NOTE,  "\n");
619 	}
620 
621       return true;
622     }
623 
624   return false;
625 }
626 
627 /* Function vect_analyze_data_ref_dependences.
628 
629    Examine all the data references in the loop, and make sure there do not
630    exist any data dependences between them.  Set *MAX_VF according to
631    the maximum vectorization factor the data dependences allow.  */
632 
633 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)634 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
635 				   unsigned int *max_vf)
636 {
637   unsigned int i;
638   struct data_dependence_relation *ddr;
639 
640   if (dump_enabled_p ())
641     dump_printf_loc (MSG_NOTE, vect_location,
642                      "=== vect_analyze_data_ref_dependences ===\n");
643 
644   LOOP_VINFO_DDRS (loop_vinfo)
645     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
646 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
647   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
648   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
649   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
650 				&LOOP_VINFO_DDRS (loop_vinfo),
651 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
652     return false;
653 
654   /* For epilogues we either have no aliases or alias versioning
655      was applied to original loop.  Therefore we may just get max_vf
656      using VF of original loop.  */
657   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
658     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
659   else
660     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
661       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
662 	return false;
663 
664   return true;
665 }
666 
667 
668 /* Function vect_slp_analyze_data_ref_dependence.
669 
670    Return TRUE if there (might) exist a dependence between a memory-reference
671    DRA and a memory-reference DRB.  When versioning for alias may check a
672    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
673    the data dependence.  */
674 
675 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)676 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
677 {
678   struct data_reference *dra = DDR_A (ddr);
679   struct data_reference *drb = DDR_B (ddr);
680 
681   /* We need to check dependences of statements marked as unvectorizable
682      as well, they still can prohibit vectorization.  */
683 
684   /* Independent data accesses.  */
685   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
686     return false;
687 
688   if (dra == drb)
689     return false;
690 
691   /* Read-read is OK.  */
692   if (DR_IS_READ (dra) && DR_IS_READ (drb))
693     return false;
694 
695   /* If dra and drb are part of the same interleaving chain consider
696      them independent.  */
697   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
698       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
699 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
700     return false;
701 
702   /* Unknown data dependence.  */
703   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
704     {
705       if  (dump_enabled_p ())
706 	{
707 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 			   "can't determine dependence between ");
709 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
710 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
711 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
712 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
713 	}
714     }
715   else if (dump_enabled_p ())
716     {
717       dump_printf_loc (MSG_NOTE, vect_location,
718 		       "determined dependence between ");
719       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
720       dump_printf (MSG_NOTE, " and ");
721       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
722       dump_printf (MSG_NOTE,  "\n");
723     }
724 
725   return true;
726 }
727 
728 
729 /* Analyze dependences involved in the transform of SLP NODE.  STORES
730    contain the vector of scalar stores of this instance if we are
731    disambiguating the loads.  */
732 
733 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)734 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
735 				   vec<gimple *> stores, gimple *last_store)
736 {
737   /* This walks over all stmts involved in the SLP load/store done
738      in NODE verifying we can sink them up to the last stmt in the
739      group.  */
740   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
741   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
742     {
743       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
744       if (access == last_access)
745 	continue;
746       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
747       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
748 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
749 	{
750 	  gimple *stmt = gsi_stmt (gsi);
751 	  if (! gimple_vuse (stmt)
752 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
753 	    continue;
754 
755 	  /* If we couldn't record a (single) data reference for this
756 	     stmt we have to give up.  */
757 	  /* ???  Here and below if dependence analysis fails we can resort
758 	     to the alias oracle which can handle more kinds of stmts.  */
759 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
760 	  if (!dr_b)
761 	    return false;
762 
763 	  bool dependent = false;
764 	  /* If we run into a store of this same instance (we've just
765 	     marked those) then delay dependence checking until we run
766 	     into the last store because this is where it will have
767 	     been sunk to (and we verify if we can do that as well).  */
768 	  if (gimple_visited_p (stmt))
769 	    {
770 	      if (stmt != last_store)
771 		continue;
772 	      unsigned i;
773 	      gimple *store;
774 	      FOR_EACH_VEC_ELT (stores, i, store)
775 		{
776 		  data_reference *store_dr
777 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
778 		  ddr_p ddr = initialize_data_dependence_relation
779 				(dr_a, store_dr, vNULL);
780 		  dependent = vect_slp_analyze_data_ref_dependence (ddr);
781 		  free_dependence_relation (ddr);
782 		  if (dependent)
783 		    break;
784 		}
785 	    }
786 	  else
787 	    {
788 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
789 							       dr_b, vNULL);
790 	      dependent = vect_slp_analyze_data_ref_dependence (ddr);
791 	      free_dependence_relation (ddr);
792 	    }
793 	  if (dependent)
794 	    return false;
795 	}
796     }
797   return true;
798 }
799 
800 
801 /* Function vect_analyze_data_ref_dependences.
802 
803    Examine all the data references in the basic-block, and make sure there
804    do not exist any data dependences between them.  Set *MAX_VF according to
805    the maximum vectorization factor the data dependences allow.  */
806 
807 bool
vect_slp_analyze_instance_dependence(slp_instance instance)808 vect_slp_analyze_instance_dependence (slp_instance instance)
809 {
810   if (dump_enabled_p ())
811     dump_printf_loc (MSG_NOTE, vect_location,
812                      "=== vect_slp_analyze_instance_dependence ===\n");
813 
814   /* The stores of this instance are at the root of the SLP tree.  */
815   slp_tree store = SLP_INSTANCE_TREE (instance);
816   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
817     store = NULL;
818 
819   /* Verify we can sink stores to the vectorized stmt insert location.  */
820   gimple *last_store = NULL;
821   if (store)
822     {
823       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
824 	return false;
825 
826       /* Mark stores in this instance and remember the last one.  */
827       last_store = vect_find_last_scalar_stmt_in_slp (store);
828       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
829 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
830     }
831 
832   bool res = true;
833 
834   /* Verify we can sink loads to the vectorized stmt insert location,
835      special-casing stores of this instance.  */
836   slp_tree load;
837   unsigned int i;
838   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
839     if (! vect_slp_analyze_node_dependences (instance, load,
840 					     store
841 					     ? SLP_TREE_SCALAR_STMTS (store)
842 					     : vNULL, last_store))
843       {
844 	res = false;
845 	break;
846       }
847 
848   /* Unset the visited flag.  */
849   if (store)
850     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
851       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
852 
853   return res;
854 }
855 
856 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
857    the statement that contains DRB, which is useful for recording in the
858    dump file.  */
859 
860 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)861 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
862 			    innermost_loop_behavior *drb)
863 {
864   bool existed;
865   innermost_loop_behavior *&entry
866     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
867   if (!existed || entry->base_alignment < drb->base_alignment)
868     {
869       entry = drb;
870       if (dump_enabled_p ())
871 	{
872 	  dump_printf_loc (MSG_NOTE, vect_location,
873 			   "recording new base alignment for ");
874 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
875 	  dump_printf (MSG_NOTE, "\n");
876 	  dump_printf_loc (MSG_NOTE, vect_location,
877 			   "  alignment:    %d\n", drb->base_alignment);
878 	  dump_printf_loc (MSG_NOTE, vect_location,
879 			   "  misalignment: %d\n", drb->base_misalignment);
880 	  dump_printf_loc (MSG_NOTE, vect_location,
881 			   "  based on:     ");
882 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
883 	}
884     }
885 }
886 
887 /* If the region we're going to vectorize is reached, all unconditional
888    data references occur at least once.  We can therefore pool the base
889    alignment guarantees from each unconditional reference.  Do this by
890    going through all the data references in VINFO and checking whether
891    the containing statement makes the reference unconditionally.  If so,
892    record the alignment of the base address in VINFO so that it can be
893    used for all other references with the same base.  */
894 
895 void
vect_record_base_alignments(vec_info * vinfo)896 vect_record_base_alignments (vec_info *vinfo)
897 {
898   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
899   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
900   data_reference *dr;
901   unsigned int i;
902   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
903     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
904       {
905 	gimple *stmt = DR_STMT (dr);
906 	vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
907 
908 	/* If DR is nested in the loop that is being vectorized, we can also
909 	   record the alignment of the base wrt the outer loop.  */
910 	if (loop && nested_in_vect_loop_p (loop, stmt))
911 	  {
912 	    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
913 	    vect_record_base_alignment
914 	      (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
915 	  }
916       }
917 }
918 
919 /* Return the target alignment for the vectorized form of DR.  */
920 
921 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)922 vect_calculate_target_alignment (struct data_reference *dr)
923 {
924   gimple *stmt = DR_STMT (dr);
925   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
926   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
927   return targetm.vectorize.preferred_vector_alignment (vectype);
928 }
929 
930 /* Function vect_compute_data_ref_alignment
931 
932    Compute the misalignment of the data reference DR.
933 
934    Output:
935    1. If during the misalignment computation it is found that the data reference
936       cannot be vectorized then false is returned.
937    2. DR_MISALIGNMENT (DR) is defined.
938 
939    FOR NOW: No analysis is actually performed. Misalignment is calculated
940    only for trivial cases. TODO.  */
941 
942 bool
vect_compute_data_ref_alignment(struct data_reference * dr)943 vect_compute_data_ref_alignment (struct data_reference *dr)
944 {
945   gimple *stmt = DR_STMT (dr);
946   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
947   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
948   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
949   struct loop *loop = NULL;
950   tree ref = DR_REF (dr);
951   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
952 
953   if (dump_enabled_p ())
954     dump_printf_loc (MSG_NOTE, vect_location,
955                      "vect_compute_data_ref_alignment:\n");
956 
957   if (loop_vinfo)
958     loop = LOOP_VINFO_LOOP (loop_vinfo);
959 
960   /* Initialize misalignment to unknown.  */
961   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
962 
963   innermost_loop_behavior *drb = vect_dr_behavior (dr);
964   bool step_preserves_misalignment_p;
965 
966   unsigned HOST_WIDE_INT vector_alignment
967     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
968   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
969 
970   /* No step for BB vectorization.  */
971   if (!loop)
972     {
973       gcc_assert (integer_zerop (drb->step));
974       step_preserves_misalignment_p = true;
975     }
976 
977   /* In case the dataref is in an inner-loop of the loop that is being
978      vectorized (LOOP), we use the base and misalignment information
979      relative to the outer-loop (LOOP).  This is ok only if the misalignment
980      stays the same throughout the execution of the inner-loop, which is why
981      we have to check that the stride of the dataref in the inner-loop evenly
982      divides by the vector alignment.  */
983   else if (nested_in_vect_loop_p (loop, stmt))
984     {
985       step_preserves_misalignment_p
986 	= (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
987 
988       if (dump_enabled_p ())
989 	{
990 	  if (step_preserves_misalignment_p)
991 	    dump_printf_loc (MSG_NOTE, vect_location,
992 			     "inner step divides the vector alignment.\n");
993 	  else
994 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 			     "inner step doesn't divide the vector"
996 			     " alignment.\n");
997 	}
998     }
999 
1000   /* Similarly we can only use base and misalignment information relative to
1001      an innermost loop if the misalignment stays the same throughout the
1002      execution of the loop.  As above, this is the case if the stride of
1003      the dataref evenly divides by the alignment.  */
1004   else
1005     {
1006       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1007       step_preserves_misalignment_p
1008 	= multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
1009 
1010       if (!step_preserves_misalignment_p && dump_enabled_p ())
1011 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 			 "step doesn't divide the vector alignment.\n");
1013     }
1014 
1015   unsigned int base_alignment = drb->base_alignment;
1016   unsigned int base_misalignment = drb->base_misalignment;
1017 
1018   /* Calculate the maximum of the pooled base address alignment and the
1019      alignment that we can compute for DR itself.  */
1020   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1021   if (entry && base_alignment < (*entry)->base_alignment)
1022     {
1023       base_alignment = (*entry)->base_alignment;
1024       base_misalignment = (*entry)->base_misalignment;
1025     }
1026 
1027   if (drb->offset_alignment < vector_alignment
1028       || !step_preserves_misalignment_p
1029       /* We need to know whether the step wrt the vectorized loop is
1030 	 negative when computing the starting misalignment below.  */
1031       || TREE_CODE (drb->step) != INTEGER_CST)
1032     {
1033       if (dump_enabled_p ())
1034 	{
1035 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 	                   "Unknown alignment for access: ");
1037 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039 	}
1040       return true;
1041     }
1042 
1043   if (base_alignment < vector_alignment)
1044     {
1045       unsigned int max_alignment;
1046       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1047       if (max_alignment < vector_alignment
1048 	  || !vect_can_force_dr_alignment_p (base,
1049 					     vector_alignment * BITS_PER_UNIT))
1050 	{
1051 	  if (dump_enabled_p ())
1052 	    {
1053 	      dump_printf_loc (MSG_NOTE, vect_location,
1054 	                       "can't force alignment of ref: ");
1055 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1056 	      dump_printf (MSG_NOTE, "\n");
1057 	    }
1058 	  return true;
1059 	}
1060 
1061       /* Force the alignment of the decl.
1062 	 NOTE: This is the only change to the code we make during
1063 	 the analysis phase, before deciding to vectorize the loop.  */
1064       if (dump_enabled_p ())
1065         {
1066           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1067           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1068           dump_printf (MSG_NOTE, "\n");
1069         }
1070 
1071       DR_VECT_AUX (dr)->base_decl = base;
1072       DR_VECT_AUX (dr)->base_misaligned = true;
1073       base_misalignment = 0;
1074     }
1075   poly_int64 misalignment
1076     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1077 
1078   /* If this is a backward running DR then first access in the larger
1079      vectype actually is N-1 elements before the address in the DR.
1080      Adjust misalign accordingly.  */
1081   if (tree_int_cst_sgn (drb->step) < 0)
1082     /* PLUS because STEP is negative.  */
1083     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1084 		     * TREE_INT_CST_LOW (drb->step));
1085 
1086   unsigned int const_misalignment;
1087   if (!known_misalignment (misalignment, vector_alignment,
1088 			   &const_misalignment))
1089     {
1090       if (dump_enabled_p ())
1091 	{
1092 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1093 			   "Non-constant misalignment for access: ");
1094 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1095 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1096 	}
1097       return true;
1098     }
1099 
1100   SET_DR_MISALIGNMENT (dr, const_misalignment);
1101 
1102   if (dump_enabled_p ())
1103     {
1104       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1106       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1107       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1108     }
1109 
1110   return true;
1111 }
1112 
1113 /* Function vect_update_misalignment_for_peel.
1114    Sets DR's misalignment
1115    - to 0 if it has the same alignment as DR_PEEL,
1116    - to the misalignment computed using NPEEL if DR's salignment is known,
1117    - to -1 (unknown) otherwise.
1118 
1119    DR - the data reference whose misalignment is to be adjusted.
1120    DR_PEEL - the data reference whose misalignment is being made
1121              zero in the vector loop by the peel.
1122    NPEEL - the number of iterations in the peel loop if the misalignment
1123            of DR_PEEL is known at compile time.  */
1124 
1125 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1126 vect_update_misalignment_for_peel (struct data_reference *dr,
1127                                    struct data_reference *dr_peel, int npeel)
1128 {
1129   unsigned int i;
1130   vec<dr_p> same_aligned_drs;
1131   struct data_reference *current_dr;
1132   int dr_size = vect_get_scalar_dr_size (dr);
1133   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1134   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1135   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1136 
1137  /* For interleaved data accesses the step in the loop must be multiplied by
1138      the size of the interleaving group.  */
1139   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1140     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1141   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1142     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1143 
1144   /* It can be assumed that the data refs with the same alignment as dr_peel
1145      are aligned in the vector loop.  */
1146   same_aligned_drs
1147     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1148   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1149     {
1150       if (current_dr != dr)
1151         continue;
1152       gcc_assert (!known_alignment_for_access_p (dr)
1153 		  || !known_alignment_for_access_p (dr_peel)
1154 		  || (DR_MISALIGNMENT (dr) / dr_size
1155 		      == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1156       SET_DR_MISALIGNMENT (dr, 0);
1157       return;
1158     }
1159 
1160   if (known_alignment_for_access_p (dr)
1161       && known_alignment_for_access_p (dr_peel))
1162     {
1163       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1164       int misal = DR_MISALIGNMENT (dr);
1165       misal += negative ? -npeel * dr_size : npeel * dr_size;
1166       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1167       SET_DR_MISALIGNMENT (dr, misal);
1168       return;
1169     }
1170 
1171   if (dump_enabled_p ())
1172     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1173 		     "to unknown (-1).\n");
1174   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1175 }
1176 
1177 
1178 /* Function verify_data_ref_alignment
1179 
1180    Return TRUE if DR can be handled with respect to alignment.  */
1181 
1182 static bool
verify_data_ref_alignment(data_reference_p dr)1183 verify_data_ref_alignment (data_reference_p dr)
1184 {
1185   enum dr_alignment_support supportable_dr_alignment
1186     = vect_supportable_dr_alignment (dr, false);
1187   if (!supportable_dr_alignment)
1188     {
1189       if (dump_enabled_p ())
1190 	{
1191 	  if (DR_IS_READ (dr))
1192 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 			     "not vectorized: unsupported unaligned load.");
1194 	  else
1195 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 			     "not vectorized: unsupported unaligned "
1197 			     "store.");
1198 
1199 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1200 			     DR_REF (dr));
1201 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1202 	}
1203       return false;
1204     }
1205 
1206   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1207     dump_printf_loc (MSG_NOTE, vect_location,
1208 		     "Vectorizing an unaligned access.\n");
1209 
1210   return true;
1211 }
1212 
1213 /* Function vect_verify_datarefs_alignment
1214 
1215    Return TRUE if all data references in the loop can be
1216    handled with respect to alignment.  */
1217 
1218 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1219 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1220 {
1221   vec<data_reference_p> datarefs = vinfo->datarefs;
1222   struct data_reference *dr;
1223   unsigned int i;
1224 
1225   FOR_EACH_VEC_ELT (datarefs, i, dr)
1226     {
1227       gimple *stmt = DR_STMT (dr);
1228       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1229 
1230       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1231 	continue;
1232 
1233       /* For interleaving, only the alignment of the first access matters.   */
1234       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1235 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1236 	continue;
1237 
1238       /* Strided accesses perform only component accesses, alignment is
1239 	 irrelevant for them.  */
1240       if (STMT_VINFO_STRIDED_P (stmt_info)
1241 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 	continue;
1243 
1244       if (! verify_data_ref_alignment (dr))
1245 	return false;
1246     }
1247 
1248   return true;
1249 }
1250 
1251 /* Given an memory reference EXP return whether its alignment is less
1252    than its size.  */
1253 
1254 static bool
not_size_aligned(tree exp)1255 not_size_aligned (tree exp)
1256 {
1257   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1258     return true;
1259 
1260   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1261 	  > get_object_alignment (exp));
1262 }
1263 
1264 /* Function vector_alignment_reachable_p
1265 
1266    Return true if vector alignment for DR is reachable by peeling
1267    a few loop iterations.  Return false otherwise.  */
1268 
1269 static bool
vector_alignment_reachable_p(struct data_reference * dr)1270 vector_alignment_reachable_p (struct data_reference *dr)
1271 {
1272   gimple *stmt = DR_STMT (dr);
1273   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1274   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1275 
1276   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1277     {
1278       /* For interleaved access we peel only if number of iterations in
1279 	 the prolog loop ({VF - misalignment}), is a multiple of the
1280 	 number of the interleaved accesses.  */
1281       int elem_size, mis_in_elements;
1282 
1283       /* FORNOW: handle only known alignment.  */
1284       if (!known_alignment_for_access_p (dr))
1285 	return false;
1286 
1287       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1288       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1289       elem_size = vector_element_size (vector_size, nelements);
1290       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1291 
1292       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1293 	return false;
1294     }
1295 
1296   /* If misalignment is known at the compile time then allow peeling
1297      only if natural alignment is reachable through peeling.  */
1298   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1299     {
1300       HOST_WIDE_INT elmsize =
1301 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1302       if (dump_enabled_p ())
1303 	{
1304 	  dump_printf_loc (MSG_NOTE, vect_location,
1305 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1306 	  dump_printf (MSG_NOTE,
1307 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1308 	}
1309       if (DR_MISALIGNMENT (dr) % elmsize)
1310 	{
1311 	  if (dump_enabled_p ())
1312 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 	                     "data size does not divide the misalignment.\n");
1314 	  return false;
1315 	}
1316     }
1317 
1318   if (!known_alignment_for_access_p (dr))
1319     {
1320       tree type = TREE_TYPE (DR_REF (dr));
1321       bool is_packed = not_size_aligned (DR_REF (dr));
1322       if (dump_enabled_p ())
1323 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 	                 "Unknown misalignment, %snaturally aligned\n",
1325 			 is_packed ? "not " : "");
1326       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1327     }
1328 
1329   return true;
1330 }
1331 
1332 
1333 /* Calculate the cost of the memory access represented by DR.  */
1334 
1335 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1336 vect_get_data_access_cost (struct data_reference *dr,
1337                            unsigned int *inside_cost,
1338                            unsigned int *outside_cost,
1339 			   stmt_vector_for_cost *body_cost_vec)
1340 {
1341   gimple *stmt = DR_STMT (dr);
1342   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1343   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1344   int ncopies;
1345 
1346   if (PURE_SLP_STMT (stmt_info))
1347     ncopies = 1;
1348   else
1349     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1350 
1351   if (DR_IS_READ (dr))
1352     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1353 			NULL, body_cost_vec, false);
1354   else
1355     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1356 
1357   if (dump_enabled_p ())
1358     dump_printf_loc (MSG_NOTE, vect_location,
1359                      "vect_get_data_access_cost: inside_cost = %d, "
1360                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1361 }
1362 
1363 
1364 typedef struct _vect_peel_info
1365 {
1366   struct data_reference *dr;
1367   int npeel;
1368   unsigned int count;
1369 } *vect_peel_info;
1370 
1371 typedef struct _vect_peel_extended_info
1372 {
1373   struct _vect_peel_info peel_info;
1374   unsigned int inside_cost;
1375   unsigned int outside_cost;
1376 } *vect_peel_extended_info;
1377 
1378 
1379 /* Peeling hashtable helpers.  */
1380 
1381 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1382 {
1383   static inline hashval_t hash (const _vect_peel_info *);
1384   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1385 };
1386 
1387 inline hashval_t
hash(const _vect_peel_info * peel_info)1388 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1389 {
1390   return (hashval_t) peel_info->npeel;
1391 }
1392 
1393 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1394 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1395 {
1396   return (a->npeel == b->npeel);
1397 }
1398 
1399 
1400 /* Insert DR into peeling hash table with NPEEL as key.  */
1401 
1402 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1403 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1404 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1405                           int npeel)
1406 {
1407   struct _vect_peel_info elem, *slot;
1408   _vect_peel_info **new_slot;
1409   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1410 
1411   elem.npeel = npeel;
1412   slot = peeling_htab->find (&elem);
1413   if (slot)
1414     slot->count++;
1415   else
1416     {
1417       slot = XNEW (struct _vect_peel_info);
1418       slot->npeel = npeel;
1419       slot->dr = dr;
1420       slot->count = 1;
1421       new_slot = peeling_htab->find_slot (slot, INSERT);
1422       *new_slot = slot;
1423     }
1424 
1425   if (!supportable_dr_alignment
1426       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1427     slot->count += VECT_MAX_COST;
1428 }
1429 
1430 
1431 /* Traverse peeling hash table to find peeling option that aligns maximum
1432    number of data accesses.  */
1433 
1434 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1435 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1436 				     _vect_peel_extended_info *max)
1437 {
1438   vect_peel_info elem = *slot;
1439 
1440   if (elem->count > max->peel_info.count
1441       || (elem->count == max->peel_info.count
1442           && max->peel_info.npeel > elem->npeel))
1443     {
1444       max->peel_info.npeel = elem->npeel;
1445       max->peel_info.count = elem->count;
1446       max->peel_info.dr = elem->dr;
1447     }
1448 
1449   return 1;
1450 }
1451 
1452 /* Get the costs of peeling NPEEL iterations checking data access costs
1453    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1454    misalignment will be zero after peeling.  */
1455 
1456 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1457 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1458 				struct data_reference *dr0,
1459 				unsigned int *inside_cost,
1460 				unsigned int *outside_cost,
1461 				stmt_vector_for_cost *body_cost_vec,
1462 				unsigned int npeel,
1463 				bool unknown_misalignment)
1464 {
1465   unsigned i;
1466   data_reference *dr;
1467 
1468   FOR_EACH_VEC_ELT (datarefs, i, dr)
1469     {
1470       gimple *stmt = DR_STMT (dr);
1471       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1472       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1473 	continue;
1474 
1475       /* For interleaving, only the alignment of the first access
1476          matters.  */
1477       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1478           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1479         continue;
1480 
1481       /* Strided accesses perform only component accesses, alignment is
1482          irrelevant for them.  */
1483       if (STMT_VINFO_STRIDED_P (stmt_info)
1484 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1485 	continue;
1486 
1487       int save_misalignment;
1488       save_misalignment = DR_MISALIGNMENT (dr);
1489       if (npeel == 0)
1490 	;
1491       else if (unknown_misalignment && dr == dr0)
1492 	SET_DR_MISALIGNMENT (dr, 0);
1493       else
1494 	vect_update_misalignment_for_peel (dr, dr0, npeel);
1495       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1496 				 body_cost_vec);
1497       SET_DR_MISALIGNMENT (dr, save_misalignment);
1498     }
1499 }
1500 
1501 /* Traverse peeling hash table and calculate cost for each peeling option.
1502    Find the one with the lowest cost.  */
1503 
1504 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1505 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1506 				   _vect_peel_extended_info *min)
1507 {
1508   vect_peel_info elem = *slot;
1509   int dummy;
1510   unsigned int inside_cost = 0, outside_cost = 0;
1511   gimple *stmt = DR_STMT (elem->dr);
1512   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1513   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1514   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1515 		       epilogue_cost_vec;
1516 
1517   prologue_cost_vec.create (2);
1518   body_cost_vec.create (2);
1519   epilogue_cost_vec.create (2);
1520 
1521   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1522 				  elem->dr, &inside_cost, &outside_cost,
1523 				  &body_cost_vec, elem->npeel, false);
1524 
1525   body_cost_vec.release ();
1526 
1527   outside_cost += vect_get_known_peeling_cost
1528     (loop_vinfo, elem->npeel, &dummy,
1529      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1530      &prologue_cost_vec, &epilogue_cost_vec);
1531 
1532   /* Prologue and epilogue costs are added to the target model later.
1533      These costs depend only on the scalar iteration cost, the
1534      number of peeling iterations finally chosen, and the number of
1535      misaligned statements.  So discard the information found here.  */
1536   prologue_cost_vec.release ();
1537   epilogue_cost_vec.release ();
1538 
1539   if (inside_cost < min->inside_cost
1540       || (inside_cost == min->inside_cost
1541 	  && outside_cost < min->outside_cost))
1542     {
1543       min->inside_cost = inside_cost;
1544       min->outside_cost = outside_cost;
1545       min->peel_info.dr = elem->dr;
1546       min->peel_info.npeel = elem->npeel;
1547       min->peel_info.count = elem->count;
1548     }
1549 
1550   return 1;
1551 }
1552 
1553 
1554 /* Choose best peeling option by traversing peeling hash table and either
1555    choosing an option with the lowest cost (if cost model is enabled) or the
1556    option that aligns as many accesses as possible.  */
1557 
1558 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1559 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1560 				       loop_vec_info loop_vinfo)
1561 {
1562    struct _vect_peel_extended_info res;
1563 
1564    res.peel_info.dr = NULL;
1565 
1566    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1567      {
1568        res.inside_cost = INT_MAX;
1569        res.outside_cost = INT_MAX;
1570        peeling_htab->traverse <_vect_peel_extended_info *,
1571 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1572      }
1573    else
1574      {
1575        res.peel_info.count = 0;
1576        peeling_htab->traverse <_vect_peel_extended_info *,
1577 	   		       vect_peeling_hash_get_most_frequent> (&res);
1578        res.inside_cost = 0;
1579        res.outside_cost = 0;
1580      }
1581 
1582    return res;
1583 }
1584 
1585 /* Return true if the new peeling NPEEL is supported.  */
1586 
1587 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1588 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1589 			  unsigned npeel)
1590 {
1591   unsigned i;
1592   struct data_reference *dr = NULL;
1593   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1594   gimple *stmt;
1595   stmt_vec_info stmt_info;
1596   enum dr_alignment_support supportable_dr_alignment;
1597 
1598   /* Ensure that all data refs can be vectorized after the peel.  */
1599   FOR_EACH_VEC_ELT (datarefs, i, dr)
1600     {
1601       int save_misalignment;
1602 
1603       if (dr == dr0)
1604 	continue;
1605 
1606       stmt = DR_STMT (dr);
1607       stmt_info = vinfo_for_stmt (stmt);
1608       /* For interleaving, only the alignment of the first access
1609 	 matters.  */
1610       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1611 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1612 	continue;
1613 
1614       /* Strided accesses perform only component accesses, alignment is
1615 	 irrelevant for them.  */
1616       if (STMT_VINFO_STRIDED_P (stmt_info)
1617 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1618 	continue;
1619 
1620       save_misalignment = DR_MISALIGNMENT (dr);
1621       vect_update_misalignment_for_peel (dr, dr0, npeel);
1622       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1623       SET_DR_MISALIGNMENT (dr, save_misalignment);
1624 
1625       if (!supportable_dr_alignment)
1626 	return false;
1627     }
1628 
1629   return true;
1630 }
1631 
1632 /* Function vect_enhance_data_refs_alignment
1633 
1634    This pass will use loop versioning and loop peeling in order to enhance
1635    the alignment of data references in the loop.
1636 
1637    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1638    original loop is to be vectorized.  Any other loops that are created by
1639    the transformations performed in this pass - are not supposed to be
1640    vectorized.  This restriction will be relaxed.
1641 
1642    This pass will require a cost model to guide it whether to apply peeling
1643    or versioning or a combination of the two.  For example, the scheme that
1644    intel uses when given a loop with several memory accesses, is as follows:
1645    choose one memory access ('p') which alignment you want to force by doing
1646    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1647    other accesses are not necessarily aligned, or (2) use loop versioning to
1648    generate one loop in which all accesses are aligned, and another loop in
1649    which only 'p' is necessarily aligned.
1650 
1651    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1652    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1653    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1654 
1655    Devising a cost model is the most critical aspect of this work.  It will
1656    guide us on which access to peel for, whether to use loop versioning, how
1657    many versions to create, etc.  The cost model will probably consist of
1658    generic considerations as well as target specific considerations (on
1659    powerpc for example, misaligned stores are more painful than misaligned
1660    loads).
1661 
1662    Here are the general steps involved in alignment enhancements:
1663 
1664      -- original loop, before alignment analysis:
1665 	for (i=0; i<N; i++){
1666 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1667 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1668 	}
1669 
1670      -- After vect_compute_data_refs_alignment:
1671 	for (i=0; i<N; i++){
1672 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1673 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1674 	}
1675 
1676      -- Possibility 1: we do loop versioning:
1677      if (p is aligned) {
1678 	for (i=0; i<N; i++){	# loop 1A
1679 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1680 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1681 	}
1682      }
1683      else {
1684 	for (i=0; i<N; i++){	# loop 1B
1685 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1686 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1687 	}
1688      }
1689 
1690      -- Possibility 2: we do loop peeling:
1691      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1692 	x = q[i];
1693 	p[i] = y;
1694      }
1695      for (i = 3; i < N; i++){	# loop 2A
1696 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1697 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1698      }
1699 
1700      -- Possibility 3: combination of loop peeling and versioning:
1701      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1702 	x = q[i];
1703 	p[i] = y;
1704      }
1705      if (p is aligned) {
1706 	for (i = 3; i<N; i++){	# loop 3A
1707 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1708 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1709 	}
1710      }
1711      else {
1712 	for (i = 3; i<N; i++){	# loop 3B
1713 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1714 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1715 	}
1716      }
1717 
1718      These loops are later passed to loop_transform to be vectorized.  The
1719      vectorizer will use the alignment information to guide the transformation
1720      (whether to generate regular loads/stores, or with special handling for
1721      misalignment).  */
1722 
1723 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1724 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1725 {
1726   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1727   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1728   enum dr_alignment_support supportable_dr_alignment;
1729   struct data_reference *dr0 = NULL, *first_store = NULL;
1730   struct data_reference *dr;
1731   unsigned int i, j;
1732   bool do_peeling = false;
1733   bool do_versioning = false;
1734   bool stat;
1735   gimple *stmt;
1736   stmt_vec_info stmt_info;
1737   unsigned int npeel = 0;
1738   bool one_misalignment_known = false;
1739   bool one_misalignment_unknown = false;
1740   bool one_dr_unsupportable = false;
1741   struct data_reference *unsupportable_dr = NULL;
1742   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1743   unsigned possible_npeel_number = 1;
1744   tree vectype;
1745   unsigned int mis, same_align_drs_max = 0;
1746   hash_table<peel_info_hasher> peeling_htab (1);
1747 
1748   if (dump_enabled_p ())
1749     dump_printf_loc (MSG_NOTE, vect_location,
1750                      "=== vect_enhance_data_refs_alignment ===\n");
1751 
1752   /* Reset data so we can safely be called multiple times.  */
1753   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1754   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1755 
1756   /* While cost model enhancements are expected in the future, the high level
1757      view of the code at this time is as follows:
1758 
1759      A) If there is a misaligned access then see if peeling to align
1760         this access can make all data references satisfy
1761         vect_supportable_dr_alignment.  If so, update data structures
1762         as needed and return true.
1763 
1764      B) If peeling wasn't possible and there is a data reference with an
1765         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1766         then see if loop versioning checks can be used to make all data
1767         references satisfy vect_supportable_dr_alignment.  If so, update
1768         data structures as needed and return true.
1769 
1770      C) If neither peeling nor versioning were successful then return false if
1771         any data reference does not satisfy vect_supportable_dr_alignment.
1772 
1773      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1774 
1775      Note, Possibility 3 above (which is peeling and versioning together) is not
1776      being done at this time.  */
1777 
1778   /* (1) Peeling to force alignment.  */
1779 
1780   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1781      Considerations:
1782      + How many accesses will become aligned due to the peeling
1783      - How many accesses will become unaligned due to the peeling,
1784        and the cost of misaligned accesses.
1785      - The cost of peeling (the extra runtime checks, the increase
1786        in code size).  */
1787 
1788   FOR_EACH_VEC_ELT (datarefs, i, dr)
1789     {
1790       stmt = DR_STMT (dr);
1791       stmt_info = vinfo_for_stmt (stmt);
1792 
1793       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1794 	continue;
1795 
1796       /* For interleaving, only the alignment of the first access
1797          matters.  */
1798       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1799           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1800         continue;
1801 
1802       /* For invariant accesses there is nothing to enhance.  */
1803       if (integer_zerop (DR_STEP (dr)))
1804 	continue;
1805 
1806       /* Strided accesses perform only component accesses, alignment is
1807 	 irrelevant for them.  */
1808       if (STMT_VINFO_STRIDED_P (stmt_info)
1809 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1810 	continue;
1811 
1812       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1813       do_peeling = vector_alignment_reachable_p (dr);
1814       if (do_peeling)
1815         {
1816           if (known_alignment_for_access_p (dr))
1817             {
1818 	      unsigned int npeel_tmp = 0;
1819 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1820 						    size_zero_node) < 0;
1821 
1822 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1823 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1824 	      unsigned int dr_size = vect_get_scalar_dr_size (dr);
1825 	      mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1826 	      if (DR_MISALIGNMENT (dr) != 0)
1827 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1828 
1829               /* For multiple types, it is possible that the bigger type access
1830                  will have more than one peeling option.  E.g., a loop with two
1831                  types: one of size (vector size / 4), and the other one of
1832                  size (vector size / 8).  Vectorization factor will 8.  If both
1833                  accesses are misaligned by 3, the first one needs one scalar
1834                  iteration to be aligned, and the second one needs 5.  But the
1835 		 first one will be aligned also by peeling 5 scalar
1836                  iterations, and in that case both accesses will be aligned.
1837                  Hence, except for the immediate peeling amount, we also want
1838                  to try to add full vector size, while we don't exceed
1839                  vectorization factor.
1840                  We do this automatically for cost model, since we calculate
1841 		 cost for every peeling option.  */
1842               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1843 		{
1844 		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1845 					  ? vf * GROUP_SIZE (stmt_info) : vf);
1846 		  possible_npeel_number
1847 		    = vect_get_num_vectors (nscalars, vectype);
1848 
1849 		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1850 		     allow peeling NELEMENTS.  */
1851 		  if (DR_MISALIGNMENT (dr) == 0)
1852 		    possible_npeel_number++;
1853 		}
1854 
1855 	      /* Save info about DR in the hash table.  Also include peeling
1856 	         amounts according to the explanation above.  */
1857               for (j = 0; j < possible_npeel_number; j++)
1858                 {
1859                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1860 					    dr, npeel_tmp);
1861 		  npeel_tmp += target_align / dr_size;
1862                 }
1863 
1864 	      one_misalignment_known = true;
1865             }
1866           else
1867             {
1868               /* If we don't know any misalignment values, we prefer
1869                  peeling for data-ref that has the maximum number of data-refs
1870                  with the same alignment, unless the target prefers to align
1871                  stores over load.  */
1872 	      unsigned same_align_drs
1873 		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1874 	      if (!dr0
1875 		  || same_align_drs_max < same_align_drs)
1876 		{
1877 		  same_align_drs_max = same_align_drs;
1878 		  dr0 = dr;
1879 		}
1880 	      /* For data-refs with the same number of related
1881 		 accesses prefer the one where the misalign
1882 		 computation will be invariant in the outermost loop.  */
1883 	      else if (same_align_drs_max == same_align_drs)
1884 		{
1885 		  struct loop *ivloop0, *ivloop;
1886 		  ivloop0 = outermost_invariant_loop_for_expr
1887 		    (loop, DR_BASE_ADDRESS (dr0));
1888 		  ivloop = outermost_invariant_loop_for_expr
1889 		    (loop, DR_BASE_ADDRESS (dr));
1890 		  if ((ivloop && !ivloop0)
1891 		      || (ivloop && ivloop0
1892 			  && flow_loop_nested_p (ivloop, ivloop0)))
1893 		    dr0 = dr;
1894 		}
1895 
1896 	      one_misalignment_unknown = true;
1897 
1898 	      /* Check for data refs with unsupportable alignment that
1899 	         can be peeled.  */
1900 	      if (!supportable_dr_alignment)
1901 	      {
1902 		one_dr_unsupportable = true;
1903 		unsupportable_dr = dr;
1904 	      }
1905 
1906 	      if (!first_store && DR_IS_WRITE (dr))
1907 		first_store = dr;
1908             }
1909         }
1910       else
1911         {
1912           if (!aligned_access_p (dr))
1913             {
1914               if (dump_enabled_p ())
1915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1916                                  "vector alignment may not be reachable\n");
1917               break;
1918             }
1919         }
1920     }
1921 
1922   /* Check if we can possibly peel the loop.  */
1923   if (!vect_can_advance_ivs_p (loop_vinfo)
1924       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1925       || loop->inner)
1926     do_peeling = false;
1927 
1928   struct _vect_peel_extended_info peel_for_known_alignment;
1929   struct _vect_peel_extended_info peel_for_unknown_alignment;
1930   struct _vect_peel_extended_info best_peel;
1931 
1932   peel_for_unknown_alignment.inside_cost = INT_MAX;
1933   peel_for_unknown_alignment.outside_cost = INT_MAX;
1934   peel_for_unknown_alignment.peel_info.count = 0;
1935 
1936   if (do_peeling
1937       && one_misalignment_unknown)
1938     {
1939       /* Check if the target requires to prefer stores over loads, i.e., if
1940          misaligned stores are more expensive than misaligned loads (taking
1941          drs with same alignment into account).  */
1942       unsigned int load_inside_cost = 0;
1943       unsigned int load_outside_cost = 0;
1944       unsigned int store_inside_cost = 0;
1945       unsigned int store_outside_cost = 0;
1946       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1947 
1948       stmt_vector_for_cost dummy;
1949       dummy.create (2);
1950       vect_get_peeling_costs_all_drs (datarefs, dr0,
1951 				      &load_inside_cost,
1952 				      &load_outside_cost,
1953 				      &dummy, estimated_npeels, true);
1954       dummy.release ();
1955 
1956       if (first_store)
1957 	{
1958 	  dummy.create (2);
1959 	  vect_get_peeling_costs_all_drs (datarefs, first_store,
1960 					  &store_inside_cost,
1961 					  &store_outside_cost,
1962 					  &dummy, estimated_npeels, true);
1963 	  dummy.release ();
1964 	}
1965       else
1966 	{
1967 	  store_inside_cost = INT_MAX;
1968 	  store_outside_cost = INT_MAX;
1969 	}
1970 
1971       if (load_inside_cost > store_inside_cost
1972 	  || (load_inside_cost == store_inside_cost
1973 	      && load_outside_cost > store_outside_cost))
1974 	{
1975 	  dr0 = first_store;
1976 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1977 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1978 	}
1979       else
1980 	{
1981 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1982 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1983 	}
1984 
1985       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1986       prologue_cost_vec.create (2);
1987       epilogue_cost_vec.create (2);
1988 
1989       int dummy2;
1990       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1991 	(loop_vinfo, estimated_npeels, &dummy2,
1992 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1993 	 &prologue_cost_vec, &epilogue_cost_vec);
1994 
1995       prologue_cost_vec.release ();
1996       epilogue_cost_vec.release ();
1997 
1998       peel_for_unknown_alignment.peel_info.count = 1
1999 	+ STMT_VINFO_SAME_ALIGN_REFS
2000 	(vinfo_for_stmt (DR_STMT (dr0))).length ();
2001     }
2002 
2003   peel_for_unknown_alignment.peel_info.npeel = 0;
2004   peel_for_unknown_alignment.peel_info.dr = dr0;
2005 
2006   best_peel = peel_for_unknown_alignment;
2007 
2008   peel_for_known_alignment.inside_cost = INT_MAX;
2009   peel_for_known_alignment.outside_cost = INT_MAX;
2010   peel_for_known_alignment.peel_info.count = 0;
2011   peel_for_known_alignment.peel_info.dr = NULL;
2012 
2013   if (do_peeling && one_misalignment_known)
2014     {
2015       /* Peeling is possible, but there is no data access that is not supported
2016          unless aligned.  So we try to choose the best possible peeling from
2017 	 the hash table.  */
2018       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2019 	(&peeling_htab, loop_vinfo);
2020     }
2021 
2022   /* Compare costs of peeling for known and unknown alignment. */
2023   if (peel_for_known_alignment.peel_info.dr != NULL
2024       && peel_for_unknown_alignment.inside_cost
2025       >= peel_for_known_alignment.inside_cost)
2026     {
2027       best_peel = peel_for_known_alignment;
2028 
2029       /* If the best peeling for known alignment has NPEEL == 0, perform no
2030          peeling at all except if there is an unsupportable dr that we can
2031          align.  */
2032       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2033 	do_peeling = false;
2034     }
2035 
2036   /* If there is an unsupportable data ref, prefer this over all choices so far
2037      since we'd have to discard a chosen peeling except when it accidentally
2038      aligned the unsupportable data ref.  */
2039   if (one_dr_unsupportable)
2040     dr0 = unsupportable_dr;
2041   else if (do_peeling)
2042     {
2043       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2044 	 TODO: Use nopeel_outside_cost or get rid of it?  */
2045       unsigned nopeel_inside_cost = 0;
2046       unsigned nopeel_outside_cost = 0;
2047 
2048       stmt_vector_for_cost dummy;
2049       dummy.create (2);
2050       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
2051 				      &nopeel_outside_cost, &dummy, 0, false);
2052       dummy.release ();
2053 
2054       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2055 	 costs will be recorded.  */
2056       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2057       prologue_cost_vec.create (2);
2058       epilogue_cost_vec.create (2);
2059 
2060       int dummy2;
2061       nopeel_outside_cost += vect_get_known_peeling_cost
2062 	(loop_vinfo, 0, &dummy2,
2063 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2064 	 &prologue_cost_vec, &epilogue_cost_vec);
2065 
2066       prologue_cost_vec.release ();
2067       epilogue_cost_vec.release ();
2068 
2069       npeel = best_peel.peel_info.npeel;
2070       dr0 = best_peel.peel_info.dr;
2071 
2072       /* If no peeling is not more expensive than the best peeling we
2073 	 have so far, don't perform any peeling.  */
2074       if (nopeel_inside_cost <= best_peel.inside_cost)
2075 	do_peeling = false;
2076     }
2077 
2078   if (do_peeling)
2079     {
2080       stmt = DR_STMT (dr0);
2081       stmt_info = vinfo_for_stmt (stmt);
2082       vectype = STMT_VINFO_VECTYPE (stmt_info);
2083 
2084       if (known_alignment_for_access_p (dr0))
2085         {
2086 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
2087 						size_zero_node) < 0;
2088           if (!npeel)
2089             {
2090               /* Since it's known at compile time, compute the number of
2091                  iterations in the peeled loop (the peeling factor) for use in
2092                  updating DR_MISALIGNMENT values.  The peeling factor is the
2093                  vectorization factor minus the misalignment as an element
2094                  count.  */
2095 	      mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2096 	      unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2097 	      npeel = ((mis & (target_align - 1))
2098 		       / vect_get_scalar_dr_size (dr0));
2099             }
2100 
2101 	  /* For interleaved data access every iteration accesses all the
2102 	     members of the group, therefore we divide the number of iterations
2103 	     by the group size.  */
2104 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2105 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106 	    npeel /= GROUP_SIZE (stmt_info);
2107 
2108           if (dump_enabled_p ())
2109             dump_printf_loc (MSG_NOTE, vect_location,
2110                              "Try peeling by %d\n", npeel);
2111         }
2112 
2113       /* Ensure that all datarefs can be vectorized after the peel.  */
2114       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2115 	do_peeling = false;
2116 
2117       /* Check if all datarefs are supportable and log.  */
2118       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2119         {
2120           stat = vect_verify_datarefs_alignment (loop_vinfo);
2121           if (!stat)
2122             do_peeling = false;
2123           else
2124 	    return stat;
2125         }
2126 
2127       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2128       if (do_peeling)
2129         {
2130           unsigned max_allowed_peel
2131             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2132           if (max_allowed_peel != (unsigned)-1)
2133             {
2134               unsigned max_peel = npeel;
2135               if (max_peel == 0)
2136                 {
2137 		  unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2138 		  max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2139                 }
2140               if (max_peel > max_allowed_peel)
2141                 {
2142                   do_peeling = false;
2143                   if (dump_enabled_p ())
2144                     dump_printf_loc (MSG_NOTE, vect_location,
2145                         "Disable peeling, max peels reached: %d\n", max_peel);
2146                 }
2147             }
2148         }
2149 
2150       /* Cost model #2 - if peeling may result in a remaining loop not
2151 	 iterating enough to be vectorized then do not peel.  Since this
2152 	 is a cost heuristic rather than a correctness decision, use the
2153 	 most likely runtime value for variable vectorization factors.  */
2154       if (do_peeling
2155 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2156 	{
2157 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2158 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2159 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2160 	      < assumed_vf + max_peel)
2161 	    do_peeling = false;
2162 	}
2163 
2164       if (do_peeling)
2165         {
2166           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2167              If the misalignment of DR_i is identical to that of dr0 then set
2168              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2169              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2170              by the peeling factor times the element size of DR_i (MOD the
2171              vectorization factor times the size).  Otherwise, the
2172              misalignment of DR_i must be set to unknown.  */
2173 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2174 	    if (dr != dr0)
2175 	      {
2176 		/* Strided accesses perform only component accesses, alignment
2177 		   is irrelevant for them.  */
2178 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
2179 		if (STMT_VINFO_STRIDED_P (stmt_info)
2180 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2181 		  continue;
2182 
2183 		vect_update_misalignment_for_peel (dr, dr0, npeel);
2184 	      }
2185 
2186           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2187           if (npeel)
2188             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2189           else
2190             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2191 	      = DR_MISALIGNMENT (dr0);
2192 	  SET_DR_MISALIGNMENT (dr0, 0);
2193 	  if (dump_enabled_p ())
2194             {
2195               dump_printf_loc (MSG_NOTE, vect_location,
2196                                "Alignment of access forced using peeling.\n");
2197               dump_printf_loc (MSG_NOTE, vect_location,
2198                                "Peeling for alignment will be applied.\n");
2199             }
2200 
2201 	  /* The inside-loop cost will be accounted for in vectorizable_load
2202 	     and vectorizable_store correctly with adjusted alignments.
2203 	     Drop the body_cst_vec on the floor here.  */
2204 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
2205 	  gcc_assert (stat);
2206           return stat;
2207         }
2208     }
2209 
2210   /* (2) Versioning to force alignment.  */
2211 
2212   /* Try versioning if:
2213      1) optimize loop for speed
2214      2) there is at least one unsupported misaligned data ref with an unknown
2215         misalignment, and
2216      3) all misaligned data refs with a known misalignment are supported, and
2217      4) the number of runtime alignment checks is within reason.  */
2218 
2219   do_versioning =
2220 	optimize_loop_nest_for_speed_p (loop)
2221 	&& (!loop->inner); /* FORNOW */
2222 
2223   if (do_versioning)
2224     {
2225       FOR_EACH_VEC_ELT (datarefs, i, dr)
2226         {
2227 	  stmt = DR_STMT (dr);
2228 	  stmt_info = vinfo_for_stmt (stmt);
2229 
2230 	  /* For interleaving, only the alignment of the first access
2231 	     matters.  */
2232 	  if (aligned_access_p (dr)
2233 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2234 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2235 	    continue;
2236 
2237 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2238 	    {
2239 	      /* Strided loads perform only component accesses, alignment is
2240 		 irrelevant for them.  */
2241 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2242 		continue;
2243 	      do_versioning = false;
2244 	      break;
2245 	    }
2246 
2247 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2248 
2249           if (!supportable_dr_alignment)
2250             {
2251 	      gimple *stmt;
2252               int mask;
2253               tree vectype;
2254 
2255               if (known_alignment_for_access_p (dr)
2256                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2257                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2258                 {
2259                   do_versioning = false;
2260                   break;
2261                 }
2262 
2263               stmt = DR_STMT (dr);
2264               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2265               gcc_assert (vectype);
2266 
2267 	      /* At present we don't support versioning for alignment
2268 		 with variable VF, since there's no guarantee that the
2269 		 VF is a power of two.  We could relax this if we added
2270 		 a way of enforcing a power-of-two size.  */
2271 	      unsigned HOST_WIDE_INT size;
2272 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2273 		{
2274 		  do_versioning = false;
2275 		  break;
2276 		}
2277 
2278               /* The rightmost bits of an aligned address must be zeros.
2279                  Construct the mask needed for this test.  For example,
2280                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2281                  mask must be 15 = 0xf. */
2282 	      mask = size - 1;
2283 
2284               /* FORNOW: use the same mask to test all potentially unaligned
2285                  references in the loop.  The vectorizer currently supports
2286                  a single vector size, see the reference to
2287                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2288                  vectorization factor is computed.  */
2289               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2290                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2291               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2292               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2293 		      DR_STMT (dr));
2294             }
2295         }
2296 
2297       /* Versioning requires at least one misaligned data reference.  */
2298       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2299         do_versioning = false;
2300       else if (!do_versioning)
2301         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2302     }
2303 
2304   if (do_versioning)
2305     {
2306       vec<gimple *> may_misalign_stmts
2307         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2308       gimple *stmt;
2309 
2310       /* It can now be assumed that the data references in the statements
2311          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2312          of the loop being vectorized.  */
2313       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2314         {
2315           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2316           dr = STMT_VINFO_DATA_REF (stmt_info);
2317 	  SET_DR_MISALIGNMENT (dr, 0);
2318 	  if (dump_enabled_p ())
2319             dump_printf_loc (MSG_NOTE, vect_location,
2320                              "Alignment of access forced using versioning.\n");
2321         }
2322 
2323       if (dump_enabled_p ())
2324         dump_printf_loc (MSG_NOTE, vect_location,
2325                          "Versioning for alignment will be applied.\n");
2326 
2327       /* Peeling and versioning can't be done together at this time.  */
2328       gcc_assert (! (do_peeling && do_versioning));
2329 
2330       stat = vect_verify_datarefs_alignment (loop_vinfo);
2331       gcc_assert (stat);
2332       return stat;
2333     }
2334 
2335   /* This point is reached if neither peeling nor versioning is being done.  */
2336   gcc_assert (! (do_peeling || do_versioning));
2337 
2338   stat = vect_verify_datarefs_alignment (loop_vinfo);
2339   return stat;
2340 }
2341 
2342 
2343 /* Function vect_find_same_alignment_drs.
2344 
2345    Update group and alignment relations according to the chosen
2346    vectorization factor.  */
2347 
2348 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2349 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2350 {
2351   struct data_reference *dra = DDR_A (ddr);
2352   struct data_reference *drb = DDR_B (ddr);
2353   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2354   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2355 
2356   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2357     return;
2358 
2359   if (dra == drb)
2360     return;
2361 
2362   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2363       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2364       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2365     return;
2366 
2367   /* Two references with distance zero have the same alignment.  */
2368   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2369 			  - wi::to_poly_offset (DR_INIT (drb)));
2370   if (maybe_ne (diff, 0))
2371     {
2372       /* Get the wider of the two alignments.  */
2373       unsigned int align_a = (vect_calculate_target_alignment (dra)
2374 			      / BITS_PER_UNIT);
2375       unsigned int align_b = (vect_calculate_target_alignment (drb)
2376 			      / BITS_PER_UNIT);
2377       unsigned int max_align = MAX (align_a, align_b);
2378 
2379       /* Require the gap to be a multiple of the larger vector alignment.  */
2380       if (!multiple_p (diff, max_align))
2381 	return;
2382     }
2383 
2384   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2385   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2386   if (dump_enabled_p ())
2387     {
2388       dump_printf_loc (MSG_NOTE, vect_location,
2389 		       "accesses have the same alignment: ");
2390       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2391       dump_printf (MSG_NOTE,  " and ");
2392       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2393       dump_printf (MSG_NOTE, "\n");
2394     }
2395 }
2396 
2397 
2398 /* Function vect_analyze_data_refs_alignment
2399 
2400    Analyze the alignment of the data-references in the loop.
2401    Return FALSE if a data reference is found that cannot be vectorized.  */
2402 
2403 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2404 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2405 {
2406   if (dump_enabled_p ())
2407     dump_printf_loc (MSG_NOTE, vect_location,
2408                      "=== vect_analyze_data_refs_alignment ===\n");
2409 
2410   /* Mark groups of data references with same alignment using
2411      data dependence information.  */
2412   vec<ddr_p> ddrs = vinfo->ddrs;
2413   struct data_dependence_relation *ddr;
2414   unsigned int i;
2415 
2416   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2417     vect_find_same_alignment_drs (ddr);
2418 
2419   vec<data_reference_p> datarefs = vinfo->datarefs;
2420   struct data_reference *dr;
2421 
2422   vect_record_base_alignments (vinfo);
2423   FOR_EACH_VEC_ELT (datarefs, i, dr)
2424     {
2425       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2426       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2427 	  && !vect_compute_data_ref_alignment (dr))
2428 	{
2429 	  /* Strided accesses perform only component accesses, misalignment
2430 	     information is irrelevant for them.  */
2431 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2432 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2433 	    continue;
2434 
2435 	  if (dump_enabled_p ())
2436 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2437 			     "not vectorized: can't calculate alignment "
2438 			     "for data ref.\n");
2439 
2440 	  return false;
2441 	}
2442     }
2443 
2444   return true;
2445 }
2446 
2447 
2448 /* Analyze alignment of DRs of stmts in NODE.  */
2449 
2450 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2451 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2452 {
2453   /* We vectorize from the first scalar stmt in the node unless
2454      the node is permuted in which case we start from the first
2455      element in the group.  */
2456   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2457   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2458   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2459     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2460 
2461   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2462   if (! vect_compute_data_ref_alignment (dr)
2463       /* For creating the data-ref pointer we need alignment of the
2464 	 first element anyway.  */
2465       || (dr != first_dr
2466 	  && ! vect_compute_data_ref_alignment (first_dr))
2467       || ! verify_data_ref_alignment (dr))
2468     {
2469       if (dump_enabled_p ())
2470 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 			 "not vectorized: bad data alignment in basic "
2472 			 "block.\n");
2473       return false;
2474     }
2475 
2476   return true;
2477 }
2478 
2479 /* Function vect_slp_analyze_instance_alignment
2480 
2481    Analyze the alignment of the data-references in the SLP instance.
2482    Return FALSE if a data reference is found that cannot be vectorized.  */
2483 
2484 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2485 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2486 {
2487   if (dump_enabled_p ())
2488     dump_printf_loc (MSG_NOTE, vect_location,
2489                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2490 
2491   slp_tree node;
2492   unsigned i;
2493   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2494     if (! vect_slp_analyze_and_verify_node_alignment (node))
2495       return false;
2496 
2497   node = SLP_INSTANCE_TREE (instance);
2498   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2499       && ! vect_slp_analyze_and_verify_node_alignment
2500 	     (SLP_INSTANCE_TREE (instance)))
2501     return false;
2502 
2503   return true;
2504 }
2505 
2506 
2507 /* Analyze groups of accesses: check that DR belongs to a group of
2508    accesses of legal size, step, etc.  Detect gaps, single element
2509    interleaving, and other special cases. Set grouped access info.
2510    Collect groups of strided stores for further use in SLP analysis.
2511    Worker for vect_analyze_group_access.  */
2512 
2513 static bool
vect_analyze_group_access_1(struct data_reference * dr)2514 vect_analyze_group_access_1 (struct data_reference *dr)
2515 {
2516   tree step = DR_STEP (dr);
2517   tree scalar_type = TREE_TYPE (DR_REF (dr));
2518   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2519   gimple *stmt = DR_STMT (dr);
2520   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2521   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2522   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2523   HOST_WIDE_INT dr_step = -1;
2524   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2525   bool slp_impossible = false;
2526 
2527   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2528      size of the interleaving group (including gaps).  */
2529   if (tree_fits_shwi_p (step))
2530     {
2531       dr_step = tree_to_shwi (step);
2532       /* Check that STEP is a multiple of type size.  Otherwise there is
2533          a non-element-sized gap at the end of the group which we
2534 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2535 	 ???  As we can handle non-constant step fine here we should
2536 	 simply remove uses of GROUP_GAP between the last and first
2537 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2538 	 simply not include that gap.  */
2539       if ((dr_step % type_size) != 0)
2540 	{
2541 	  if (dump_enabled_p ())
2542 	    {
2543 	      dump_printf_loc (MSG_NOTE, vect_location,
2544 	                       "Step ");
2545 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2546 	      dump_printf (MSG_NOTE,
2547 			   " is not a multiple of the element size for ");
2548 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2549 	      dump_printf (MSG_NOTE, "\n");
2550 	    }
2551 	  return false;
2552 	}
2553       groupsize = absu_hwi (dr_step) / type_size;
2554     }
2555   else
2556     groupsize = 0;
2557 
2558   /* Not consecutive access is possible only if it is a part of interleaving.  */
2559   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2560     {
2561       /* Check if it this DR is a part of interleaving, and is a single
2562 	 element of the group that is accessed in the loop.  */
2563 
2564       /* Gaps are supported only for loads. STEP must be a multiple of the type
2565 	 size.  */
2566       if (DR_IS_READ (dr)
2567 	  && (dr_step % type_size) == 0
2568 	  && groupsize > 0)
2569 	{
2570 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2571 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2572 	  GROUP_GAP (stmt_info) = groupsize - 1;
2573 	  if (dump_enabled_p ())
2574 	    {
2575 	      dump_printf_loc (MSG_NOTE, vect_location,
2576 	                       "Detected single element interleaving ");
2577 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2578 	      dump_printf (MSG_NOTE, " step ");
2579 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2580 	      dump_printf (MSG_NOTE, "\n");
2581 	    }
2582 
2583 	  return true;
2584 	}
2585 
2586       if (dump_enabled_p ())
2587         {
2588  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589 	                   "not consecutive access ");
2590 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2591         }
2592 
2593       if (bb_vinfo)
2594         {
2595           /* Mark the statement as unvectorizable.  */
2596           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2597           return true;
2598         }
2599 
2600       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2601       STMT_VINFO_STRIDED_P (stmt_info) = true;
2602       return true;
2603     }
2604 
2605   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2606     {
2607       /* First stmt in the interleaving chain. Check the chain.  */
2608       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2609       struct data_reference *data_ref = dr;
2610       unsigned int count = 1;
2611       tree prev_init = DR_INIT (data_ref);
2612       gimple *prev = stmt;
2613       HOST_WIDE_INT diff, gaps = 0;
2614 
2615       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2616       while (next)
2617         {
2618           /* Skip same data-refs.  In case that two or more stmts share
2619              data-ref (supported only for loads), we vectorize only the first
2620              stmt, and the rest get their vectorized loads from the first
2621              one.  */
2622           if (!tree_int_cst_compare (DR_INIT (data_ref),
2623                                      DR_INIT (STMT_VINFO_DATA_REF (
2624 						   vinfo_for_stmt (next)))))
2625             {
2626               if (DR_IS_WRITE (data_ref))
2627                 {
2628                   if (dump_enabled_p ())
2629                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630                                      "Two store stmts share the same dr.\n");
2631                   return false;
2632                 }
2633 
2634 	      if (dump_enabled_p ())
2635 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636 				 "Two or more load stmts share the same dr.\n");
2637 
2638               /* For load use the same data-ref load.  */
2639               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2640 
2641               prev = next;
2642               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2643               continue;
2644             }
2645 
2646           prev = next;
2647           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2648 
2649 	  /* All group members have the same STEP by construction.  */
2650 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2651 
2652           /* Check that the distance between two accesses is equal to the type
2653              size. Otherwise, we have gaps.  */
2654           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2655                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2656 	  if (diff != 1)
2657 	    {
2658 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2659 	      slp_impossible = true;
2660 	      if (DR_IS_WRITE (data_ref))
2661 		{
2662                   if (dump_enabled_p ())
2663                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2664                                      "interleaved store with gaps\n");
2665 		  return false;
2666 		}
2667 
2668               gaps += diff - 1;
2669 	    }
2670 
2671 	  last_accessed_element += diff;
2672 
2673           /* Store the gap from the previous member of the group. If there is no
2674              gap in the access, GROUP_GAP is always 1.  */
2675           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2676 
2677           prev_init = DR_INIT (data_ref);
2678           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2679           /* Count the number of data-refs in the chain.  */
2680           count++;
2681         }
2682 
2683       if (groupsize == 0)
2684         groupsize = count + gaps;
2685 
2686       /* This could be UINT_MAX but as we are generating code in a very
2687          inefficient way we have to cap earlier.  See PR78699 for example.  */
2688       if (groupsize > 4096)
2689 	{
2690 	  if (dump_enabled_p ())
2691 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2692 			     "group is too large\n");
2693 	  return false;
2694 	}
2695 
2696       /* Check that the size of the interleaving is equal to count for stores,
2697          i.e., that there are no gaps.  */
2698       if (groupsize != count
2699 	  && !DR_IS_READ (dr))
2700         {
2701 	  if (dump_enabled_p ())
2702 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2703 			     "interleaved store with gaps\n");
2704 	  return false;
2705 	}
2706 
2707       /* If there is a gap after the last load in the group it is the
2708 	 difference between the groupsize and the last accessed
2709 	 element.
2710 	 When there is no gap, this difference should be 0.  */
2711       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2712 
2713       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2714       if (dump_enabled_p ())
2715 	{
2716 	  dump_printf_loc (MSG_NOTE, vect_location,
2717 			   "Detected interleaving ");
2718 	  if (DR_IS_READ (dr))
2719 	    dump_printf (MSG_NOTE, "load ");
2720 	  else
2721 	    dump_printf (MSG_NOTE, "store ");
2722 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2723 		       (unsigned)groupsize);
2724 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2725 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2726 	    dump_printf_loc (MSG_NOTE, vect_location,
2727 			     "There is a gap of %u elements after the group\n",
2728 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2729 	}
2730 
2731       /* SLP: create an SLP data structure for every interleaving group of
2732 	 stores for further analysis in vect_analyse_slp.  */
2733       if (DR_IS_WRITE (dr) && !slp_impossible)
2734         {
2735           if (loop_vinfo)
2736             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2737           if (bb_vinfo)
2738             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2739         }
2740     }
2741 
2742   return true;
2743 }
2744 
2745 /* Analyze groups of accesses: check that DR belongs to a group of
2746    accesses of legal size, step, etc.  Detect gaps, single element
2747    interleaving, and other special cases. Set grouped access info.
2748    Collect groups of strided stores for further use in SLP analysis.  */
2749 
2750 static bool
vect_analyze_group_access(struct data_reference * dr)2751 vect_analyze_group_access (struct data_reference *dr)
2752 {
2753   if (!vect_analyze_group_access_1 (dr))
2754     {
2755       /* Dissolve the group if present.  */
2756       gimple *next;
2757       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2758       while (stmt)
2759 	{
2760 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2761 	  next = GROUP_NEXT_ELEMENT (vinfo);
2762 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2763 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2764 	  stmt = next;
2765 	}
2766       return false;
2767     }
2768   return true;
2769 }
2770 
2771 /* Analyze the access pattern of the data-reference DR.
2772    In case of non-consecutive accesses call vect_analyze_group_access() to
2773    analyze groups of accesses.  */
2774 
2775 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2776 vect_analyze_data_ref_access (struct data_reference *dr)
2777 {
2778   tree step = DR_STEP (dr);
2779   tree scalar_type = TREE_TYPE (DR_REF (dr));
2780   gimple *stmt = DR_STMT (dr);
2781   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2782   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2783   struct loop *loop = NULL;
2784 
2785   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2786     return true;
2787 
2788   if (loop_vinfo)
2789     loop = LOOP_VINFO_LOOP (loop_vinfo);
2790 
2791   if (loop_vinfo && !step)
2792     {
2793       if (dump_enabled_p ())
2794 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2795 	                 "bad data-ref access in loop\n");
2796       return false;
2797     }
2798 
2799   /* Allow loads with zero step in inner-loop vectorization.  */
2800   if (loop_vinfo && integer_zerop (step))
2801     {
2802       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2803       if (!nested_in_vect_loop_p (loop, stmt))
2804 	return DR_IS_READ (dr);
2805       /* Allow references with zero step for outer loops marked
2806 	 with pragma omp simd only - it guarantees absence of
2807 	 loop-carried dependencies between inner loop iterations.  */
2808       if (loop->safelen < 2)
2809 	{
2810 	  if (dump_enabled_p ())
2811 	    dump_printf_loc (MSG_NOTE, vect_location,
2812 			     "zero step in inner loop of nest\n");
2813 	  return false;
2814 	}
2815     }
2816 
2817   if (loop && nested_in_vect_loop_p (loop, stmt))
2818     {
2819       /* Interleaved accesses are not yet supported within outer-loop
2820         vectorization for references in the inner-loop.  */
2821       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2822 
2823       /* For the rest of the analysis we use the outer-loop step.  */
2824       step = STMT_VINFO_DR_STEP (stmt_info);
2825       if (integer_zerop (step))
2826 	{
2827 	  if (dump_enabled_p ())
2828 	    dump_printf_loc (MSG_NOTE, vect_location,
2829 	                     "zero step in outer loop.\n");
2830 	  return DR_IS_READ (dr);
2831 	}
2832     }
2833 
2834   /* Consecutive?  */
2835   if (TREE_CODE (step) == INTEGER_CST)
2836     {
2837       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2838       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2839 	  || (dr_step < 0
2840 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2841 	{
2842 	  /* Mark that it is not interleaving.  */
2843 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2844 	  return true;
2845 	}
2846     }
2847 
2848   if (loop && nested_in_vect_loop_p (loop, stmt))
2849     {
2850       if (dump_enabled_p ())
2851 	dump_printf_loc (MSG_NOTE, vect_location,
2852 	                 "grouped access in outer loop.\n");
2853       return false;
2854     }
2855 
2856 
2857   /* Assume this is a DR handled by non-constant strided load case.  */
2858   if (TREE_CODE (step) != INTEGER_CST)
2859     return (STMT_VINFO_STRIDED_P (stmt_info)
2860 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2861 		|| vect_analyze_group_access (dr)));
2862 
2863   /* Not consecutive access - check if it's a part of interleaving group.  */
2864   return vect_analyze_group_access (dr);
2865 }
2866 
2867 /* Compare two data-references DRA and DRB to group them into chunks
2868    suitable for grouping.  */
2869 
2870 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2871 dr_group_sort_cmp (const void *dra_, const void *drb_)
2872 {
2873   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2874   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2875   int cmp;
2876 
2877   /* Stabilize sort.  */
2878   if (dra == drb)
2879     return 0;
2880 
2881   /* DRs in different loops never belong to the same group.  */
2882   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2883   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2884   if (loopa != loopb)
2885     return loopa->num < loopb->num ? -1 : 1;
2886 
2887   /* Ordering of DRs according to base.  */
2888   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2889 			       DR_BASE_ADDRESS (drb));
2890   if (cmp != 0)
2891     return cmp;
2892 
2893   /* And according to DR_OFFSET.  */
2894   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2895   if (cmp != 0)
2896     return cmp;
2897 
2898   /* Put reads before writes.  */
2899   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2900     return DR_IS_READ (dra) ? -1 : 1;
2901 
2902   /* Then sort after access size.  */
2903   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2904 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2905   if (cmp != 0)
2906     return cmp;
2907 
2908   /* And after step.  */
2909   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2910   if (cmp != 0)
2911     return cmp;
2912 
2913   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2914   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2915   if (cmp == 0)
2916     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2917   return cmp;
2918 }
2919 
2920 /* If OP is the result of a conversion, return the unconverted value,
2921    otherwise return null.  */
2922 
2923 static tree
strip_conversion(tree op)2924 strip_conversion (tree op)
2925 {
2926   if (TREE_CODE (op) != SSA_NAME)
2927     return NULL_TREE;
2928   gimple *stmt = SSA_NAME_DEF_STMT (op);
2929   if (!is_gimple_assign (stmt)
2930       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2931     return NULL_TREE;
2932   return gimple_assign_rhs1 (stmt);
2933 }
2934 
2935 /* Return true if vectorizable_* routines can handle statements STMT1
2936    and STMT2 being in a single group.  */
2937 
2938 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2939 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2940 {
2941   if (gimple_assign_single_p (stmt1))
2942     return gimple_assign_single_p (stmt2);
2943 
2944   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2945     {
2946       /* Check for two masked loads or two masked stores.  */
2947       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2948 	return false;
2949       internal_fn ifn = gimple_call_internal_fn (stmt1);
2950       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2951 	return false;
2952       if (ifn != gimple_call_internal_fn (stmt2))
2953 	return false;
2954 
2955       /* Check that the masks are the same.  Cope with casts of masks,
2956 	 like those created by build_mask_conversion.  */
2957       tree mask1 = gimple_call_arg (stmt1, 2);
2958       tree mask2 = gimple_call_arg (stmt2, 2);
2959       if (!operand_equal_p (mask1, mask2, 0))
2960 	{
2961 	  mask1 = strip_conversion (mask1);
2962 	  if (!mask1)
2963 	    return false;
2964 	  mask2 = strip_conversion (mask2);
2965 	  if (!mask2)
2966 	    return false;
2967 	  if (!operand_equal_p (mask1, mask2, 0))
2968 	    return false;
2969 	}
2970       return true;
2971     }
2972 
2973   return false;
2974 }
2975 
2976 /* Function vect_analyze_data_ref_accesses.
2977 
2978    Analyze the access pattern of all the data references in the loop.
2979 
2980    FORNOW: the only access pattern that is considered vectorizable is a
2981 	   simple step 1 (consecutive) access.
2982 
2983    FORNOW: handle only arrays and pointer accesses.  */
2984 
2985 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2986 vect_analyze_data_ref_accesses (vec_info *vinfo)
2987 {
2988   unsigned int i;
2989   vec<data_reference_p> datarefs = vinfo->datarefs;
2990   struct data_reference *dr;
2991 
2992   if (dump_enabled_p ())
2993     dump_printf_loc (MSG_NOTE, vect_location,
2994                      "=== vect_analyze_data_ref_accesses ===\n");
2995 
2996   if (datarefs.is_empty ())
2997     return true;
2998 
2999   /* Sort the array of datarefs to make building the interleaving chains
3000      linear.  Don't modify the original vector's order, it is needed for
3001      determining what dependencies are reversed.  */
3002   vec<data_reference_p> datarefs_copy = datarefs.copy ();
3003   datarefs_copy.qsort (dr_group_sort_cmp);
3004 
3005   /* Build the interleaving chains.  */
3006   for (i = 0; i < datarefs_copy.length () - 1;)
3007     {
3008       data_reference_p dra = datarefs_copy[i];
3009       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
3010       stmt_vec_info lastinfo = NULL;
3011       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3012 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3013 	{
3014 	  ++i;
3015 	  continue;
3016 	}
3017       for (i = i + 1; i < datarefs_copy.length (); ++i)
3018 	{
3019 	  data_reference_p drb = datarefs_copy[i];
3020 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
3021 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3022 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3023 	    break;
3024 
3025 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
3026 	     accesses, same accesses) can lead to a group to be artificially
3027 	     split here as we don't just skip over those.  If it really
3028 	     matters we can push those to a worklist and re-iterate
3029 	     over them.  The we can just skip ahead to the next DR here.  */
3030 
3031 	  /* DRs in a different loop should not be put into the same
3032 	     interleaving group.  */
3033 	  if (gimple_bb (DR_STMT (dra))->loop_father
3034 	      != gimple_bb (DR_STMT (drb))->loop_father)
3035 	    break;
3036 
3037 	  /* Check that the data-refs have same first location (except init)
3038 	     and they are both either store or load (not load and store,
3039 	     not masked loads or stores).  */
3040 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
3041 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3042 					DR_BASE_ADDRESS (drb)) != 0
3043 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3044 	      || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
3045 	    break;
3046 
3047 	  /* Check that the data-refs have the same constant size.  */
3048 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3049 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3050 	  if (!tree_fits_uhwi_p (sza)
3051 	      || !tree_fits_uhwi_p (szb)
3052 	      || !tree_int_cst_equal (sza, szb))
3053 	    break;
3054 
3055 	  /* Check that the data-refs have the same step.  */
3056 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3057 	    break;
3058 
3059 	  /* Check the types are compatible.
3060 	     ???  We don't distinguish this during sorting.  */
3061 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3062 				   TREE_TYPE (DR_REF (drb))))
3063 	    break;
3064 
3065 	  /* Check that the DR_INITs are compile-time constants.  */
3066 	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3067 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3068 	    break;
3069 
3070 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3071 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3072 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3073 	  HOST_WIDE_INT init_prev
3074 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3075 	  gcc_assert (init_a <= init_b
3076 		      && init_a <= init_prev
3077 		      && init_prev <= init_b);
3078 
3079 	  /* Do not place the same access in the interleaving chain twice.  */
3080 	  if (init_b == init_prev)
3081 	    {
3082 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3083 			  < gimple_uid (DR_STMT (drb)));
3084 	      /* ???  For now we simply "drop" the later reference which is
3085 	         otherwise the same rather than finishing off this group.
3086 		 In the end we'd want to re-process duplicates forming
3087 		 multiple groups from the refs, likely by just collecting
3088 		 all candidates (including duplicates and split points
3089 		 below) in a vector and then process them together.  */
3090 	      continue;
3091 	    }
3092 
3093 	  /* If init_b == init_a + the size of the type * k, we have an
3094 	     interleaving, and DRA is accessed before DRB.  */
3095 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3096 	  if (type_size_a == 0
3097 	      || (init_b - init_a) % type_size_a != 0)
3098 	    break;
3099 
3100 	  /* If we have a store, the accesses are adjacent.  This splits
3101 	     groups into chunks we support (we don't support vectorization
3102 	     of stores with gaps).  */
3103 	  if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3104 	    break;
3105 
3106 	  /* If the step (if not zero or non-constant) is greater than the
3107 	     difference between data-refs' inits this splits groups into
3108 	     suitable sizes.  */
3109 	  if (tree_fits_shwi_p (DR_STEP (dra)))
3110 	    {
3111 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3112 	      if (step != 0 && step <= (init_b - init_a))
3113 		break;
3114 	    }
3115 
3116 	  if (dump_enabled_p ())
3117 	    {
3118 	      dump_printf_loc (MSG_NOTE, vect_location,
3119 			       "Detected interleaving ");
3120 	      if (DR_IS_READ (dra))
3121 		dump_printf (MSG_NOTE, "load ");
3122 	      else
3123 		dump_printf (MSG_NOTE, "store ");
3124 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3125 	      dump_printf (MSG_NOTE,  " and ");
3126 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3127 	      dump_printf (MSG_NOTE, "\n");
3128 	    }
3129 
3130 	  /* Link the found element into the group list.  */
3131 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3132 	    {
3133 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3134 	      lastinfo = stmtinfo_a;
3135 	    }
3136 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3137 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3138 	  lastinfo = stmtinfo_b;
3139 	}
3140     }
3141 
3142   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3143     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3144         && !vect_analyze_data_ref_access (dr))
3145       {
3146 	if (dump_enabled_p ())
3147 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3148 	                   "not vectorized: complicated access pattern.\n");
3149 
3150         if (is_a <bb_vec_info> (vinfo))
3151           {
3152             /* Mark the statement as not vectorizable.  */
3153             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3154             continue;
3155           }
3156         else
3157 	  {
3158 	    datarefs_copy.release ();
3159 	    return false;
3160 	  }
3161       }
3162 
3163   datarefs_copy.release ();
3164   return true;
3165 }
3166 
3167 /* Function vect_vfa_segment_size.
3168 
3169    Input:
3170      DR: The data reference.
3171      LENGTH_FACTOR: segment length to consider.
3172 
3173    Return a value suitable for the dr_with_seg_len::seg_len field.
3174    This is the "distance travelled" by the pointer from the first
3175    iteration in the segment to the last.  Note that it does not include
3176    the size of the access; in effect it only describes the first byte.  */
3177 
3178 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3179 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3180 {
3181   length_factor = size_binop (MINUS_EXPR,
3182 			      fold_convert (sizetype, length_factor),
3183 			      size_one_node);
3184   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3185 		     length_factor);
3186 }
3187 
3188 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3189    gives the worst-case number of bytes covered by the segment.  */
3190 
3191 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3192 vect_vfa_access_size (data_reference *dr)
3193 {
3194   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3195   tree ref_type = TREE_TYPE (DR_REF (dr));
3196   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3197   unsigned HOST_WIDE_INT access_size = ref_size;
3198   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3199     {
3200       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3201       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3202     }
3203   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3204       && (vect_supportable_dr_alignment (dr, false)
3205 	  == dr_explicit_realign_optimized))
3206     {
3207       /* We might access a full vector's worth.  */
3208       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3209       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3210     }
3211   return access_size;
3212 }
3213 
3214 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3215 
3216 static unsigned int
vect_vfa_align(const data_reference * dr)3217 vect_vfa_align (const data_reference *dr)
3218 {
3219   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3220 }
3221 
3222 /* Function vect_no_alias_p.
3223 
3224    Given data references A and B with equal base and offset, see whether
3225    the alias relation can be decided at compilation time.  Return 1 if
3226    it can and the references alias, 0 if it can and the references do
3227    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3228    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3229    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3230 
3231 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3232 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3233 			 tree segment_length_a, tree segment_length_b,
3234 			 unsigned HOST_WIDE_INT access_size_a,
3235 			 unsigned HOST_WIDE_INT access_size_b)
3236 {
3237   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3238   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3239   poly_uint64 const_length_a;
3240   poly_uint64 const_length_b;
3241 
3242   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3243      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3244      [a, a+12) */
3245   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3246     {
3247       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3248       offset_a -= const_length_a;
3249     }
3250   else
3251     const_length_a = tree_to_poly_uint64 (segment_length_a);
3252   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3253     {
3254       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3255       offset_b -= const_length_b;
3256     }
3257   else
3258     const_length_b = tree_to_poly_uint64 (segment_length_b);
3259 
3260   const_length_a += access_size_a;
3261   const_length_b += access_size_b;
3262 
3263   if (ranges_known_overlap_p (offset_a, const_length_a,
3264 			      offset_b, const_length_b))
3265     return 1;
3266 
3267   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3268 			       offset_b, const_length_b))
3269     return 0;
3270 
3271   return -1;
3272 }
3273 
3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3275    in DDR is >= VF.  */
3276 
3277 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3278 dependence_distance_ge_vf (data_dependence_relation *ddr,
3279 			   unsigned int loop_depth, poly_uint64 vf)
3280 {
3281   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3282       || DDR_NUM_DIST_VECTS (ddr) == 0)
3283     return false;
3284 
3285   /* If the dependence is exact, we should have limited the VF instead.  */
3286   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3287 
3288   unsigned int i;
3289   lambda_vector dist_v;
3290   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3291     {
3292       HOST_WIDE_INT dist = dist_v[loop_depth];
3293       if (dist != 0
3294 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3295 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3296 	return false;
3297     }
3298 
3299   if (dump_enabled_p ())
3300     {
3301       dump_printf_loc (MSG_NOTE, vect_location,
3302 		       "dependence distance between ");
3303       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3304       dump_printf (MSG_NOTE,  " and ");
3305       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3306       dump_printf (MSG_NOTE,  " is >= VF\n");
3307     }
3308 
3309   return true;
3310 }
3311 
3312 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3313 
3314 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3315 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3316 {
3317   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3318   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3319   dump_printf (dump_kind, ") >= ");
3320   dump_dec (dump_kind, lower_bound.min_value);
3321 }
3322 
3323 /* Record that the vectorized loop requires the vec_lower_bound described
3324    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3325 
3326 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3327 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3328 			poly_uint64 min_value)
3329 {
3330   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3331   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3332     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3333       {
3334 	unsigned_p &= lower_bounds[i].unsigned_p;
3335 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3336 	if (lower_bounds[i].unsigned_p != unsigned_p
3337 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3338 	  {
3339 	    lower_bounds[i].unsigned_p = unsigned_p;
3340 	    lower_bounds[i].min_value = min_value;
3341 	    if (dump_enabled_p ())
3342 	      {
3343 		dump_printf_loc (MSG_NOTE, vect_location,
3344 				 "updating run-time check to ");
3345 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3346 		dump_printf (MSG_NOTE, "\n");
3347 	      }
3348 	  }
3349 	return;
3350       }
3351 
3352   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3353   if (dump_enabled_p ())
3354     {
3355       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3356       dump_lower_bound (MSG_NOTE, lower_bound);
3357       dump_printf (MSG_NOTE, "\n");
3358     }
3359   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3360 }
3361 
3362 /* Return true if it's unlikely that the step of the vectorized form of DR
3363    will span fewer than GAP bytes.  */
3364 
3365 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3366 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3367 {
3368   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3369   HOST_WIDE_INT count
3370     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3371   if (GROUP_FIRST_ELEMENT (stmt_info))
3372     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3373   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3374 }
3375 
3376 /* Return true if we know that there is no alias between DR_A and DR_B
3377    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3378    *LOWER_BOUND_OUT to this N.  */
3379 
3380 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3381 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3382 				poly_uint64 *lower_bound_out)
3383 {
3384   /* Check that there is a constant gap of known sign between DR_A
3385      and DR_B.  */
3386   poly_int64 init_a, init_b;
3387   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3388       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3389       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3390       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3391       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3392       || !ordered_p (init_a, init_b))
3393     return false;
3394 
3395   /* Sort DR_A and DR_B by the address they access.  */
3396   if (maybe_lt (init_b, init_a))
3397     {
3398       std::swap (init_a, init_b);
3399       std::swap (dr_a, dr_b);
3400     }
3401 
3402   /* If the two accesses could be dependent within a scalar iteration,
3403      make sure that we'd retain their order.  */
3404   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3405       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3406     return false;
3407 
3408   /* There is no alias if abs (DR_STEP) is greater than or equal to
3409      the bytes spanned by the combination of the two accesses.  */
3410   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3411   return true;
3412 }
3413 
3414 /* Function vect_prune_runtime_alias_test_list.
3415 
3416    Prune a list of ddrs to be tested at run-time by versioning for alias.
3417    Merge several alias checks into one if possible.
3418    Return FALSE if resulting list of ddrs is longer then allowed by
3419    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3420 
3421 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3423 {
3424   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3425   hash_set <tree_pair_hash> compared_objects;
3426 
3427   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3428   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3429     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3430   vec<vec_object_pair> &check_unequal_addrs
3431     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3432   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3434 
3435   ddr_p ddr;
3436   unsigned int i;
3437   tree length_factor;
3438 
3439   if (dump_enabled_p ())
3440     dump_printf_loc (MSG_NOTE, vect_location,
3441                      "=== vect_prune_runtime_alias_test_list ===\n");
3442 
3443   /* Step values are irrelevant for aliasing if the number of vector
3444      iterations is equal to the number of scalar iterations (which can
3445      happen for fully-SLP loops).  */
3446   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3447 
3448   if (!ignore_step_p)
3449     {
3450       /* Convert the checks for nonzero steps into bound tests.  */
3451       tree value;
3452       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3453 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3454     }
3455 
3456   if (may_alias_ddrs.is_empty ())
3457     return true;
3458 
3459   comp_alias_ddrs.create (may_alias_ddrs.length ());
3460 
3461   unsigned int loop_depth
3462     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3463 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3464 
3465   /* First, we collect all data ref pairs for aliasing checks.  */
3466   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3467     {
3468       int comp_res;
3469       poly_uint64 lower_bound;
3470       struct data_reference *dr_a, *dr_b;
3471       gimple *dr_group_first_a, *dr_group_first_b;
3472       tree segment_length_a, segment_length_b;
3473       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3474       unsigned int align_a, align_b;
3475       gimple *stmt_a, *stmt_b;
3476 
3477       /* Ignore the alias if the VF we chose ended up being no greater
3478 	 than the dependence distance.  */
3479       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3480 	continue;
3481 
3482       if (DDR_OBJECT_A (ddr))
3483 	{
3484 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3485 	  if (!compared_objects.add (new_pair))
3486 	    {
3487 	      if (dump_enabled_p ())
3488 		{
3489 		  dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3490 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3491 		  dump_printf (MSG_NOTE, " and ");
3492 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3493 		  dump_printf (MSG_NOTE, " have different addresses\n");
3494 		}
3495 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3496 	    }
3497 	  continue;
3498 	}
3499 
3500       dr_a = DDR_A (ddr);
3501       stmt_a = DR_STMT (DDR_A (ddr));
3502 
3503       dr_b = DDR_B (ddr);
3504       stmt_b = DR_STMT (DDR_B (ddr));
3505 
3506       /* Skip the pair if inter-iteration dependencies are irrelevant
3507 	 and intra-iteration dependencies are guaranteed to be honored.  */
3508       if (ignore_step_p
3509 	  && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3510 	      || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3511 	{
3512 	  if (dump_enabled_p ())
3513 	    {
3514 	      dump_printf_loc (MSG_NOTE, vect_location,
3515 			       "no need for alias check between ");
3516 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3517 	      dump_printf (MSG_NOTE, " and ");
3518 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3519 	      dump_printf (MSG_NOTE, " when VF is 1\n");
3520 	    }
3521 	  continue;
3522 	}
3523 
3524       /* See whether we can handle the alias using a bounds check on
3525 	 the step, and whether that's likely to be the best approach.
3526 	 (It might not be, for example, if the minimum step is much larger
3527 	 than the number of bytes handled by one vector iteration.)  */
3528       if (!ignore_step_p
3529 	  && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3530 	  && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3531 	  && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3532 	      || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3533 	{
3534 	  bool unsigned_p = dr_known_forward_stride_p (dr_a);
3535 	  if (dump_enabled_p ())
3536 	    {
3537 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3538 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3539 	      dump_printf (MSG_NOTE, " and ");
3540 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3541 	      dump_printf (MSG_NOTE, " when the step ");
3542 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3543 	      dump_printf (MSG_NOTE, " is outside ");
3544 	      if (unsigned_p)
3545 		dump_printf (MSG_NOTE, "[0");
3546 	      else
3547 		{
3548 		  dump_printf (MSG_NOTE, "(");
3549 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3550 		}
3551 	      dump_printf (MSG_NOTE, ", ");
3552 	      dump_dec (MSG_NOTE, lower_bound);
3553 	      dump_printf (MSG_NOTE, ")\n");
3554 	    }
3555 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3556 				  lower_bound);
3557 	  continue;
3558 	}
3559 
3560       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3561       if (dr_group_first_a)
3562 	{
3563 	  stmt_a = dr_group_first_a;
3564 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3565 	}
3566 
3567       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3568       if (dr_group_first_b)
3569 	{
3570 	  stmt_b = dr_group_first_b;
3571 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3572 	}
3573 
3574       if (ignore_step_p)
3575 	{
3576 	  segment_length_a = size_zero_node;
3577 	  segment_length_b = size_zero_node;
3578 	}
3579       else
3580 	{
3581 	  if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3582 	    length_factor = scalar_loop_iters;
3583 	  else
3584 	    length_factor = size_int (vect_factor);
3585 	  segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3586 	  segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3587 	}
3588       access_size_a = vect_vfa_access_size (dr_a);
3589       access_size_b = vect_vfa_access_size (dr_b);
3590       align_a = vect_vfa_align (dr_a);
3591       align_b = vect_vfa_align (dr_b);
3592 
3593       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3594 					DR_BASE_ADDRESS (dr_b));
3595       if (comp_res == 0)
3596 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3597 					  DR_OFFSET (dr_b));
3598 
3599       /* See whether the alias is known at compilation time.  */
3600       if (comp_res == 0
3601 	  && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3602 	  && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3603 	  && poly_int_tree_p (segment_length_a)
3604 	  && poly_int_tree_p (segment_length_b))
3605 	{
3606 	  int res = vect_compile_time_alias (dr_a, dr_b,
3607 					     segment_length_a,
3608 					     segment_length_b,
3609 					     access_size_a,
3610 					     access_size_b);
3611 	  if (res >= 0 && dump_enabled_p ())
3612 	    {
3613 	      dump_printf_loc (MSG_NOTE, vect_location,
3614 			       "can tell at compile time that ");
3615 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3616 	      dump_printf (MSG_NOTE, " and ");
3617 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3618 	      if (res == 0)
3619 		dump_printf (MSG_NOTE, " do not alias\n");
3620 	      else
3621 		dump_printf (MSG_NOTE, " alias\n");
3622 	    }
3623 
3624 	  if (res == 0)
3625 	    continue;
3626 
3627 	  if (res == 1)
3628 	    {
3629 	      if (dump_enabled_p ())
3630 		dump_printf_loc (MSG_NOTE, vect_location,
3631 				 "not vectorized: compilation time alias.\n");
3632 	      return false;
3633 	    }
3634 	}
3635 
3636       dr_with_seg_len_pair_t dr_with_seg_len_pair
3637 	(dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3638 	 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3639 
3640       /* Canonicalize pairs by sorting the two DR members.  */
3641       if (comp_res > 0)
3642 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3643 
3644       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3645     }
3646 
3647   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3648 
3649   unsigned int count = (comp_alias_ddrs.length ()
3650 			+ check_unequal_addrs.length ());
3651 
3652   dump_printf_loc (MSG_NOTE, vect_location,
3653 		   "improved number of alias checks from %d to %d\n",
3654 		   may_alias_ddrs.length (), count);
3655   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3656     {
3657       if (dump_enabled_p ())
3658 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3659 			 "number of versioning for alias "
3660 			 "run-time tests exceeds %d "
3661 			 "(--param vect-max-version-for-alias-checks)\n",
3662 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3663       return false;
3664     }
3665 
3666   return true;
3667 }
3668 
3669 /* Check whether we can use an internal function for a gather load
3670    or scatter store.  READ_P is true for loads and false for stores.
3671    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3672    the type of the memory elements being loaded or stored.  OFFSET_BITS
3673    is the number of bits in each scalar offset and OFFSET_SIGN is the
3674    sign of the offset.  SCALE is the amount by which the offset should
3675    be multiplied *after* it has been converted to address width.
3676 
3677    Return true if the function is supported, storing the function
3678    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3679 
3680 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3681 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3682 			  tree memory_type, unsigned int offset_bits,
3683 			  signop offset_sign, int scale,
3684 			  internal_fn *ifn_out, tree *element_type_out)
3685 {
3686   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3687   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3688   if (offset_bits > element_bits)
3689     /* Internal functions require the offset to be the same width as
3690        the vector elements.  We can extend narrower offsets, but it isn't
3691        safe to truncate wider offsets.  */
3692     return false;
3693 
3694   if (element_bits != memory_bits)
3695     /* For now the vector elements must be the same width as the
3696        memory elements.  */
3697     return false;
3698 
3699   /* Work out which function we need.  */
3700   internal_fn ifn;
3701   if (read_p)
3702     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3703   else
3704     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3705 
3706   /* Test whether the target supports this combination.  */
3707   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3708 					       offset_sign, scale))
3709     return false;
3710 
3711   *ifn_out = ifn;
3712   *element_type_out = TREE_TYPE (vectype);
3713   return true;
3714 }
3715 
3716 /* CALL is a call to an internal gather load or scatter store function.
3717    Describe the operation in INFO.  */
3718 
3719 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3720 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3721 {
3722   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3723   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3724   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3725 
3726   info->ifn = gimple_call_internal_fn (call);
3727   info->decl = NULL_TREE;
3728   info->base = gimple_call_arg (call, 0);
3729   info->offset = gimple_call_arg (call, 1);
3730   info->offset_dt = vect_unknown_def_type;
3731   info->offset_vectype = NULL_TREE;
3732   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3733   info->element_type = TREE_TYPE (vectype);
3734   info->memory_type = TREE_TYPE (DR_REF (dr));
3735 }
3736 
3737 /* Return true if a non-affine read or write in STMT is suitable for a
3738    gather load or scatter store.  Describe the operation in *INFO if so.  */
3739 
3740 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3741 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3742 			   gather_scatter_info *info)
3743 {
3744   HOST_WIDE_INT scale = 1;
3745   poly_int64 pbitpos, pbitsize;
3746   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3747   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3748   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3749   tree offtype = NULL_TREE;
3750   tree decl = NULL_TREE, base, off;
3751   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3752   tree memory_type = TREE_TYPE (DR_REF (dr));
3753   machine_mode pmode;
3754   int punsignedp, reversep, pvolatilep = 0;
3755   internal_fn ifn;
3756   tree element_type;
3757   bool masked_p = false;
3758 
3759   /* See whether this is already a call to a gather/scatter internal function.
3760      If not, see whether it's a masked load or store.  */
3761   gcall *call = dyn_cast <gcall *> (stmt);
3762   if (call && gimple_call_internal_p (call))
3763     {
3764       ifn = gimple_call_internal_fn (stmt);
3765       if (internal_gather_scatter_fn_p (ifn))
3766 	{
3767 	  vect_describe_gather_scatter_call (call, info);
3768 	  return true;
3769 	}
3770       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3771     }
3772 
3773   /* True if we should aim to use internal functions rather than
3774      built-in functions.  */
3775   bool use_ifn_p = (DR_IS_READ (dr)
3776 		    ? supports_vec_gather_load_p ()
3777 		    : supports_vec_scatter_store_p ());
3778 
3779   base = DR_REF (dr);
3780   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3781      see if we can use the def stmt of the address.  */
3782   if (masked_p
3783       && TREE_CODE (base) == MEM_REF
3784       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3785       && integer_zerop (TREE_OPERAND (base, 1))
3786       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3787     {
3788       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3789       if (is_gimple_assign (def_stmt)
3790 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3791 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3792     }
3793 
3794   /* The gather and scatter builtins need address of the form
3795      loop_invariant + vector * {1, 2, 4, 8}
3796      or
3797      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3798      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3799      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3800      multiplications and additions in it.  To get a vector, we need
3801      a single SSA_NAME that will be defined in the loop and will
3802      contain everything that is not loop invariant and that can be
3803      vectorized.  The following code attempts to find such a preexistng
3804      SSA_NAME OFF and put the loop invariants into a tree BASE
3805      that can be gimplified before the loop.  */
3806   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3807 			      &punsignedp, &reversep, &pvolatilep);
3808   gcc_assert (base && !reversep);
3809   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3810 
3811   if (TREE_CODE (base) == MEM_REF)
3812     {
3813       if (!integer_zerop (TREE_OPERAND (base, 1)))
3814 	{
3815 	  if (off == NULL_TREE)
3816 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3817 	  else
3818 	    off = size_binop (PLUS_EXPR, off,
3819 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3820 	}
3821       base = TREE_OPERAND (base, 0);
3822     }
3823   else
3824     base = build_fold_addr_expr (base);
3825 
3826   if (off == NULL_TREE)
3827     off = size_zero_node;
3828 
3829   /* If base is not loop invariant, either off is 0, then we start with just
3830      the constant offset in the loop invariant BASE and continue with base
3831      as OFF, otherwise give up.
3832      We could handle that case by gimplifying the addition of base + off
3833      into some SSA_NAME and use that as off, but for now punt.  */
3834   if (!expr_invariant_in_loop_p (loop, base))
3835     {
3836       if (!integer_zerop (off))
3837 	return false;
3838       off = base;
3839       base = size_int (pbytepos);
3840     }
3841   /* Otherwise put base + constant offset into the loop invariant BASE
3842      and continue with OFF.  */
3843   else
3844     {
3845       base = fold_convert (sizetype, base);
3846       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3847     }
3848 
3849   /* OFF at this point may be either a SSA_NAME or some tree expression
3850      from get_inner_reference.  Try to peel off loop invariants from it
3851      into BASE as long as possible.  */
3852   STRIP_NOPS (off);
3853   while (offtype == NULL_TREE)
3854     {
3855       enum tree_code code;
3856       tree op0, op1, add = NULL_TREE;
3857 
3858       if (TREE_CODE (off) == SSA_NAME)
3859 	{
3860 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3861 
3862 	  if (expr_invariant_in_loop_p (loop, off))
3863 	    return false;
3864 
3865 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3866 	    break;
3867 
3868 	  op0 = gimple_assign_rhs1 (def_stmt);
3869 	  code = gimple_assign_rhs_code (def_stmt);
3870 	  op1 = gimple_assign_rhs2 (def_stmt);
3871 	}
3872       else
3873 	{
3874 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3875 	    return false;
3876 	  code = TREE_CODE (off);
3877 	  extract_ops_from_tree (off, &code, &op0, &op1);
3878 	}
3879       switch (code)
3880 	{
3881 	case POINTER_PLUS_EXPR:
3882 	case PLUS_EXPR:
3883 	  if (expr_invariant_in_loop_p (loop, op0))
3884 	    {
3885 	      add = op0;
3886 	      off = op1;
3887 	    do_add:
3888 	      add = fold_convert (sizetype, add);
3889 	      if (scale != 1)
3890 		add = size_binop (MULT_EXPR, add, size_int (scale));
3891 	      base = size_binop (PLUS_EXPR, base, add);
3892 	      continue;
3893 	    }
3894 	  if (expr_invariant_in_loop_p (loop, op1))
3895 	    {
3896 	      add = op1;
3897 	      off = op0;
3898 	      goto do_add;
3899 	    }
3900 	  break;
3901 	case MINUS_EXPR:
3902 	  if (expr_invariant_in_loop_p (loop, op1))
3903 	    {
3904 	      add = fold_convert (sizetype, op1);
3905 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3906 	      off = op0;
3907 	      goto do_add;
3908 	    }
3909 	  break;
3910 	case MULT_EXPR:
3911 	  if (scale == 1 && tree_fits_shwi_p (op1))
3912 	    {
3913 	      int new_scale = tree_to_shwi (op1);
3914 	      /* Only treat this as a scaling operation if the target
3915 		 supports it.  */
3916 	      if (use_ifn_p
3917 		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3918 						vectype, memory_type, 1,
3919 						TYPE_SIGN (TREE_TYPE (op0)),
3920 						new_scale, &ifn,
3921 						&element_type))
3922 		break;
3923 	      scale = new_scale;
3924 	      off = op0;
3925 	      continue;
3926 	    }
3927 	  break;
3928 	case SSA_NAME:
3929 	  off = op0;
3930 	  continue;
3931 	CASE_CONVERT:
3932 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3933 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3934 	    break;
3935 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3936 	      == TYPE_PRECISION (TREE_TYPE (off)))
3937 	    {
3938 	      off = op0;
3939 	      continue;
3940 	    }
3941 
3942 	  /* The internal functions need the offset to be the same width
3943 	     as the elements of VECTYPE.  Don't include operations that
3944 	     cast the offset from that width to a different width.  */
3945 	  if (use_ifn_p
3946 	      && (int_size_in_bytes (TREE_TYPE (vectype))
3947 		  == int_size_in_bytes (TREE_TYPE (off))))
3948 	    break;
3949 
3950 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3951 	      < TYPE_PRECISION (TREE_TYPE (off)))
3952 	    {
3953 	      off = op0;
3954 	      offtype = TREE_TYPE (off);
3955 	      STRIP_NOPS (off);
3956 	      continue;
3957 	    }
3958 	  break;
3959 	default:
3960 	  break;
3961 	}
3962       break;
3963     }
3964 
3965   /* If at the end OFF still isn't a SSA_NAME or isn't
3966      defined in the loop, punt.  */
3967   if (TREE_CODE (off) != SSA_NAME
3968       || expr_invariant_in_loop_p (loop, off))
3969     return false;
3970 
3971   if (offtype == NULL_TREE)
3972     offtype = TREE_TYPE (off);
3973 
3974   if (use_ifn_p)
3975     {
3976       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3977 				     memory_type, TYPE_PRECISION (offtype),
3978 				     TYPE_SIGN (offtype), scale, &ifn,
3979 				     &element_type))
3980 	return false;
3981     }
3982   else
3983     {
3984       if (DR_IS_READ (dr))
3985 	{
3986 	  if (targetm.vectorize.builtin_gather)
3987 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3988 	}
3989       else
3990 	{
3991 	  if (targetm.vectorize.builtin_scatter)
3992 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3993 	}
3994 
3995       if (!decl)
3996 	return false;
3997 
3998       ifn = IFN_LAST;
3999       element_type = TREE_TYPE (vectype);
4000     }
4001 
4002   info->ifn = ifn;
4003   info->decl = decl;
4004   info->base = base;
4005   info->offset = off;
4006   info->offset_dt = vect_unknown_def_type;
4007   info->offset_vectype = NULL_TREE;
4008   info->scale = scale;
4009   info->element_type = element_type;
4010   info->memory_type = memory_type;
4011   return true;
4012 }
4013 
4014 /* Function vect_analyze_data_refs.
4015 
4016   Find all the data references in the loop or basic block.
4017 
4018    The general structure of the analysis of data refs in the vectorizer is as
4019    follows:
4020    1- vect_analyze_data_refs(loop/bb): call
4021       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4022       in the loop/bb and their dependences.
4023    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4024    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4025    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4026 
4027 */
4028 
4029 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)4030 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4031 {
4032   struct loop *loop = NULL;
4033   unsigned int i;
4034   struct data_reference *dr;
4035   tree scalar_type;
4036 
4037   if (dump_enabled_p ())
4038     dump_printf_loc (MSG_NOTE, vect_location,
4039 		     "=== vect_analyze_data_refs ===\n");
4040 
4041   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4042     loop = LOOP_VINFO_LOOP (loop_vinfo);
4043 
4044   /* Go through the data-refs, check that the analysis succeeded.  Update
4045      pointer from stmt_vec_info struct to DR and vectype.  */
4046 
4047   vec<data_reference_p> datarefs = vinfo->datarefs;
4048   FOR_EACH_VEC_ELT (datarefs, i, dr)
4049     {
4050       gimple *stmt;
4051       stmt_vec_info stmt_info;
4052       tree base, offset, init;
4053       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4054       bool simd_lane_access = false;
4055       poly_uint64 vf;
4056 
4057 again:
4058       if (!dr || !DR_REF (dr))
4059         {
4060           if (dump_enabled_p ())
4061 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4062 	                     "not vectorized: unhandled data-ref\n");
4063           return false;
4064         }
4065 
4066       stmt = DR_STMT (dr);
4067       stmt_info = vinfo_for_stmt (stmt);
4068 
4069       /* Discard clobbers from the dataref vector.  We will remove
4070          clobber stmts during vectorization.  */
4071       if (gimple_clobber_p (stmt))
4072 	{
4073 	  free_data_ref (dr);
4074 	  if (i == datarefs.length () - 1)
4075 	    {
4076 	      datarefs.pop ();
4077 	      break;
4078 	    }
4079 	  datarefs.ordered_remove (i);
4080 	  dr = datarefs[i];
4081 	  goto again;
4082 	}
4083 
4084       /* Check that analysis of the data-ref succeeded.  */
4085       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4086 	  || !DR_STEP (dr))
4087         {
4088 	  bool maybe_gather
4089 	    = DR_IS_READ (dr)
4090 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4091 	      && (targetm.vectorize.builtin_gather != NULL
4092 		  || supports_vec_gather_load_p ());
4093 	  bool maybe_scatter
4094 	    = DR_IS_WRITE (dr)
4095 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4096 	      && (targetm.vectorize.builtin_scatter != NULL
4097 		  || supports_vec_scatter_store_p ());
4098 	  bool maybe_simd_lane_access
4099 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
4100 
4101 	  /* If target supports vector gather loads or scatter stores, or if
4102 	     this might be a SIMD lane access, see if they can't be used.  */
4103 	  if (is_a <loop_vec_info> (vinfo)
4104 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4105 	      && !nested_in_vect_loop_p (loop, stmt))
4106 	    {
4107 	      struct data_reference *newdr
4108 		= create_data_ref (NULL, loop_containing_stmt (stmt),
4109 				   DR_REF (dr), stmt, !maybe_scatter,
4110 				   DR_IS_CONDITIONAL_IN_STMT (dr));
4111 	      gcc_assert (newdr != NULL && DR_REF (newdr));
4112 	      if (DR_BASE_ADDRESS (newdr)
4113 		  && DR_OFFSET (newdr)
4114 		  && DR_INIT (newdr)
4115 		  && DR_STEP (newdr)
4116 		  && integer_zerop (DR_STEP (newdr)))
4117 		{
4118 		  if (maybe_simd_lane_access)
4119 		    {
4120 		      tree off = DR_OFFSET (newdr);
4121 		      STRIP_NOPS (off);
4122 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4123 			  && TREE_CODE (off) == MULT_EXPR
4124 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4125 			{
4126 			  tree step = TREE_OPERAND (off, 1);
4127 			  off = TREE_OPERAND (off, 0);
4128 			  STRIP_NOPS (off);
4129 			  if (CONVERT_EXPR_P (off)
4130 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4131 									  0)))
4132 				 < TYPE_PRECISION (TREE_TYPE (off)))
4133 			    off = TREE_OPERAND (off, 0);
4134 			  if (TREE_CODE (off) == SSA_NAME)
4135 			    {
4136 			      gimple *def = SSA_NAME_DEF_STMT (off);
4137 			      tree reft = TREE_TYPE (DR_REF (newdr));
4138 			      if (is_gimple_call (def)
4139 				  && gimple_call_internal_p (def)
4140 				  && (gimple_call_internal_fn (def)
4141 				      == IFN_GOMP_SIMD_LANE))
4142 				{
4143 				  tree arg = gimple_call_arg (def, 0);
4144 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4145 				  arg = SSA_NAME_VAR (arg);
4146 				  if (arg == loop->simduid
4147 				      /* For now.  */
4148 				      && tree_int_cst_equal
4149 					   (TYPE_SIZE_UNIT (reft),
4150 					    step))
4151 				    {
4152 				      DR_OFFSET (newdr) = ssize_int (0);
4153 				      DR_STEP (newdr) = step;
4154 				      DR_OFFSET_ALIGNMENT (newdr)
4155 					= BIGGEST_ALIGNMENT;
4156 				      DR_STEP_ALIGNMENT (newdr)
4157 					= highest_pow2_factor (step);
4158 				      dr = newdr;
4159 				      simd_lane_access = true;
4160 				    }
4161 				}
4162 			    }
4163 			}
4164 		    }
4165 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
4166 		    {
4167 		      dr = newdr;
4168 		      if (maybe_gather)
4169 			gatherscatter = GATHER;
4170 		      else
4171 			gatherscatter = SCATTER;
4172 		    }
4173 		}
4174 	      if (gatherscatter == SG_NONE && !simd_lane_access)
4175 		free_data_ref (newdr);
4176 	    }
4177 
4178 	  if (gatherscatter == SG_NONE && !simd_lane_access)
4179 	    {
4180 	      if (dump_enabled_p ())
4181 		{
4182 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183                                    "not vectorized: data ref analysis "
4184                                    "failed ");
4185 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4186 		}
4187 
4188 	      if (is_a <bb_vec_info> (vinfo))
4189 		break;
4190 
4191 	      return false;
4192 	    }
4193         }
4194 
4195       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4196         {
4197           if (dump_enabled_p ())
4198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                              "not vectorized: base addr of dr is a "
4200                              "constant\n");
4201 
4202           if (is_a <bb_vec_info> (vinfo))
4203 	    break;
4204 
4205 	  if (gatherscatter != SG_NONE || simd_lane_access)
4206 	    free_data_ref (dr);
4207 	  return false;
4208         }
4209 
4210       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4211         {
4212           if (dump_enabled_p ())
4213             {
4214               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4215                                "not vectorized: volatile type ");
4216               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4217             }
4218 
4219           if (is_a <bb_vec_info> (vinfo))
4220 	    break;
4221 
4222           return false;
4223         }
4224 
4225       if (stmt_can_throw_internal (stmt))
4226         {
4227           if (dump_enabled_p ())
4228             {
4229               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4230                                "not vectorized: statement can throw an "
4231                                "exception ");
4232               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4233             }
4234 
4235           if (is_a <bb_vec_info> (vinfo))
4236 	    break;
4237 
4238 	  if (gatherscatter != SG_NONE || simd_lane_access)
4239 	    free_data_ref (dr);
4240           return false;
4241         }
4242 
4243       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4244 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4245 	{
4246           if (dump_enabled_p ())
4247             {
4248               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4249                                "not vectorized: statement is bitfield "
4250                                "access ");
4251               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4252             }
4253 
4254           if (is_a <bb_vec_info> (vinfo))
4255 	    break;
4256 
4257 	  if (gatherscatter != SG_NONE || simd_lane_access)
4258 	    free_data_ref (dr);
4259           return false;
4260 	}
4261 
4262       base = unshare_expr (DR_BASE_ADDRESS (dr));
4263       offset = unshare_expr (DR_OFFSET (dr));
4264       init = unshare_expr (DR_INIT (dr));
4265 
4266       if (is_gimple_call (stmt)
4267 	  && (!gimple_call_internal_p (stmt)
4268 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4269 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4270 	{
4271 	  if (dump_enabled_p ())
4272 	    {
4273 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4274 	                       "not vectorized: dr in a call ");
4275 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4276 	    }
4277 
4278 	  if (is_a <bb_vec_info> (vinfo))
4279 	    break;
4280 
4281 	  if (gatherscatter != SG_NONE || simd_lane_access)
4282 	    free_data_ref (dr);
4283 	  return false;
4284 	}
4285 
4286       /* Update DR field in stmt_vec_info struct.  */
4287 
4288       /* If the dataref is in an inner-loop of the loop that is considered for
4289 	 for vectorization, we also want to analyze the access relative to
4290 	 the outer-loop (DR contains information only relative to the
4291 	 inner-most enclosing loop).  We do that by building a reference to the
4292 	 first location accessed by the inner-loop, and analyze it relative to
4293 	 the outer-loop.  */
4294       if (loop && nested_in_vect_loop_p (loop, stmt))
4295 	{
4296 	  /* Build a reference to the first location accessed by the
4297 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4298 	     this address must be invariant in the inner loop, so we
4299 	     can consider it as being used in the outer loop.  */
4300 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4301 					  init, offset);
4302 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4303 	  tree init_ref = build_fold_indirect_ref (init_addr);
4304 
4305 	  if (dump_enabled_p ())
4306 	    {
4307 	      dump_printf_loc (MSG_NOTE, vect_location,
4308                                "analyze in outer loop: ");
4309 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4310 	      dump_printf (MSG_NOTE, "\n");
4311 	    }
4312 
4313 	  if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4314 				     init_ref, loop))
4315 	    /* dr_analyze_innermost already explained the failure.  */
4316 	    return false;
4317 
4318           if (dump_enabled_p ())
4319 	    {
4320 	      dump_printf_loc (MSG_NOTE, vect_location,
4321                                "\touter base_address: ");
4322 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4323                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4324 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4325 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4326                                  STMT_VINFO_DR_OFFSET (stmt_info));
4327 	      dump_printf (MSG_NOTE,
4328                            "\n\touter constant offset from base address: ");
4329 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4330                                  STMT_VINFO_DR_INIT (stmt_info));
4331 	      dump_printf (MSG_NOTE, "\n\touter step: ");
4332 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4333                                  STMT_VINFO_DR_STEP (stmt_info));
4334 	      dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4335 			   STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4336 	      dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4337 			   STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4338 	      dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4339 			   STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4340 	      dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4341 			   STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4342 	    }
4343 	}
4344 
4345       if (STMT_VINFO_DATA_REF (stmt_info))
4346         {
4347           if (dump_enabled_p ())
4348             {
4349               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4350                                "not vectorized: more than one data ref "
4351                                "in stmt: ");
4352               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4353             }
4354 
4355           if (is_a <bb_vec_info> (vinfo))
4356 	    break;
4357 
4358 	  if (gatherscatter != SG_NONE || simd_lane_access)
4359 	    free_data_ref (dr);
4360           return false;
4361         }
4362 
4363       STMT_VINFO_DATA_REF (stmt_info) = dr;
4364       if (simd_lane_access)
4365 	{
4366 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4367 	  free_data_ref (datarefs[i]);
4368 	  datarefs[i] = dr;
4369 	}
4370 
4371       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4372 	  && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4373 	  && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4374 	{
4375           if (dump_enabled_p ())
4376             {
4377               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4378                                "not vectorized: base object not addressable "
4379 			       "for stmt: ");
4380               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4381             }
4382           if (is_a <bb_vec_info> (vinfo))
4383 	    {
4384 	      /* In BB vectorization the ref can still participate
4385 	         in dependence analysis, we just can't vectorize it.  */
4386 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4387 	      continue;
4388 	    }
4389 	  return false;
4390 	}
4391 
4392       /* Set vectype for STMT.  */
4393       scalar_type = TREE_TYPE (DR_REF (dr));
4394       STMT_VINFO_VECTYPE (stmt_info)
4395 	= get_vectype_for_scalar_type (scalar_type);
4396       if (!STMT_VINFO_VECTYPE (stmt_info))
4397         {
4398           if (dump_enabled_p ())
4399             {
4400               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4401                                "not vectorized: no vectype for stmt: ");
4402               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4403               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4404               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4405                                  scalar_type);
4406               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4407             }
4408 
4409           if (is_a <bb_vec_info> (vinfo))
4410 	    {
4411 	      /* No vector type is fine, the ref can still participate
4412 	         in dependence analysis, we just can't vectorize it.  */
4413 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4414 	      continue;
4415 	    }
4416 
4417 	  if (gatherscatter != SG_NONE || simd_lane_access)
4418 	    {
4419 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4420 	      if (gatherscatter != SG_NONE)
4421 		free_data_ref (dr);
4422 	    }
4423 	  return false;
4424         }
4425       else
4426 	{
4427 	  if (dump_enabled_p ())
4428 	    {
4429 	      dump_printf_loc (MSG_NOTE, vect_location,
4430 			       "got vectype for stmt: ");
4431 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4432 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4433 				 STMT_VINFO_VECTYPE (stmt_info));
4434 	      dump_printf (MSG_NOTE, "\n");
4435 	    }
4436 	}
4437 
4438       /* Adjust the minimal vectorization factor according to the
4439 	 vector type.  */
4440       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4441       *min_vf = upper_bound (*min_vf, vf);
4442 
4443       if (gatherscatter != SG_NONE)
4444 	{
4445 	  gather_scatter_info gs_info;
4446 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4447 					  &gs_info)
4448 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4449 	    {
4450 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4451 	      free_data_ref (dr);
4452 	      if (dump_enabled_p ())
4453 		{
4454 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4455 				   (gatherscatter == GATHER) ?
4456 				   "not vectorized: not suitable for gather "
4457 				   "load " :
4458 				   "not vectorized: not suitable for scatter "
4459 				   "store ");
4460 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4461 		}
4462 	      return false;
4463 	    }
4464 
4465 	  free_data_ref (datarefs[i]);
4466 	  datarefs[i] = dr;
4467 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4468 	}
4469 
4470       else if (is_a <loop_vec_info> (vinfo)
4471 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4472 	{
4473 	  if (nested_in_vect_loop_p (loop, stmt))
4474 	    {
4475 	      if (dump_enabled_p ())
4476 		{
4477 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4478                                    "not vectorized: not suitable for strided "
4479                                    "load ");
4480 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4481 		}
4482 	      return false;
4483 	    }
4484 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4485 	}
4486     }
4487 
4488   /* If we stopped analysis at the first dataref we could not analyze
4489      when trying to vectorize a basic-block mark the rest of the datarefs
4490      as not vectorizable and truncate the vector of datarefs.  That
4491      avoids spending useless time in analyzing their dependence.  */
4492   if (i != datarefs.length ())
4493     {
4494       gcc_assert (is_a <bb_vec_info> (vinfo));
4495       for (unsigned j = i; j < datarefs.length (); ++j)
4496 	{
4497 	  data_reference_p dr = datarefs[j];
4498           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4499 	  free_data_ref (dr);
4500 	}
4501       datarefs.truncate (i);
4502     }
4503 
4504   return true;
4505 }
4506 
4507 
4508 /* Function vect_get_new_vect_var.
4509 
4510    Returns a name for a new variable.  The current naming scheme appends the
4511    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4512    the name of vectorizer generated variables, and appends that to NAME if
4513    provided.  */
4514 
4515 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4516 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4517 {
4518   const char *prefix;
4519   tree new_vect_var;
4520 
4521   switch (var_kind)
4522   {
4523   case vect_simple_var:
4524     prefix = "vect";
4525     break;
4526   case vect_scalar_var:
4527     prefix = "stmp";
4528     break;
4529   case vect_mask_var:
4530     prefix = "mask";
4531     break;
4532   case vect_pointer_var:
4533     prefix = "vectp";
4534     break;
4535   default:
4536     gcc_unreachable ();
4537   }
4538 
4539   if (name)
4540     {
4541       char* tmp = concat (prefix, "_", name, NULL);
4542       new_vect_var = create_tmp_reg (type, tmp);
4543       free (tmp);
4544     }
4545   else
4546     new_vect_var = create_tmp_reg (type, prefix);
4547 
4548   return new_vect_var;
4549 }
4550 
4551 /* Like vect_get_new_vect_var but return an SSA name.  */
4552 
4553 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4554 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4555 {
4556   const char *prefix;
4557   tree new_vect_var;
4558 
4559   switch (var_kind)
4560   {
4561   case vect_simple_var:
4562     prefix = "vect";
4563     break;
4564   case vect_scalar_var:
4565     prefix = "stmp";
4566     break;
4567   case vect_pointer_var:
4568     prefix = "vectp";
4569     break;
4570   default:
4571     gcc_unreachable ();
4572   }
4573 
4574   if (name)
4575     {
4576       char* tmp = concat (prefix, "_", name, NULL);
4577       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4578       free (tmp);
4579     }
4580   else
4581     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4582 
4583   return new_vect_var;
4584 }
4585 
4586 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4587 
4588 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4589 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4590 {
4591   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4592   int misalign = DR_MISALIGNMENT (dr);
4593   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4594     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4595   else
4596     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4597 			    DR_TARGET_ALIGNMENT (dr), misalign);
4598 }
4599 
4600 /* Function vect_create_addr_base_for_vector_ref.
4601 
4602    Create an expression that computes the address of the first memory location
4603    that will be accessed for a data reference.
4604 
4605    Input:
4606    STMT: The statement containing the data reference.
4607    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4608    OFFSET: Optional. If supplied, it is be added to the initial address.
4609    LOOP:    Specify relative to which loop-nest should the address be computed.
4610             For example, when the dataref is in an inner-loop nested in an
4611 	    outer-loop that is now being vectorized, LOOP can be either the
4612 	    outer-loop, or the inner-loop.  The first memory location accessed
4613 	    by the following dataref ('in' points to short):
4614 
4615 		for (i=0; i<N; i++)
4616 		   for (j=0; j<M; j++)
4617 		     s += in[i+j]
4618 
4619 	    is as follows:
4620 	    if LOOP=i_loop:	&in		(relative to i_loop)
4621 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4622    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4623 	    initial address.  Unlike OFFSET, which is number of elements to
4624 	    be added, BYTE_OFFSET is measured in bytes.
4625 
4626    Output:
4627    1. Return an SSA_NAME whose value is the address of the memory location of
4628       the first vector of the data reference.
4629    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4630       these statement(s) which define the returned SSA_NAME.
4631 
4632    FORNOW: We are only handling array accesses with step 1.  */
4633 
4634 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4635 vect_create_addr_base_for_vector_ref (gimple *stmt,
4636 				      gimple_seq *new_stmt_list,
4637 				      tree offset,
4638 				      tree byte_offset)
4639 {
4640   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4641   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4642   const char *base_name;
4643   tree addr_base;
4644   tree dest;
4645   gimple_seq seq = NULL;
4646   tree vect_ptr_type;
4647   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4648   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4649   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4650 
4651   tree data_ref_base = unshare_expr (drb->base_address);
4652   tree base_offset = unshare_expr (drb->offset);
4653   tree init = unshare_expr (drb->init);
4654 
4655   if (loop_vinfo)
4656     base_name = get_name (data_ref_base);
4657   else
4658     {
4659       base_offset = ssize_int (0);
4660       init = ssize_int (0);
4661       base_name = get_name (DR_REF (dr));
4662     }
4663 
4664   /* Create base_offset */
4665   base_offset = size_binop (PLUS_EXPR,
4666 			    fold_convert (sizetype, base_offset),
4667 			    fold_convert (sizetype, init));
4668 
4669   if (offset)
4670     {
4671       offset = fold_build2 (MULT_EXPR, sizetype,
4672 			    fold_convert (sizetype, offset), step);
4673       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4674 				 base_offset, offset);
4675     }
4676   if (byte_offset)
4677     {
4678       byte_offset = fold_convert (sizetype, byte_offset);
4679       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4680 				 base_offset, byte_offset);
4681     }
4682 
4683   /* base + base_offset */
4684   if (loop_vinfo)
4685     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4686   else
4687     {
4688       addr_base = build1 (ADDR_EXPR,
4689 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4690 			  unshare_expr (DR_REF (dr)));
4691     }
4692 
4693   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4694   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4695   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4696   gimple_seq_add_seq (new_stmt_list, seq);
4697 
4698   if (DR_PTR_INFO (dr)
4699       && TREE_CODE (addr_base) == SSA_NAME
4700       && !SSA_NAME_PTR_INFO (addr_base))
4701     {
4702       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4703       if (offset || byte_offset)
4704 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4705     }
4706 
4707   if (dump_enabled_p ())
4708     {
4709       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4710       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4711       dump_printf (MSG_NOTE, "\n");
4712     }
4713 
4714   return addr_base;
4715 }
4716 
4717 
4718 /* Function vect_create_data_ref_ptr.
4719 
4720    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4721    location accessed in the loop by STMT, along with the def-use update
4722    chain to appropriately advance the pointer through the loop iterations.
4723    Also set aliasing information for the pointer.  This pointer is used by
4724    the callers to this function to create a memory reference expression for
4725    vector load/store access.
4726 
4727    Input:
4728    1. STMT: a stmt that references memory. Expected to be of the form
4729          GIMPLE_ASSIGN <name, data-ref> or
4730 	 GIMPLE_ASSIGN <data-ref, name>.
4731    2. AGGR_TYPE: the type of the reference, which should be either a vector
4732         or an array.
4733    3. AT_LOOP: the loop where the vector memref is to be created.
4734    4. OFFSET (optional): an offset to be added to the initial address accessed
4735         by the data-ref in STMT.
4736    5. BSI: location where the new stmts are to be placed if there is no loop
4737    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4738         pointing to the initial address.
4739    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4740 	to the initial address accessed by the data-ref in STMT.  This is
4741 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4742 	in bytes.
4743    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4744 	to the IV during each iteration of the loop.  NULL says to move
4745 	by one copy of AGGR_TYPE up or down, depending on the step of the
4746 	data reference.
4747 
4748    Output:
4749    1. Declare a new ptr to vector_type, and have it point to the base of the
4750       data reference (initial addressed accessed by the data reference).
4751       For example, for vector of type V8HI, the following code is generated:
4752 
4753       v8hi *ap;
4754       ap = (v8hi *)initial_address;
4755 
4756       if OFFSET is not supplied:
4757          initial_address = &a[init];
4758       if OFFSET is supplied:
4759          initial_address = &a[init + OFFSET];
4760       if BYTE_OFFSET is supplied:
4761 	 initial_address = &a[init] + BYTE_OFFSET;
4762 
4763       Return the initial_address in INITIAL_ADDRESS.
4764 
4765    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4766       update the pointer in each iteration of the loop.
4767 
4768       Return the increment stmt that updates the pointer in PTR_INCR.
4769 
4770    3. Set INV_P to true if the access pattern of the data reference in the
4771       vectorized loop is invariant.  Set it to false otherwise.
4772 
4773    4. Return the pointer.  */
4774 
4775 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4776 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4777 			  tree offset, tree *initial_address,
4778 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4779 			  bool only_init, bool *inv_p, tree byte_offset,
4780 			  tree iv_step)
4781 {
4782   const char *base_name;
4783   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4784   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4785   struct loop *loop = NULL;
4786   bool nested_in_vect_loop = false;
4787   struct loop *containing_loop = NULL;
4788   tree aggr_ptr_type;
4789   tree aggr_ptr;
4790   tree new_temp;
4791   gimple_seq new_stmt_list = NULL;
4792   edge pe = NULL;
4793   basic_block new_bb;
4794   tree aggr_ptr_init;
4795   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4796   tree aptr;
4797   gimple_stmt_iterator incr_gsi;
4798   bool insert_after;
4799   tree indx_before_incr, indx_after_incr;
4800   gimple *incr;
4801   tree step;
4802   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4803 
4804   gcc_assert (iv_step != NULL_TREE
4805 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4806 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4807 
4808   if (loop_vinfo)
4809     {
4810       loop = LOOP_VINFO_LOOP (loop_vinfo);
4811       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4812       containing_loop = (gimple_bb (stmt))->loop_father;
4813       pe = loop_preheader_edge (loop);
4814     }
4815   else
4816     {
4817       gcc_assert (bb_vinfo);
4818       only_init = true;
4819       *ptr_incr = NULL;
4820     }
4821 
4822   /* Check the step (evolution) of the load in LOOP, and record
4823      whether it's invariant.  */
4824   step = vect_dr_behavior (dr)->step;
4825   if (integer_zerop (step))
4826     *inv_p = true;
4827   else
4828     *inv_p = false;
4829 
4830   /* Create an expression for the first address accessed by this load
4831      in LOOP.  */
4832   base_name = get_name (DR_BASE_ADDRESS (dr));
4833 
4834   if (dump_enabled_p ())
4835     {
4836       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4837       dump_printf_loc (MSG_NOTE, vect_location,
4838                        "create %s-pointer variable to type: ",
4839 		       get_tree_code_name (TREE_CODE (aggr_type)));
4840       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4841       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4842         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4843       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4844         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4845       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4846         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4847       else
4848         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4849       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4850       dump_printf (MSG_NOTE, "\n");
4851     }
4852 
4853   /* (1) Create the new aggregate-pointer variable.
4854      Vector and array types inherit the alias set of their component
4855      type by default so we need to use a ref-all pointer if the data
4856      reference does not conflict with the created aggregated data
4857      reference because it is not addressable.  */
4858   bool need_ref_all = false;
4859   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4860 			      get_alias_set (DR_REF (dr))))
4861     need_ref_all = true;
4862   /* Likewise for any of the data references in the stmt group.  */
4863   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4864     {
4865       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4866       do
4867 	{
4868 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4869 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4870 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4871 				      get_alias_set (DR_REF (sdr))))
4872 	    {
4873 	      need_ref_all = true;
4874 	      break;
4875 	    }
4876 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4877 	}
4878       while (orig_stmt);
4879     }
4880   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4881 					       need_ref_all);
4882   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4883 
4884 
4885   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4886      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4887      def-use update cycles for the pointer: one relative to the outer-loop
4888      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4889      to the inner-loop (which is the inner-most loop containing the dataref),
4890      and this is done be step (5) below.
4891 
4892      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4893      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4894      redundant.  Steps (3),(4) create the following:
4895 
4896 	vp0 = &base_addr;
4897 	LOOP:	vp1 = phi(vp0,vp2)
4898 		...
4899 		...
4900 		vp2 = vp1 + step
4901 		goto LOOP
4902 
4903      If there is an inner-loop nested in loop, then step (5) will also be
4904      applied, and an additional update in the inner-loop will be created:
4905 
4906 	vp0 = &base_addr;
4907 	LOOP:   vp1 = phi(vp0,vp2)
4908 		...
4909         inner:     vp3 = phi(vp1,vp4)
4910 	           vp4 = vp3 + inner_step
4911 	           if () goto inner
4912 		...
4913 		vp2 = vp1 + step
4914 		if () goto LOOP   */
4915 
4916   /* (2) Calculate the initial address of the aggregate-pointer, and set
4917      the aggregate-pointer to point to it before the loop.  */
4918 
4919   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4920 
4921   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4922 						   offset, byte_offset);
4923   if (new_stmt_list)
4924     {
4925       if (pe)
4926         {
4927           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4928           gcc_assert (!new_bb);
4929         }
4930       else
4931         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4932     }
4933 
4934   *initial_address = new_temp;
4935   aggr_ptr_init = new_temp;
4936 
4937   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4938      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4939      inner-loop nested in LOOP (during outer-loop vectorization).  */
4940 
4941   /* No update in loop is required.  */
4942   if (only_init && (!loop_vinfo || at_loop == loop))
4943     aptr = aggr_ptr_init;
4944   else
4945     {
4946       if (iv_step == NULL_TREE)
4947 	{
4948 	  /* The step of the aggregate pointer is the type size.  */
4949 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4950 	  /* One exception to the above is when the scalar step of the load in
4951 	     LOOP is zero. In this case the step here is also zero.  */
4952 	  if (*inv_p)
4953 	    iv_step = size_zero_node;
4954 	  else if (tree_int_cst_sgn (step) == -1)
4955 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4956 	}
4957 
4958       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4959 
4960       create_iv (aggr_ptr_init,
4961 		 fold_convert (aggr_ptr_type, iv_step),
4962 		 aggr_ptr, loop, &incr_gsi, insert_after,
4963 		 &indx_before_incr, &indx_after_incr);
4964       incr = gsi_stmt (incr_gsi);
4965       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4966 
4967       /* Copy the points-to information if it exists. */
4968       if (DR_PTR_INFO (dr))
4969 	{
4970 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4971 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4972 	}
4973       if (ptr_incr)
4974 	*ptr_incr = incr;
4975 
4976       aptr = indx_before_incr;
4977     }
4978 
4979   if (!nested_in_vect_loop || only_init)
4980     return aptr;
4981 
4982 
4983   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4984      nested in LOOP, if exists.  */
4985 
4986   gcc_assert (nested_in_vect_loop);
4987   if (!only_init)
4988     {
4989       standard_iv_increment_position (containing_loop, &incr_gsi,
4990 				      &insert_after);
4991       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4992 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4993 		 &indx_after_incr);
4994       incr = gsi_stmt (incr_gsi);
4995       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4996 
4997       /* Copy the points-to information if it exists. */
4998       if (DR_PTR_INFO (dr))
4999 	{
5000 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
5001 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
5002 	}
5003       if (ptr_incr)
5004 	*ptr_incr = incr;
5005 
5006       return indx_before_incr;
5007     }
5008   else
5009     gcc_unreachable ();
5010 }
5011 
5012 
5013 /* Function bump_vector_ptr
5014 
5015    Increment a pointer (to a vector type) by vector-size. If requested,
5016    i.e. if PTR-INCR is given, then also connect the new increment stmt
5017    to the existing def-use update-chain of the pointer, by modifying
5018    the PTR_INCR as illustrated below:
5019 
5020    The pointer def-use update-chain before this function:
5021                         DATAREF_PTR = phi (p_0, p_2)
5022                         ....
5023         PTR_INCR:       p_2 = DATAREF_PTR + step
5024 
5025    The pointer def-use update-chain after this function:
5026                         DATAREF_PTR = phi (p_0, p_2)
5027                         ....
5028                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5029                         ....
5030         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5031 
5032    Input:
5033    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5034                  in the loop.
5035    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5036 	      the loop.  The increment amount across iterations is expected
5037 	      to be vector_size.
5038    BSI - location where the new update stmt is to be placed.
5039    STMT - the original scalar memory-access stmt that is being vectorized.
5040    BUMP - optional. The offset by which to bump the pointer. If not given,
5041 	  the offset is assumed to be vector_size.
5042 
5043    Output: Return NEW_DATAREF_PTR as illustrated above.
5044 
5045 */
5046 
5047 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)5048 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5049 		 gimple *stmt, tree bump)
5050 {
5051   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5052   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5053   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5054   tree update = TYPE_SIZE_UNIT (vectype);
5055   gassign *incr_stmt;
5056   ssa_op_iter iter;
5057   use_operand_p use_p;
5058   tree new_dataref_ptr;
5059 
5060   if (bump)
5061     update = bump;
5062 
5063   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5064     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5065   else
5066     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5067   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5068 				   dataref_ptr, update);
5069   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5070 
5071   /* Copy the points-to information if it exists. */
5072   if (DR_PTR_INFO (dr))
5073     {
5074       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5075       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5076     }
5077 
5078   if (!ptr_incr)
5079     return new_dataref_ptr;
5080 
5081   /* Update the vector-pointer's cross-iteration increment.  */
5082   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5083     {
5084       tree use = USE_FROM_PTR (use_p);
5085 
5086       if (use == dataref_ptr)
5087         SET_USE (use_p, new_dataref_ptr);
5088       else
5089         gcc_assert (operand_equal_p (use, update, 0));
5090     }
5091 
5092   return new_dataref_ptr;
5093 }
5094 
5095 
5096 /* Copy memory reference info such as base/clique from the SRC reference
5097    to the DEST MEM_REF.  */
5098 
5099 void
vect_copy_ref_info(tree dest,tree src)5100 vect_copy_ref_info (tree dest, tree src)
5101 {
5102   if (TREE_CODE (dest) != MEM_REF)
5103     return;
5104 
5105   tree src_base = src;
5106   while (handled_component_p (src_base))
5107     src_base = TREE_OPERAND (src_base, 0);
5108   if (TREE_CODE (src_base) != MEM_REF
5109       && TREE_CODE (src_base) != TARGET_MEM_REF)
5110     return;
5111 
5112   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5113   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5114 }
5115 
5116 
5117 /* Function vect_create_destination_var.
5118 
5119    Create a new temporary of type VECTYPE.  */
5120 
5121 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5122 vect_create_destination_var (tree scalar_dest, tree vectype)
5123 {
5124   tree vec_dest;
5125   const char *name;
5126   char *new_name;
5127   tree type;
5128   enum vect_var_kind kind;
5129 
5130   kind = vectype
5131     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5132     ? vect_mask_var
5133     : vect_simple_var
5134     : vect_scalar_var;
5135   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5136 
5137   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5138 
5139   name = get_name (scalar_dest);
5140   if (name)
5141     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5142   else
5143     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5144   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5145   free (new_name);
5146 
5147   return vec_dest;
5148 }
5149 
5150 /* Function vect_grouped_store_supported.
5151 
5152    Returns TRUE if interleave high and interleave low permutations
5153    are supported, and FALSE otherwise.  */
5154 
5155 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5156 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5157 {
5158   machine_mode mode = TYPE_MODE (vectype);
5159 
5160   /* vect_permute_store_chain requires the group size to be equal to 3 or
5161      be a power of two.  */
5162   if (count != 3 && exact_log2 (count) == -1)
5163     {
5164       if (dump_enabled_p ())
5165 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5166 			 "the size of the group of accesses"
5167 			 " is not a power of 2 or not eqaul to 3\n");
5168       return false;
5169     }
5170 
5171   /* Check that the permutation is supported.  */
5172   if (VECTOR_MODE_P (mode))
5173     {
5174       unsigned int i;
5175       if (count == 3)
5176 	{
5177 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5178 	  unsigned int i, j;
5179 
5180 	  unsigned int nelt;
5181 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5182 	    {
5183 	      if (dump_enabled_p ())
5184 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185 				 "cannot handle groups of 3 stores for"
5186 				 " variable-length vectors\n");
5187 	      return false;
5188 	    }
5189 
5190 	  vec_perm_builder sel (nelt, nelt, 1);
5191 	  sel.quick_grow (nelt);
5192 	  vec_perm_indices indices;
5193 	  for (j = 0; j < 3; j++)
5194 	    {
5195 	      int nelt0 = ((3 - j) * nelt) % 3;
5196 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5197 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5198 	      for (i = 0; i < nelt; i++)
5199 		{
5200 		  if (3 * i + nelt0 < nelt)
5201 		    sel[3 * i + nelt0] = j0++;
5202 		  if (3 * i + nelt1 < nelt)
5203 		    sel[3 * i + nelt1] = nelt + j1++;
5204 		  if (3 * i + nelt2 < nelt)
5205 		    sel[3 * i + nelt2] = 0;
5206 		}
5207 	      indices.new_vector (sel, 2, nelt);
5208 	      if (!can_vec_perm_const_p (mode, indices))
5209 		{
5210 		  if (dump_enabled_p ())
5211 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5212 				 "permutation op not supported by target.\n");
5213 		  return false;
5214 		}
5215 
5216 	      for (i = 0; i < nelt; i++)
5217 		{
5218 		  if (3 * i + nelt0 < nelt)
5219 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5220 		  if (3 * i + nelt1 < nelt)
5221 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5222 		  if (3 * i + nelt2 < nelt)
5223 		    sel[3 * i + nelt2] = nelt + j2++;
5224 		}
5225 	      indices.new_vector (sel, 2, nelt);
5226 	      if (!can_vec_perm_const_p (mode, indices))
5227 		{
5228 		  if (dump_enabled_p ())
5229 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5230 				 "permutation op not supported by target.\n");
5231 		  return false;
5232 		}
5233 	    }
5234 	  return true;
5235 	}
5236       else
5237 	{
5238 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5239 	  gcc_assert (pow2p_hwi (count));
5240 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5241 
5242 	  /* The encoding has 2 interleaved stepped patterns.  */
5243 	  vec_perm_builder sel (nelt, 2, 3);
5244 	  sel.quick_grow (6);
5245 	  for (i = 0; i < 3; i++)
5246 	    {
5247 	      sel[i * 2] = i;
5248 	      sel[i * 2 + 1] = i + nelt;
5249 	    }
5250 	  vec_perm_indices indices (sel, 2, nelt);
5251 	  if (can_vec_perm_const_p (mode, indices))
5252 	    {
5253 	      for (i = 0; i < 6; i++)
5254 		sel[i] += exact_div (nelt, 2);
5255 	      indices.new_vector (sel, 2, nelt);
5256 	      if (can_vec_perm_const_p (mode, indices))
5257 		return true;
5258 	    }
5259 	}
5260     }
5261 
5262   if (dump_enabled_p ())
5263     dump_printf (MSG_MISSED_OPTIMIZATION,
5264 		 "permutaion op not supported by target.\n");
5265   return false;
5266 }
5267 
5268 
5269 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5270    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5271 
5272 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5273 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5274 			    bool masked_p)
5275 {
5276   if (masked_p)
5277     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5278 					 vec_mask_store_lanes_optab,
5279 					 vectype, count);
5280   else
5281     return vect_lanes_optab_supported_p ("vec_store_lanes",
5282 					 vec_store_lanes_optab,
5283 					 vectype, count);
5284 }
5285 
5286 
5287 /* Function vect_permute_store_chain.
5288 
5289    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5290    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5291    the data correctly for the stores.  Return the final references for stores
5292    in RESULT_CHAIN.
5293 
5294    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5295    The input is 4 vectors each containing 8 elements.  We assign a number to
5296    each element, the input sequence is:
5297 
5298    1st vec:   0  1  2  3  4  5  6  7
5299    2nd vec:   8  9 10 11 12 13 14 15
5300    3rd vec:  16 17 18 19 20 21 22 23
5301    4th vec:  24 25 26 27 28 29 30 31
5302 
5303    The output sequence should be:
5304 
5305    1st vec:  0  8 16 24  1  9 17 25
5306    2nd vec:  2 10 18 26  3 11 19 27
5307    3rd vec:  4 12 20 28  5 13 21 30
5308    4th vec:  6 14 22 30  7 15 23 31
5309 
5310    i.e., we interleave the contents of the four vectors in their order.
5311 
5312    We use interleave_high/low instructions to create such output.  The input of
5313    each interleave_high/low operation is two vectors:
5314    1st vec    2nd vec
5315    0 1 2 3    4 5 6 7
5316    the even elements of the result vector are obtained left-to-right from the
5317    high/low elements of the first vector.  The odd elements of the result are
5318    obtained left-to-right from the high/low elements of the second vector.
5319    The output of interleave_high will be:   0 4 1 5
5320    and of interleave_low:                   2 6 3 7
5321 
5322 
5323    The permutation is done in log LENGTH stages.  In each stage interleave_high
5324    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5325    where the first argument is taken from the first half of DR_CHAIN and the
5326    second argument from it's second half.
5327    In our example,
5328 
5329    I1: interleave_high (1st vec, 3rd vec)
5330    I2: interleave_low (1st vec, 3rd vec)
5331    I3: interleave_high (2nd vec, 4th vec)
5332    I4: interleave_low (2nd vec, 4th vec)
5333 
5334    The output for the first stage is:
5335 
5336    I1:  0 16  1 17  2 18  3 19
5337    I2:  4 20  5 21  6 22  7 23
5338    I3:  8 24  9 25 10 26 11 27
5339    I4: 12 28 13 29 14 30 15 31
5340 
5341    The output of the second stage, i.e. the final result is:
5342 
5343    I1:  0  8 16 24  1  9 17 25
5344    I2:  2 10 18 26  3 11 19 27
5345    I3:  4 12 20 28  5 13 21 30
5346    I4:  6 14 22 30  7 15 23 31.  */
5347 
5348 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5349 vect_permute_store_chain (vec<tree> dr_chain,
5350 			  unsigned int length,
5351 			  gimple *stmt,
5352 			  gimple_stmt_iterator *gsi,
5353 			  vec<tree> *result_chain)
5354 {
5355   tree vect1, vect2, high, low;
5356   gimple *perm_stmt;
5357   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5358   tree perm_mask_low, perm_mask_high;
5359   tree data_ref;
5360   tree perm3_mask_low, perm3_mask_high;
5361   unsigned int i, j, n, log_length = exact_log2 (length);
5362 
5363   result_chain->quick_grow (length);
5364   memcpy (result_chain->address (), dr_chain.address (),
5365 	  length * sizeof (tree));
5366 
5367   if (length == 3)
5368     {
5369       /* vect_grouped_store_supported ensures that this is constant.  */
5370       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5371       unsigned int j0 = 0, j1 = 0, j2 = 0;
5372 
5373       vec_perm_builder sel (nelt, nelt, 1);
5374       sel.quick_grow (nelt);
5375       vec_perm_indices indices;
5376       for (j = 0; j < 3; j++)
5377         {
5378 	  int nelt0 = ((3 - j) * nelt) % 3;
5379 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5380 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5381 
5382 	  for (i = 0; i < nelt; i++)
5383 	    {
5384 	      if (3 * i + nelt0 < nelt)
5385 		sel[3 * i + nelt0] = j0++;
5386 	      if (3 * i + nelt1 < nelt)
5387 		sel[3 * i + nelt1] = nelt + j1++;
5388 	      if (3 * i + nelt2 < nelt)
5389 		sel[3 * i + nelt2] = 0;
5390 	    }
5391 	  indices.new_vector (sel, 2, nelt);
5392 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5393 
5394 	  for (i = 0; i < nelt; i++)
5395 	    {
5396 	      if (3 * i + nelt0 < nelt)
5397 		sel[3 * i + nelt0] = 3 * i + nelt0;
5398 	      if (3 * i + nelt1 < nelt)
5399 		sel[3 * i + nelt1] = 3 * i + nelt1;
5400 	      if (3 * i + nelt2 < nelt)
5401 		sel[3 * i + nelt2] = nelt + j2++;
5402 	    }
5403 	  indices.new_vector (sel, 2, nelt);
5404 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5405 
5406 	  vect1 = dr_chain[0];
5407 	  vect2 = dr_chain[1];
5408 
5409 	  /* Create interleaving stmt:
5410 	     low = VEC_PERM_EXPR <vect1, vect2,
5411 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5412 				   j + 2, nelt + j + 2, *, ...}>  */
5413 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5414 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5415 					   vect2, perm3_mask_low);
5416 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5417 
5418 	  vect1 = data_ref;
5419 	  vect2 = dr_chain[2];
5420 	  /* Create interleaving stmt:
5421 	     low = VEC_PERM_EXPR <vect1, vect2,
5422 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5423 				   6, 7, nelt + j + 2, ...}>  */
5424 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5425 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5426 					   vect2, perm3_mask_high);
5427 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5428 	  (*result_chain)[j] = data_ref;
5429 	}
5430     }
5431   else
5432     {
5433       /* If length is not equal to 3 then only power of 2 is supported.  */
5434       gcc_assert (pow2p_hwi (length));
5435 
5436       /* The encoding has 2 interleaved stepped patterns.  */
5437       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5438       vec_perm_builder sel (nelt, 2, 3);
5439       sel.quick_grow (6);
5440       for (i = 0; i < 3; i++)
5441 	{
5442 	  sel[i * 2] = i;
5443 	  sel[i * 2 + 1] = i + nelt;
5444 	}
5445 	vec_perm_indices indices (sel, 2, nelt);
5446 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5447 
5448 	for (i = 0; i < 6; i++)
5449 	  sel[i] += exact_div (nelt, 2);
5450 	indices.new_vector (sel, 2, nelt);
5451 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5452 
5453 	for (i = 0, n = log_length; i < n; i++)
5454 	  {
5455 	    for (j = 0; j < length/2; j++)
5456 	      {
5457 		vect1 = dr_chain[j];
5458 		vect2 = dr_chain[j+length/2];
5459 
5460 		/* Create interleaving stmt:
5461 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5462 							...}>  */
5463 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5464 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5465 						 vect2, perm_mask_high);
5466 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5467 		(*result_chain)[2*j] = high;
5468 
5469 		/* Create interleaving stmt:
5470 		   low = VEC_PERM_EXPR <vect1, vect2,
5471 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5472 					 ...}>  */
5473 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5474 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5475 						 vect2, perm_mask_low);
5476 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5477 		(*result_chain)[2*j+1] = low;
5478 	      }
5479 	    memcpy (dr_chain.address (), result_chain->address (),
5480 		    length * sizeof (tree));
5481 	  }
5482     }
5483 }
5484 
5485 /* Function vect_setup_realignment
5486 
5487    This function is called when vectorizing an unaligned load using
5488    the dr_explicit_realign[_optimized] scheme.
5489    This function generates the following code at the loop prolog:
5490 
5491       p = initial_addr;
5492    x  msq_init = *(floor(p));   # prolog load
5493       realignment_token = call target_builtin;
5494     loop:
5495    x  msq = phi (msq_init, ---)
5496 
5497    The stmts marked with x are generated only for the case of
5498    dr_explicit_realign_optimized.
5499 
5500    The code above sets up a new (vector) pointer, pointing to the first
5501    location accessed by STMT, and a "floor-aligned" load using that pointer.
5502    It also generates code to compute the "realignment-token" (if the relevant
5503    target hook was defined), and creates a phi-node at the loop-header bb
5504    whose arguments are the result of the prolog-load (created by this
5505    function) and the result of a load that takes place in the loop (to be
5506    created by the caller to this function).
5507 
5508    For the case of dr_explicit_realign_optimized:
5509    The caller to this function uses the phi-result (msq) to create the
5510    realignment code inside the loop, and sets up the missing phi argument,
5511    as follows:
5512     loop:
5513       msq = phi (msq_init, lsq)
5514       lsq = *(floor(p'));        # load in loop
5515       result = realign_load (msq, lsq, realignment_token);
5516 
5517    For the case of dr_explicit_realign:
5518     loop:
5519       msq = *(floor(p)); 	# load in loop
5520       p' = p + (VS-1);
5521       lsq = *(floor(p'));	# load in loop
5522       result = realign_load (msq, lsq, realignment_token);
5523 
5524    Input:
5525    STMT - (scalar) load stmt to be vectorized. This load accesses
5526           a memory location that may be unaligned.
5527    BSI - place where new code is to be inserted.
5528    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5529 			      is used.
5530 
5531    Output:
5532    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5533                        target hook, if defined.
5534    Return value - the result of the loop-header phi node.  */
5535 
5536 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5537 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5538                         tree *realignment_token,
5539 			enum dr_alignment_support alignment_support_scheme,
5540 			tree init_addr,
5541 			struct loop **at_loop)
5542 {
5543   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5544   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5545   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5546   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5547   struct loop *loop = NULL;
5548   edge pe = NULL;
5549   tree scalar_dest = gimple_assign_lhs (stmt);
5550   tree vec_dest;
5551   gimple *inc;
5552   tree ptr;
5553   tree data_ref;
5554   basic_block new_bb;
5555   tree msq_init = NULL_TREE;
5556   tree new_temp;
5557   gphi *phi_stmt;
5558   tree msq = NULL_TREE;
5559   gimple_seq stmts = NULL;
5560   bool inv_p;
5561   bool compute_in_loop = false;
5562   bool nested_in_vect_loop = false;
5563   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5564   struct loop *loop_for_initial_load = NULL;
5565 
5566   if (loop_vinfo)
5567     {
5568       loop = LOOP_VINFO_LOOP (loop_vinfo);
5569       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5570     }
5571 
5572   gcc_assert (alignment_support_scheme == dr_explicit_realign
5573 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5574 
5575   /* We need to generate three things:
5576      1. the misalignment computation
5577      2. the extra vector load (for the optimized realignment scheme).
5578      3. the phi node for the two vectors from which the realignment is
5579       done (for the optimized realignment scheme).  */
5580 
5581   /* 1. Determine where to generate the misalignment computation.
5582 
5583      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5584      calculation will be generated by this function, outside the loop (in the
5585      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5586      caller, inside the loop.
5587 
5588      Background: If the misalignment remains fixed throughout the iterations of
5589      the loop, then both realignment schemes are applicable, and also the
5590      misalignment computation can be done outside LOOP.  This is because we are
5591      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5592      are a multiple of VS (the Vector Size), and therefore the misalignment in
5593      different vectorized LOOP iterations is always the same.
5594      The problem arises only if the memory access is in an inner-loop nested
5595      inside LOOP, which is now being vectorized using outer-loop vectorization.
5596      This is the only case when the misalignment of the memory access may not
5597      remain fixed throughout the iterations of the inner-loop (as explained in
5598      detail in vect_supportable_dr_alignment).  In this case, not only is the
5599      optimized realignment scheme not applicable, but also the misalignment
5600      computation (and generation of the realignment token that is passed to
5601      REALIGN_LOAD) have to be done inside the loop.
5602 
5603      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5604      or not, which in turn determines if the misalignment is computed inside
5605      the inner-loop, or outside LOOP.  */
5606 
5607   if (init_addr != NULL_TREE || !loop_vinfo)
5608     {
5609       compute_in_loop = true;
5610       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5611     }
5612 
5613 
5614   /* 2. Determine where to generate the extra vector load.
5615 
5616      For the optimized realignment scheme, instead of generating two vector
5617      loads in each iteration, we generate a single extra vector load in the
5618      preheader of the loop, and in each iteration reuse the result of the
5619      vector load from the previous iteration.  In case the memory access is in
5620      an inner-loop nested inside LOOP, which is now being vectorized using
5621      outer-loop vectorization, we need to determine whether this initial vector
5622      load should be generated at the preheader of the inner-loop, or can be
5623      generated at the preheader of LOOP.  If the memory access has no evolution
5624      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5625      to be generated inside LOOP (in the preheader of the inner-loop).  */
5626 
5627   if (nested_in_vect_loop)
5628     {
5629       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5630       bool invariant_in_outerloop =
5631             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5632       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5633     }
5634   else
5635     loop_for_initial_load = loop;
5636   if (at_loop)
5637     *at_loop = loop_for_initial_load;
5638 
5639   if (loop_for_initial_load)
5640     pe = loop_preheader_edge (loop_for_initial_load);
5641 
5642   /* 3. For the case of the optimized realignment, create the first vector
5643       load at the loop preheader.  */
5644 
5645   if (alignment_support_scheme == dr_explicit_realign_optimized)
5646     {
5647       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5648       gassign *new_stmt;
5649 
5650       gcc_assert (!compute_in_loop);
5651       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5652       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5653 				      NULL_TREE, &init_addr, NULL, &inc,
5654 				      true, &inv_p);
5655       if (TREE_CODE (ptr) == SSA_NAME)
5656 	new_temp = copy_ssa_name (ptr);
5657       else
5658 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5659       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5660       new_stmt = gimple_build_assign
5661 		   (new_temp, BIT_AND_EXPR, ptr,
5662 		    build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5663       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5664       gcc_assert (!new_bb);
5665       data_ref
5666 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5667 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5668       vect_copy_ref_info (data_ref, DR_REF (dr));
5669       new_stmt = gimple_build_assign (vec_dest, data_ref);
5670       new_temp = make_ssa_name (vec_dest, new_stmt);
5671       gimple_assign_set_lhs (new_stmt, new_temp);
5672       if (pe)
5673         {
5674           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5675           gcc_assert (!new_bb);
5676         }
5677       else
5678          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5679 
5680       msq_init = gimple_assign_lhs (new_stmt);
5681     }
5682 
5683   /* 4. Create realignment token using a target builtin, if available.
5684       It is done either inside the containing loop, or before LOOP (as
5685       determined above).  */
5686 
5687   if (targetm.vectorize.builtin_mask_for_load)
5688     {
5689       gcall *new_stmt;
5690       tree builtin_decl;
5691 
5692       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5693       if (!init_addr)
5694 	{
5695 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5696 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5697 							    NULL_TREE);
5698           if (loop)
5699             {
5700    	      pe = loop_preheader_edge (loop);
5701 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5702 	      gcc_assert (!new_bb);
5703             }
5704           else
5705              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5706 	}
5707 
5708       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5709       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5710       vec_dest =
5711 	vect_create_destination_var (scalar_dest,
5712 				     gimple_call_return_type (new_stmt));
5713       new_temp = make_ssa_name (vec_dest, new_stmt);
5714       gimple_call_set_lhs (new_stmt, new_temp);
5715 
5716       if (compute_in_loop)
5717 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5718       else
5719 	{
5720 	  /* Generate the misalignment computation outside LOOP.  */
5721 	  pe = loop_preheader_edge (loop);
5722 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5723 	  gcc_assert (!new_bb);
5724 	}
5725 
5726       *realignment_token = gimple_call_lhs (new_stmt);
5727 
5728       /* The result of the CALL_EXPR to this builtin is determined from
5729          the value of the parameter and no global variables are touched
5730          which makes the builtin a "const" function.  Requiring the
5731          builtin to have the "const" attribute makes it unnecessary
5732          to call mark_call_clobbered.  */
5733       gcc_assert (TREE_READONLY (builtin_decl));
5734     }
5735 
5736   if (alignment_support_scheme == dr_explicit_realign)
5737     return msq;
5738 
5739   gcc_assert (!compute_in_loop);
5740   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5741 
5742 
5743   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5744 
5745   pe = loop_preheader_edge (containing_loop);
5746   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5747   msq = make_ssa_name (vec_dest);
5748   phi_stmt = create_phi_node (msq, containing_loop->header);
5749   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5750 
5751   return msq;
5752 }
5753 
5754 
5755 /* Function vect_grouped_load_supported.
5756 
5757    COUNT is the size of the load group (the number of statements plus the
5758    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5759    only one statement, with a gap of COUNT - 1.
5760 
5761    Returns true if a suitable permute exists.  */
5762 
5763 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5764 vect_grouped_load_supported (tree vectype, bool single_element_p,
5765 			     unsigned HOST_WIDE_INT count)
5766 {
5767   machine_mode mode = TYPE_MODE (vectype);
5768 
5769   /* If this is single-element interleaving with an element distance
5770      that leaves unused vector loads around punt - we at least create
5771      very sub-optimal code in that case (and blow up memory,
5772      see PR65518).  */
5773   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5774     {
5775       if (dump_enabled_p ())
5776 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5777 			 "single-element interleaving not supported "
5778 			 "for not adjacent vector loads\n");
5779       return false;
5780     }
5781 
5782   /* vect_permute_load_chain requires the group size to be equal to 3 or
5783      be a power of two.  */
5784   if (count != 3 && exact_log2 (count) == -1)
5785     {
5786       if (dump_enabled_p ())
5787 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 			 "the size of the group of accesses"
5789 			 " is not a power of 2 or not equal to 3\n");
5790       return false;
5791     }
5792 
5793   /* Check that the permutation is supported.  */
5794   if (VECTOR_MODE_P (mode))
5795     {
5796       unsigned int i, j;
5797       if (count == 3)
5798 	{
5799 	  unsigned int nelt;
5800 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5801 	    {
5802 	      if (dump_enabled_p ())
5803 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 				 "cannot handle groups of 3 loads for"
5805 				 " variable-length vectors\n");
5806 	      return false;
5807 	    }
5808 
5809 	  vec_perm_builder sel (nelt, nelt, 1);
5810 	  sel.quick_grow (nelt);
5811 	  vec_perm_indices indices;
5812 	  unsigned int k;
5813 	  for (k = 0; k < 3; k++)
5814 	    {
5815 	      for (i = 0; i < nelt; i++)
5816 		if (3 * i + k < 2 * nelt)
5817 		  sel[i] = 3 * i + k;
5818 		else
5819 		  sel[i] = 0;
5820 	      indices.new_vector (sel, 2, nelt);
5821 	      if (!can_vec_perm_const_p (mode, indices))
5822 		{
5823 		  if (dump_enabled_p ())
5824 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825 				     "shuffle of 3 loads is not supported by"
5826 				     " target\n");
5827 		  return false;
5828 		}
5829 	      for (i = 0, j = 0; i < nelt; i++)
5830 		if (3 * i + k < 2 * nelt)
5831 		  sel[i] = i;
5832 		else
5833 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5834 	      indices.new_vector (sel, 2, nelt);
5835 	      if (!can_vec_perm_const_p (mode, indices))
5836 		{
5837 		  if (dump_enabled_p ())
5838 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5839 				     "shuffle of 3 loads is not supported by"
5840 				     " target\n");
5841 		  return false;
5842 		}
5843 	    }
5844 	  return true;
5845 	}
5846       else
5847 	{
5848 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5849 	  gcc_assert (pow2p_hwi (count));
5850 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5851 
5852 	  /* The encoding has a single stepped pattern.  */
5853 	  vec_perm_builder sel (nelt, 1, 3);
5854 	  sel.quick_grow (3);
5855 	  for (i = 0; i < 3; i++)
5856 	    sel[i] = i * 2;
5857 	  vec_perm_indices indices (sel, 2, nelt);
5858 	  if (can_vec_perm_const_p (mode, indices))
5859 	    {
5860 	      for (i = 0; i < 3; i++)
5861 		sel[i] = i * 2 + 1;
5862 	      indices.new_vector (sel, 2, nelt);
5863 	      if (can_vec_perm_const_p (mode, indices))
5864 		return true;
5865 	    }
5866         }
5867     }
5868 
5869   if (dump_enabled_p ())
5870     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5871 		     "extract even/odd not supported by target\n");
5872   return false;
5873 }
5874 
5875 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5876    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5877 
5878 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5879 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5880 			   bool masked_p)
5881 {
5882   if (masked_p)
5883     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5884 					 vec_mask_load_lanes_optab,
5885 					 vectype, count);
5886   else
5887     return vect_lanes_optab_supported_p ("vec_load_lanes",
5888 					 vec_load_lanes_optab,
5889 					 vectype, count);
5890 }
5891 
5892 /* Function vect_permute_load_chain.
5893 
5894    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5895    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5896    the input data correctly.  Return the final references for loads in
5897    RESULT_CHAIN.
5898 
5899    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5900    The input is 4 vectors each containing 8 elements. We assign a number to each
5901    element, the input sequence is:
5902 
5903    1st vec:   0  1  2  3  4  5  6  7
5904    2nd vec:   8  9 10 11 12 13 14 15
5905    3rd vec:  16 17 18 19 20 21 22 23
5906    4th vec:  24 25 26 27 28 29 30 31
5907 
5908    The output sequence should be:
5909 
5910    1st vec:  0 4  8 12 16 20 24 28
5911    2nd vec:  1 5  9 13 17 21 25 29
5912    3rd vec:  2 6 10 14 18 22 26 30
5913    4th vec:  3 7 11 15 19 23 27 31
5914 
5915    i.e., the first output vector should contain the first elements of each
5916    interleaving group, etc.
5917 
5918    We use extract_even/odd instructions to create such output.  The input of
5919    each extract_even/odd operation is two vectors
5920    1st vec    2nd vec
5921    0 1 2 3    4 5 6 7
5922 
5923    and the output is the vector of extracted even/odd elements.  The output of
5924    extract_even will be:   0 2 4 6
5925    and of extract_odd:     1 3 5 7
5926 
5927 
5928    The permutation is done in log LENGTH stages.  In each stage extract_even
5929    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5930    their order.  In our example,
5931 
5932    E1: extract_even (1st vec, 2nd vec)
5933    E2: extract_odd (1st vec, 2nd vec)
5934    E3: extract_even (3rd vec, 4th vec)
5935    E4: extract_odd (3rd vec, 4th vec)
5936 
5937    The output for the first stage will be:
5938 
5939    E1:  0  2  4  6  8 10 12 14
5940    E2:  1  3  5  7  9 11 13 15
5941    E3: 16 18 20 22 24 26 28 30
5942    E4: 17 19 21 23 25 27 29 31
5943 
5944    In order to proceed and create the correct sequence for the next stage (or
5945    for the correct output, if the second stage is the last one, as in our
5946    example), we first put the output of extract_even operation and then the
5947    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5948    The input for the second stage is:
5949 
5950    1st vec (E1):  0  2  4  6  8 10 12 14
5951    2nd vec (E3): 16 18 20 22 24 26 28 30
5952    3rd vec (E2):  1  3  5  7  9 11 13 15
5953    4th vec (E4): 17 19 21 23 25 27 29 31
5954 
5955    The output of the second stage:
5956 
5957    E1: 0 4  8 12 16 20 24 28
5958    E2: 2 6 10 14 18 22 26 30
5959    E3: 1 5  9 13 17 21 25 29
5960    E4: 3 7 11 15 19 23 27 31
5961 
5962    And RESULT_CHAIN after reordering:
5963 
5964    1st vec (E1):  0 4  8 12 16 20 24 28
5965    2nd vec (E3):  1 5  9 13 17 21 25 29
5966    3rd vec (E2):  2 6 10 14 18 22 26 30
5967    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5968 
5969 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5970 vect_permute_load_chain (vec<tree> dr_chain,
5971 			 unsigned int length,
5972 			 gimple *stmt,
5973 			 gimple_stmt_iterator *gsi,
5974 			 vec<tree> *result_chain)
5975 {
5976   tree data_ref, first_vect, second_vect;
5977   tree perm_mask_even, perm_mask_odd;
5978   tree perm3_mask_low, perm3_mask_high;
5979   gimple *perm_stmt;
5980   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5981   unsigned int i, j, log_length = exact_log2 (length);
5982 
5983   result_chain->quick_grow (length);
5984   memcpy (result_chain->address (), dr_chain.address (),
5985 	  length * sizeof (tree));
5986 
5987   if (length == 3)
5988     {
5989       /* vect_grouped_load_supported ensures that this is constant.  */
5990       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5991       unsigned int k;
5992 
5993       vec_perm_builder sel (nelt, nelt, 1);
5994       sel.quick_grow (nelt);
5995       vec_perm_indices indices;
5996       for (k = 0; k < 3; k++)
5997 	{
5998 	  for (i = 0; i < nelt; i++)
5999 	    if (3 * i + k < 2 * nelt)
6000 	      sel[i] = 3 * i + k;
6001 	    else
6002 	      sel[i] = 0;
6003 	  indices.new_vector (sel, 2, nelt);
6004 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6005 
6006 	  for (i = 0, j = 0; i < nelt; i++)
6007 	    if (3 * i + k < 2 * nelt)
6008 	      sel[i] = i;
6009 	    else
6010 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6011 	  indices.new_vector (sel, 2, nelt);
6012 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6013 
6014 	  first_vect = dr_chain[0];
6015 	  second_vect = dr_chain[1];
6016 
6017 	  /* Create interleaving stmt (low part of):
6018 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6019 							     ...}>  */
6020 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6021 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6022 					   second_vect, perm3_mask_low);
6023 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6024 
6025 	  /* Create interleaving stmt (high part of):
6026 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6027 							      ...}>  */
6028 	  first_vect = data_ref;
6029 	  second_vect = dr_chain[2];
6030 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6031 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6032 					   second_vect, perm3_mask_high);
6033 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6034 	  (*result_chain)[k] = data_ref;
6035 	}
6036     }
6037   else
6038     {
6039       /* If length is not equal to 3 then only power of 2 is supported.  */
6040       gcc_assert (pow2p_hwi (length));
6041 
6042       /* The encoding has a single stepped pattern.  */
6043       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6044       vec_perm_builder sel (nelt, 1, 3);
6045       sel.quick_grow (3);
6046       for (i = 0; i < 3; ++i)
6047 	sel[i] = i * 2;
6048       vec_perm_indices indices (sel, 2, nelt);
6049       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6050 
6051       for (i = 0; i < 3; ++i)
6052 	sel[i] = i * 2 + 1;
6053       indices.new_vector (sel, 2, nelt);
6054       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6055 
6056       for (i = 0; i < log_length; i++)
6057 	{
6058 	  for (j = 0; j < length; j += 2)
6059 	    {
6060 	      first_vect = dr_chain[j];
6061 	      second_vect = dr_chain[j+1];
6062 
6063 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6064 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6065 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6066 					       first_vect, second_vect,
6067 					       perm_mask_even);
6068 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6069 	      (*result_chain)[j/2] = data_ref;
6070 
6071 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6072 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6073 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6074 					       first_vect, second_vect,
6075 					       perm_mask_odd);
6076 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6077 	      (*result_chain)[j/2+length/2] = data_ref;
6078 	    }
6079 	  memcpy (dr_chain.address (), result_chain->address (),
6080 		  length * sizeof (tree));
6081 	}
6082     }
6083 }
6084 
6085 /* Function vect_shift_permute_load_chain.
6086 
6087    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6088    sequence of stmts to reorder the input data accordingly.
6089    Return the final references for loads in RESULT_CHAIN.
6090    Return true if successed, false otherwise.
6091 
6092    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6093    The input is 3 vectors each containing 8 elements.  We assign a
6094    number to each element, the input sequence is:
6095 
6096    1st vec:   0  1  2  3  4  5  6  7
6097    2nd vec:   8  9 10 11 12 13 14 15
6098    3rd vec:  16 17 18 19 20 21 22 23
6099 
6100    The output sequence should be:
6101 
6102    1st vec:  0 3 6  9 12 15 18 21
6103    2nd vec:  1 4 7 10 13 16 19 22
6104    3rd vec:  2 5 8 11 14 17 20 23
6105 
6106    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6107 
6108    First we shuffle all 3 vectors to get correct elements order:
6109 
6110    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6111    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6112    3rd vec:  (16 19 22) (17 20 23) (18 21)
6113 
6114    Next we unite and shift vector 3 times:
6115 
6116    1st step:
6117      shift right by 6 the concatenation of:
6118      "1st vec" and  "2nd vec"
6119        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6120      "2nd vec" and  "3rd vec"
6121        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6122      "3rd vec" and  "1st vec"
6123        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6124 			     | New vectors                   |
6125 
6126      So that now new vectors are:
6127 
6128      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6129      2nd vec:  (10 13) (16 19 22) (17 20 23)
6130      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6131 
6132    2nd step:
6133      shift right by 5 the concatenation of:
6134      "1st vec" and  "3rd vec"
6135        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6136      "2nd vec" and  "1st vec"
6137        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6138      "3rd vec" and  "2nd vec"
6139        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6140 			  | New vectors                   |
6141 
6142      So that now new vectors are:
6143 
6144      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6145      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6146      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6147 
6148    3rd step:
6149      shift right by 5 the concatenation of:
6150      "1st vec" and  "1st vec"
6151        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6152      shift right by 3 the concatenation of:
6153      "2nd vec" and  "2nd vec"
6154                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6155 			  | New vectors                   |
6156 
6157      So that now all vectors are READY:
6158      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6159      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6160      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6161 
6162    This algorithm is faster than one in vect_permute_load_chain if:
6163      1.  "shift of a concatination" is faster than general permutation.
6164 	 This is usually so.
6165      2.  The TARGET machine can't execute vector instructions in parallel.
6166 	 This is because each step of the algorithm depends on previous.
6167 	 The algorithm in vect_permute_load_chain is much more parallel.
6168 
6169    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6170 */
6171 
6172 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6173 vect_shift_permute_load_chain (vec<tree> dr_chain,
6174 			       unsigned int length,
6175 			       gimple *stmt,
6176 			       gimple_stmt_iterator *gsi,
6177 			       vec<tree> *result_chain)
6178 {
6179   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6180   tree perm2_mask1, perm2_mask2, perm3_mask;
6181   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6182   gimple *perm_stmt;
6183 
6184   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6185   unsigned int i;
6186   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6187   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6188 
6189   unsigned HOST_WIDE_INT nelt, vf;
6190   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6191       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6192     /* Not supported for variable-length vectors.  */
6193     return false;
6194 
6195   vec_perm_builder sel (nelt, nelt, 1);
6196   sel.quick_grow (nelt);
6197 
6198   result_chain->quick_grow (length);
6199   memcpy (result_chain->address (), dr_chain.address (),
6200 	  length * sizeof (tree));
6201 
6202   if (pow2p_hwi (length) && vf > 4)
6203     {
6204       unsigned int j, log_length = exact_log2 (length);
6205       for (i = 0; i < nelt / 2; ++i)
6206 	sel[i] = i * 2;
6207       for (i = 0; i < nelt / 2; ++i)
6208 	sel[nelt / 2 + i] = i * 2 + 1;
6209       vec_perm_indices indices (sel, 2, nelt);
6210       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211 	{
6212 	  if (dump_enabled_p ())
6213 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214 			     "shuffle of 2 fields structure is not \
6215 			      supported by target\n");
6216 	  return false;
6217 	}
6218       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6219 
6220       for (i = 0; i < nelt / 2; ++i)
6221 	sel[i] = i * 2 + 1;
6222       for (i = 0; i < nelt / 2; ++i)
6223 	sel[nelt / 2 + i] = i * 2;
6224       indices.new_vector (sel, 2, nelt);
6225       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226 	{
6227 	  if (dump_enabled_p ())
6228 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 			     "shuffle of 2 fields structure is not \
6230 			      supported by target\n");
6231 	  return false;
6232 	}
6233       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6234 
6235       /* Generating permutation constant to shift all elements.
6236 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6237       for (i = 0; i < nelt; i++)
6238 	sel[i] = nelt / 2 + i;
6239       indices.new_vector (sel, 2, nelt);
6240       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241 	{
6242 	  if (dump_enabled_p ())
6243 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244 			     "shift permutation is not supported by target\n");
6245 	  return false;
6246 	}
6247       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6248 
6249       /* Generating permutation constant to select vector from 2.
6250 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6251       for (i = 0; i < nelt / 2; i++)
6252 	sel[i] = i;
6253       for (i = nelt / 2; i < nelt; i++)
6254 	sel[i] = nelt + i;
6255       indices.new_vector (sel, 2, nelt);
6256       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257 	{
6258 	  if (dump_enabled_p ())
6259 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 			     "select is not supported by target\n");
6261 	  return false;
6262 	}
6263       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6264 
6265       for (i = 0; i < log_length; i++)
6266 	{
6267 	  for (j = 0; j < length; j += 2)
6268 	    {
6269 	      first_vect = dr_chain[j];
6270 	      second_vect = dr_chain[j + 1];
6271 
6272 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6273 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6274 					       first_vect, first_vect,
6275 					       perm2_mask1);
6276 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6277 	      vect[0] = data_ref;
6278 
6279 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6280 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6281 					       second_vect, second_vect,
6282 					       perm2_mask2);
6283 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6284 	      vect[1] = data_ref;
6285 
6286 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6287 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288 					       vect[0], vect[1], shift1_mask);
6289 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290 	      (*result_chain)[j/2 + length/2] = data_ref;
6291 
6292 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6293 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6294 					       vect[0], vect[1], select_mask);
6295 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296 	      (*result_chain)[j/2] = data_ref;
6297 	    }
6298 	  memcpy (dr_chain.address (), result_chain->address (),
6299 		  length * sizeof (tree));
6300 	}
6301       return true;
6302     }
6303   if (length == 3 && vf > 2)
6304     {
6305       unsigned int k = 0, l = 0;
6306 
6307       /* Generating permutation constant to get all elements in rigth order.
6308 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6309       for (i = 0; i < nelt; i++)
6310 	{
6311 	  if (3 * k + (l % 3) >= nelt)
6312 	    {
6313 	      k = 0;
6314 	      l += (3 - (nelt % 3));
6315 	    }
6316 	  sel[i] = 3 * k + (l % 3);
6317 	  k++;
6318 	}
6319       vec_perm_indices indices (sel, 2, nelt);
6320       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6321 	{
6322 	  if (dump_enabled_p ())
6323 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324 			     "shuffle of 3 fields structure is not \
6325 			      supported by target\n");
6326 	  return false;
6327 	}
6328       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6329 
6330       /* Generating permutation constant to shift all elements.
6331 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6332       for (i = 0; i < nelt; i++)
6333 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6334       indices.new_vector (sel, 2, nelt);
6335       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6336 	{
6337 	  if (dump_enabled_p ())
6338 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6339 			     "shift permutation is not supported by target\n");
6340 	  return false;
6341 	}
6342       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6343 
6344       /* Generating permutation constant to shift all elements.
6345 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6346       for (i = 0; i < nelt; i++)
6347 	sel[i] = 2 * (nelt / 3) + 1 + i;
6348       indices.new_vector (sel, 2, nelt);
6349       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6350 	{
6351 	  if (dump_enabled_p ())
6352 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6353 			     "shift permutation is not supported by target\n");
6354 	  return false;
6355 	}
6356       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6357 
6358       /* Generating permutation constant to shift all elements.
6359 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6360       for (i = 0; i < nelt; i++)
6361 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6362       indices.new_vector (sel, 2, nelt);
6363       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6364 	{
6365 	  if (dump_enabled_p ())
6366 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6367 			     "shift permutation is not supported by target\n");
6368 	  return false;
6369 	}
6370       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6371 
6372       /* Generating permutation constant to shift all elements.
6373 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6374       for (i = 0; i < nelt; i++)
6375 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6376       indices.new_vector (sel, 2, nelt);
6377       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6378 	{
6379 	  if (dump_enabled_p ())
6380 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6381 			     "shift permutation is not supported by target\n");
6382 	  return false;
6383 	}
6384       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6385 
6386       for (k = 0; k < 3; k++)
6387 	{
6388 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6389 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6390 					   dr_chain[k], dr_chain[k],
6391 					   perm3_mask);
6392 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6393 	  vect[k] = data_ref;
6394 	}
6395 
6396       for (k = 0; k < 3; k++)
6397 	{
6398 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6399 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6400 					   vect[k % 3], vect[(k + 1) % 3],
6401 					   shift1_mask);
6402 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6403 	  vect_shift[k] = data_ref;
6404 	}
6405 
6406       for (k = 0; k < 3; k++)
6407 	{
6408 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6409 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6410 					   vect_shift[(4 - k) % 3],
6411 					   vect_shift[(3 - k) % 3],
6412 					   shift2_mask);
6413 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6414 	  vect[k] = data_ref;
6415 	}
6416 
6417       (*result_chain)[3 - (nelt % 3)] = vect[2];
6418 
6419       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6420       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6421 				       vect[0], shift3_mask);
6422       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6423       (*result_chain)[nelt % 3] = data_ref;
6424 
6425       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6426       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6427 				       vect[1], shift4_mask);
6428       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6429       (*result_chain)[0] = data_ref;
6430       return true;
6431     }
6432   return false;
6433 }
6434 
6435 /* Function vect_transform_grouped_load.
6436 
6437    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6438    to perform their permutation and ascribe the result vectorized statements to
6439    the scalar statements.
6440 */
6441 
6442 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6443 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6444 			     gimple_stmt_iterator *gsi)
6445 {
6446   machine_mode mode;
6447   vec<tree> result_chain = vNULL;
6448 
6449   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6450      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6451      vectors, that are ready for vector computation.  */
6452   result_chain.create (size);
6453 
6454   /* If reassociation width for vector type is 2 or greater target machine can
6455      execute 2 or more vector instructions in parallel.  Otherwise try to
6456      get chain for loads group using vect_shift_permute_load_chain.  */
6457   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6458   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6459       || pow2p_hwi (size)
6460       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6461 					 gsi, &result_chain))
6462     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6463   vect_record_grouped_load_vectors (stmt, result_chain);
6464   result_chain.release ();
6465 }
6466 
6467 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6468    generated as part of the vectorization of STMT.  Assign the statement
6469    for each vector to the associated scalar statement.  */
6470 
6471 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6472 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6473 {
6474   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6475   gimple *next_stmt, *new_stmt;
6476   unsigned int i, gap_count;
6477   tree tmp_data_ref;
6478 
6479   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6480      Since we scan the chain starting from it's first node, their order
6481      corresponds the order of data-refs in RESULT_CHAIN.  */
6482   next_stmt = first_stmt;
6483   gap_count = 1;
6484   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6485     {
6486       if (!next_stmt)
6487 	break;
6488 
6489       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6490        code elimination pass later.  No need to check for the first stmt in
6491        the group, since it always exists.
6492        GROUP_GAP is the number of steps in elements from the previous
6493        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6494        correspond to the gaps.  */
6495       if (next_stmt != first_stmt
6496           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6497       {
6498         gap_count++;
6499         continue;
6500       }
6501 
6502       while (next_stmt)
6503         {
6504 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6505 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6506 	     copies, and we put the new vector statement in the first available
6507 	     RELATED_STMT.  */
6508 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6509 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6510 	  else
6511             {
6512               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6513                 {
6514 		  gimple *prev_stmt =
6515 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6516 		  gimple *rel_stmt =
6517 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6518 	          while (rel_stmt)
6519 		    {
6520 		      prev_stmt = rel_stmt;
6521 		      rel_stmt =
6522                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6523 		    }
6524 
6525   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6526                     new_stmt;
6527                 }
6528             }
6529 
6530 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6531 	  gap_count = 1;
6532 	  /* If NEXT_STMT accesses the same DR as the previous statement,
6533 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
6534 	     get the next data-ref from RESULT_CHAIN.  */
6535 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6536 	    break;
6537         }
6538     }
6539 }
6540 
6541 /* Function vect_force_dr_alignment_p.
6542 
6543    Returns whether the alignment of a DECL can be forced to be aligned
6544    on ALIGNMENT bit boundary.  */
6545 
6546 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6547 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6548 {
6549   if (!VAR_P (decl))
6550     return false;
6551 
6552   if (decl_in_symtab_p (decl)
6553       && !symtab_node::get (decl)->can_increase_alignment_p ())
6554     return false;
6555 
6556   if (TREE_STATIC (decl))
6557     return (alignment <= MAX_OFILE_ALIGNMENT);
6558   else
6559     return (alignment <= MAX_STACK_ALIGNMENT);
6560 }
6561 
6562 
6563 /* Return whether the data reference DR is supported with respect to its
6564    alignment.
6565    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6566    it is aligned, i.e., check if it is possible to vectorize it with different
6567    alignment.  */
6568 
6569 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6570 vect_supportable_dr_alignment (struct data_reference *dr,
6571                                bool check_aligned_accesses)
6572 {
6573   gimple *stmt = DR_STMT (dr);
6574   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6575   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6576   machine_mode mode = TYPE_MODE (vectype);
6577   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6578   struct loop *vect_loop = NULL;
6579   bool nested_in_vect_loop = false;
6580 
6581   if (aligned_access_p (dr) && !check_aligned_accesses)
6582     return dr_aligned;
6583 
6584   /* For now assume all conditional loads/stores support unaligned
6585      access without any special code.  */
6586   if (is_gimple_call (stmt)
6587       && gimple_call_internal_p (stmt)
6588       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6589 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6590     return dr_unaligned_supported;
6591 
6592   if (loop_vinfo)
6593     {
6594       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6595       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6596     }
6597 
6598   /* Possibly unaligned access.  */
6599 
6600   /* We can choose between using the implicit realignment scheme (generating
6601      a misaligned_move stmt) and the explicit realignment scheme (generating
6602      aligned loads with a REALIGN_LOAD).  There are two variants to the
6603      explicit realignment scheme: optimized, and unoptimized.
6604      We can optimize the realignment only if the step between consecutive
6605      vector loads is equal to the vector size.  Since the vector memory
6606      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6607      is guaranteed that the misalignment amount remains the same throughout the
6608      execution of the vectorized loop.  Therefore, we can create the
6609      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6610      at the loop preheader.
6611 
6612      However, in the case of outer-loop vectorization, when vectorizing a
6613      memory access in the inner-loop nested within the LOOP that is now being
6614      vectorized, while it is guaranteed that the misalignment of the
6615      vectorized memory access will remain the same in different outer-loop
6616      iterations, it is *not* guaranteed that is will remain the same throughout
6617      the execution of the inner-loop.  This is because the inner-loop advances
6618      with the original scalar step (and not in steps of VS).  If the inner-loop
6619      step happens to be a multiple of VS, then the misalignment remains fixed
6620      and we can use the optimized realignment scheme.  For example:
6621 
6622       for (i=0; i<N; i++)
6623         for (j=0; j<M; j++)
6624           s += a[i+j];
6625 
6626      When vectorizing the i-loop in the above example, the step between
6627      consecutive vector loads is 1, and so the misalignment does not remain
6628      fixed across the execution of the inner-loop, and the realignment cannot
6629      be optimized (as illustrated in the following pseudo vectorized loop):
6630 
6631       for (i=0; i<N; i+=4)
6632         for (j=0; j<M; j++){
6633           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6634                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6635                          // (assuming that we start from an aligned address).
6636           }
6637 
6638      We therefore have to use the unoptimized realignment scheme:
6639 
6640       for (i=0; i<N; i+=4)
6641           for (j=k; j<M; j+=4)
6642           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6643                            // that the misalignment of the initial address is
6644                            // 0).
6645 
6646      The loop can then be vectorized as follows:
6647 
6648       for (k=0; k<4; k++){
6649         rt = get_realignment_token (&vp[k]);
6650         for (i=0; i<N; i+=4){
6651           v1 = vp[i+k];
6652           for (j=k; j<M; j+=4){
6653             v2 = vp[i+j+VS-1];
6654             va = REALIGN_LOAD <v1,v2,rt>;
6655             vs += va;
6656             v1 = v2;
6657           }
6658         }
6659     } */
6660 
6661   if (DR_IS_READ (dr))
6662     {
6663       bool is_packed = false;
6664       tree type = (TREE_TYPE (DR_REF (dr)));
6665 
6666       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6667 	  && (!targetm.vectorize.builtin_mask_for_load
6668 	      || targetm.vectorize.builtin_mask_for_load ()))
6669 	{
6670 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6671 
6672 	  /* If we are doing SLP then the accesses need not have the
6673 	     same alignment, instead it depends on the SLP group size.  */
6674 	  if (loop_vinfo
6675 	      && STMT_SLP_TYPE (stmt_info)
6676 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6677 			      * GROUP_SIZE (vinfo_for_stmt
6678 					    (GROUP_FIRST_ELEMENT (stmt_info))),
6679 			      TYPE_VECTOR_SUBPARTS (vectype)))
6680 	    ;
6681 	  else if (!loop_vinfo
6682 		   || (nested_in_vect_loop
6683 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6684 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6685 	    return dr_explicit_realign;
6686 	  else
6687 	    return dr_explicit_realign_optimized;
6688 	}
6689       if (!known_alignment_for_access_p (dr))
6690 	is_packed = not_size_aligned (DR_REF (dr));
6691 
6692       if (targetm.vectorize.support_vector_misalignment
6693 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6694 	/* Can't software pipeline the loads, but can at least do them.  */
6695 	return dr_unaligned_supported;
6696     }
6697   else
6698     {
6699       bool is_packed = false;
6700       tree type = (TREE_TYPE (DR_REF (dr)));
6701 
6702       if (!known_alignment_for_access_p (dr))
6703 	is_packed = not_size_aligned (DR_REF (dr));
6704 
6705      if (targetm.vectorize.support_vector_misalignment
6706 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6707        return dr_unaligned_supported;
6708     }
6709 
6710   /* Unsupported.  */
6711   return dr_unaligned_unsupported;
6712 }
6713