1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs.  */
58 #include "expr.h"
59 #include "optabs.h"
60 
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
63 
64 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 			      tree vectype, unsigned HOST_WIDE_INT count)
67 {
68   enum machine_mode mode, array_mode;
69   bool limit_p;
70 
71   mode = TYPE_MODE (vectype);
72   limit_p = !targetm.array_mode_supported_p (mode, count);
73   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 			      MODE_INT, limit_p);
75 
76   if (array_mode == BLKmode)
77     {
78       if (dump_enabled_p ())
79 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81                          GET_MODE_NAME (mode), count);
82       return false;
83     }
84 
85   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86     {
87       if (dump_enabled_p ())
88 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89                          "cannot use %s<%s><%s>\n", name,
90                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91       return false;
92     }
93 
94   if (dump_enabled_p ())
95     dump_printf_loc (MSG_NOTE, vect_location,
96                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97                      GET_MODE_NAME (mode));
98 
99   return true;
100 }
101 
102 
103 /* Return the smallest scalar part of STMT.
104    This is used to determine the vectype of the stmt.  We generally set the
105    vectype according to the type of the result (lhs).  For stmts whose
106    result-type is different than the type of the arguments (e.g., demotion,
107    promotion), vectype will be reset appropriately (later).  Note that we have
108    to visit the smallest datatype in this function, because that determines the
109    VF.  If the smallest datatype in the loop is present only as the rhs of a
110    promotion operation - we'd miss it.
111    Such a case, where a variable of this datatype does not appear in the lhs
112    anywhere in the loop, can only occur if it's an invariant: e.g.:
113    'int_x = (int) short_inv', which we'd expect to have been optimized away by
114    invariant motion.  However, we cannot rely on invariant motion to always
115    take invariants out of the loop, and so in the case of promotion we also
116    have to check the rhs.
117    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118    types.  */
119 
120 tree
vect_get_smallest_scalar_type(gimple stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122                                HOST_WIDE_INT *rhs_size_unit)
123 {
124   tree scalar_type = gimple_expr_type (stmt);
125   HOST_WIDE_INT lhs, rhs;
126 
127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128 
129   if (is_gimple_assign (stmt)
130       && (gimple_assign_cast_p (stmt)
131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134     {
135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136 
137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138       if (rhs < lhs)
139         scalar_type = rhs_type;
140     }
141 
142   *lhs_size_unit = lhs;
143   *rhs_size_unit = rhs;
144   return scalar_type;
145 }
146 
147 
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149    tested at run-time.  Return TRUE if DDR was successfully inserted.
150    Return false if versioning is not supported.  */
151 
152 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 {
155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156 
157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158     return false;
159 
160   if (dump_enabled_p ())
161     {
162       dump_printf_loc (MSG_NOTE, vect_location,
163                        "mark for run-time aliasing test between ");
164       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165       dump_printf (MSG_NOTE,  " and ");
166       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167       dump_printf (MSG_NOTE, "\n");
168     }
169 
170   if (optimize_loop_nest_for_size_p (loop))
171     {
172       if (dump_enabled_p ())
173 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174                          "versioning not supported when optimizing"
175                          " for size.\n");
176       return false;
177     }
178 
179   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
180   if (loop->inner)
181     {
182       if (dump_enabled_p ())
183 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184                          "versioning not yet supported for outer-loops.\n");
185       return false;
186     }
187 
188   /* FORNOW: We don't support creating runtime alias tests for non-constant
189      step.  */
190   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
192     {
193       if (dump_enabled_p ())
194 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195                          "versioning not yet supported for non-constant "
196                          "step\n");
197       return false;
198     }
199 
200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201   return true;
202 }
203 
204 
205 /* Function vect_analyze_data_ref_dependence.
206 
207    Return TRUE if there (might) exist a dependence between a memory-reference
208    DRA and a memory-reference DRB.  When versioning for alias may check a
209    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
210    the data dependence.  */
211 
212 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,int * max_vf)213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214                                   loop_vec_info loop_vinfo, int *max_vf)
215 {
216   unsigned int i;
217   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218   struct data_reference *dra = DDR_A (ddr);
219   struct data_reference *drb = DDR_B (ddr);
220   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222   lambda_vector dist_v;
223   unsigned int loop_depth;
224 
225   /* In loop analysis all data references should be vectorizable.  */
226   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228     gcc_unreachable ();
229 
230   /* Independent data accesses.  */
231   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232     return false;
233 
234   if (dra == drb
235       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236     return false;
237 
238   /* Even if we have an anti-dependence then, as the vectorized loop covers at
239      least two scalar iterations, there is always also a true dependence.
240      As the vectorizer does not re-order loads and stores we can ignore
241      the anti-dependence if TBAA can disambiguate both DRs similar to the
242      case with known negative distance anti-dependences (positive
243      distance anti-dependences would violate TBAA constraints).  */
244   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
245        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
246       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
247 				 get_alias_set (DR_REF (drb))))
248     return false;
249 
250   /* Unknown data dependence.  */
251   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252     {
253       /* If user asserted safelen consecutive iterations can be
254 	 executed concurrently, assume independence.  */
255       if (loop->safelen >= 2)
256 	{
257 	  if (loop->safelen < *max_vf)
258 	    *max_vf = loop->safelen;
259 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 	  return false;
261 	}
262 
263       if (STMT_VINFO_GATHER_P (stmtinfo_a)
264 	  || STMT_VINFO_GATHER_P (stmtinfo_b))
265 	{
266 	  if (dump_enabled_p ())
267 	    {
268 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
269 			       "versioning for alias not supported for: "
270 			       "can't determine dependence between ");
271 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
272 				 DR_REF (dra));
273 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
274 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 				 DR_REF (drb));
276 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 	    }
278 	  return true;
279 	}
280 
281       if (dump_enabled_p ())
282 	{
283 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
284 			   "versioning for alias required: "
285 			   "can't determine dependence between ");
286 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 			     DR_REF (dra));
288 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
289 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 			     DR_REF (drb));
291 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
292 	}
293 
294       /* Add to list of ddrs that need to be tested at run-time.  */
295       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
296     }
297 
298   /* Known data dependence.  */
299   if (DDR_NUM_DIST_VECTS (ddr) == 0)
300     {
301       /* If user asserted safelen consecutive iterations can be
302 	 executed concurrently, assume independence.  */
303       if (loop->safelen >= 2)
304 	{
305 	  if (loop->safelen < *max_vf)
306 	    *max_vf = loop->safelen;
307 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
308 	  return false;
309 	}
310 
311       if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 	  || STMT_VINFO_GATHER_P (stmtinfo_b))
313 	{
314 	  if (dump_enabled_p ())
315 	    {
316 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 			       "versioning for alias not supported for: "
318 			       "bad dist vector for ");
319 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 				 DR_REF (dra));
321 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 				 DR_REF (drb));
324 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 	    }
326 	  return true;
327 	}
328 
329       if (dump_enabled_p ())
330         {
331           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
332                            "versioning for alias required: "
333                            "bad dist vector for ");
334           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
335           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
336           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338         }
339       /* Add to list of ddrs that need to be tested at run-time.  */
340       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341     }
342 
343   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
344   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345     {
346       int dist = dist_v[loop_depth];
347 
348       if (dump_enabled_p ())
349 	dump_printf_loc (MSG_NOTE, vect_location,
350                          "dependence distance  = %d.\n", dist);
351 
352       if (dist == 0)
353 	{
354 	  if (dump_enabled_p ())
355 	    {
356 	      dump_printf_loc (MSG_NOTE, vect_location,
357 	                       "dependence distance == 0 between ");
358 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
359 	      dump_printf (MSG_NOTE, " and ");
360 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 	    }
363 
364 	  /* When we perform grouped accesses and perform implicit CSE
365 	     by detecting equal accesses and doing disambiguation with
366 	     runtime alias tests like for
367 	        .. = a[i];
368 		.. = a[i+1];
369 		a[i] = ..;
370 		a[i+1] = ..;
371 		*p = ..;
372 		.. = a[i];
373 		.. = a[i+1];
374 	     where we will end up loading { a[i], a[i+1] } once, make
375 	     sure that inserting group loads before the first load and
376 	     stores after the last store will do the right thing.
377 	     Similar for groups like
378 	        a[i] = ...;
379 		... = a[i];
380 		a[i+1] = ...;
381 	     where loads from the group interleave with the store.  */
382 	  if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
383 	      || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 	    {
385 	      gimple earlier_stmt;
386 	      earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
387 	      if (DR_IS_WRITE
388 		    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 		{
390 		  if (dump_enabled_p ())
391 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 				     "READ_WRITE dependence in interleaving."
393 				     "\n");
394 		  return true;
395 		}
396 	    }
397 
398 	  continue;
399 	}
400 
401       if (dist > 0 && DDR_REVERSED_P (ddr))
402 	{
403 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
404 	     reversed (to make distance vector positive), and the actual
405 	     distance is negative.  */
406 	  if (dump_enabled_p ())
407 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 	                     "dependence distance negative.\n");
409 	  /* Record a negative dependence distance to later limit the
410 	     amount of stmt copying / unrolling we can perform.
411 	     Only need to handle read-after-write dependence.  */
412 	  if (DR_IS_READ (drb)
413 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
414 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
415 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
416 	  continue;
417 	}
418 
419       if (abs (dist) >= 2
420 	  && abs (dist) < *max_vf)
421 	{
422 	  /* The dependence distance requires reduction of the maximal
423 	     vectorization factor.  */
424 	  *max_vf = abs (dist);
425 	  if (dump_enabled_p ())
426 	    dump_printf_loc (MSG_NOTE, vect_location,
427 	                     "adjusting maximal vectorization factor to %i\n",
428 	                     *max_vf);
429 	}
430 
431       if (abs (dist) >= *max_vf)
432 	{
433 	  /* Dependence distance does not create dependence, as far as
434 	     vectorization is concerned, in this case.  */
435 	  if (dump_enabled_p ())
436 	    dump_printf_loc (MSG_NOTE, vect_location,
437 	                     "dependence distance >= VF.\n");
438 	  continue;
439 	}
440 
441       if (dump_enabled_p ())
442 	{
443 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 	               "not vectorized, possible dependence "
445 	               "between data-refs ");
446 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
447 	  dump_printf (MSG_NOTE,  " and ");
448 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 	  dump_printf (MSG_NOTE,  "\n");
450 	}
451 
452       return true;
453     }
454 
455   return false;
456 }
457 
458 /* Function vect_analyze_data_ref_dependences.
459 
460    Examine all the data references in the loop, and make sure there do not
461    exist any data dependences between them.  Set *MAX_VF according to
462    the maximum vectorization factor the data dependences allow.  */
463 
464 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,int * max_vf)465 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 {
467   unsigned int i;
468   struct data_dependence_relation *ddr;
469 
470   if (dump_enabled_p ())
471     dump_printf_loc (MSG_NOTE, vect_location,
472                      "=== vect_analyze_data_ref_dependences ===\n");
473 
474   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
475   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
476 				&LOOP_VINFO_DDRS (loop_vinfo),
477 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
478     return false;
479 
480   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
481     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
482       return false;
483 
484   return true;
485 }
486 
487 
488 /* Function vect_slp_analyze_data_ref_dependence.
489 
490    Return TRUE if there (might) exist a dependence between a memory-reference
491    DRA and a memory-reference DRB.  When versioning for alias may check a
492    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
493    the data dependence.  */
494 
495 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
497 {
498   struct data_reference *dra = DDR_A (ddr);
499   struct data_reference *drb = DDR_B (ddr);
500 
501   /* We need to check dependences of statements marked as unvectorizable
502      as well, they still can prohibit vectorization.  */
503 
504   /* Independent data accesses.  */
505   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
506     return false;
507 
508   if (dra == drb)
509     return false;
510 
511   /* Read-read is OK.  */
512   if (DR_IS_READ (dra) && DR_IS_READ (drb))
513     return false;
514 
515   /* If dra and drb are part of the same interleaving chain consider
516      them independent.  */
517   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
518       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
519 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
520     return false;
521 
522   /* Unknown data dependence.  */
523   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
524     {
525       if  (dump_enabled_p ())
526 	{
527 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 			   "can't determine dependence between ");
529 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
530 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
531 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
532 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
533 	}
534     }
535   else if (dump_enabled_p ())
536     {
537       dump_printf_loc (MSG_NOTE, vect_location,
538 		       "determined dependence between ");
539       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
540       dump_printf (MSG_NOTE, " and ");
541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
542       dump_printf (MSG_NOTE,  "\n");
543     }
544 
545   /* We do not vectorize basic blocks with write-write dependencies.  */
546   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
547     return true;
548 
549   /* If we have a read-write dependence check that the load is before the store.
550      When we vectorize basic blocks, vector load can be only before
551      corresponding scalar load, and vector store can be only after its
552      corresponding scalar store.  So the order of the acceses is preserved in
553      case the load is before the store.  */
554   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
555   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
556     {
557       /* That only holds for load-store pairs taking part in vectorization.  */
558       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
559 	  && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
560 	return false;
561     }
562 
563   return true;
564 }
565 
566 
567 /* Function vect_analyze_data_ref_dependences.
568 
569    Examine all the data references in the basic-block, and make sure there
570    do not exist any data dependences between them.  Set *MAX_VF according to
571    the maximum vectorization factor the data dependences allow.  */
572 
573 bool
vect_slp_analyze_data_ref_dependences(bb_vec_info bb_vinfo)574 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
575 {
576   struct data_dependence_relation *ddr;
577   unsigned int i;
578 
579   if (dump_enabled_p ())
580     dump_printf_loc (MSG_NOTE, vect_location,
581                      "=== vect_slp_analyze_data_ref_dependences ===\n");
582 
583   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
584 				&BB_VINFO_DDRS (bb_vinfo),
585 				vNULL, true))
586     return false;
587 
588   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
589     if (vect_slp_analyze_data_ref_dependence (ddr))
590       return false;
591 
592   return true;
593 }
594 
595 
596 /* Function vect_compute_data_ref_alignment
597 
598    Compute the misalignment of the data reference DR.
599 
600    Output:
601    1. If during the misalignment computation it is found that the data reference
602       cannot be vectorized then false is returned.
603    2. DR_MISALIGNMENT (DR) is defined.
604 
605    FOR NOW: No analysis is actually performed. Misalignment is calculated
606    only for trivial cases. TODO.  */
607 
608 static bool
vect_compute_data_ref_alignment(struct data_reference * dr)609 vect_compute_data_ref_alignment (struct data_reference *dr)
610 {
611   gimple stmt = DR_STMT (dr);
612   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
613   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
614   struct loop *loop = NULL;
615   tree ref = DR_REF (dr);
616   tree vectype;
617   tree base, base_addr;
618   bool base_aligned;
619   tree misalign;
620   tree aligned_to, alignment;
621 
622   if (dump_enabled_p ())
623     dump_printf_loc (MSG_NOTE, vect_location,
624                      "vect_compute_data_ref_alignment:\n");
625 
626   if (loop_vinfo)
627     loop = LOOP_VINFO_LOOP (loop_vinfo);
628 
629   /* Initialize misalignment to unknown.  */
630   SET_DR_MISALIGNMENT (dr, -1);
631 
632   /* Strided loads perform only component accesses, misalignment information
633      is irrelevant for them.  */
634   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
635     return true;
636 
637   misalign = DR_INIT (dr);
638   aligned_to = DR_ALIGNED_TO (dr);
639   base_addr = DR_BASE_ADDRESS (dr);
640   vectype = STMT_VINFO_VECTYPE (stmt_info);
641 
642   /* In case the dataref is in an inner-loop of the loop that is being
643      vectorized (LOOP), we use the base and misalignment information
644      relative to the outer-loop (LOOP).  This is ok only if the misalignment
645      stays the same throughout the execution of the inner-loop, which is why
646      we have to check that the stride of the dataref in the inner-loop evenly
647      divides by the vector size.  */
648   if (loop && nested_in_vect_loop_p (loop, stmt))
649     {
650       tree step = DR_STEP (dr);
651       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
652 
653       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
654         {
655           if (dump_enabled_p ())
656             dump_printf_loc (MSG_NOTE, vect_location,
657                              "inner step divides the vector-size.\n");
658 	  misalign = STMT_VINFO_DR_INIT (stmt_info);
659 	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
660 	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
661         }
662       else
663 	{
664 	  if (dump_enabled_p ())
665 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 	                     "inner step doesn't divide the vector-size.\n");
667 	  misalign = NULL_TREE;
668 	}
669     }
670 
671   /* Similarly, if we're doing basic-block vectorization, we can only use
672      base and misalignment information relative to an innermost loop if the
673      misalignment stays the same throughout the execution of the loop.
674      As above, this is the case if the stride of the dataref evenly divides
675      by the vector size.  */
676   if (!loop)
677     {
678       tree step = DR_STEP (dr);
679       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
680 
681       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
682 	{
683 	  if (dump_enabled_p ())
684 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
685 	                     "SLP: step doesn't divide the vector-size.\n");
686 	  misalign = NULL_TREE;
687 	}
688     }
689 
690   base = build_fold_indirect_ref (base_addr);
691   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
692 
693   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
694       || !misalign)
695     {
696       if (dump_enabled_p ())
697 	{
698 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699 	                   "Unknown alignment for access: ");
700 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
701 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
702 	}
703       return true;
704     }
705 
706   if ((DECL_P (base)
707        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
708 				alignment) >= 0)
709       || (TREE_CODE (base_addr) == SSA_NAME
710 	  && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
711 						      TREE_TYPE (base_addr)))),
712 				   alignment) >= 0)
713       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
714     base_aligned = true;
715   else
716     base_aligned = false;
717 
718   if (!base_aligned)
719     {
720       /* Do not change the alignment of global variables here if
721 	 flag_section_anchors is enabled as we already generated
722 	 RTL for other functions.  Most global variables should
723 	 have been aligned during the IPA increase_alignment pass.  */
724       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
725 	  || (TREE_STATIC (base) && flag_section_anchors))
726 	{
727 	  if (dump_enabled_p ())
728 	    {
729 	      dump_printf_loc (MSG_NOTE, vect_location,
730 	                       "can't force alignment of ref: ");
731 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
732 	      dump_printf (MSG_NOTE, "\n");
733 	    }
734 	  return true;
735 	}
736 
737       /* Force the alignment of the decl.
738 	 NOTE: This is the only change to the code we make during
739 	 the analysis phase, before deciding to vectorize the loop.  */
740       if (dump_enabled_p ())
741         {
742           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
743           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
744           dump_printf (MSG_NOTE, "\n");
745         }
746 
747       ((dataref_aux *)dr->aux)->base_decl = base;
748       ((dataref_aux *)dr->aux)->base_misaligned = true;
749     }
750 
751   /* If this is a backward running DR then first access in the larger
752      vectype actually is N-1 elements before the address in the DR.
753      Adjust misalign accordingly.  */
754   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
755     {
756       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
757       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
758 	 otherwise we wouldn't be here.  */
759       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
760       /* PLUS because DR_STEP was negative.  */
761       misalign = size_binop (PLUS_EXPR, misalign, offset);
762     }
763 
764   /* Modulo alignment.  */
765   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
766 
767   if (!tree_fits_uhwi_p (misalign))
768     {
769       /* Negative or overflowed misalignment value.  */
770       if (dump_enabled_p ())
771 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 	                 "unexpected misalign value\n");
773       return false;
774     }
775 
776   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
777 
778   if (dump_enabled_p ())
779     {
780       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
782       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
783       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
784     }
785 
786   return true;
787 }
788 
789 
790 /* Function vect_compute_data_refs_alignment
791 
792    Compute the misalignment of data references in the loop.
793    Return FALSE if a data reference is found that cannot be vectorized.  */
794 
795 static bool
vect_compute_data_refs_alignment(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)796 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
797                                   bb_vec_info bb_vinfo)
798 {
799   vec<data_reference_p> datarefs;
800   struct data_reference *dr;
801   unsigned int i;
802 
803   if (loop_vinfo)
804     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
805   else
806     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
807 
808   FOR_EACH_VEC_ELT (datarefs, i, dr)
809     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
810         && !vect_compute_data_ref_alignment (dr))
811       {
812         if (bb_vinfo)
813           {
814             /* Mark unsupported statement as unvectorizable.  */
815             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
816             continue;
817           }
818         else
819           return false;
820       }
821 
822   return true;
823 }
824 
825 
826 /* Function vect_update_misalignment_for_peel
827 
828    DR - the data reference whose misalignment is to be adjusted.
829    DR_PEEL - the data reference whose misalignment is being made
830              zero in the vector loop by the peel.
831    NPEEL - the number of iterations in the peel loop if the misalignment
832            of DR_PEEL is known at compile time.  */
833 
834 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)835 vect_update_misalignment_for_peel (struct data_reference *dr,
836                                    struct data_reference *dr_peel, int npeel)
837 {
838   unsigned int i;
839   vec<dr_p> same_align_drs;
840   struct data_reference *current_dr;
841   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
842   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
843   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
844   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
845 
846  /* For interleaved data accesses the step in the loop must be multiplied by
847      the size of the interleaving group.  */
848   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
849     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
850   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
851     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
852 
853   /* It can be assumed that the data refs with the same alignment as dr_peel
854      are aligned in the vector loop.  */
855   same_align_drs
856     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
857   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
858     {
859       if (current_dr != dr)
860         continue;
861       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
862                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
863       SET_DR_MISALIGNMENT (dr, 0);
864       return;
865     }
866 
867   if (known_alignment_for_access_p (dr)
868       && known_alignment_for_access_p (dr_peel))
869     {
870       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
871       int misal = DR_MISALIGNMENT (dr);
872       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
873       misal += negative ? -npeel * dr_size : npeel * dr_size;
874       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
875       SET_DR_MISALIGNMENT (dr, misal);
876       return;
877     }
878 
879   if (dump_enabled_p ())
880     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
881   SET_DR_MISALIGNMENT (dr, -1);
882 }
883 
884 
885 /* Function vect_verify_datarefs_alignment
886 
887    Return TRUE if all data references in the loop can be
888    handled with respect to alignment.  */
889 
890 bool
vect_verify_datarefs_alignment(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)891 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
892 {
893   vec<data_reference_p> datarefs;
894   struct data_reference *dr;
895   enum dr_alignment_support supportable_dr_alignment;
896   unsigned int i;
897 
898   if (loop_vinfo)
899     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
900   else
901     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
902 
903   FOR_EACH_VEC_ELT (datarefs, i, dr)
904     {
905       gimple stmt = DR_STMT (dr);
906       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
907 
908       if (!STMT_VINFO_RELEVANT_P (stmt_info))
909 	continue;
910 
911       /* For interleaving, only the alignment of the first access matters.
912          Skip statements marked as not vectorizable.  */
913       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
914            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
915           || !STMT_VINFO_VECTORIZABLE (stmt_info))
916         continue;
917 
918       /* Strided loads perform only component accesses, alignment is
919 	 irrelevant for them.  */
920       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
921 	continue;
922 
923       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
924       if (!supportable_dr_alignment)
925         {
926           if (dump_enabled_p ())
927             {
928               if (DR_IS_READ (dr))
929                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930                                  "not vectorized: unsupported unaligned load.");
931               else
932                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
933                                  "not vectorized: unsupported unaligned "
934                                  "store.");
935 
936               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
937                                  DR_REF (dr));
938               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
939             }
940           return false;
941         }
942       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
943         dump_printf_loc (MSG_NOTE, vect_location,
944                          "Vectorizing an unaligned access.\n");
945     }
946   return true;
947 }
948 
949 /* Given an memory reference EXP return whether its alignment is less
950    than its size.  */
951 
952 static bool
not_size_aligned(tree exp)953 not_size_aligned (tree exp)
954 {
955   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
956     return true;
957 
958   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
959 	  > get_object_alignment (exp));
960 }
961 
962 /* Function vector_alignment_reachable_p
963 
964    Return true if vector alignment for DR is reachable by peeling
965    a few loop iterations.  Return false otherwise.  */
966 
967 static bool
vector_alignment_reachable_p(struct data_reference * dr)968 vector_alignment_reachable_p (struct data_reference *dr)
969 {
970   gimple stmt = DR_STMT (dr);
971   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
972   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
973 
974   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
975     {
976       /* For interleaved access we peel only if number of iterations in
977 	 the prolog loop ({VF - misalignment}), is a multiple of the
978 	 number of the interleaved accesses.  */
979       int elem_size, mis_in_elements;
980       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
981 
982       /* FORNOW: handle only known alignment.  */
983       if (!known_alignment_for_access_p (dr))
984 	return false;
985 
986       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
987       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
988 
989       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
990 	return false;
991     }
992 
993   /* If misalignment is known at the compile time then allow peeling
994      only if natural alignment is reachable through peeling.  */
995   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
996     {
997       HOST_WIDE_INT elmsize =
998 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
999       if (dump_enabled_p ())
1000 	{
1001 	  dump_printf_loc (MSG_NOTE, vect_location,
1002 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1003 	  dump_printf (MSG_NOTE,
1004 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1005 	}
1006       if (DR_MISALIGNMENT (dr) % elmsize)
1007 	{
1008 	  if (dump_enabled_p ())
1009 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 	                     "data size does not divide the misalignment.\n");
1011 	  return false;
1012 	}
1013     }
1014 
1015   if (!known_alignment_for_access_p (dr))
1016     {
1017       tree type = TREE_TYPE (DR_REF (dr));
1018       bool is_packed = not_size_aligned (DR_REF (dr));
1019       if (dump_enabled_p ())
1020 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 	                 "Unknown misalignment, is_packed = %d\n",is_packed);
1022       if ((TYPE_USER_ALIGN (type) && !is_packed)
1023 	  || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1024 	return true;
1025       else
1026 	return false;
1027     }
1028 
1029   return true;
1030 }
1031 
1032 
1033 /* Calculate the cost of the memory access represented by DR.  */
1034 
1035 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1036 vect_get_data_access_cost (struct data_reference *dr,
1037                            unsigned int *inside_cost,
1038                            unsigned int *outside_cost,
1039 			   stmt_vector_for_cost *body_cost_vec)
1040 {
1041   gimple stmt = DR_STMT (dr);
1042   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1043   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1045   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1046   int ncopies = vf / nunits;
1047 
1048   if (DR_IS_READ (dr))
1049     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1050 			NULL, body_cost_vec, false);
1051   else
1052     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1053 
1054   if (dump_enabled_p ())
1055     dump_printf_loc (MSG_NOTE, vect_location,
1056                      "vect_get_data_access_cost: inside_cost = %d, "
1057                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1058 }
1059 
1060 
1061 /* Insert DR into peeling hash table with NPEEL as key.  */
1062 
1063 static void
vect_peeling_hash_insert(loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1064 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1065                           int npeel)
1066 {
1067   struct _vect_peel_info elem, *slot;
1068   _vect_peel_info **new_slot;
1069   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1070 
1071   elem.npeel = npeel;
1072   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1073   if (slot)
1074     slot->count++;
1075   else
1076     {
1077       slot = XNEW (struct _vect_peel_info);
1078       slot->npeel = npeel;
1079       slot->dr = dr;
1080       slot->count = 1;
1081       new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1082       *new_slot = slot;
1083     }
1084 
1085   if (!supportable_dr_alignment
1086       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1087     slot->count += VECT_MAX_COST;
1088 }
1089 
1090 
1091 /* Traverse peeling hash table to find peeling option that aligns maximum
1092    number of data accesses.  */
1093 
1094 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1095 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1096 				     _vect_peel_extended_info *max)
1097 {
1098   vect_peel_info elem = *slot;
1099 
1100   if (elem->count > max->peel_info.count
1101       || (elem->count == max->peel_info.count
1102           && max->peel_info.npeel > elem->npeel))
1103     {
1104       max->peel_info.npeel = elem->npeel;
1105       max->peel_info.count = elem->count;
1106       max->peel_info.dr = elem->dr;
1107     }
1108 
1109   return 1;
1110 }
1111 
1112 
1113 /* Traverse peeling hash table and calculate cost for each peeling option.
1114    Find the one with the lowest cost.  */
1115 
1116 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1117 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1118 				   _vect_peel_extended_info *min)
1119 {
1120   vect_peel_info elem = *slot;
1121   int save_misalignment, dummy;
1122   unsigned int inside_cost = 0, outside_cost = 0, i;
1123   gimple stmt = DR_STMT (elem->dr);
1124   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1125   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1126   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1127   struct data_reference *dr;
1128   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1129   int single_iter_cost;
1130 
1131   prologue_cost_vec.create (2);
1132   body_cost_vec.create (2);
1133   epilogue_cost_vec.create (2);
1134 
1135   FOR_EACH_VEC_ELT (datarefs, i, dr)
1136     {
1137       stmt = DR_STMT (dr);
1138       stmt_info = vinfo_for_stmt (stmt);
1139       /* For interleaving, only the alignment of the first access
1140          matters.  */
1141       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1142           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1143         continue;
1144 
1145       save_misalignment = DR_MISALIGNMENT (dr);
1146       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1147       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1148 				 &body_cost_vec);
1149       SET_DR_MISALIGNMENT (dr, save_misalignment);
1150     }
1151 
1152   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1153   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1154 					       &dummy, single_iter_cost,
1155 					       &prologue_cost_vec,
1156 					       &epilogue_cost_vec);
1157 
1158   /* Prologue and epilogue costs are added to the target model later.
1159      These costs depend only on the scalar iteration cost, the
1160      number of peeling iterations finally chosen, and the number of
1161      misaligned statements.  So discard the information found here.  */
1162   prologue_cost_vec.release ();
1163   epilogue_cost_vec.release ();
1164 
1165   if (inside_cost < min->inside_cost
1166       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1167     {
1168       min->inside_cost = inside_cost;
1169       min->outside_cost = outside_cost;
1170       min->body_cost_vec.release ();
1171       min->body_cost_vec = body_cost_vec;
1172       min->peel_info.dr = elem->dr;
1173       min->peel_info.npeel = elem->npeel;
1174     }
1175   else
1176     body_cost_vec.release ();
1177 
1178   return 1;
1179 }
1180 
1181 
1182 /* Choose best peeling option by traversing peeling hash table and either
1183    choosing an option with the lowest cost (if cost model is enabled) or the
1184    option that aligns as many accesses as possible.  */
1185 
1186 static struct data_reference *
vect_peeling_hash_choose_best_peeling(loop_vec_info loop_vinfo,unsigned int * npeel,stmt_vector_for_cost * body_cost_vec)1187 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1188                                        unsigned int *npeel,
1189 				       stmt_vector_for_cost *body_cost_vec)
1190 {
1191    struct _vect_peel_extended_info res;
1192 
1193    res.peel_info.dr = NULL;
1194    res.body_cost_vec = stmt_vector_for_cost ();
1195 
1196    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1197      {
1198        res.inside_cost = INT_MAX;
1199        res.outside_cost = INT_MAX;
1200        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1201            .traverse <_vect_peel_extended_info *,
1202                       vect_peeling_hash_get_lowest_cost> (&res);
1203      }
1204    else
1205      {
1206        res.peel_info.count = 0;
1207        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1208            .traverse <_vect_peel_extended_info *,
1209                       vect_peeling_hash_get_most_frequent> (&res);
1210      }
1211 
1212    *npeel = res.peel_info.npeel;
1213    *body_cost_vec = res.body_cost_vec;
1214    return res.peel_info.dr;
1215 }
1216 
1217 
1218 /* Function vect_enhance_data_refs_alignment
1219 
1220    This pass will use loop versioning and loop peeling in order to enhance
1221    the alignment of data references in the loop.
1222 
1223    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1224    original loop is to be vectorized.  Any other loops that are created by
1225    the transformations performed in this pass - are not supposed to be
1226    vectorized.  This restriction will be relaxed.
1227 
1228    This pass will require a cost model to guide it whether to apply peeling
1229    or versioning or a combination of the two.  For example, the scheme that
1230    intel uses when given a loop with several memory accesses, is as follows:
1231    choose one memory access ('p') which alignment you want to force by doing
1232    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1233    other accesses are not necessarily aligned, or (2) use loop versioning to
1234    generate one loop in which all accesses are aligned, and another loop in
1235    which only 'p' is necessarily aligned.
1236 
1237    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1238    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1239    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1240 
1241    Devising a cost model is the most critical aspect of this work.  It will
1242    guide us on which access to peel for, whether to use loop versioning, how
1243    many versions to create, etc.  The cost model will probably consist of
1244    generic considerations as well as target specific considerations (on
1245    powerpc for example, misaligned stores are more painful than misaligned
1246    loads).
1247 
1248    Here are the general steps involved in alignment enhancements:
1249 
1250      -- original loop, before alignment analysis:
1251 	for (i=0; i<N; i++){
1252 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1253 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1254 	}
1255 
1256      -- After vect_compute_data_refs_alignment:
1257 	for (i=0; i<N; i++){
1258 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1259 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1260 	}
1261 
1262      -- Possibility 1: we do loop versioning:
1263      if (p is aligned) {
1264 	for (i=0; i<N; i++){	# loop 1A
1265 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1266 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1267 	}
1268      }
1269      else {
1270 	for (i=0; i<N; i++){	# loop 1B
1271 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1272 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1273 	}
1274      }
1275 
1276      -- Possibility 2: we do loop peeling:
1277      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1278 	x = q[i];
1279 	p[i] = y;
1280      }
1281      for (i = 3; i < N; i++){	# loop 2A
1282 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1283 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1284      }
1285 
1286      -- Possibility 3: combination of loop peeling and versioning:
1287      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1288 	x = q[i];
1289 	p[i] = y;
1290      }
1291      if (p is aligned) {
1292 	for (i = 3; i<N; i++){	# loop 3A
1293 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1294 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1295 	}
1296      }
1297      else {
1298 	for (i = 3; i<N; i++){	# loop 3B
1299 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1300 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1301 	}
1302      }
1303 
1304      These loops are later passed to loop_transform to be vectorized.  The
1305      vectorizer will use the alignment information to guide the transformation
1306      (whether to generate regular loads/stores, or with special handling for
1307      misalignment).  */
1308 
1309 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1310 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1311 {
1312   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1313   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1314   enum dr_alignment_support supportable_dr_alignment;
1315   struct data_reference *dr0 = NULL, *first_store = NULL;
1316   struct data_reference *dr;
1317   unsigned int i, j;
1318   bool do_peeling = false;
1319   bool do_versioning = false;
1320   bool stat;
1321   gimple stmt;
1322   stmt_vec_info stmt_info;
1323   unsigned int npeel = 0;
1324   bool all_misalignments_unknown = true;
1325   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1326   unsigned possible_npeel_number = 1;
1327   tree vectype;
1328   unsigned int nelements, mis, same_align_drs_max = 0;
1329   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1330 
1331   if (dump_enabled_p ())
1332     dump_printf_loc (MSG_NOTE, vect_location,
1333                      "=== vect_enhance_data_refs_alignment ===\n");
1334 
1335   /* While cost model enhancements are expected in the future, the high level
1336      view of the code at this time is as follows:
1337 
1338      A) If there is a misaligned access then see if peeling to align
1339         this access can make all data references satisfy
1340         vect_supportable_dr_alignment.  If so, update data structures
1341         as needed and return true.
1342 
1343      B) If peeling wasn't possible and there is a data reference with an
1344         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1345         then see if loop versioning checks can be used to make all data
1346         references satisfy vect_supportable_dr_alignment.  If so, update
1347         data structures as needed and return true.
1348 
1349      C) If neither peeling nor versioning were successful then return false if
1350         any data reference does not satisfy vect_supportable_dr_alignment.
1351 
1352      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1353 
1354      Note, Possibility 3 above (which is peeling and versioning together) is not
1355      being done at this time.  */
1356 
1357   /* (1) Peeling to force alignment.  */
1358 
1359   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1360      Considerations:
1361      + How many accesses will become aligned due to the peeling
1362      - How many accesses will become unaligned due to the peeling,
1363        and the cost of misaligned accesses.
1364      - The cost of peeling (the extra runtime checks, the increase
1365        in code size).  */
1366 
1367   FOR_EACH_VEC_ELT (datarefs, i, dr)
1368     {
1369       stmt = DR_STMT (dr);
1370       stmt_info = vinfo_for_stmt (stmt);
1371 
1372       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1373 	continue;
1374 
1375       /* For interleaving, only the alignment of the first access
1376          matters.  */
1377       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1378           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1379         continue;
1380 
1381       /* For invariant accesses there is nothing to enhance.  */
1382       if (integer_zerop (DR_STEP (dr)))
1383 	continue;
1384 
1385       /* Strided loads perform only component accesses, alignment is
1386 	 irrelevant for them.  */
1387       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1388 	continue;
1389 
1390       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1391       do_peeling = vector_alignment_reachable_p (dr);
1392       if (do_peeling)
1393         {
1394           if (known_alignment_for_access_p (dr))
1395             {
1396               unsigned int npeel_tmp;
1397 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1398 						    size_zero_node) < 0;
1399 
1400               /* Save info about DR in the hash table.  */
1401               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1402                 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1403 
1404               vectype = STMT_VINFO_VECTYPE (stmt_info);
1405               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1406               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1407                                                 TREE_TYPE (DR_REF (dr))));
1408               npeel_tmp = (negative
1409 			   ? (mis - nelements) : (nelements - mis))
1410 		  & (nelements - 1);
1411 
1412               /* For multiple types, it is possible that the bigger type access
1413                  will have more than one peeling option.  E.g., a loop with two
1414                  types: one of size (vector size / 4), and the other one of
1415                  size (vector size / 8).  Vectorization factor will 8.  If both
1416                  access are misaligned by 3, the first one needs one scalar
1417                  iteration to be aligned, and the second one needs 5.  But the
1418                  the first one will be aligned also by peeling 5 scalar
1419                  iterations, and in that case both accesses will be aligned.
1420                  Hence, except for the immediate peeling amount, we also want
1421                  to try to add full vector size, while we don't exceed
1422                  vectorization factor.
1423                  We do this automtically for cost model, since we calculate cost
1424                  for every peeling option.  */
1425               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1426                 possible_npeel_number = vf /nelements;
1427 
1428               /* Handle the aligned case. We may decide to align some other
1429                  access, making DR unaligned.  */
1430               if (DR_MISALIGNMENT (dr) == 0)
1431                 {
1432                   npeel_tmp = 0;
1433                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1434                     possible_npeel_number++;
1435                 }
1436 
1437               for (j = 0; j < possible_npeel_number; j++)
1438                 {
1439                   gcc_assert (npeel_tmp <= vf);
1440                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1441                   npeel_tmp += nelements;
1442                 }
1443 
1444               all_misalignments_unknown = false;
1445               /* Data-ref that was chosen for the case that all the
1446                  misalignments are unknown is not relevant anymore, since we
1447                  have a data-ref with known alignment.  */
1448               dr0 = NULL;
1449             }
1450           else
1451             {
1452               /* If we don't know any misalignment values, we prefer
1453                  peeling for data-ref that has the maximum number of data-refs
1454                  with the same alignment, unless the target prefers to align
1455                  stores over load.  */
1456               if (all_misalignments_unknown)
1457                 {
1458 		  unsigned same_align_drs
1459 		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1460                   if (!dr0
1461 		      || same_align_drs_max < same_align_drs)
1462                     {
1463                       same_align_drs_max = same_align_drs;
1464                       dr0 = dr;
1465                     }
1466 		  /* For data-refs with the same number of related
1467 		     accesses prefer the one where the misalign
1468 		     computation will be invariant in the outermost loop.  */
1469 		  else if (same_align_drs_max == same_align_drs)
1470 		    {
1471 		      struct loop *ivloop0, *ivloop;
1472 		      ivloop0 = outermost_invariant_loop_for_expr
1473 			  (loop, DR_BASE_ADDRESS (dr0));
1474 		      ivloop = outermost_invariant_loop_for_expr
1475 			  (loop, DR_BASE_ADDRESS (dr));
1476 		      if ((ivloop && !ivloop0)
1477 			  || (ivloop && ivloop0
1478 			      && flow_loop_nested_p (ivloop, ivloop0)))
1479 			dr0 = dr;
1480 		    }
1481 
1482                   if (!first_store && DR_IS_WRITE (dr))
1483                     first_store = dr;
1484                 }
1485 
1486               /* If there are both known and unknown misaligned accesses in the
1487                  loop, we choose peeling amount according to the known
1488                  accesses.  */
1489               if (!supportable_dr_alignment)
1490                 {
1491                   dr0 = dr;
1492                   if (!first_store && DR_IS_WRITE (dr))
1493                     first_store = dr;
1494                 }
1495             }
1496         }
1497       else
1498         {
1499           if (!aligned_access_p (dr))
1500             {
1501               if (dump_enabled_p ())
1502                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                                  "vector alignment may not be reachable\n");
1504               break;
1505             }
1506         }
1507     }
1508 
1509   /* Check if we can possibly peel the loop.  */
1510   if (!vect_can_advance_ivs_p (loop_vinfo)
1511       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1512     do_peeling = false;
1513 
1514   if (do_peeling && all_misalignments_unknown
1515       && vect_supportable_dr_alignment (dr0, false))
1516     {
1517 
1518       /* Check if the target requires to prefer stores over loads, i.e., if
1519          misaligned stores are more expensive than misaligned loads (taking
1520          drs with same alignment into account).  */
1521       if (first_store && DR_IS_READ (dr0))
1522         {
1523           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1524           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1525           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1526           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1527 	  stmt_vector_for_cost dummy;
1528 	  dummy.create (2);
1529 
1530           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1531 				     &dummy);
1532           vect_get_data_access_cost (first_store, &store_inside_cost,
1533 				     &store_outside_cost, &dummy);
1534 
1535 	  dummy.release ();
1536 
1537           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1538              aligning the load DR0).  */
1539           load_inside_penalty = store_inside_cost;
1540           load_outside_penalty = store_outside_cost;
1541           for (i = 0;
1542 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1543 			  DR_STMT (first_store))).iterate (i, &dr);
1544                i++)
1545             if (DR_IS_READ (dr))
1546               {
1547                 load_inside_penalty += load_inside_cost;
1548                 load_outside_penalty += load_outside_cost;
1549               }
1550             else
1551               {
1552                 load_inside_penalty += store_inside_cost;
1553                 load_outside_penalty += store_outside_cost;
1554               }
1555 
1556           /* Calculate the penalty for leaving DR0 unaligned (by
1557              aligning the FIRST_STORE).  */
1558           store_inside_penalty = load_inside_cost;
1559           store_outside_penalty = load_outside_cost;
1560           for (i = 0;
1561 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1562 		      DR_STMT (dr0))).iterate (i, &dr);
1563                i++)
1564             if (DR_IS_READ (dr))
1565               {
1566                 store_inside_penalty += load_inside_cost;
1567                 store_outside_penalty += load_outside_cost;
1568               }
1569             else
1570               {
1571                 store_inside_penalty += store_inside_cost;
1572                 store_outside_penalty += store_outside_cost;
1573               }
1574 
1575           if (load_inside_penalty > store_inside_penalty
1576               || (load_inside_penalty == store_inside_penalty
1577                   && load_outside_penalty > store_outside_penalty))
1578             dr0 = first_store;
1579         }
1580 
1581       /* In case there are only loads with different unknown misalignments, use
1582          peeling only if it may help to align other accesses in the loop.  */
1583       if (!first_store
1584 	  && !STMT_VINFO_SAME_ALIGN_REFS (
1585 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1586           && vect_supportable_dr_alignment (dr0, false)
1587               != dr_unaligned_supported)
1588         do_peeling = false;
1589     }
1590 
1591   if (do_peeling && !dr0)
1592     {
1593       /* Peeling is possible, but there is no data access that is not supported
1594          unless aligned. So we try to choose the best possible peeling.  */
1595 
1596       /* We should get here only if there are drs with known misalignment.  */
1597       gcc_assert (!all_misalignments_unknown);
1598 
1599       /* Choose the best peeling from the hash table.  */
1600       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1601 						   &body_cost_vec);
1602       if (!dr0 || !npeel)
1603         do_peeling = false;
1604     }
1605 
1606   if (do_peeling)
1607     {
1608       stmt = DR_STMT (dr0);
1609       stmt_info = vinfo_for_stmt (stmt);
1610       vectype = STMT_VINFO_VECTYPE (stmt_info);
1611       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1612 
1613       if (known_alignment_for_access_p (dr0))
1614         {
1615 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1616 						size_zero_node) < 0;
1617           if (!npeel)
1618             {
1619               /* Since it's known at compile time, compute the number of
1620                  iterations in the peeled loop (the peeling factor) for use in
1621                  updating DR_MISALIGNMENT values.  The peeling factor is the
1622                  vectorization factor minus the misalignment as an element
1623                  count.  */
1624               mis = DR_MISALIGNMENT (dr0);
1625               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1626               npeel = ((negative ? mis - nelements : nelements - mis)
1627 		       & (nelements - 1));
1628             }
1629 
1630 	  /* For interleaved data access every iteration accesses all the
1631 	     members of the group, therefore we divide the number of iterations
1632 	     by the group size.  */
1633 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1634 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1635 	    npeel /= GROUP_SIZE (stmt_info);
1636 
1637           if (dump_enabled_p ())
1638             dump_printf_loc (MSG_NOTE, vect_location,
1639                              "Try peeling by %d\n", npeel);
1640         }
1641 
1642       /* Ensure that all data refs can be vectorized after the peel.  */
1643       FOR_EACH_VEC_ELT (datarefs, i, dr)
1644         {
1645           int save_misalignment;
1646 
1647 	  if (dr == dr0)
1648 	    continue;
1649 
1650 	  stmt = DR_STMT (dr);
1651 	  stmt_info = vinfo_for_stmt (stmt);
1652 	  /* For interleaving, only the alignment of the first access
1653             matters.  */
1654 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1655 	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1656 	    continue;
1657 
1658 	  /* Strided loads perform only component accesses, alignment is
1659 	     irrelevant for them.  */
1660 	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1661 	    continue;
1662 
1663 	  save_misalignment = DR_MISALIGNMENT (dr);
1664 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1665 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1666 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1667 
1668 	  if (!supportable_dr_alignment)
1669 	    {
1670 	      do_peeling = false;
1671 	      break;
1672 	    }
1673 	}
1674 
1675       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1676         {
1677           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1678           if (!stat)
1679             do_peeling = false;
1680           else
1681 	    {
1682 	      body_cost_vec.release ();
1683 	      return stat;
1684 	    }
1685         }
1686 
1687       if (do_peeling)
1688         {
1689           unsigned max_allowed_peel
1690             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1691           if (max_allowed_peel != (unsigned)-1)
1692             {
1693               unsigned max_peel = npeel;
1694               if (max_peel == 0)
1695                 {
1696                   gimple dr_stmt = DR_STMT (dr0);
1697                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1698                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1699                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1700                 }
1701               if (max_peel > max_allowed_peel)
1702                 {
1703                   do_peeling = false;
1704                   if (dump_enabled_p ())
1705                     dump_printf_loc (MSG_NOTE, vect_location,
1706                         "Disable peeling, max peels reached: %d\n", max_peel);
1707                 }
1708             }
1709         }
1710 
1711       if (do_peeling)
1712         {
1713 	  stmt_info_for_cost *si;
1714 	  void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1715 
1716           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1717              If the misalignment of DR_i is identical to that of dr0 then set
1718              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1719              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1720              by the peeling factor times the element size of DR_i (MOD the
1721              vectorization factor times the size).  Otherwise, the
1722              misalignment of DR_i must be set to unknown.  */
1723 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1724 	    if (dr != dr0)
1725 	      vect_update_misalignment_for_peel (dr, dr0, npeel);
1726 
1727           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1728           if (npeel)
1729             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1730           else
1731             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1732 	      = DR_MISALIGNMENT (dr0);
1733 	  SET_DR_MISALIGNMENT (dr0, 0);
1734 	  if (dump_enabled_p ())
1735             {
1736               dump_printf_loc (MSG_NOTE, vect_location,
1737                                "Alignment of access forced using peeling.\n");
1738               dump_printf_loc (MSG_NOTE, vect_location,
1739                                "Peeling for alignment will be applied.\n");
1740             }
1741 	  /* We've delayed passing the inside-loop peeling costs to the
1742 	     target cost model until we were sure peeling would happen.
1743 	     Do so now.  */
1744 	  if (body_cost_vec.exists ())
1745 	    {
1746 	      FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1747 		{
1748 		  struct _stmt_vec_info *stmt_info
1749 		    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1750 		  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1751 					si->misalign, vect_body);
1752 		}
1753 	      body_cost_vec.release ();
1754 	    }
1755 
1756 	  stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1757 	  gcc_assert (stat);
1758           return stat;
1759         }
1760     }
1761 
1762   body_cost_vec.release ();
1763 
1764   /* (2) Versioning to force alignment.  */
1765 
1766   /* Try versioning if:
1767      1) optimize loop for speed
1768      2) there is at least one unsupported misaligned data ref with an unknown
1769         misalignment, and
1770      3) all misaligned data refs with a known misalignment are supported, and
1771      4) the number of runtime alignment checks is within reason.  */
1772 
1773   do_versioning =
1774 	optimize_loop_nest_for_speed_p (loop)
1775 	&& (!loop->inner); /* FORNOW */
1776 
1777   if (do_versioning)
1778     {
1779       FOR_EACH_VEC_ELT (datarefs, i, dr)
1780         {
1781 	  stmt = DR_STMT (dr);
1782 	  stmt_info = vinfo_for_stmt (stmt);
1783 
1784 	  /* For interleaving, only the alignment of the first access
1785 	     matters.  */
1786 	  if (aligned_access_p (dr)
1787 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1788 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1789 	    continue;
1790 
1791 	  /* Strided loads perform only component accesses, alignment is
1792 	     irrelevant for them.  */
1793 	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1794 	    continue;
1795 
1796 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1797 
1798           if (!supportable_dr_alignment)
1799             {
1800               gimple stmt;
1801               int mask;
1802               tree vectype;
1803 
1804               if (known_alignment_for_access_p (dr)
1805                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1806                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1807                 {
1808                   do_versioning = false;
1809                   break;
1810                 }
1811 
1812               stmt = DR_STMT (dr);
1813               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1814               gcc_assert (vectype);
1815 
1816               /* The rightmost bits of an aligned address must be zeros.
1817                  Construct the mask needed for this test.  For example,
1818                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1819                  mask must be 15 = 0xf. */
1820               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1821 
1822               /* FORNOW: use the same mask to test all potentially unaligned
1823                  references in the loop.  The vectorizer currently supports
1824                  a single vector size, see the reference to
1825                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1826                  vectorization factor is computed.  */
1827               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1828                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1829               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1830               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1831 		      DR_STMT (dr));
1832             }
1833         }
1834 
1835       /* Versioning requires at least one misaligned data reference.  */
1836       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1837         do_versioning = false;
1838       else if (!do_versioning)
1839         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1840     }
1841 
1842   if (do_versioning)
1843     {
1844       vec<gimple> may_misalign_stmts
1845         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1846       gimple stmt;
1847 
1848       /* It can now be assumed that the data references in the statements
1849          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1850          of the loop being vectorized.  */
1851       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1852         {
1853           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1854           dr = STMT_VINFO_DATA_REF (stmt_info);
1855 	  SET_DR_MISALIGNMENT (dr, 0);
1856 	  if (dump_enabled_p ())
1857             dump_printf_loc (MSG_NOTE, vect_location,
1858                              "Alignment of access forced using versioning.\n");
1859         }
1860 
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_NOTE, vect_location,
1863                          "Versioning for alignment will be applied.\n");
1864 
1865       /* Peeling and versioning can't be done together at this time.  */
1866       gcc_assert (! (do_peeling && do_versioning));
1867 
1868       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1869       gcc_assert (stat);
1870       return stat;
1871     }
1872 
1873   /* This point is reached if neither peeling nor versioning is being done.  */
1874   gcc_assert (! (do_peeling || do_versioning));
1875 
1876   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1877   return stat;
1878 }
1879 
1880 
1881 /* Function vect_find_same_alignment_drs.
1882 
1883    Update group and alignment relations according to the chosen
1884    vectorization factor.  */
1885 
1886 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo)1887 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1888 			      loop_vec_info loop_vinfo)
1889 {
1890   unsigned int i;
1891   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1892   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1893   struct data_reference *dra = DDR_A (ddr);
1894   struct data_reference *drb = DDR_B (ddr);
1895   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1896   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1897   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1898   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1899   lambda_vector dist_v;
1900   unsigned int loop_depth;
1901 
1902   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1903     return;
1904 
1905   if (dra == drb)
1906     return;
1907 
1908   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1909     return;
1910 
1911   /* Loop-based vectorization and known data dependence.  */
1912   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1913     return;
1914 
1915   /* Data-dependence analysis reports a distance vector of zero
1916      for data-references that overlap only in the first iteration
1917      but have different sign step (see PR45764).
1918      So as a sanity check require equal DR_STEP.  */
1919   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1920     return;
1921 
1922   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1923   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1924     {
1925       int dist = dist_v[loop_depth];
1926 
1927       if (dump_enabled_p ())
1928 	dump_printf_loc (MSG_NOTE, vect_location,
1929 	                 "dependence distance  = %d.\n", dist);
1930 
1931       /* Same loop iteration.  */
1932       if (dist == 0
1933 	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
1934 	{
1935 	  /* Two references with distance zero have the same alignment.  */
1936 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1937 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1938 	  if (dump_enabled_p ())
1939 	    {
1940 	      dump_printf_loc (MSG_NOTE, vect_location,
1941 	                       "accesses have the same alignment.\n");
1942 	      dump_printf (MSG_NOTE,
1943 	                   "dependence distance modulo vf == 0 between ");
1944 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1945 	      dump_printf (MSG_NOTE,  " and ");
1946 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1947 	      dump_printf (MSG_NOTE, "\n");
1948 	    }
1949 	}
1950     }
1951 }
1952 
1953 
1954 /* Function vect_analyze_data_refs_alignment
1955 
1956    Analyze the alignment of the data-references in the loop.
1957    Return FALSE if a data reference is found that cannot be vectorized.  */
1958 
1959 bool
vect_analyze_data_refs_alignment(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)1960 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1961                                   bb_vec_info bb_vinfo)
1962 {
1963   if (dump_enabled_p ())
1964     dump_printf_loc (MSG_NOTE, vect_location,
1965                      "=== vect_analyze_data_refs_alignment ===\n");
1966 
1967   /* Mark groups of data references with same alignment using
1968      data dependence information.  */
1969   if (loop_vinfo)
1970     {
1971       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1972       struct data_dependence_relation *ddr;
1973       unsigned int i;
1974 
1975       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1976 	vect_find_same_alignment_drs (ddr, loop_vinfo);
1977     }
1978 
1979   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1980     {
1981       if (dump_enabled_p ())
1982 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1983 	                 "not vectorized: can't calculate alignment "
1984 	                 "for data ref.\n");
1985       return false;
1986     }
1987 
1988   return true;
1989 }
1990 
1991 
1992 /* Analyze groups of accesses: check that DR belongs to a group of
1993    accesses of legal size, step, etc.  Detect gaps, single element
1994    interleaving, and other special cases. Set grouped access info.
1995    Collect groups of strided stores for further use in SLP analysis.  */
1996 
1997 static bool
vect_analyze_group_access(struct data_reference * dr)1998 vect_analyze_group_access (struct data_reference *dr)
1999 {
2000   tree step = DR_STEP (dr);
2001   tree scalar_type = TREE_TYPE (DR_REF (dr));
2002   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2003   gimple stmt = DR_STMT (dr);
2004   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2005   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2006   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2007   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2008   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2009   bool slp_impossible = false;
2010   struct loop *loop = NULL;
2011 
2012   if (loop_vinfo)
2013     loop = LOOP_VINFO_LOOP (loop_vinfo);
2014 
2015   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2016      size of the interleaving group (including gaps).  */
2017   groupsize = absu_hwi (dr_step) / type_size;
2018 
2019   /* Not consecutive access is possible only if it is a part of interleaving.  */
2020   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2021     {
2022       /* Check if it this DR is a part of interleaving, and is a single
2023 	 element of the group that is accessed in the loop.  */
2024 
2025       /* Gaps are supported only for loads. STEP must be a multiple of the type
2026 	 size.  The size of the group must be a power of 2.  */
2027       if (DR_IS_READ (dr)
2028 	  && (dr_step % type_size) == 0
2029 	  && groupsize > 0
2030 	  && exact_log2 (groupsize) != -1)
2031 	{
2032 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2033 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2034 	  if (dump_enabled_p ())
2035 	    {
2036 	      dump_printf_loc (MSG_NOTE, vect_location,
2037 	                       "Detected single element interleaving ");
2038 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2039 	      dump_printf (MSG_NOTE, " step ");
2040 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2041 	      dump_printf (MSG_NOTE, "\n");
2042 	    }
2043 
2044 	  if (loop_vinfo)
2045 	    {
2046 	      if (dump_enabled_p ())
2047 		dump_printf_loc (MSG_NOTE, vect_location,
2048 		                 "Data access with gaps requires scalar "
2049 		                 "epilogue loop\n");
2050               if (loop->inner)
2051                 {
2052                   if (dump_enabled_p ())
2053                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2054                                      "Peeling for outer loop is not"
2055                                      " supported\n");
2056                   return false;
2057                 }
2058 
2059               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2060 	    }
2061 
2062 	  return true;
2063 	}
2064 
2065       if (dump_enabled_p ())
2066         {
2067  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2068 	                   "not consecutive access ");
2069 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2070 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2071         }
2072 
2073       if (bb_vinfo)
2074         {
2075           /* Mark the statement as unvectorizable.  */
2076           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2077           return true;
2078         }
2079 
2080       return false;
2081     }
2082 
2083   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2084     {
2085       /* First stmt in the interleaving chain. Check the chain.  */
2086       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2087       struct data_reference *data_ref = dr;
2088       unsigned int count = 1;
2089       tree prev_init = DR_INIT (data_ref);
2090       gimple prev = stmt;
2091       HOST_WIDE_INT diff, gaps = 0;
2092       unsigned HOST_WIDE_INT count_in_bytes;
2093 
2094       while (next)
2095         {
2096           /* Skip same data-refs.  In case that two or more stmts share
2097              data-ref (supported only for loads), we vectorize only the first
2098              stmt, and the rest get their vectorized loads from the first
2099              one.  */
2100           if (!tree_int_cst_compare (DR_INIT (data_ref),
2101                                      DR_INIT (STMT_VINFO_DATA_REF (
2102 						   vinfo_for_stmt (next)))))
2103             {
2104               if (DR_IS_WRITE (data_ref))
2105                 {
2106                   if (dump_enabled_p ())
2107                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108                                      "Two store stmts share the same dr.\n");
2109                   return false;
2110                 }
2111 
2112               /* For load use the same data-ref load.  */
2113               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2114 
2115               prev = next;
2116               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2117               continue;
2118             }
2119 
2120           prev = next;
2121           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2122 
2123 	  /* All group members have the same STEP by construction.  */
2124 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2125 
2126           /* Check that the distance between two accesses is equal to the type
2127              size. Otherwise, we have gaps.  */
2128           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2129                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2130 	  if (diff != 1)
2131 	    {
2132 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2133 	      slp_impossible = true;
2134 	      if (DR_IS_WRITE (data_ref))
2135 		{
2136                   if (dump_enabled_p ())
2137                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138                                      "interleaved store with gaps\n");
2139 		  return false;
2140 		}
2141 
2142               gaps += diff - 1;
2143 	    }
2144 
2145 	  last_accessed_element += diff;
2146 
2147           /* Store the gap from the previous member of the group. If there is no
2148              gap in the access, GROUP_GAP is always 1.  */
2149           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2150 
2151           prev_init = DR_INIT (data_ref);
2152           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2153           /* Count the number of data-refs in the chain.  */
2154           count++;
2155         }
2156 
2157       /* COUNT is the number of accesses found, we multiply it by the size of
2158          the type to get COUNT_IN_BYTES.  */
2159       count_in_bytes = type_size * count;
2160 
2161       /* Check that the size of the interleaving (including gaps) is not
2162          greater than STEP.  */
2163       if (dr_step != 0
2164 	  && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2165         {
2166           if (dump_enabled_p ())
2167             {
2168               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169                                "interleaving size is greater than step for ");
2170               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2171                                  DR_REF (dr));
2172               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2173             }
2174           return false;
2175         }
2176 
2177       /* Check that the size of the interleaving is equal to STEP for stores,
2178          i.e., that there are no gaps.  */
2179       if (dr_step != 0
2180 	  && absu_hwi (dr_step) != count_in_bytes)
2181         {
2182           if (DR_IS_READ (dr))
2183             {
2184               slp_impossible = true;
2185               /* There is a gap after the last load in the group. This gap is a
2186                  difference between the groupsize and the number of elements.
2187 		 When there is no gap, this difference should be 0.  */
2188               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2189             }
2190           else
2191             {
2192               if (dump_enabled_p ())
2193                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194                                  "interleaved store with gaps\n");
2195               return false;
2196             }
2197         }
2198 
2199       /* Check that STEP is a multiple of type size.  */
2200       if (dr_step != 0
2201 	  && (dr_step % type_size) != 0)
2202         {
2203           if (dump_enabled_p ())
2204             {
2205               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2206                                "step is not a multiple of type size: step ");
2207               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2208               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2209               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2210                                  TYPE_SIZE_UNIT (scalar_type));
2211               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2212             }
2213           return false;
2214         }
2215 
2216       if (groupsize == 0)
2217         groupsize = count;
2218 
2219       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2220       if (dump_enabled_p ())
2221         dump_printf_loc (MSG_NOTE, vect_location,
2222                          "Detected interleaving of size %d\n", (int)groupsize);
2223 
2224       /* SLP: create an SLP data structure for every interleaving group of
2225 	 stores for further analysis in vect_analyse_slp.  */
2226       if (DR_IS_WRITE (dr) && !slp_impossible)
2227         {
2228           if (loop_vinfo)
2229             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2230           if (bb_vinfo)
2231             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2232         }
2233 
2234       /* There is a gap in the end of the group.  */
2235       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2236 	{
2237 	  if (dump_enabled_p ())
2238 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 	                     "Data access with gaps requires scalar "
2240 	                     "epilogue loop\n");
2241           if (loop->inner)
2242             {
2243               if (dump_enabled_p ())
2244                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245                                  "Peeling for outer loop is not supported\n");
2246               return false;
2247             }
2248 
2249           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2250 	}
2251     }
2252 
2253   return true;
2254 }
2255 
2256 
2257 /* Analyze the access pattern of the data-reference DR.
2258    In case of non-consecutive accesses call vect_analyze_group_access() to
2259    analyze groups of accesses.  */
2260 
2261 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2262 vect_analyze_data_ref_access (struct data_reference *dr)
2263 {
2264   tree step = DR_STEP (dr);
2265   tree scalar_type = TREE_TYPE (DR_REF (dr));
2266   gimple stmt = DR_STMT (dr);
2267   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2268   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2269   struct loop *loop = NULL;
2270 
2271   if (loop_vinfo)
2272     loop = LOOP_VINFO_LOOP (loop_vinfo);
2273 
2274   if (loop_vinfo && !step)
2275     {
2276       if (dump_enabled_p ())
2277 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278 	                 "bad data-ref access in loop\n");
2279       return false;
2280     }
2281 
2282   /* Allow invariant loads in not nested loops.  */
2283   if (loop_vinfo && integer_zerop (step))
2284     {
2285       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2286       if (nested_in_vect_loop_p (loop, stmt))
2287 	{
2288 	  if (dump_enabled_p ())
2289 	    dump_printf_loc (MSG_NOTE, vect_location,
2290 			     "zero step in inner loop of nest\n");
2291 	  return false;
2292 	}
2293       return DR_IS_READ (dr);
2294     }
2295 
2296   if (loop && nested_in_vect_loop_p (loop, stmt))
2297     {
2298       /* Interleaved accesses are not yet supported within outer-loop
2299         vectorization for references in the inner-loop.  */
2300       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2301 
2302       /* For the rest of the analysis we use the outer-loop step.  */
2303       step = STMT_VINFO_DR_STEP (stmt_info);
2304       if (integer_zerop (step))
2305 	{
2306 	  if (dump_enabled_p ())
2307 	    dump_printf_loc (MSG_NOTE, vect_location,
2308 	                     "zero step in outer loop.\n");
2309 	  if (DR_IS_READ (dr))
2310   	    return true;
2311 	  else
2312 	    return false;
2313 	}
2314     }
2315 
2316   /* Consecutive?  */
2317   if (TREE_CODE (step) == INTEGER_CST)
2318     {
2319       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2320       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2321 	  || (dr_step < 0
2322 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2323 	{
2324 	  /* Mark that it is not interleaving.  */
2325 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2326 	  return true;
2327 	}
2328     }
2329 
2330   if (loop && nested_in_vect_loop_p (loop, stmt))
2331     {
2332       if (dump_enabled_p ())
2333 	dump_printf_loc (MSG_NOTE, vect_location,
2334 	                 "grouped access in outer loop.\n");
2335       return false;
2336     }
2337 
2338   /* Assume this is a DR handled by non-constant strided load case.  */
2339   if (TREE_CODE (step) != INTEGER_CST)
2340     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2341 
2342   /* Not consecutive access - check if it's a part of interleaving group.  */
2343   return vect_analyze_group_access (dr);
2344 }
2345 
2346 
2347 
2348 /*  A helper function used in the comparator function to sort data
2349     references.  T1 and T2 are two data references to be compared.
2350     The function returns -1, 0, or 1.  */
2351 
2352 static int
compare_tree(tree t1,tree t2)2353 compare_tree (tree t1, tree t2)
2354 {
2355   int i, cmp;
2356   enum tree_code code;
2357   char tclass;
2358 
2359   if (t1 == t2)
2360     return 0;
2361   if (t1 == NULL)
2362     return -1;
2363   if (t2 == NULL)
2364     return 1;
2365 
2366 
2367   if (TREE_CODE (t1) != TREE_CODE (t2))
2368     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2369 
2370   code = TREE_CODE (t1);
2371   switch (code)
2372     {
2373     /* For const values, we can just use hash values for comparisons.  */
2374     case INTEGER_CST:
2375     case REAL_CST:
2376     case FIXED_CST:
2377     case STRING_CST:
2378     case COMPLEX_CST:
2379     case VECTOR_CST:
2380       {
2381 	hashval_t h1 = iterative_hash_expr (t1, 0);
2382 	hashval_t h2 = iterative_hash_expr (t2, 0);
2383 	if (h1 != h2)
2384 	  return h1 < h2 ? -1 : 1;
2385 	break;
2386       }
2387 
2388     case SSA_NAME:
2389       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2390       if (cmp != 0)
2391 	return cmp;
2392 
2393       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2394 	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2395       break;
2396 
2397     default:
2398       tclass = TREE_CODE_CLASS (code);
2399 
2400       /* For var-decl, we could compare their UIDs.  */
2401       if (tclass == tcc_declaration)
2402 	{
2403 	  if (DECL_UID (t1) != DECL_UID (t2))
2404 	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2405 	  break;
2406 	}
2407 
2408       /* For expressions with operands, compare their operands recursively.  */
2409       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2410 	{
2411 	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2412 	  if (cmp != 0)
2413 	    return cmp;
2414 	}
2415     }
2416 
2417   return 0;
2418 }
2419 
2420 
2421 /* Compare two data-references DRA and DRB to group them into chunks
2422    suitable for grouping.  */
2423 
2424 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2425 dr_group_sort_cmp (const void *dra_, const void *drb_)
2426 {
2427   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2428   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2429   int cmp;
2430 
2431   /* Stabilize sort.  */
2432   if (dra == drb)
2433     return 0;
2434 
2435   /* Ordering of DRs according to base.  */
2436   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2437     {
2438       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2439       if (cmp != 0)
2440         return cmp;
2441     }
2442 
2443   /* And according to DR_OFFSET.  */
2444   if (!dr_equal_offsets_p (dra, drb))
2445     {
2446       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2447       if (cmp != 0)
2448         return cmp;
2449     }
2450 
2451   /* Put reads before writes.  */
2452   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2453     return DR_IS_READ (dra) ? -1 : 1;
2454 
2455   /* Then sort after access size.  */
2456   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2457 			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2458     {
2459       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2460                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2461       if (cmp != 0)
2462         return cmp;
2463     }
2464 
2465   /* And after step.  */
2466   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2467     {
2468       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2469       if (cmp != 0)
2470         return cmp;
2471     }
2472 
2473   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2474   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2475   if (cmp == 0)
2476     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2477   return cmp;
2478 }
2479 
2480 /* Function vect_analyze_data_ref_accesses.
2481 
2482    Analyze the access pattern of all the data references in the loop.
2483 
2484    FORNOW: the only access pattern that is considered vectorizable is a
2485 	   simple step 1 (consecutive) access.
2486 
2487    FORNOW: handle only arrays and pointer accesses.  */
2488 
2489 bool
vect_analyze_data_ref_accesses(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo)2490 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2491 {
2492   unsigned int i;
2493   vec<data_reference_p> datarefs;
2494   struct data_reference *dr;
2495 
2496   if (dump_enabled_p ())
2497     dump_printf_loc (MSG_NOTE, vect_location,
2498                      "=== vect_analyze_data_ref_accesses ===\n");
2499 
2500   if (loop_vinfo)
2501     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2502   else
2503     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2504 
2505   if (datarefs.is_empty ())
2506     return true;
2507 
2508   /* Sort the array of datarefs to make building the interleaving chains
2509      linear.  Don't modify the original vector's order, it is needed for
2510      determining what dependencies are reversed.  */
2511   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2512   qsort (datarefs_copy.address (), datarefs_copy.length (),
2513 	 sizeof (data_reference_p), dr_group_sort_cmp);
2514 
2515   /* Build the interleaving chains.  */
2516   for (i = 0; i < datarefs_copy.length () - 1;)
2517     {
2518       data_reference_p dra = datarefs_copy[i];
2519       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2520       stmt_vec_info lastinfo = NULL;
2521       for (i = i + 1; i < datarefs_copy.length (); ++i)
2522 	{
2523 	  data_reference_p drb = datarefs_copy[i];
2524 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2525 
2526 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2527 	     accesses, same accesses) can lead to a group to be artificially
2528 	     split here as we don't just skip over those.  If it really
2529 	     matters we can push those to a worklist and re-iterate
2530 	     over them.  The we can just skip ahead to the next DR here.  */
2531 
2532 	  /* Check that the data-refs have same first location (except init)
2533 	     and they are both either store or load (not load and store,
2534 	     not masked loads or stores).  */
2535 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2536 	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2537 				   DR_BASE_ADDRESS (drb), 0)
2538 	      || !dr_equal_offsets_p (dra, drb)
2539 	      || !gimple_assign_single_p (DR_STMT (dra))
2540 	      || !gimple_assign_single_p (DR_STMT (drb)))
2541 	    break;
2542 
2543 	  /* Check that the data-refs have the same constant size and step.  */
2544 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2545 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2546 	  if (!tree_fits_uhwi_p (sza)
2547 	      || !tree_fits_uhwi_p (szb)
2548 	      || !tree_int_cst_equal (sza, szb)
2549 	      || !tree_fits_shwi_p (DR_STEP (dra))
2550 	      || !tree_fits_shwi_p (DR_STEP (drb))
2551 	      || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2552 	    break;
2553 
2554 	  /* Do not place the same access in the interleaving chain twice.  */
2555 	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2556 	    break;
2557 
2558 	  /* Check the types are compatible.
2559 	     ???  We don't distinguish this during sorting.  */
2560 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2561 				   TREE_TYPE (DR_REF (drb))))
2562 	    break;
2563 
2564 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2565 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2566 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2567 	  gcc_assert (init_a < init_b);
2568 
2569 	  /* If init_b == init_a + the size of the type * k, we have an
2570 	     interleaving, and DRA is accessed before DRB.  */
2571 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2572 	  if ((init_b - init_a) % type_size_a != 0)
2573 	    break;
2574 
2575 	  /* The step (if not zero) is greater than the difference between
2576 	     data-refs' inits.  This splits groups into suitable sizes.  */
2577 	  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2578 	  if (step != 0 && step <= (init_b - init_a))
2579 	    break;
2580 
2581 	  if (dump_enabled_p ())
2582 	    {
2583 	      dump_printf_loc (MSG_NOTE, vect_location,
2584 			       "Detected interleaving ");
2585 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2586 	      dump_printf (MSG_NOTE,  " and ");
2587 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2588 	      dump_printf (MSG_NOTE, "\n");
2589 	    }
2590 
2591 	  /* Link the found element into the group list.  */
2592 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2593 	    {
2594 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2595 	      lastinfo = stmtinfo_a;
2596 	    }
2597 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2598 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2599 	  lastinfo = stmtinfo_b;
2600 	}
2601     }
2602 
2603   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2604     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2605         && !vect_analyze_data_ref_access (dr))
2606       {
2607 	if (dump_enabled_p ())
2608 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609 	                   "not vectorized: complicated access pattern.\n");
2610 
2611         if (bb_vinfo)
2612           {
2613             /* Mark the statement as not vectorizable.  */
2614             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2615             continue;
2616           }
2617         else
2618 	  {
2619 	    datarefs_copy.release ();
2620 	    return false;
2621 	  }
2622       }
2623 
2624   datarefs_copy.release ();
2625   return true;
2626 }
2627 
2628 
2629 /* Operator == between two dr_with_seg_len objects.
2630 
2631    This equality operator is used to make sure two data refs
2632    are the same one so that we will consider to combine the
2633    aliasing checks of those two pairs of data dependent data
2634    refs.  */
2635 
2636 static bool
2637 operator == (const dr_with_seg_len& d1,
2638 	     const dr_with_seg_len& d2)
2639 {
2640   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2641 			  DR_BASE_ADDRESS (d2.dr), 0)
2642 	   && compare_tree (d1.offset, d2.offset) == 0
2643 	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2644 }
2645 
2646 /* Function comp_dr_with_seg_len_pair.
2647 
2648    Comparison function for sorting objects of dr_with_seg_len_pair_t
2649    so that we can combine aliasing checks in one scan.  */
2650 
2651 static int
comp_dr_with_seg_len_pair(const void * p1_,const void * p2_)2652 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2653 {
2654   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2655   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2656 
2657   const dr_with_seg_len &p11 = p1->first,
2658 			&p12 = p1->second,
2659 			&p21 = p2->first,
2660 			&p22 = p2->second;
2661 
2662   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2663      if a and c have the same basic address snd step, and b and d have the same
2664      address and step.  Therefore, if any a&c or b&d don't have the same address
2665      and step, we don't care the order of those two pairs after sorting.  */
2666   int comp_res;
2667 
2668   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2669 				DR_BASE_ADDRESS (p21.dr))) != 0)
2670     return comp_res;
2671   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2672 				DR_BASE_ADDRESS (p22.dr))) != 0)
2673     return comp_res;
2674   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2675     return comp_res;
2676   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2677     return comp_res;
2678   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2679     return comp_res;
2680   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2681     return comp_res;
2682 
2683   return 0;
2684 }
2685 
2686 template <class T> static void
swap(T & a,T & b)2687 swap (T& a, T& b)
2688 {
2689   T c (a);
2690   a = b;
2691   b = c;
2692 }
2693 
2694 /* Function vect_vfa_segment_size.
2695 
2696    Create an expression that computes the size of segment
2697    that will be accessed for a data reference.  The functions takes into
2698    account that realignment loads may access one more vector.
2699 
2700    Input:
2701      DR: The data reference.
2702      LENGTH_FACTOR: segment length to consider.
2703 
2704    Return an expression whose value is the size of segment which will be
2705    accessed by DR.  */
2706 
2707 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)2708 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2709 {
2710   tree segment_length;
2711 
2712   if (integer_zerop (DR_STEP (dr)))
2713     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2714   else
2715     segment_length = size_binop (MULT_EXPR,
2716 				 fold_convert (sizetype, DR_STEP (dr)),
2717 				 fold_convert (sizetype, length_factor));
2718 
2719   if (vect_supportable_dr_alignment (dr, false)
2720 	== dr_explicit_realign_optimized)
2721     {
2722       tree vector_size = TYPE_SIZE_UNIT
2723 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2724 
2725       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2726     }
2727   return segment_length;
2728 }
2729 
2730 /* Function vect_prune_runtime_alias_test_list.
2731 
2732    Prune a list of ddrs to be tested at run-time by versioning for alias.
2733    Merge several alias checks into one if possible.
2734    Return FALSE if resulting list of ddrs is longer then allowed by
2735    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2736 
2737 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)2738 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2739 {
2740   vec<ddr_p> may_alias_ddrs =
2741     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2742   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2743     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2744   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2745   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2746 
2747   ddr_p ddr;
2748   unsigned int i;
2749   tree length_factor;
2750 
2751   if (dump_enabled_p ())
2752     dump_printf_loc (MSG_NOTE, vect_location,
2753                      "=== vect_prune_runtime_alias_test_list ===\n");
2754 
2755   if (may_alias_ddrs.is_empty ())
2756     return true;
2757 
2758   /* Basically, for each pair of dependent data refs store_ptr_0
2759      and load_ptr_0, we create an expression:
2760 
2761      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2762      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2763 
2764      for aliasing checks.  However, in some cases we can decrease
2765      the number of checks by combining two checks into one.  For
2766      example, suppose we have another pair of data refs store_ptr_0
2767      and load_ptr_1, and if the following condition is satisfied:
2768 
2769      load_ptr_0 < load_ptr_1  &&
2770      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2771 
2772      (this condition means, in each iteration of vectorized loop,
2773      the accessed memory of store_ptr_0 cannot be between the memory
2774      of load_ptr_0 and load_ptr_1.)
2775 
2776      we then can use only the following expression to finish the
2777      alising checks between store_ptr_0 & load_ptr_0 and
2778      store_ptr_0 & load_ptr_1:
2779 
2780      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2782 
2783      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2784      same basic address.  */
2785 
2786   comp_alias_ddrs.create (may_alias_ddrs.length ());
2787 
2788   /* First, we collect all data ref pairs for aliasing checks.  */
2789   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2790     {
2791       struct data_reference *dr_a, *dr_b;
2792       gimple dr_group_first_a, dr_group_first_b;
2793       tree segment_length_a, segment_length_b;
2794       gimple stmt_a, stmt_b;
2795 
2796       dr_a = DDR_A (ddr);
2797       stmt_a = DR_STMT (DDR_A (ddr));
2798       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2799       if (dr_group_first_a)
2800 	{
2801 	  stmt_a = dr_group_first_a;
2802 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2803 	}
2804 
2805       dr_b = DDR_B (ddr);
2806       stmt_b = DR_STMT (DDR_B (ddr));
2807       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2808       if (dr_group_first_b)
2809 	{
2810 	  stmt_b = dr_group_first_b;
2811 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2812 	}
2813 
2814       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2815 	length_factor = scalar_loop_iters;
2816       else
2817 	length_factor = size_int (vect_factor);
2818       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2819       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2820 
2821       dr_with_seg_len_pair_t dr_with_seg_len_pair
2822 	  (dr_with_seg_len (dr_a, segment_length_a),
2823 	   dr_with_seg_len (dr_b, segment_length_b));
2824 
2825       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2826 	swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2827 
2828       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2829     }
2830 
2831   /* Second, we sort the collected data ref pairs so that we can scan
2832      them once to combine all possible aliasing checks.  */
2833   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2834 
2835   /* Third, we scan the sorted dr pairs and check if we can combine
2836      alias checks of two neighbouring dr pairs.  */
2837   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2838     {
2839       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2840       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2841 		      *dr_b1 = &comp_alias_ddrs[i-1].second,
2842 		      *dr_a2 = &comp_alias_ddrs[i].first,
2843 		      *dr_b2 = &comp_alias_ddrs[i].second;
2844 
2845       /* Remove duplicate data ref pairs.  */
2846       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2847 	{
2848 	  if (dump_enabled_p ())
2849 	    {
2850 	      dump_printf_loc (MSG_NOTE, vect_location,
2851 			       "found equal ranges ");
2852 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2853 				 DR_REF (dr_a1->dr));
2854 	      dump_printf (MSG_NOTE,  ", ");
2855 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2856 				 DR_REF (dr_b1->dr));
2857 	      dump_printf (MSG_NOTE,  " and ");
2858 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859 				 DR_REF (dr_a2->dr));
2860 	      dump_printf (MSG_NOTE,  ", ");
2861 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2862 				 DR_REF (dr_b2->dr));
2863 	      dump_printf (MSG_NOTE, "\n");
2864 	    }
2865 
2866 	  comp_alias_ddrs.ordered_remove (i--);
2867 	  continue;
2868 	}
2869 
2870       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2871 	{
2872 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2873 	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
2874 	  if (*dr_a1 == *dr_a2)
2875 	    {
2876 	      swap (dr_a1, dr_b1);
2877 	      swap (dr_a2, dr_b2);
2878 	    }
2879 
2880 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2881 				DR_BASE_ADDRESS (dr_a2->dr),
2882 				0)
2883 	      || !tree_fits_shwi_p (dr_a1->offset)
2884 	      || !tree_fits_shwi_p (dr_a2->offset))
2885 	    continue;
2886 
2887 	  HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2888 				- tree_to_shwi (dr_a1->offset));
2889 
2890 
2891 	  /* Now we check if the following condition is satisfied:
2892 
2893 	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2894 
2895 	     where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2896 	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2897 	     have to make a best estimation.  We can get the minimum value
2898 	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2899 	     then either of the following two conditions can guarantee the
2900 	     one above:
2901 
2902 	     1: DIFF <= MIN_SEG_LEN_B
2903 	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2904 
2905 	     */
2906 
2907 	  HOST_WIDE_INT
2908 	  min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2909 			     TREE_INT_CST_LOW (dr_b1->seg_len) :
2910 			     vect_factor;
2911 
2912 	  if (diff <= min_seg_len_b
2913 	      || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2914 		  && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2915 		     min_seg_len_b))
2916 	    {
2917 	      if (dump_enabled_p ())
2918 		{
2919 		  dump_printf_loc (MSG_NOTE, vect_location,
2920 				   "merging ranges for ");
2921 		  dump_generic_expr (MSG_NOTE, TDF_SLIM,
2922 				     DR_REF (dr_a1->dr));
2923 		  dump_printf (MSG_NOTE,  ", ");
2924 		  dump_generic_expr (MSG_NOTE, TDF_SLIM,
2925 				     DR_REF (dr_b1->dr));
2926 		  dump_printf (MSG_NOTE,  " and ");
2927 		  dump_generic_expr (MSG_NOTE, TDF_SLIM,
2928 				     DR_REF (dr_a2->dr));
2929 		  dump_printf (MSG_NOTE,  ", ");
2930 		  dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 				     DR_REF (dr_b2->dr));
2932 		  dump_printf (MSG_NOTE, "\n");
2933 		}
2934 
2935 	      dr_a1->seg_len = size_binop (PLUS_EXPR,
2936 					   dr_a2->seg_len, size_int (diff));
2937 	      comp_alias_ddrs.ordered_remove (i--);
2938 	    }
2939 	}
2940     }
2941 
2942   dump_printf_loc (MSG_NOTE, vect_location,
2943 		   "improved number of alias checks from %d to %d\n",
2944 		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
2945   if ((int) comp_alias_ddrs.length () >
2946       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2947     return false;
2948 
2949   return true;
2950 }
2951 
2952 /* Check whether a non-affine read in stmt is suitable for gather load
2953    and if so, return a builtin decl for that operation.  */
2954 
2955 tree
vect_check_gather(gimple stmt,loop_vec_info loop_vinfo,tree * basep,tree * offp,int * scalep)2956 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2957 		   tree *offp, int *scalep)
2958 {
2959   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2960   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2961   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2962   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2963   tree offtype = NULL_TREE;
2964   tree decl, base, off;
2965   enum machine_mode pmode;
2966   int punsignedp, pvolatilep;
2967 
2968   base = DR_REF (dr);
2969   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2970      see if we can use the def stmt of the address.  */
2971   if (is_gimple_call (stmt)
2972       && gimple_call_internal_p (stmt)
2973       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2974 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2975       && TREE_CODE (base) == MEM_REF
2976       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2977       && integer_zerop (TREE_OPERAND (base, 1))
2978       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2979     {
2980       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2981       if (is_gimple_assign (def_stmt)
2982 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2983 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2984     }
2985 
2986   /* The gather builtins need address of the form
2987      loop_invariant + vector * {1, 2, 4, 8}
2988      or
2989      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2990      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2991      of loop invariants/SSA_NAMEs defined in the loop, with casts,
2992      multiplications and additions in it.  To get a vector, we need
2993      a single SSA_NAME that will be defined in the loop and will
2994      contain everything that is not loop invariant and that can be
2995      vectorized.  The following code attempts to find such a preexistng
2996      SSA_NAME OFF and put the loop invariants into a tree BASE
2997      that can be gimplified before the loop.  */
2998   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2999 			      &pmode, &punsignedp, &pvolatilep, false);
3000   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3001 
3002   if (TREE_CODE (base) == MEM_REF)
3003     {
3004       if (!integer_zerop (TREE_OPERAND (base, 1)))
3005 	{
3006 	  if (off == NULL_TREE)
3007 	    {
3008 	      double_int moff = mem_ref_offset (base);
3009 	      off = double_int_to_tree (sizetype, moff);
3010 	    }
3011 	  else
3012 	    off = size_binop (PLUS_EXPR, off,
3013 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3014 	}
3015       base = TREE_OPERAND (base, 0);
3016     }
3017   else
3018     base = build_fold_addr_expr (base);
3019 
3020   if (off == NULL_TREE)
3021     off = size_zero_node;
3022 
3023   /* If base is not loop invariant, either off is 0, then we start with just
3024      the constant offset in the loop invariant BASE and continue with base
3025      as OFF, otherwise give up.
3026      We could handle that case by gimplifying the addition of base + off
3027      into some SSA_NAME and use that as off, but for now punt.  */
3028   if (!expr_invariant_in_loop_p (loop, base))
3029     {
3030       if (!integer_zerop (off))
3031 	return NULL_TREE;
3032       off = base;
3033       base = size_int (pbitpos / BITS_PER_UNIT);
3034     }
3035   /* Otherwise put base + constant offset into the loop invariant BASE
3036      and continue with OFF.  */
3037   else
3038     {
3039       base = fold_convert (sizetype, base);
3040       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3041     }
3042 
3043   /* OFF at this point may be either a SSA_NAME or some tree expression
3044      from get_inner_reference.  Try to peel off loop invariants from it
3045      into BASE as long as possible.  */
3046   STRIP_NOPS (off);
3047   while (offtype == NULL_TREE)
3048     {
3049       enum tree_code code;
3050       tree op0, op1, add = NULL_TREE;
3051 
3052       if (TREE_CODE (off) == SSA_NAME)
3053 	{
3054 	  gimple def_stmt = SSA_NAME_DEF_STMT (off);
3055 
3056 	  if (expr_invariant_in_loop_p (loop, off))
3057 	    return NULL_TREE;
3058 
3059 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3060 	    break;
3061 
3062 	  op0 = gimple_assign_rhs1 (def_stmt);
3063 	  code = gimple_assign_rhs_code (def_stmt);
3064 	  op1 = gimple_assign_rhs2 (def_stmt);
3065 	}
3066       else
3067 	{
3068 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3069 	    return NULL_TREE;
3070 	  code = TREE_CODE (off);
3071 	  extract_ops_from_tree (off, &code, &op0, &op1);
3072 	}
3073       switch (code)
3074 	{
3075 	case POINTER_PLUS_EXPR:
3076 	case PLUS_EXPR:
3077 	  if (expr_invariant_in_loop_p (loop, op0))
3078 	    {
3079 	      add = op0;
3080 	      off = op1;
3081 	    do_add:
3082 	      add = fold_convert (sizetype, add);
3083 	      if (scale != 1)
3084 		add = size_binop (MULT_EXPR, add, size_int (scale));
3085 	      base = size_binop (PLUS_EXPR, base, add);
3086 	      continue;
3087 	    }
3088 	  if (expr_invariant_in_loop_p (loop, op1))
3089 	    {
3090 	      add = op1;
3091 	      off = op0;
3092 	      goto do_add;
3093 	    }
3094 	  break;
3095 	case MINUS_EXPR:
3096 	  if (expr_invariant_in_loop_p (loop, op1))
3097 	    {
3098 	      add = fold_convert (sizetype, op1);
3099 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3100 	      off = op0;
3101 	      goto do_add;
3102 	    }
3103 	  break;
3104 	case MULT_EXPR:
3105 	  if (scale == 1 && tree_fits_shwi_p (op1))
3106 	    {
3107 	      scale = tree_to_shwi (op1);
3108 	      off = op0;
3109 	      continue;
3110 	    }
3111 	  break;
3112 	case SSA_NAME:
3113 	  off = op0;
3114 	  continue;
3115 	CASE_CONVERT:
3116 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3117 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3118 	    break;
3119 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3120 	      == TYPE_PRECISION (TREE_TYPE (off)))
3121 	    {
3122 	      off = op0;
3123 	      continue;
3124 	    }
3125 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3126 	      < TYPE_PRECISION (TREE_TYPE (off)))
3127 	    {
3128 	      off = op0;
3129 	      offtype = TREE_TYPE (off);
3130 	      STRIP_NOPS (off);
3131 	      continue;
3132 	    }
3133 	  break;
3134 	default:
3135 	  break;
3136 	}
3137       break;
3138     }
3139 
3140   /* If at the end OFF still isn't a SSA_NAME or isn't
3141      defined in the loop, punt.  */
3142   if (TREE_CODE (off) != SSA_NAME
3143       || expr_invariant_in_loop_p (loop, off))
3144     return NULL_TREE;
3145 
3146   if (offtype == NULL_TREE)
3147     offtype = TREE_TYPE (off);
3148 
3149   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3150 					   offtype, scale);
3151   if (decl == NULL_TREE)
3152     return NULL_TREE;
3153 
3154   if (basep)
3155     *basep = base;
3156   if (offp)
3157     *offp = off;
3158   if (scalep)
3159     *scalep = scale;
3160   return decl;
3161 }
3162 
3163 /* Function vect_analyze_data_refs.
3164 
3165   Find all the data references in the loop or basic block.
3166 
3167    The general structure of the analysis of data refs in the vectorizer is as
3168    follows:
3169    1- vect_analyze_data_refs(loop/bb): call
3170       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3171       in the loop/bb and their dependences.
3172    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3173    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3174    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3175 
3176 */
3177 
3178 bool
vect_analyze_data_refs(loop_vec_info loop_vinfo,bb_vec_info bb_vinfo,int * min_vf,unsigned * n_stmts)3179 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3180 			bb_vec_info bb_vinfo,
3181 			int *min_vf, unsigned *n_stmts)
3182 {
3183   struct loop *loop = NULL;
3184   basic_block bb = NULL;
3185   unsigned int i;
3186   vec<data_reference_p> datarefs;
3187   struct data_reference *dr;
3188   tree scalar_type;
3189 
3190   if (dump_enabled_p ())
3191     dump_printf_loc (MSG_NOTE, vect_location,
3192                      "=== vect_analyze_data_refs ===\n");
3193 
3194   if (loop_vinfo)
3195     {
3196       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3197 
3198       loop = LOOP_VINFO_LOOP (loop_vinfo);
3199       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3200       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3201 	{
3202 	  if (dump_enabled_p ())
3203 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3204 	                     "not vectorized: loop contains function calls"
3205 	                     " or data references that cannot be analyzed\n");
3206 	  return false;
3207 	}
3208 
3209       for (i = 0; i < loop->num_nodes; i++)
3210 	{
3211 	  gimple_stmt_iterator gsi;
3212 
3213 	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3214 	    {
3215 	      gimple stmt = gsi_stmt (gsi);
3216 	      if (is_gimple_debug (stmt))
3217 		continue;
3218 	      ++*n_stmts;
3219 	      if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3220 		{
3221 		  if (is_gimple_call (stmt) && loop->safelen)
3222 		    {
3223 		      tree fndecl = gimple_call_fndecl (stmt), op;
3224 		      if (fndecl != NULL_TREE)
3225 			{
3226 			  struct cgraph_node *node = cgraph_get_node (fndecl);
3227 			  if (node != NULL && node->simd_clones != NULL)
3228 			    {
3229 			      unsigned int j, n = gimple_call_num_args (stmt);
3230 			      for (j = 0; j < n; j++)
3231 				{
3232 				  op = gimple_call_arg (stmt, j);
3233 				  if (DECL_P (op)
3234 				      || (REFERENCE_CLASS_P (op)
3235 					  && get_base_address (op)))
3236 				    break;
3237 				}
3238 			      op = gimple_call_lhs (stmt);
3239 			      /* Ignore #pragma omp declare simd functions
3240 				 if they don't have data references in the
3241 				 call stmt itself.  */
3242 			      if (j == n
3243 				  && !(op
3244 				       && (DECL_P (op)
3245 					   || (REFERENCE_CLASS_P (op)
3246 					       && get_base_address (op)))))
3247 				continue;
3248 			    }
3249 			}
3250 		    }
3251 		  LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3252 		  if (dump_enabled_p ())
3253 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3254 				     "not vectorized: loop contains function "
3255 				     "calls or data references that cannot "
3256 				     "be analyzed\n");
3257 		  return false;
3258 		}
3259 	    }
3260 	}
3261 
3262       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3263     }
3264   else
3265     {
3266       gimple_stmt_iterator gsi;
3267 
3268       bb = BB_VINFO_BB (bb_vinfo);
3269       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3270 	{
3271 	  gimple stmt = gsi_stmt (gsi);
3272 	  if (is_gimple_debug (stmt))
3273 	    continue;
3274 	  ++*n_stmts;
3275 	  if (!find_data_references_in_stmt (NULL, stmt,
3276 					     &BB_VINFO_DATAREFS (bb_vinfo)))
3277 	    {
3278 	      /* Mark the rest of the basic-block as unvectorizable.  */
3279 	      for (; !gsi_end_p (gsi); gsi_next (&gsi))
3280 		{
3281 		  stmt = gsi_stmt (gsi);
3282 		  STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3283 		}
3284 	      break;
3285 	    }
3286 	}
3287 
3288       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3289     }
3290 
3291   /* Go through the data-refs, check that the analysis succeeded.  Update
3292      pointer from stmt_vec_info struct to DR and vectype.  */
3293 
3294   FOR_EACH_VEC_ELT (datarefs, i, dr)
3295     {
3296       gimple stmt;
3297       stmt_vec_info stmt_info;
3298       tree base, offset, init;
3299       bool gather = false;
3300       bool simd_lane_access = false;
3301       int vf;
3302 
3303 again:
3304       if (!dr || !DR_REF (dr))
3305         {
3306           if (dump_enabled_p ())
3307 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 	                     "not vectorized: unhandled data-ref\n");
3309           return false;
3310         }
3311 
3312       stmt = DR_STMT (dr);
3313       stmt_info = vinfo_for_stmt (stmt);
3314 
3315       /* Discard clobbers from the dataref vector.  We will remove
3316          clobber stmts during vectorization.  */
3317       if (gimple_clobber_p (stmt))
3318 	{
3319 	  free_data_ref (dr);
3320 	  if (i == datarefs.length () - 1)
3321 	    {
3322 	      datarefs.pop ();
3323 	      break;
3324 	    }
3325 	  datarefs.ordered_remove (i);
3326 	  dr = datarefs[i];
3327 	  goto again;
3328 	}
3329 
3330       /* Check that analysis of the data-ref succeeded.  */
3331       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3332 	  || !DR_STEP (dr))
3333         {
3334 	  bool maybe_gather
3335 	    = DR_IS_READ (dr)
3336 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3337 	      && targetm.vectorize.builtin_gather != NULL;
3338 	  bool maybe_simd_lane_access
3339 	    = loop_vinfo && loop->simduid;
3340 
3341 	  /* If target supports vector gather loads, or if this might be
3342 	     a SIMD lane access, see if they can't be used.  */
3343 	  if (loop_vinfo
3344 	      && (maybe_gather || maybe_simd_lane_access)
3345 	      && !nested_in_vect_loop_p (loop, stmt))
3346 	    {
3347 	      struct data_reference *newdr
3348 		= create_data_ref (NULL, loop_containing_stmt (stmt),
3349 				   DR_REF (dr), stmt, true);
3350 	      gcc_assert (newdr != NULL && DR_REF (newdr));
3351 	      if (DR_BASE_ADDRESS (newdr)
3352 		  && DR_OFFSET (newdr)
3353 		  && DR_INIT (newdr)
3354 		  && DR_STEP (newdr)
3355 		  && integer_zerop (DR_STEP (newdr)))
3356 		{
3357 		  if (maybe_simd_lane_access)
3358 		    {
3359 		      tree off = DR_OFFSET (newdr);
3360 		      STRIP_NOPS (off);
3361 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3362 			  && TREE_CODE (off) == MULT_EXPR
3363 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3364 			{
3365 			  tree step = TREE_OPERAND (off, 1);
3366 			  off = TREE_OPERAND (off, 0);
3367 			  STRIP_NOPS (off);
3368 			  if (CONVERT_EXPR_P (off)
3369 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3370 									  0)))
3371 				 < TYPE_PRECISION (TREE_TYPE (off)))
3372 			    off = TREE_OPERAND (off, 0);
3373 			  if (TREE_CODE (off) == SSA_NAME)
3374 			    {
3375 			      gimple def = SSA_NAME_DEF_STMT (off);
3376 			      tree reft = TREE_TYPE (DR_REF (newdr));
3377 			      if (is_gimple_call (def)
3378 				  && gimple_call_internal_p (def)
3379 				  && (gimple_call_internal_fn (def)
3380 				      == IFN_GOMP_SIMD_LANE))
3381 				{
3382 				  tree arg = gimple_call_arg (def, 0);
3383 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3384 				  arg = SSA_NAME_VAR (arg);
3385 				  if (arg == loop->simduid
3386 				      /* For now.  */
3387 				      && tree_int_cst_equal
3388 					   (TYPE_SIZE_UNIT (reft),
3389 					    step))
3390 				    {
3391 				      DR_OFFSET (newdr) = ssize_int (0);
3392 				      DR_STEP (newdr) = step;
3393 				      DR_ALIGNED_TO (newdr)
3394 					= size_int (BIGGEST_ALIGNMENT);
3395 				      dr = newdr;
3396 				      simd_lane_access = true;
3397 				    }
3398 				}
3399 			    }
3400 			}
3401 		    }
3402 		  if (!simd_lane_access && maybe_gather)
3403 		    {
3404 		      dr = newdr;
3405 		      gather = true;
3406 		    }
3407 		}
3408 	      if (!gather && !simd_lane_access)
3409 		free_data_ref (newdr);
3410 	    }
3411 
3412 	  if (!gather && !simd_lane_access)
3413 	    {
3414 	      if (dump_enabled_p ())
3415 		{
3416 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3417                                    "not vectorized: data ref analysis "
3418                                    "failed ");
3419 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3420                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3421 		}
3422 
3423 	      if (bb_vinfo)
3424 		break;
3425 
3426 	      return false;
3427 	    }
3428         }
3429 
3430       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3431         {
3432           if (dump_enabled_p ())
3433             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434                              "not vectorized: base addr of dr is a "
3435                              "constant\n");
3436 
3437           if (bb_vinfo)
3438 	    break;
3439 
3440 	  if (gather || simd_lane_access)
3441 	    free_data_ref (dr);
3442 	  return false;
3443         }
3444 
3445       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3446         {
3447           if (dump_enabled_p ())
3448             {
3449               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3450                                "not vectorized: volatile type ");
3451               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3452               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3453             }
3454 
3455           if (bb_vinfo)
3456 	    break;
3457 
3458           return false;
3459         }
3460 
3461       if (stmt_can_throw_internal (stmt))
3462         {
3463           if (dump_enabled_p ())
3464             {
3465               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466                                "not vectorized: statement can throw an "
3467                                "exception ");
3468               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3470             }
3471 
3472           if (bb_vinfo)
3473 	    break;
3474 
3475 	  if (gather || simd_lane_access)
3476 	    free_data_ref (dr);
3477           return false;
3478         }
3479 
3480       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3481 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3482 	{
3483           if (dump_enabled_p ())
3484             {
3485               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3486                                "not vectorized: statement is bitfield "
3487                                "access ");
3488               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3489               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3490             }
3491 
3492           if (bb_vinfo)
3493 	    break;
3494 
3495 	  if (gather || simd_lane_access)
3496 	    free_data_ref (dr);
3497           return false;
3498 	}
3499 
3500       base = unshare_expr (DR_BASE_ADDRESS (dr));
3501       offset = unshare_expr (DR_OFFSET (dr));
3502       init = unshare_expr (DR_INIT (dr));
3503 
3504       if (is_gimple_call (stmt)
3505 	  && (!gimple_call_internal_p (stmt)
3506 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3507 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3508 	{
3509 	  if (dump_enabled_p ())
3510 	    {
3511 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3512 	                       "not vectorized: dr in a call ");
3513 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3514 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3515 	    }
3516 
3517 	  if (bb_vinfo)
3518 	    break;
3519 
3520 	  if (gather || simd_lane_access)
3521 	    free_data_ref (dr);
3522 	  return false;
3523 	}
3524 
3525       /* Update DR field in stmt_vec_info struct.  */
3526 
3527       /* If the dataref is in an inner-loop of the loop that is considered for
3528 	 for vectorization, we also want to analyze the access relative to
3529 	 the outer-loop (DR contains information only relative to the
3530 	 inner-most enclosing loop).  We do that by building a reference to the
3531 	 first location accessed by the inner-loop, and analyze it relative to
3532 	 the outer-loop.  */
3533       if (loop && nested_in_vect_loop_p (loop, stmt))
3534 	{
3535 	  tree outer_step, outer_base, outer_init;
3536 	  HOST_WIDE_INT pbitsize, pbitpos;
3537 	  tree poffset;
3538 	  enum machine_mode pmode;
3539 	  int punsignedp, pvolatilep;
3540 	  affine_iv base_iv, offset_iv;
3541 	  tree dinit;
3542 
3543 	  /* Build a reference to the first location accessed by the
3544 	     inner-loop: *(BASE+INIT).  (The first location is actually
3545 	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3546           tree inner_base = build_fold_indirect_ref
3547                                 (fold_build_pointer_plus (base, init));
3548 
3549 	  if (dump_enabled_p ())
3550 	    {
3551 	      dump_printf_loc (MSG_NOTE, vect_location,
3552                                "analyze in outer-loop: ");
3553 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3554 	      dump_printf (MSG_NOTE, "\n");
3555 	    }
3556 
3557 	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3558 		          &poffset, &pmode, &punsignedp, &pvolatilep, false);
3559 	  gcc_assert (outer_base != NULL_TREE);
3560 
3561 	  if (pbitpos % BITS_PER_UNIT != 0)
3562 	    {
3563 	      if (dump_enabled_p ())
3564 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3565                                  "failed: bit offset alignment.\n");
3566 	      return false;
3567 	    }
3568 
3569 	  outer_base = build_fold_addr_expr (outer_base);
3570 	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3571                           &base_iv, false))
3572 	    {
3573 	      if (dump_enabled_p ())
3574 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3575                                  "failed: evolution of base is not affine.\n");
3576 	      return false;
3577 	    }
3578 
3579 	  if (offset)
3580 	    {
3581 	      if (poffset)
3582 		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3583                                        poffset);
3584 	      else
3585 		poffset = offset;
3586 	    }
3587 
3588 	  if (!poffset)
3589 	    {
3590 	      offset_iv.base = ssize_int (0);
3591 	      offset_iv.step = ssize_int (0);
3592 	    }
3593 	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3594                                &offset_iv, false))
3595 	    {
3596 	      if (dump_enabled_p ())
3597 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598                                  "evolution of offset is not affine.\n");
3599 	      return false;
3600 	    }
3601 
3602 	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3603 	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3604 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3605 	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3606 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3607 
3608 	  outer_step = size_binop (PLUS_EXPR,
3609 				fold_convert (ssizetype, base_iv.step),
3610 				fold_convert (ssizetype, offset_iv.step));
3611 
3612 	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3613 	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3614 	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3615 	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3616 	  STMT_VINFO_DR_OFFSET (stmt_info) =
3617 				fold_convert (ssizetype, offset_iv.base);
3618 	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3619 				size_int (highest_pow2_factor (offset_iv.base));
3620 
3621           if (dump_enabled_p ())
3622 	    {
3623 	      dump_printf_loc (MSG_NOTE, vect_location,
3624                                "\touter base_address: ");
3625 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3626                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3627 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3628 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3629                                  STMT_VINFO_DR_OFFSET (stmt_info));
3630 	      dump_printf (MSG_NOTE,
3631                            "\n\touter constant offset from base address: ");
3632 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3633                                  STMT_VINFO_DR_INIT (stmt_info));
3634 	      dump_printf (MSG_NOTE, "\n\touter step: ");
3635 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3636                                  STMT_VINFO_DR_STEP (stmt_info));
3637 	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3638 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3639                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3640 	      dump_printf (MSG_NOTE, "\n");
3641 	    }
3642 	}
3643 
3644       if (STMT_VINFO_DATA_REF (stmt_info))
3645         {
3646           if (dump_enabled_p ())
3647             {
3648               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649                                "not vectorized: more than one data ref "
3650                                "in stmt: ");
3651               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3652               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3653             }
3654 
3655           if (bb_vinfo)
3656 	    break;
3657 
3658 	  if (gather || simd_lane_access)
3659 	    free_data_ref (dr);
3660           return false;
3661         }
3662 
3663       STMT_VINFO_DATA_REF (stmt_info) = dr;
3664       if (simd_lane_access)
3665 	{
3666 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3667 	  free_data_ref (datarefs[i]);
3668 	  datarefs[i] = dr;
3669 	}
3670 
3671       /* Set vectype for STMT.  */
3672       scalar_type = TREE_TYPE (DR_REF (dr));
3673       STMT_VINFO_VECTYPE (stmt_info)
3674 	= get_vectype_for_scalar_type (scalar_type);
3675       if (!STMT_VINFO_VECTYPE (stmt_info))
3676         {
3677           if (dump_enabled_p ())
3678             {
3679               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680                                "not vectorized: no vectype for stmt: ");
3681               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3683               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3684                                  scalar_type);
3685               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3686             }
3687 
3688           if (bb_vinfo)
3689 	    break;
3690 
3691 	  if (gather || simd_lane_access)
3692 	    {
3693 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3694 	      if (gather)
3695 		free_data_ref (dr);
3696 	    }
3697 	  return false;
3698         }
3699       else
3700 	{
3701 	  if (dump_enabled_p ())
3702 	    {
3703 	      dump_printf_loc (MSG_NOTE, vect_location,
3704 			       "got vectype for stmt: ");
3705 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3706 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3707 				 STMT_VINFO_VECTYPE (stmt_info));
3708 	      dump_printf (MSG_NOTE, "\n");
3709 	    }
3710 	}
3711 
3712       /* Adjust the minimal vectorization factor according to the
3713 	 vector type.  */
3714       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3715       if (vf > *min_vf)
3716 	*min_vf = vf;
3717 
3718       if (gather)
3719 	{
3720 	  tree off;
3721 
3722 	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3723 	  if (gather
3724 	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3725 	    gather = false;
3726 	  if (!gather)
3727 	    {
3728 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3729 	      free_data_ref (dr);
3730 	      if (dump_enabled_p ())
3731 		{
3732 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3733                                    "not vectorized: not suitable for gather "
3734                                    "load ");
3735 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3736                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3737 		}
3738 	      return false;
3739 	    }
3740 
3741 	  datarefs[i] = dr;
3742 	  STMT_VINFO_GATHER_P (stmt_info) = true;
3743 	}
3744       else if (loop_vinfo
3745 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3746 	{
3747 	  if (nested_in_vect_loop_p (loop, stmt)
3748 	      || !DR_IS_READ (dr))
3749 	    {
3750 	      if (dump_enabled_p ())
3751 		{
3752 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3753                                    "not vectorized: not suitable for strided "
3754                                    "load ");
3755 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3756                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3757 		}
3758 	      return false;
3759 	    }
3760 	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3761 	}
3762     }
3763 
3764   /* If we stopped analysis at the first dataref we could not analyze
3765      when trying to vectorize a basic-block mark the rest of the datarefs
3766      as not vectorizable and truncate the vector of datarefs.  That
3767      avoids spending useless time in analyzing their dependence.  */
3768   if (i != datarefs.length ())
3769     {
3770       gcc_assert (bb_vinfo != NULL);
3771       for (unsigned j = i; j < datarefs.length (); ++j)
3772 	{
3773 	  data_reference_p dr = datarefs[j];
3774           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3775 	  free_data_ref (dr);
3776 	}
3777       datarefs.truncate (i);
3778     }
3779 
3780   return true;
3781 }
3782 
3783 
3784 /* Function vect_get_new_vect_var.
3785 
3786    Returns a name for a new variable.  The current naming scheme appends the
3787    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3788    the name of vectorizer generated variables, and appends that to NAME if
3789    provided.  */
3790 
3791 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)3792 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3793 {
3794   const char *prefix;
3795   tree new_vect_var;
3796 
3797   switch (var_kind)
3798   {
3799   case vect_simple_var:
3800     prefix = "vect";
3801     break;
3802   case vect_scalar_var:
3803     prefix = "stmp";
3804     break;
3805   case vect_pointer_var:
3806     prefix = "vectp";
3807     break;
3808   default:
3809     gcc_unreachable ();
3810   }
3811 
3812   if (name)
3813     {
3814       char* tmp = concat (prefix, "_", name, NULL);
3815       new_vect_var = create_tmp_reg (type, tmp);
3816       free (tmp);
3817     }
3818   else
3819     new_vect_var = create_tmp_reg (type, prefix);
3820 
3821   return new_vect_var;
3822 }
3823 
3824 
3825 /* Function vect_create_addr_base_for_vector_ref.
3826 
3827    Create an expression that computes the address of the first memory location
3828    that will be accessed for a data reference.
3829 
3830    Input:
3831    STMT: The statement containing the data reference.
3832    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3833    OFFSET: Optional. If supplied, it is be added to the initial address.
3834    LOOP:    Specify relative to which loop-nest should the address be computed.
3835             For example, when the dataref is in an inner-loop nested in an
3836 	    outer-loop that is now being vectorized, LOOP can be either the
3837 	    outer-loop, or the inner-loop.  The first memory location accessed
3838 	    by the following dataref ('in' points to short):
3839 
3840 		for (i=0; i<N; i++)
3841 		   for (j=0; j<M; j++)
3842 		     s += in[i+j]
3843 
3844 	    is as follows:
3845 	    if LOOP=i_loop:	&in		(relative to i_loop)
3846 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
3847    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3848 	    initial address.  Unlike OFFSET, which is number of elements to
3849 	    be added, BYTE_OFFSET is measured in bytes.
3850 
3851    Output:
3852    1. Return an SSA_NAME whose value is the address of the memory location of
3853       the first vector of the data reference.
3854    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3855       these statement(s) which define the returned SSA_NAME.
3856 
3857    FORNOW: We are only handling array accesses with step 1.  */
3858 
3859 tree
vect_create_addr_base_for_vector_ref(gimple stmt,gimple_seq * new_stmt_list,tree offset,struct loop * loop,tree byte_offset)3860 vect_create_addr_base_for_vector_ref (gimple stmt,
3861 				      gimple_seq *new_stmt_list,
3862 				      tree offset,
3863 				      struct loop *loop,
3864 				      tree byte_offset)
3865 {
3866   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3867   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3868   tree data_ref_base;
3869   const char *base_name;
3870   tree addr_base;
3871   tree dest;
3872   gimple_seq seq = NULL;
3873   tree base_offset;
3874   tree init;
3875   tree vect_ptr_type;
3876   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3877   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3878 
3879   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3880     {
3881       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3882 
3883       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3884 
3885       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3886       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3887       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3888     }
3889   else
3890     {
3891       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3892       base_offset = unshare_expr (DR_OFFSET (dr));
3893       init = unshare_expr (DR_INIT (dr));
3894     }
3895 
3896   if (loop_vinfo)
3897     base_name = get_name (data_ref_base);
3898   else
3899     {
3900       base_offset = ssize_int (0);
3901       init = ssize_int (0);
3902       base_name = get_name (DR_REF (dr));
3903     }
3904 
3905   /* Create base_offset */
3906   base_offset = size_binop (PLUS_EXPR,
3907 			    fold_convert (sizetype, base_offset),
3908 			    fold_convert (sizetype, init));
3909 
3910   if (offset)
3911     {
3912       offset = fold_build2 (MULT_EXPR, sizetype,
3913 			    fold_convert (sizetype, offset), step);
3914       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3915 				 base_offset, offset);
3916     }
3917   if (byte_offset)
3918     {
3919       byte_offset = fold_convert (sizetype, byte_offset);
3920       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3921 				 base_offset, byte_offset);
3922     }
3923 
3924   /* base + base_offset */
3925   if (loop_vinfo)
3926     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3927   else
3928     {
3929       addr_base = build1 (ADDR_EXPR,
3930 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
3931 			  unshare_expr (DR_REF (dr)));
3932     }
3933 
3934   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3935   addr_base = fold_convert (vect_ptr_type, addr_base);
3936   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3937   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3938   gimple_seq_add_seq (new_stmt_list, seq);
3939 
3940   if (DR_PTR_INFO (dr)
3941       && TREE_CODE (addr_base) == SSA_NAME)
3942     {
3943       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3944       unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3945       int misalign = DR_MISALIGNMENT (dr);
3946       if (offset || byte_offset || (misalign == -1))
3947 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3948       else
3949 	set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3950     }
3951 
3952   if (dump_enabled_p ())
3953     {
3954       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3955       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3956       dump_printf (MSG_NOTE, "\n");
3957     }
3958 
3959   return addr_base;
3960 }
3961 
3962 
3963 /* Function vect_create_data_ref_ptr.
3964 
3965    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3966    location accessed in the loop by STMT, along with the def-use update
3967    chain to appropriately advance the pointer through the loop iterations.
3968    Also set aliasing information for the pointer.  This pointer is used by
3969    the callers to this function to create a memory reference expression for
3970    vector load/store access.
3971 
3972    Input:
3973    1. STMT: a stmt that references memory. Expected to be of the form
3974          GIMPLE_ASSIGN <name, data-ref> or
3975 	 GIMPLE_ASSIGN <data-ref, name>.
3976    2. AGGR_TYPE: the type of the reference, which should be either a vector
3977         or an array.
3978    3. AT_LOOP: the loop where the vector memref is to be created.
3979    4. OFFSET (optional): an offset to be added to the initial address accessed
3980         by the data-ref in STMT.
3981    5. BSI: location where the new stmts are to be placed if there is no loop
3982    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3983         pointing to the initial address.
3984    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3985 	to the initial address accessed by the data-ref in STMT.  This is
3986 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3987 	in bytes.
3988 
3989    Output:
3990    1. Declare a new ptr to vector_type, and have it point to the base of the
3991       data reference (initial addressed accessed by the data reference).
3992       For example, for vector of type V8HI, the following code is generated:
3993 
3994       v8hi *ap;
3995       ap = (v8hi *)initial_address;
3996 
3997       if OFFSET is not supplied:
3998          initial_address = &a[init];
3999       if OFFSET is supplied:
4000          initial_address = &a[init + OFFSET];
4001       if BYTE_OFFSET is supplied:
4002 	 initial_address = &a[init] + BYTE_OFFSET;
4003 
4004       Return the initial_address in INITIAL_ADDRESS.
4005 
4006    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4007       update the pointer in each iteration of the loop.
4008 
4009       Return the increment stmt that updates the pointer in PTR_INCR.
4010 
4011    3. Set INV_P to true if the access pattern of the data reference in the
4012       vectorized loop is invariant.  Set it to false otherwise.
4013 
4014    4. Return the pointer.  */
4015 
4016 tree
vect_create_data_ref_ptr(gimple stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple * ptr_incr,bool only_init,bool * inv_p,tree byte_offset)4017 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4018 			  tree offset, tree *initial_address,
4019 			  gimple_stmt_iterator *gsi, gimple *ptr_incr,
4020 			  bool only_init, bool *inv_p, tree byte_offset)
4021 {
4022   const char *base_name;
4023   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4024   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4025   struct loop *loop = NULL;
4026   bool nested_in_vect_loop = false;
4027   struct loop *containing_loop = NULL;
4028   tree aggr_ptr_type;
4029   tree aggr_ptr;
4030   tree new_temp;
4031   gimple vec_stmt;
4032   gimple_seq new_stmt_list = NULL;
4033   edge pe = NULL;
4034   basic_block new_bb;
4035   tree aggr_ptr_init;
4036   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4037   tree aptr;
4038   gimple_stmt_iterator incr_gsi;
4039   bool insert_after;
4040   tree indx_before_incr, indx_after_incr;
4041   gimple incr;
4042   tree step;
4043   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4044 
4045   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4046 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4047 
4048   if (loop_vinfo)
4049     {
4050       loop = LOOP_VINFO_LOOP (loop_vinfo);
4051       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4052       containing_loop = (gimple_bb (stmt))->loop_father;
4053       pe = loop_preheader_edge (loop);
4054     }
4055   else
4056     {
4057       gcc_assert (bb_vinfo);
4058       only_init = true;
4059       *ptr_incr = NULL;
4060     }
4061 
4062   /* Check the step (evolution) of the load in LOOP, and record
4063      whether it's invariant.  */
4064   if (nested_in_vect_loop)
4065     step = STMT_VINFO_DR_STEP (stmt_info);
4066   else
4067     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4068 
4069   if (integer_zerop (step))
4070     *inv_p = true;
4071   else
4072     *inv_p = false;
4073 
4074   /* Create an expression for the first address accessed by this load
4075      in LOOP.  */
4076   base_name = get_name (DR_BASE_ADDRESS (dr));
4077 
4078   if (dump_enabled_p ())
4079     {
4080       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4081       dump_printf_loc (MSG_NOTE, vect_location,
4082                        "create %s-pointer variable to type: ",
4083 		       get_tree_code_name (TREE_CODE (aggr_type)));
4084       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4085       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4086         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4087       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4088         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4089       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4090         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4091       else
4092         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4093       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4094       dump_printf (MSG_NOTE, "\n");
4095     }
4096 
4097   /* (1) Create the new aggregate-pointer variable.
4098      Vector and array types inherit the alias set of their component
4099      type by default so we need to use a ref-all pointer if the data
4100      reference does not conflict with the created aggregated data
4101      reference because it is not addressable.  */
4102   bool need_ref_all = false;
4103   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4104 			      get_alias_set (DR_REF (dr))))
4105     need_ref_all = true;
4106   /* Likewise for any of the data references in the stmt group.  */
4107   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4108     {
4109       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4110       do
4111 	{
4112 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4113 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4114 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4115 				      get_alias_set (DR_REF (sdr))))
4116 	    {
4117 	      need_ref_all = true;
4118 	      break;
4119 	    }
4120 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4121 	}
4122       while (orig_stmt);
4123     }
4124   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4125 					       need_ref_all);
4126   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4127 
4128 
4129   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4130      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4131      def-use update cycles for the pointer: one relative to the outer-loop
4132      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4133      to the inner-loop (which is the inner-most loop containing the dataref),
4134      and this is done be step (5) below.
4135 
4136      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4137      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4138      redundant.  Steps (3),(4) create the following:
4139 
4140 	vp0 = &base_addr;
4141 	LOOP:	vp1 = phi(vp0,vp2)
4142 		...
4143 		...
4144 		vp2 = vp1 + step
4145 		goto LOOP
4146 
4147      If there is an inner-loop nested in loop, then step (5) will also be
4148      applied, and an additional update in the inner-loop will be created:
4149 
4150 	vp0 = &base_addr;
4151 	LOOP:   vp1 = phi(vp0,vp2)
4152 		...
4153         inner:     vp3 = phi(vp1,vp4)
4154 	           vp4 = vp3 + inner_step
4155 	           if () goto inner
4156 		...
4157 		vp2 = vp1 + step
4158 		if () goto LOOP   */
4159 
4160   /* (2) Calculate the initial address of the aggregate-pointer, and set
4161      the aggregate-pointer to point to it before the loop.  */
4162 
4163   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4164 
4165   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4166 						   offset, loop, byte_offset);
4167   if (new_stmt_list)
4168     {
4169       if (pe)
4170         {
4171           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4172           gcc_assert (!new_bb);
4173         }
4174       else
4175         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4176     }
4177 
4178   *initial_address = new_temp;
4179 
4180   /* Create: p = (aggr_type *) initial_base  */
4181   if (TREE_CODE (new_temp) != SSA_NAME
4182       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4183     {
4184       vec_stmt = gimple_build_assign (aggr_ptr,
4185 				      fold_convert (aggr_ptr_type, new_temp));
4186       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4187       /* Copy the points-to information if it exists. */
4188       if (DR_PTR_INFO (dr))
4189 	duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4190       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4191       if (pe)
4192 	{
4193 	  new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4194 	  gcc_assert (!new_bb);
4195 	}
4196       else
4197 	gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4198     }
4199   else
4200     aggr_ptr_init = new_temp;
4201 
4202   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4203      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4204      inner-loop nested in LOOP (during outer-loop vectorization).  */
4205 
4206   /* No update in loop is required.  */
4207   if (only_init && (!loop_vinfo || at_loop == loop))
4208     aptr = aggr_ptr_init;
4209   else
4210     {
4211       /* The step of the aggregate pointer is the type size.  */
4212       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4213       /* One exception to the above is when the scalar step of the load in
4214 	 LOOP is zero. In this case the step here is also zero.  */
4215       if (*inv_p)
4216 	iv_step = size_zero_node;
4217       else if (tree_int_cst_sgn (step) == -1)
4218 	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4219 
4220       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4221 
4222       create_iv (aggr_ptr_init,
4223 		 fold_convert (aggr_ptr_type, iv_step),
4224 		 aggr_ptr, loop, &incr_gsi, insert_after,
4225 		 &indx_before_incr, &indx_after_incr);
4226       incr = gsi_stmt (incr_gsi);
4227       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4228 
4229       /* Copy the points-to information if it exists. */
4230       if (DR_PTR_INFO (dr))
4231 	{
4232 	  duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4233 	  duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4234 	}
4235       if (ptr_incr)
4236 	*ptr_incr = incr;
4237 
4238       aptr = indx_before_incr;
4239     }
4240 
4241   if (!nested_in_vect_loop || only_init)
4242     return aptr;
4243 
4244 
4245   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4246      nested in LOOP, if exists.  */
4247 
4248   gcc_assert (nested_in_vect_loop);
4249   if (!only_init)
4250     {
4251       standard_iv_increment_position (containing_loop, &incr_gsi,
4252 				      &insert_after);
4253       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4254 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4255 		 &indx_after_incr);
4256       incr = gsi_stmt (incr_gsi);
4257       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4258 
4259       /* Copy the points-to information if it exists. */
4260       if (DR_PTR_INFO (dr))
4261 	{
4262 	  duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4263 	  duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4264 	}
4265       if (ptr_incr)
4266 	*ptr_incr = incr;
4267 
4268       return indx_before_incr;
4269     }
4270   else
4271     gcc_unreachable ();
4272 }
4273 
4274 
4275 /* Function bump_vector_ptr
4276 
4277    Increment a pointer (to a vector type) by vector-size. If requested,
4278    i.e. if PTR-INCR is given, then also connect the new increment stmt
4279    to the existing def-use update-chain of the pointer, by modifying
4280    the PTR_INCR as illustrated below:
4281 
4282    The pointer def-use update-chain before this function:
4283                         DATAREF_PTR = phi (p_0, p_2)
4284                         ....
4285         PTR_INCR:       p_2 = DATAREF_PTR + step
4286 
4287    The pointer def-use update-chain after this function:
4288                         DATAREF_PTR = phi (p_0, p_2)
4289                         ....
4290                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4291                         ....
4292         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4293 
4294    Input:
4295    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4296                  in the loop.
4297    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4298 	      the loop.  The increment amount across iterations is expected
4299 	      to be vector_size.
4300    BSI - location where the new update stmt is to be placed.
4301    STMT - the original scalar memory-access stmt that is being vectorized.
4302    BUMP - optional. The offset by which to bump the pointer. If not given,
4303 	  the offset is assumed to be vector_size.
4304 
4305    Output: Return NEW_DATAREF_PTR as illustrated above.
4306 
4307 */
4308 
4309 tree
bump_vector_ptr(tree dataref_ptr,gimple ptr_incr,gimple_stmt_iterator * gsi,gimple stmt,tree bump)4310 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4311 		 gimple stmt, tree bump)
4312 {
4313   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4314   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4315   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4316   tree update = TYPE_SIZE_UNIT (vectype);
4317   gimple incr_stmt;
4318   ssa_op_iter iter;
4319   use_operand_p use_p;
4320   tree new_dataref_ptr;
4321 
4322   if (bump)
4323     update = bump;
4324 
4325   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4326   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4327 					    dataref_ptr, update);
4328   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4329 
4330   /* Copy the points-to information if it exists. */
4331   if (DR_PTR_INFO (dr))
4332     {
4333       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4334       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4335     }
4336 
4337   if (!ptr_incr)
4338     return new_dataref_ptr;
4339 
4340   /* Update the vector-pointer's cross-iteration increment.  */
4341   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4342     {
4343       tree use = USE_FROM_PTR (use_p);
4344 
4345       if (use == dataref_ptr)
4346         SET_USE (use_p, new_dataref_ptr);
4347       else
4348         gcc_assert (tree_int_cst_compare (use, update) == 0);
4349     }
4350 
4351   return new_dataref_ptr;
4352 }
4353 
4354 
4355 /* Function vect_create_destination_var.
4356 
4357    Create a new temporary of type VECTYPE.  */
4358 
4359 tree
vect_create_destination_var(tree scalar_dest,tree vectype)4360 vect_create_destination_var (tree scalar_dest, tree vectype)
4361 {
4362   tree vec_dest;
4363   const char *name;
4364   char *new_name;
4365   tree type;
4366   enum vect_var_kind kind;
4367 
4368   kind = vectype ? vect_simple_var : vect_scalar_var;
4369   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4370 
4371   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4372 
4373   name = get_name (scalar_dest);
4374   if (name)
4375     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4376   else
4377     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4378   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4379   free (new_name);
4380 
4381   return vec_dest;
4382 }
4383 
4384 /* Function vect_grouped_store_supported.
4385 
4386    Returns TRUE if interleave high and interleave low permutations
4387    are supported, and FALSE otherwise.  */
4388 
4389 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)4390 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4391 {
4392   enum machine_mode mode = TYPE_MODE (vectype);
4393 
4394   /* vect_permute_store_chain requires the group size to be a power of two.  */
4395   if (exact_log2 (count) == -1)
4396     {
4397       if (dump_enabled_p ())
4398 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4399                          "the size of the group of accesses"
4400                          " is not a power of 2\n");
4401       return false;
4402     }
4403 
4404   /* Check that the permutation is supported.  */
4405   if (VECTOR_MODE_P (mode))
4406     {
4407       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4408       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4409       for (i = 0; i < nelt / 2; i++)
4410 	{
4411 	  sel[i * 2] = i;
4412 	  sel[i * 2 + 1] = i + nelt;
4413 	}
4414       if (can_vec_perm_p (mode, false, sel))
4415 	{
4416 	  for (i = 0; i < nelt; i++)
4417 	    sel[i] += nelt / 2;
4418 	  if (can_vec_perm_p (mode, false, sel))
4419 	    return true;
4420 	}
4421     }
4422 
4423   if (dump_enabled_p ())
4424     dump_printf (MSG_MISSED_OPTIMIZATION,
4425                  "interleave op not supported by target.\n");
4426   return false;
4427 }
4428 
4429 
4430 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4431    type VECTYPE.  */
4432 
4433 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count)4434 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4435 {
4436   return vect_lanes_optab_supported_p ("vec_store_lanes",
4437 				       vec_store_lanes_optab,
4438 				       vectype, count);
4439 }
4440 
4441 
4442 /* Function vect_permute_store_chain.
4443 
4444    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4445    a power of 2, generate interleave_high/low stmts to reorder the data
4446    correctly for the stores.  Return the final references for stores in
4447    RESULT_CHAIN.
4448 
4449    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4450    The input is 4 vectors each containing 8 elements.  We assign a number to
4451    each element, the input sequence is:
4452 
4453    1st vec:   0  1  2  3  4  5  6  7
4454    2nd vec:   8  9 10 11 12 13 14 15
4455    3rd vec:  16 17 18 19 20 21 22 23
4456    4th vec:  24 25 26 27 28 29 30 31
4457 
4458    The output sequence should be:
4459 
4460    1st vec:  0  8 16 24  1  9 17 25
4461    2nd vec:  2 10 18 26  3 11 19 27
4462    3rd vec:  4 12 20 28  5 13 21 30
4463    4th vec:  6 14 22 30  7 15 23 31
4464 
4465    i.e., we interleave the contents of the four vectors in their order.
4466 
4467    We use interleave_high/low instructions to create such output.  The input of
4468    each interleave_high/low operation is two vectors:
4469    1st vec    2nd vec
4470    0 1 2 3    4 5 6 7
4471    the even elements of the result vector are obtained left-to-right from the
4472    high/low elements of the first vector.  The odd elements of the result are
4473    obtained left-to-right from the high/low elements of the second vector.
4474    The output of interleave_high will be:   0 4 1 5
4475    and of interleave_low:                   2 6 3 7
4476 
4477 
4478    The permutation is done in log LENGTH stages.  In each stage interleave_high
4479    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4480    where the first argument is taken from the first half of DR_CHAIN and the
4481    second argument from it's second half.
4482    In our example,
4483 
4484    I1: interleave_high (1st vec, 3rd vec)
4485    I2: interleave_low (1st vec, 3rd vec)
4486    I3: interleave_high (2nd vec, 4th vec)
4487    I4: interleave_low (2nd vec, 4th vec)
4488 
4489    The output for the first stage is:
4490 
4491    I1:  0 16  1 17  2 18  3 19
4492    I2:  4 20  5 21  6 22  7 23
4493    I3:  8 24  9 25 10 26 11 27
4494    I4: 12 28 13 29 14 30 15 31
4495 
4496    The output of the second stage, i.e. the final result is:
4497 
4498    I1:  0  8 16 24  1  9 17 25
4499    I2:  2 10 18 26  3 11 19 27
4500    I3:  4 12 20 28  5 13 21 30
4501    I4:  6 14 22 30  7 15 23 31.  */
4502 
4503 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)4504 vect_permute_store_chain (vec<tree> dr_chain,
4505 			  unsigned int length,
4506 			  gimple stmt,
4507 			  gimple_stmt_iterator *gsi,
4508 			  vec<tree> *result_chain)
4509 {
4510   tree vect1, vect2, high, low;
4511   gimple perm_stmt;
4512   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4513   tree perm_mask_low, perm_mask_high;
4514   unsigned int i, n;
4515   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4516   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4517 
4518   result_chain->quick_grow (length);
4519   memcpy (result_chain->address (), dr_chain.address (),
4520 	  length * sizeof (tree));
4521 
4522   for (i = 0, n = nelt / 2; i < n; i++)
4523     {
4524       sel[i * 2] = i;
4525       sel[i * 2 + 1] = i + nelt;
4526     }
4527   perm_mask_high = vect_gen_perm_mask (vectype, sel);
4528   gcc_assert (perm_mask_high != NULL);
4529 
4530   for (i = 0; i < nelt; i++)
4531     sel[i] += nelt / 2;
4532   perm_mask_low = vect_gen_perm_mask (vectype, sel);
4533   gcc_assert (perm_mask_low != NULL);
4534 
4535   for (i = 0, n = exact_log2 (length); i < n; i++)
4536     {
4537       for (j = 0; j < length/2; j++)
4538 	{
4539 	  vect1 = dr_chain[j];
4540 	  vect2 = dr_chain[j+length/2];
4541 
4542 	  /* Create interleaving stmt:
4543 	     high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
4544 	  high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4545 	  perm_stmt
4546 	    = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4547 					    vect1, vect2, perm_mask_high);
4548 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4549 	  (*result_chain)[2*j] = high;
4550 
4551 	  /* Create interleaving stmt:
4552 	     low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4553 						 nelt*3/2+1, ...}>  */
4554 	  low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4555 	  perm_stmt
4556 	    = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4557 					    vect1, vect2, perm_mask_low);
4558 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4559 	  (*result_chain)[2*j+1] = low;
4560 	}
4561       memcpy (dr_chain.address (), result_chain->address (),
4562 	      length * sizeof (tree));
4563     }
4564 }
4565 
4566 /* Function vect_setup_realignment
4567 
4568    This function is called when vectorizing an unaligned load using
4569    the dr_explicit_realign[_optimized] scheme.
4570    This function generates the following code at the loop prolog:
4571 
4572       p = initial_addr;
4573    x  msq_init = *(floor(p));   # prolog load
4574       realignment_token = call target_builtin;
4575     loop:
4576    x  msq = phi (msq_init, ---)
4577 
4578    The stmts marked with x are generated only for the case of
4579    dr_explicit_realign_optimized.
4580 
4581    The code above sets up a new (vector) pointer, pointing to the first
4582    location accessed by STMT, and a "floor-aligned" load using that pointer.
4583    It also generates code to compute the "realignment-token" (if the relevant
4584    target hook was defined), and creates a phi-node at the loop-header bb
4585    whose arguments are the result of the prolog-load (created by this
4586    function) and the result of a load that takes place in the loop (to be
4587    created by the caller to this function).
4588 
4589    For the case of dr_explicit_realign_optimized:
4590    The caller to this function uses the phi-result (msq) to create the
4591    realignment code inside the loop, and sets up the missing phi argument,
4592    as follows:
4593     loop:
4594       msq = phi (msq_init, lsq)
4595       lsq = *(floor(p'));        # load in loop
4596       result = realign_load (msq, lsq, realignment_token);
4597 
4598    For the case of dr_explicit_realign:
4599     loop:
4600       msq = *(floor(p)); 	# load in loop
4601       p' = p + (VS-1);
4602       lsq = *(floor(p'));	# load in loop
4603       result = realign_load (msq, lsq, realignment_token);
4604 
4605    Input:
4606    STMT - (scalar) load stmt to be vectorized. This load accesses
4607           a memory location that may be unaligned.
4608    BSI - place where new code is to be inserted.
4609    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4610 			      is used.
4611 
4612    Output:
4613    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4614                        target hook, if defined.
4615    Return value - the result of the loop-header phi node.  */
4616 
4617 tree
vect_setup_realignment(gimple stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)4618 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4619                         tree *realignment_token,
4620 			enum dr_alignment_support alignment_support_scheme,
4621 			tree init_addr,
4622 			struct loop **at_loop)
4623 {
4624   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4625   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4626   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4627   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4628   struct loop *loop = NULL;
4629   edge pe = NULL;
4630   tree scalar_dest = gimple_assign_lhs (stmt);
4631   tree vec_dest;
4632   gimple inc;
4633   tree ptr;
4634   tree data_ref;
4635   gimple new_stmt;
4636   basic_block new_bb;
4637   tree msq_init = NULL_TREE;
4638   tree new_temp;
4639   gimple phi_stmt;
4640   tree msq = NULL_TREE;
4641   gimple_seq stmts = NULL;
4642   bool inv_p;
4643   bool compute_in_loop = false;
4644   bool nested_in_vect_loop = false;
4645   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4646   struct loop *loop_for_initial_load = NULL;
4647 
4648   if (loop_vinfo)
4649     {
4650       loop = LOOP_VINFO_LOOP (loop_vinfo);
4651       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4652     }
4653 
4654   gcc_assert (alignment_support_scheme == dr_explicit_realign
4655 	      || alignment_support_scheme == dr_explicit_realign_optimized);
4656 
4657   /* We need to generate three things:
4658      1. the misalignment computation
4659      2. the extra vector load (for the optimized realignment scheme).
4660      3. the phi node for the two vectors from which the realignment is
4661       done (for the optimized realignment scheme).  */
4662 
4663   /* 1. Determine where to generate the misalignment computation.
4664 
4665      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4666      calculation will be generated by this function, outside the loop (in the
4667      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4668      caller, inside the loop.
4669 
4670      Background: If the misalignment remains fixed throughout the iterations of
4671      the loop, then both realignment schemes are applicable, and also the
4672      misalignment computation can be done outside LOOP.  This is because we are
4673      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4674      are a multiple of VS (the Vector Size), and therefore the misalignment in
4675      different vectorized LOOP iterations is always the same.
4676      The problem arises only if the memory access is in an inner-loop nested
4677      inside LOOP, which is now being vectorized using outer-loop vectorization.
4678      This is the only case when the misalignment of the memory access may not
4679      remain fixed throughout the iterations of the inner-loop (as explained in
4680      detail in vect_supportable_dr_alignment).  In this case, not only is the
4681      optimized realignment scheme not applicable, but also the misalignment
4682      computation (and generation of the realignment token that is passed to
4683      REALIGN_LOAD) have to be done inside the loop.
4684 
4685      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4686      or not, which in turn determines if the misalignment is computed inside
4687      the inner-loop, or outside LOOP.  */
4688 
4689   if (init_addr != NULL_TREE || !loop_vinfo)
4690     {
4691       compute_in_loop = true;
4692       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4693     }
4694 
4695 
4696   /* 2. Determine where to generate the extra vector load.
4697 
4698      For the optimized realignment scheme, instead of generating two vector
4699      loads in each iteration, we generate a single extra vector load in the
4700      preheader of the loop, and in each iteration reuse the result of the
4701      vector load from the previous iteration.  In case the memory access is in
4702      an inner-loop nested inside LOOP, which is now being vectorized using
4703      outer-loop vectorization, we need to determine whether this initial vector
4704      load should be generated at the preheader of the inner-loop, or can be
4705      generated at the preheader of LOOP.  If the memory access has no evolution
4706      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4707      to be generated inside LOOP (in the preheader of the inner-loop).  */
4708 
4709   if (nested_in_vect_loop)
4710     {
4711       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4712       bool invariant_in_outerloop =
4713             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4714       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4715     }
4716   else
4717     loop_for_initial_load = loop;
4718   if (at_loop)
4719     *at_loop = loop_for_initial_load;
4720 
4721   if (loop_for_initial_load)
4722     pe = loop_preheader_edge (loop_for_initial_load);
4723 
4724   /* 3. For the case of the optimized realignment, create the first vector
4725       load at the loop preheader.  */
4726 
4727   if (alignment_support_scheme == dr_explicit_realign_optimized)
4728     {
4729       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4730 
4731       gcc_assert (!compute_in_loop);
4732       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4733       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4734 				      NULL_TREE, &init_addr, NULL, &inc,
4735 				      true, &inv_p);
4736       new_temp = copy_ssa_name (ptr, NULL);
4737       new_stmt = gimple_build_assign_with_ops
4738 		   (BIT_AND_EXPR, new_temp, ptr,
4739 		    build_int_cst (TREE_TYPE (ptr),
4740 				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4741       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4742       gcc_assert (!new_bb);
4743       data_ref
4744 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4745 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4746       new_stmt = gimple_build_assign (vec_dest, data_ref);
4747       new_temp = make_ssa_name (vec_dest, new_stmt);
4748       gimple_assign_set_lhs (new_stmt, new_temp);
4749       if (pe)
4750         {
4751           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4752           gcc_assert (!new_bb);
4753         }
4754       else
4755          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4756 
4757       msq_init = gimple_assign_lhs (new_stmt);
4758     }
4759 
4760   /* 4. Create realignment token using a target builtin, if available.
4761       It is done either inside the containing loop, or before LOOP (as
4762       determined above).  */
4763 
4764   if (targetm.vectorize.builtin_mask_for_load)
4765     {
4766       tree builtin_decl;
4767 
4768       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4769       if (!init_addr)
4770 	{
4771 	  /* Generate the INIT_ADDR computation outside LOOP.  */
4772 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4773 							NULL_TREE, loop);
4774           if (loop)
4775             {
4776    	      pe = loop_preheader_edge (loop);
4777 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4778 	      gcc_assert (!new_bb);
4779             }
4780           else
4781              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4782 	}
4783 
4784       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4785       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4786       vec_dest =
4787 	vect_create_destination_var (scalar_dest,
4788 				     gimple_call_return_type (new_stmt));
4789       new_temp = make_ssa_name (vec_dest, new_stmt);
4790       gimple_call_set_lhs (new_stmt, new_temp);
4791 
4792       if (compute_in_loop)
4793 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4794       else
4795 	{
4796 	  /* Generate the misalignment computation outside LOOP.  */
4797 	  pe = loop_preheader_edge (loop);
4798 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4799 	  gcc_assert (!new_bb);
4800 	}
4801 
4802       *realignment_token = gimple_call_lhs (new_stmt);
4803 
4804       /* The result of the CALL_EXPR to this builtin is determined from
4805          the value of the parameter and no global variables are touched
4806          which makes the builtin a "const" function.  Requiring the
4807          builtin to have the "const" attribute makes it unnecessary
4808          to call mark_call_clobbered.  */
4809       gcc_assert (TREE_READONLY (builtin_decl));
4810     }
4811 
4812   if (alignment_support_scheme == dr_explicit_realign)
4813     return msq;
4814 
4815   gcc_assert (!compute_in_loop);
4816   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4817 
4818 
4819   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4820 
4821   pe = loop_preheader_edge (containing_loop);
4822   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4823   msq = make_ssa_name (vec_dest, NULL);
4824   phi_stmt = create_phi_node (msq, containing_loop->header);
4825   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4826 
4827   return msq;
4828 }
4829 
4830 
4831 /* Function vect_grouped_load_supported.
4832 
4833    Returns TRUE if even and odd permutations are supported,
4834    and FALSE otherwise.  */
4835 
4836 bool
vect_grouped_load_supported(tree vectype,unsigned HOST_WIDE_INT count)4837 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4838 {
4839   enum machine_mode mode = TYPE_MODE (vectype);
4840 
4841   /* vect_permute_load_chain requires the group size to be a power of two.  */
4842   if (exact_log2 (count) == -1)
4843     {
4844       if (dump_enabled_p ())
4845 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4846                          "the size of the group of accesses"
4847                          " is not a power of 2\n");
4848       return false;
4849     }
4850 
4851   /* Check that the permutation is supported.  */
4852   if (VECTOR_MODE_P (mode))
4853     {
4854       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4855       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4856 
4857       for (i = 0; i < nelt; i++)
4858 	sel[i] = i * 2;
4859       if (can_vec_perm_p (mode, false, sel))
4860 	{
4861 	  for (i = 0; i < nelt; i++)
4862 	    sel[i] = i * 2 + 1;
4863 	  if (can_vec_perm_p (mode, false, sel))
4864 	    return true;
4865 	}
4866     }
4867 
4868   if (dump_enabled_p ())
4869     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4870                      "extract even/odd not supported by target\n");
4871   return false;
4872 }
4873 
4874 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4875    type VECTYPE.  */
4876 
4877 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count)4878 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4879 {
4880   return vect_lanes_optab_supported_p ("vec_load_lanes",
4881 				       vec_load_lanes_optab,
4882 				       vectype, count);
4883 }
4884 
4885 /* Function vect_permute_load_chain.
4886 
4887    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4888    a power of 2, generate extract_even/odd stmts to reorder the input data
4889    correctly.  Return the final references for loads in RESULT_CHAIN.
4890 
4891    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4892    The input is 4 vectors each containing 8 elements. We assign a number to each
4893    element, the input sequence is:
4894 
4895    1st vec:   0  1  2  3  4  5  6  7
4896    2nd vec:   8  9 10 11 12 13 14 15
4897    3rd vec:  16 17 18 19 20 21 22 23
4898    4th vec:  24 25 26 27 28 29 30 31
4899 
4900    The output sequence should be:
4901 
4902    1st vec:  0 4  8 12 16 20 24 28
4903    2nd vec:  1 5  9 13 17 21 25 29
4904    3rd vec:  2 6 10 14 18 22 26 30
4905    4th vec:  3 7 11 15 19 23 27 31
4906 
4907    i.e., the first output vector should contain the first elements of each
4908    interleaving group, etc.
4909 
4910    We use extract_even/odd instructions to create such output.  The input of
4911    each extract_even/odd operation is two vectors
4912    1st vec    2nd vec
4913    0 1 2 3    4 5 6 7
4914 
4915    and the output is the vector of extracted even/odd elements.  The output of
4916    extract_even will be:   0 2 4 6
4917    and of extract_odd:     1 3 5 7
4918 
4919 
4920    The permutation is done in log LENGTH stages.  In each stage extract_even
4921    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4922    their order.  In our example,
4923 
4924    E1: extract_even (1st vec, 2nd vec)
4925    E2: extract_odd (1st vec, 2nd vec)
4926    E3: extract_even (3rd vec, 4th vec)
4927    E4: extract_odd (3rd vec, 4th vec)
4928 
4929    The output for the first stage will be:
4930 
4931    E1:  0  2  4  6  8 10 12 14
4932    E2:  1  3  5  7  9 11 13 15
4933    E3: 16 18 20 22 24 26 28 30
4934    E4: 17 19 21 23 25 27 29 31
4935 
4936    In order to proceed and create the correct sequence for the next stage (or
4937    for the correct output, if the second stage is the last one, as in our
4938    example), we first put the output of extract_even operation and then the
4939    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4940    The input for the second stage is:
4941 
4942    1st vec (E1):  0  2  4  6  8 10 12 14
4943    2nd vec (E3): 16 18 20 22 24 26 28 30
4944    3rd vec (E2):  1  3  5  7  9 11 13 15
4945    4th vec (E4): 17 19 21 23 25 27 29 31
4946 
4947    The output of the second stage:
4948 
4949    E1: 0 4  8 12 16 20 24 28
4950    E2: 2 6 10 14 18 22 26 30
4951    E3: 1 5  9 13 17 21 25 29
4952    E4: 3 7 11 15 19 23 27 31
4953 
4954    And RESULT_CHAIN after reordering:
4955 
4956    1st vec (E1):  0 4  8 12 16 20 24 28
4957    2nd vec (E3):  1 5  9 13 17 21 25 29
4958    3rd vec (E2):  2 6 10 14 18 22 26 30
4959    4th vec (E4):  3 7 11 15 19 23 27 31.  */
4960 
4961 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)4962 vect_permute_load_chain (vec<tree> dr_chain,
4963 			 unsigned int length,
4964 			 gimple stmt,
4965 			 gimple_stmt_iterator *gsi,
4966 			 vec<tree> *result_chain)
4967 {
4968   tree data_ref, first_vect, second_vect;
4969   tree perm_mask_even, perm_mask_odd;
4970   gimple perm_stmt;
4971   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4972   unsigned int i, j, log_length = exact_log2 (length);
4973   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4974   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4975 
4976   result_chain->quick_grow (length);
4977   memcpy (result_chain->address (), dr_chain.address (),
4978 	  length * sizeof (tree));
4979 
4980   for (i = 0; i < nelt; ++i)
4981     sel[i] = i * 2;
4982   perm_mask_even = vect_gen_perm_mask (vectype, sel);
4983   gcc_assert (perm_mask_even != NULL);
4984 
4985   for (i = 0; i < nelt; ++i)
4986     sel[i] = i * 2 + 1;
4987   perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4988   gcc_assert (perm_mask_odd != NULL);
4989 
4990   for (i = 0; i < log_length; i++)
4991     {
4992       for (j = 0; j < length; j += 2)
4993 	{
4994 	  first_vect = dr_chain[j];
4995 	  second_vect = dr_chain[j+1];
4996 
4997 	  /* data_ref = permute_even (first_data_ref, second_data_ref);  */
4998 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4999 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5000 						    first_vect, second_vect,
5001 						    perm_mask_even);
5002 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5003 	  (*result_chain)[j/2] = data_ref;
5004 
5005 	  /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5006 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5007 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5008 						    first_vect, second_vect,
5009 						    perm_mask_odd);
5010 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5011 	  (*result_chain)[j/2+length/2] = data_ref;
5012 	}
5013       memcpy (dr_chain.address (), result_chain->address (),
5014 	      length * sizeof (tree));
5015     }
5016 }
5017 
5018 
5019 /* Function vect_transform_grouped_load.
5020 
5021    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5022    to perform their permutation and ascribe the result vectorized statements to
5023    the scalar statements.
5024 */
5025 
5026 void
vect_transform_grouped_load(gimple stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)5027 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5028 			     gimple_stmt_iterator *gsi)
5029 {
5030   vec<tree> result_chain = vNULL;
5031 
5032   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5033      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5034      vectors, that are ready for vector computation.  */
5035   result_chain.create (size);
5036   vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5037   vect_record_grouped_load_vectors (stmt, result_chain);
5038   result_chain.release ();
5039 }
5040 
5041 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5042    generated as part of the vectorization of STMT.  Assign the statement
5043    for each vector to the associated scalar statement.  */
5044 
5045 void
vect_record_grouped_load_vectors(gimple stmt,vec<tree> result_chain)5046 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5047 {
5048   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5049   gimple next_stmt, new_stmt;
5050   unsigned int i, gap_count;
5051   tree tmp_data_ref;
5052 
5053   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5054      Since we scan the chain starting from it's first node, their order
5055      corresponds the order of data-refs in RESULT_CHAIN.  */
5056   next_stmt = first_stmt;
5057   gap_count = 1;
5058   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5059     {
5060       if (!next_stmt)
5061 	break;
5062 
5063       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5064        code elimination pass later.  No need to check for the first stmt in
5065        the group, since it always exists.
5066        GROUP_GAP is the number of steps in elements from the previous
5067        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5068        correspond to the gaps.  */
5069       if (next_stmt != first_stmt
5070           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5071       {
5072         gap_count++;
5073         continue;
5074       }
5075 
5076       while (next_stmt)
5077         {
5078 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5079 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5080 	     copies, and we put the new vector statement in the first available
5081 	     RELATED_STMT.  */
5082 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5083 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5084 	  else
5085             {
5086               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5087                 {
5088  	          gimple prev_stmt =
5089 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5090 	          gimple rel_stmt =
5091 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5092 	          while (rel_stmt)
5093 		    {
5094 		      prev_stmt = rel_stmt;
5095 		      rel_stmt =
5096                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5097 		    }
5098 
5099   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5100                     new_stmt;
5101                 }
5102             }
5103 
5104 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5105 	  gap_count = 1;
5106 	  /* If NEXT_STMT accesses the same DR as the previous statement,
5107 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
5108 	     get the next data-ref from RESULT_CHAIN.  */
5109 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5110 	    break;
5111         }
5112     }
5113 }
5114 
5115 /* Function vect_force_dr_alignment_p.
5116 
5117    Returns whether the alignment of a DECL can be forced to be aligned
5118    on ALIGNMENT bit boundary.  */
5119 
5120 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)5121 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5122 {
5123   if (TREE_CODE (decl) != VAR_DECL)
5124     return false;
5125 
5126   /* We cannot change alignment of common or external symbols as another
5127      translation unit may contain a definition with lower alignment.
5128      The rules of common symbol linking mean that the definition
5129      will override the common symbol.  The same is true for constant
5130      pool entries which may be shared and are not properly merged
5131      by LTO.  */
5132   if (DECL_EXTERNAL (decl)
5133       || DECL_COMMON (decl)
5134       || DECL_IN_CONSTANT_POOL (decl))
5135     return false;
5136 
5137   if (TREE_ASM_WRITTEN (decl))
5138     return false;
5139 
5140   /* Do not override the alignment as specified by the ABI when the used
5141      attribute is set.  */
5142   if (DECL_PRESERVE_P (decl))
5143     return false;
5144 
5145   /* Do not override explicit alignment set by the user when an explicit
5146      section name is also used.  This is a common idiom used by many
5147      software projects.  */
5148   if (DECL_SECTION_NAME (decl) != NULL_TREE
5149       && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5150     return false;
5151 
5152   if (TREE_STATIC (decl))
5153     return (alignment <= MAX_OFILE_ALIGNMENT);
5154   else
5155     return (alignment <= MAX_STACK_ALIGNMENT);
5156 }
5157 
5158 
5159 /* Return whether the data reference DR is supported with respect to its
5160    alignment.
5161    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5162    it is aligned, i.e., check if it is possible to vectorize it with different
5163    alignment.  */
5164 
5165 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)5166 vect_supportable_dr_alignment (struct data_reference *dr,
5167                                bool check_aligned_accesses)
5168 {
5169   gimple stmt = DR_STMT (dr);
5170   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5171   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5172   enum machine_mode mode = TYPE_MODE (vectype);
5173   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5174   struct loop *vect_loop = NULL;
5175   bool nested_in_vect_loop = false;
5176 
5177   if (aligned_access_p (dr) && !check_aligned_accesses)
5178     return dr_aligned;
5179 
5180   /* For now assume all conditional loads/stores support unaligned
5181      access without any special code.  */
5182   if (is_gimple_call (stmt)
5183       && gimple_call_internal_p (stmt)
5184       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5185 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5186     return dr_unaligned_supported;
5187 
5188   if (loop_vinfo)
5189     {
5190       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5191       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5192     }
5193 
5194   /* Possibly unaligned access.  */
5195 
5196   /* We can choose between using the implicit realignment scheme (generating
5197      a misaligned_move stmt) and the explicit realignment scheme (generating
5198      aligned loads with a REALIGN_LOAD).  There are two variants to the
5199      explicit realignment scheme: optimized, and unoptimized.
5200      We can optimize the realignment only if the step between consecutive
5201      vector loads is equal to the vector size.  Since the vector memory
5202      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5203      is guaranteed that the misalignment amount remains the same throughout the
5204      execution of the vectorized loop.  Therefore, we can create the
5205      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5206      at the loop preheader.
5207 
5208      However, in the case of outer-loop vectorization, when vectorizing a
5209      memory access in the inner-loop nested within the LOOP that is now being
5210      vectorized, while it is guaranteed that the misalignment of the
5211      vectorized memory access will remain the same in different outer-loop
5212      iterations, it is *not* guaranteed that is will remain the same throughout
5213      the execution of the inner-loop.  This is because the inner-loop advances
5214      with the original scalar step (and not in steps of VS).  If the inner-loop
5215      step happens to be a multiple of VS, then the misalignment remains fixed
5216      and we can use the optimized realignment scheme.  For example:
5217 
5218       for (i=0; i<N; i++)
5219         for (j=0; j<M; j++)
5220           s += a[i+j];
5221 
5222      When vectorizing the i-loop in the above example, the step between
5223      consecutive vector loads is 1, and so the misalignment does not remain
5224      fixed across the execution of the inner-loop, and the realignment cannot
5225      be optimized (as illustrated in the following pseudo vectorized loop):
5226 
5227       for (i=0; i<N; i+=4)
5228         for (j=0; j<M; j++){
5229           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5230                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5231                          // (assuming that we start from an aligned address).
5232           }
5233 
5234      We therefore have to use the unoptimized realignment scheme:
5235 
5236       for (i=0; i<N; i+=4)
5237           for (j=k; j<M; j+=4)
5238           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5239                            // that the misalignment of the initial address is
5240                            // 0).
5241 
5242      The loop can then be vectorized as follows:
5243 
5244       for (k=0; k<4; k++){
5245         rt = get_realignment_token (&vp[k]);
5246         for (i=0; i<N; i+=4){
5247           v1 = vp[i+k];
5248           for (j=k; j<M; j+=4){
5249             v2 = vp[i+j+VS-1];
5250             va = REALIGN_LOAD <v1,v2,rt>;
5251             vs += va;
5252             v1 = v2;
5253           }
5254         }
5255     } */
5256 
5257   if (DR_IS_READ (dr))
5258     {
5259       bool is_packed = false;
5260       tree type = (TREE_TYPE (DR_REF (dr)));
5261 
5262       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5263 	  && (!targetm.vectorize.builtin_mask_for_load
5264 	      || targetm.vectorize.builtin_mask_for_load ()))
5265 	{
5266 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5267 	  if ((nested_in_vect_loop
5268 	       && (TREE_INT_CST_LOW (DR_STEP (dr))
5269 	 	   != GET_MODE_SIZE (TYPE_MODE (vectype))))
5270               || !loop_vinfo)
5271 	    return dr_explicit_realign;
5272 	  else
5273 	    return dr_explicit_realign_optimized;
5274 	}
5275       if (!known_alignment_for_access_p (dr))
5276 	is_packed = not_size_aligned (DR_REF (dr));
5277 
5278       if ((TYPE_USER_ALIGN (type) && !is_packed)
5279 	  || targetm.vectorize.
5280 	       support_vector_misalignment (mode, type,
5281 					    DR_MISALIGNMENT (dr), is_packed))
5282 	/* Can't software pipeline the loads, but can at least do them.  */
5283 	return dr_unaligned_supported;
5284     }
5285   else
5286     {
5287       bool is_packed = false;
5288       tree type = (TREE_TYPE (DR_REF (dr)));
5289 
5290       if (!known_alignment_for_access_p (dr))
5291 	is_packed = not_size_aligned (DR_REF (dr));
5292 
5293      if ((TYPE_USER_ALIGN (type) && !is_packed)
5294 	 || targetm.vectorize.
5295 	      support_vector_misalignment (mode, type,
5296 					   DR_MISALIGNMENT (dr), is_packed))
5297        return dr_unaligned_supported;
5298     }
5299 
5300   /* Unsupported.  */
5301   return dr_unaligned_unsupported;
5302 }
5303