1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
81 }
82 }
83
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 {
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
91 }
92
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
97
98 return true;
99 }
100
101
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
118
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140 {
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
147
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
151 }
152
153
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
157
158 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
160 {
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
162
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
165
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
169
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
172 }
173
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
175
176 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
178 {
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
183
184 if (dump_enabled_p ())
185 {
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
189 }
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 }
192
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
196
197 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
199 {
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
202
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
207
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the last scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Compute that position and check whether the resulting order matches
212 the current one. */
213 gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
214 if (last_a)
215 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
216 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
217 last_a = get_later_stmt (last_a, s);
218 else
219 last_a = stmt_a;
220 gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
221 if (last_b)
222 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
223 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
224 last_b = get_later_stmt (last_b, s);
225 else
226 last_b = stmt_b;
227 return ((get_later_stmt (last_a, last_b) == last_a)
228 == (get_later_stmt (stmt_a, stmt_b) == stmt_a));
229 }
230
231 /* A subroutine of vect_analyze_data_ref_dependence. Handle
232 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
233 distances. These distances are conservatively correct but they don't
234 reflect a guaranteed dependence.
235
236 Return true if this function does all the work necessary to avoid
237 an alias or false if the caller should use the dependence distances
238 to limit the vectorization factor in the usual way. LOOP_DEPTH is
239 the depth of the loop described by LOOP_VINFO and the other arguments
240 are as for vect_analyze_data_ref_dependence. */
241
242 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)243 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
244 loop_vec_info loop_vinfo,
245 int loop_depth, unsigned int *max_vf)
246 {
247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
248 lambda_vector dist_v;
249 unsigned int i;
250 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
251 {
252 int dist = dist_v[loop_depth];
253 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
254 {
255 /* If the user asserted safelen >= DIST consecutive iterations
256 can be executed concurrently, assume independence.
257
258 ??? An alternative would be to add the alias check even
259 in this case, and vectorize the fallback loop with the
260 maximum VF set to safelen. However, if the user has
261 explicitly given a length, it's less likely that that
262 would be a win. */
263 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
264 {
265 if ((unsigned int) loop->safelen < *max_vf)
266 *max_vf = loop->safelen;
267 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
268 continue;
269 }
270
271 /* For dependence distances of 2 or more, we have the option
272 of limiting VF or checking for an alias at runtime.
273 Prefer to check at runtime if we can, to avoid limiting
274 the VF unnecessarily when the bases are in fact independent.
275
276 Note that the alias checks will be removed if the VF ends up
277 being small enough. */
278 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
279 }
280 }
281 return true;
282 }
283
284
285 /* Function vect_analyze_data_ref_dependence.
286
287 Return TRUE if there (might) exist a dependence between a memory-reference
288 DRA and a memory-reference DRB. When versioning for alias may check a
289 dependence at run-time, return FALSE. Adjust *MAX_VF according to
290 the data dependence. */
291
292 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)293 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 unsigned int *max_vf)
296 {
297 unsigned int i;
298 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
299 struct data_reference *dra = DDR_A (ddr);
300 struct data_reference *drb = DDR_B (ddr);
301 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
302 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
303 lambda_vector dist_v;
304 unsigned int loop_depth;
305
306 /* In loop analysis all data references should be vectorizable. */
307 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
308 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
309 gcc_unreachable ();
310
311 /* Independent data accesses. */
312 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
313 return false;
314
315 if (dra == drb
316 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
317 return false;
318
319 /* We do not have to consider dependences between accesses that belong
320 to the same group, unless the stride could be smaller than the
321 group size. */
322 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
323 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
324 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
325 return false;
326
327 /* Even if we have an anti-dependence then, as the vectorized loop covers at
328 least two scalar iterations, there is always also a true dependence.
329 As the vectorizer does not re-order loads and stores we can ignore
330 the anti-dependence if TBAA can disambiguate both DRs similar to the
331 case with known negative distance anti-dependences (positive
332 distance anti-dependences would violate TBAA constraints). */
333 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
334 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
335 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
336 get_alias_set (DR_REF (drb))))
337 return false;
338
339 /* Unknown data dependence. */
340 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
341 {
342 /* If user asserted safelen consecutive iterations can be
343 executed concurrently, assume independence. */
344 if (loop->safelen >= 2)
345 {
346 if ((unsigned int) loop->safelen < *max_vf)
347 *max_vf = loop->safelen;
348 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
349 return false;
350 }
351
352 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
353 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
354 {
355 if (dump_enabled_p ())
356 {
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "versioning for alias not supported for: "
359 "can't determine dependence between ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (dra));
362 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
364 DR_REF (drb));
365 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
366 }
367 return true;
368 }
369
370 if (dump_enabled_p ())
371 {
372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373 "versioning for alias required: "
374 "can't determine dependence between ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (dra));
377 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
378 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
379 DR_REF (drb));
380 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
381 }
382
383 /* Add to list of ddrs that need to be tested at run-time. */
384 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
385 }
386
387 /* Known data dependence. */
388 if (DDR_NUM_DIST_VECTS (ddr) == 0)
389 {
390 /* If user asserted safelen consecutive iterations can be
391 executed concurrently, assume independence. */
392 if (loop->safelen >= 2)
393 {
394 if ((unsigned int) loop->safelen < *max_vf)
395 *max_vf = loop->safelen;
396 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
397 return false;
398 }
399
400 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
401 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
402 {
403 if (dump_enabled_p ())
404 {
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "versioning for alias not supported for: "
407 "bad dist vector for ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (dra));
410 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
412 DR_REF (drb));
413 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 }
415 return true;
416 }
417
418 if (dump_enabled_p ())
419 {
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "versioning for alias required: "
422 "bad dist vector for ");
423 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
424 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
425 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
426 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
427 }
428 /* Add to list of ddrs that need to be tested at run-time. */
429 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
430 }
431
432 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
433
434 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
435 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
436 loop_depth, max_vf))
437 return false;
438
439 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
440 {
441 int dist = dist_v[loop_depth];
442
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "dependence distance = %d.\n", dist);
446
447 if (dist == 0)
448 {
449 if (dump_enabled_p ())
450 {
451 dump_printf_loc (MSG_NOTE, vect_location,
452 "dependence distance == 0 between ");
453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
454 dump_printf (MSG_NOTE, " and ");
455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
456 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
457 }
458
459 /* When we perform grouped accesses and perform implicit CSE
460 by detecting equal accesses and doing disambiguation with
461 runtime alias tests like for
462 .. = a[i];
463 .. = a[i+1];
464 a[i] = ..;
465 a[i+1] = ..;
466 *p = ..;
467 .. = a[i];
468 .. = a[i+1];
469 where we will end up loading { a[i], a[i+1] } once, make
470 sure that inserting group loads before the first load and
471 stores after the last store will do the right thing.
472 Similar for groups like
473 a[i] = ...;
474 ... = a[i];
475 a[i+1] = ...;
476 where loads from the group interleave with the store. */
477 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
478 {
479 if (dump_enabled_p ())
480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
481 "READ_WRITE dependence in interleaving.\n");
482 return true;
483 }
484
485 if (loop->safelen < 2)
486 {
487 tree indicator = dr_zero_step_indicator (dra);
488 if (TREE_CODE (indicator) != INTEGER_CST)
489 vect_check_nonzero_value (loop_vinfo, indicator);
490 else if (integer_zerop (indicator))
491 {
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 "access also has a zero step\n");
495 return true;
496 }
497 }
498 continue;
499 }
500
501 if (dist > 0 && DDR_REVERSED_P (ddr))
502 {
503 /* If DDR_REVERSED_P the order of the data-refs in DDR was
504 reversed (to make distance vector positive), and the actual
505 distance is negative. */
506 if (dump_enabled_p ())
507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
508 "dependence distance negative.\n");
509 /* Record a negative dependence distance to later limit the
510 amount of stmt copying / unrolling we can perform.
511 Only need to handle read-after-write dependence. */
512 if (DR_IS_READ (drb)
513 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
514 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
515 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
516 continue;
517 }
518
519 unsigned int abs_dist = abs (dist);
520 if (abs_dist >= 2 && abs_dist < *max_vf)
521 {
522 /* The dependence distance requires reduction of the maximal
523 vectorization factor. */
524 *max_vf = abs (dist);
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location,
527 "adjusting maximal vectorization factor to %i\n",
528 *max_vf);
529 }
530
531 if (abs_dist >= *max_vf)
532 {
533 /* Dependence distance does not create dependence, as far as
534 vectorization is concerned, in this case. */
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE, vect_location,
537 "dependence distance >= VF.\n");
538 continue;
539 }
540
541 if (dump_enabled_p ())
542 {
543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
544 "not vectorized, possible dependence "
545 "between data-refs ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
547 dump_printf (MSG_NOTE, " and ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
549 dump_printf (MSG_NOTE, "\n");
550 }
551
552 return true;
553 }
554
555 return false;
556 }
557
558 /* Function vect_analyze_data_ref_dependences.
559
560 Examine all the data references in the loop, and make sure there do not
561 exist any data dependences between them. Set *MAX_VF according to
562 the maximum vectorization factor the data dependences allow. */
563
564 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)565 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
566 unsigned int *max_vf)
567 {
568 unsigned int i;
569 struct data_dependence_relation *ddr;
570
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "=== vect_analyze_data_ref_dependences ===\n");
574
575 LOOP_VINFO_DDRS (loop_vinfo)
576 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
577 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
578 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
579 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
580 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
581 &LOOP_VINFO_DDRS (loop_vinfo),
582 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
583 return false;
584
585 /* For epilogues we either have no aliases or alias versioning
586 was applied to original loop. Therefore we may just get max_vf
587 using VF of original loop. */
588 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
589 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
590 else
591 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
592 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
593 return false;
594
595 return true;
596 }
597
598
599 /* Function vect_slp_analyze_data_ref_dependence.
600
601 Return TRUE if there (might) exist a dependence between a memory-reference
602 DRA and a memory-reference DRB. When versioning for alias may check a
603 dependence at run-time, return FALSE. Adjust *MAX_VF according to
604 the data dependence. */
605
606 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)607 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
608 {
609 struct data_reference *dra = DDR_A (ddr);
610 struct data_reference *drb = DDR_B (ddr);
611
612 /* We need to check dependences of statements marked as unvectorizable
613 as well, they still can prohibit vectorization. */
614
615 /* Independent data accesses. */
616 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
617 return false;
618
619 if (dra == drb)
620 return false;
621
622 /* Read-read is OK. */
623 if (DR_IS_READ (dra) && DR_IS_READ (drb))
624 return false;
625
626 /* If dra and drb are part of the same interleaving chain consider
627 them independent. */
628 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
629 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
630 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
631 return false;
632
633 /* Unknown data dependence. */
634 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
635 {
636 if (dump_enabled_p ())
637 {
638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
639 "can't determine dependence between ");
640 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
641 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
643 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
644 }
645 }
646 else if (dump_enabled_p ())
647 {
648 dump_printf_loc (MSG_NOTE, vect_location,
649 "determined dependence between ");
650 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
651 dump_printf (MSG_NOTE, " and ");
652 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
653 dump_printf (MSG_NOTE, "\n");
654 }
655
656 return true;
657 }
658
659
660 /* Analyze dependences involved in the transform of SLP NODE. STORES
661 contain the vector of scalar stores of this instance if we are
662 disambiguating the loads. */
663
664 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)665 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
666 vec<gimple *> stores, gimple *last_store)
667 {
668 /* This walks over all stmts involved in the SLP load/store done
669 in NODE verifying we can sink them up to the last stmt in the
670 group. */
671 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
672 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
673 {
674 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
675 if (access == last_access)
676 continue;
677 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
678 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
679 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
680 {
681 gimple *stmt = gsi_stmt (gsi);
682 if (! gimple_vuse (stmt)
683 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
684 continue;
685
686 /* If we couldn't record a (single) data reference for this
687 stmt we have to give up. */
688 /* ??? Here and below if dependence analysis fails we can resort
689 to the alias oracle which can handle more kinds of stmts. */
690 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
691 if (!dr_b)
692 return false;
693
694 bool dependent = false;
695 /* If we run into a store of this same instance (we've just
696 marked those) then delay dependence checking until we run
697 into the last store because this is where it will have
698 been sunk to (and we verify if we can do that as well). */
699 if (gimple_visited_p (stmt))
700 {
701 if (stmt != last_store)
702 continue;
703 unsigned i;
704 gimple *store;
705 FOR_EACH_VEC_ELT (stores, i, store)
706 {
707 data_reference *store_dr
708 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
709 ddr_p ddr = initialize_data_dependence_relation
710 (dr_a, store_dr, vNULL);
711 dependent = vect_slp_analyze_data_ref_dependence (ddr);
712 free_dependence_relation (ddr);
713 if (dependent)
714 break;
715 }
716 }
717 else
718 {
719 ddr_p ddr = initialize_data_dependence_relation (dr_a,
720 dr_b, vNULL);
721 dependent = vect_slp_analyze_data_ref_dependence (ddr);
722 free_dependence_relation (ddr);
723 }
724 if (dependent)
725 return false;
726 }
727 }
728 return true;
729 }
730
731
732 /* Function vect_analyze_data_ref_dependences.
733
734 Examine all the data references in the basic-block, and make sure there
735 do not exist any data dependences between them. Set *MAX_VF according to
736 the maximum vectorization factor the data dependences allow. */
737
738 bool
vect_slp_analyze_instance_dependence(slp_instance instance)739 vect_slp_analyze_instance_dependence (slp_instance instance)
740 {
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "=== vect_slp_analyze_instance_dependence ===\n");
744
745 /* The stores of this instance are at the root of the SLP tree. */
746 slp_tree store = SLP_INSTANCE_TREE (instance);
747 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
748 store = NULL;
749
750 /* Verify we can sink stores to the vectorized stmt insert location. */
751 gimple *last_store = NULL;
752 if (store)
753 {
754 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
755 return false;
756
757 /* Mark stores in this instance and remember the last one. */
758 last_store = vect_find_last_scalar_stmt_in_slp (store);
759 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
760 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
761 }
762
763 bool res = true;
764
765 /* Verify we can sink loads to the vectorized stmt insert location,
766 special-casing stores of this instance. */
767 slp_tree load;
768 unsigned int i;
769 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
770 if (! vect_slp_analyze_node_dependences (instance, load,
771 store
772 ? SLP_TREE_SCALAR_STMTS (store)
773 : vNULL, last_store))
774 {
775 res = false;
776 break;
777 }
778
779 /* Unset the visited flag. */
780 if (store)
781 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
782 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
783
784 return res;
785 }
786
787 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
788 the statement that contains DRB, which is useful for recording in the
789 dump file. */
790
791 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)792 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
793 innermost_loop_behavior *drb)
794 {
795 bool existed;
796 innermost_loop_behavior *&entry
797 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
798 if (!existed || entry->base_alignment < drb->base_alignment)
799 {
800 entry = drb;
801 if (dump_enabled_p ())
802 {
803 dump_printf_loc (MSG_NOTE, vect_location,
804 "recording new base alignment for ");
805 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
806 dump_printf (MSG_NOTE, "\n");
807 dump_printf_loc (MSG_NOTE, vect_location,
808 " alignment: %d\n", drb->base_alignment);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " misalignment: %d\n", drb->base_misalignment);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " based on: ");
813 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
814 }
815 }
816 }
817
818 /* If the region we're going to vectorize is reached, all unconditional
819 data references occur at least once. We can therefore pool the base
820 alignment guarantees from each unconditional reference. Do this by
821 going through all the data references in VINFO and checking whether
822 the containing statement makes the reference unconditionally. If so,
823 record the alignment of the base address in VINFO so that it can be
824 used for all other references with the same base. */
825
826 void
vect_record_base_alignments(vec_info * vinfo)827 vect_record_base_alignments (vec_info *vinfo)
828 {
829 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
830 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
831 data_reference *dr;
832 unsigned int i;
833 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
834 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
835 {
836 gimple *stmt = DR_STMT (dr);
837 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
838
839 /* If DR is nested in the loop that is being vectorized, we can also
840 record the alignment of the base wrt the outer loop. */
841 if (loop && nested_in_vect_loop_p (loop, stmt))
842 {
843 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
844 vect_record_base_alignment
845 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
846 }
847 }
848 }
849
850 /* Return the target alignment for the vectorized form of DR. */
851
852 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)853 vect_calculate_target_alignment (struct data_reference *dr)
854 {
855 gimple *stmt = DR_STMT (dr);
856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
857 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
858 return targetm.vectorize.preferred_vector_alignment (vectype);
859 }
860
861 /* Function vect_compute_data_ref_alignment
862
863 Compute the misalignment of the data reference DR.
864
865 Output:
866 1. If during the misalignment computation it is found that the data reference
867 cannot be vectorized then false is returned.
868 2. DR_MISALIGNMENT (DR) is defined.
869
870 FOR NOW: No analysis is actually performed. Misalignment is calculated
871 only for trivial cases. TODO. */
872
873 bool
vect_compute_data_ref_alignment(struct data_reference * dr)874 vect_compute_data_ref_alignment (struct data_reference *dr)
875 {
876 gimple *stmt = DR_STMT (dr);
877 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
878 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
879 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
880 struct loop *loop = NULL;
881 tree ref = DR_REF (dr);
882 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
883
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_NOTE, vect_location,
886 "vect_compute_data_ref_alignment:\n");
887
888 if (loop_vinfo)
889 loop = LOOP_VINFO_LOOP (loop_vinfo);
890
891 /* Initialize misalignment to unknown. */
892 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
893
894 innermost_loop_behavior *drb = vect_dr_behavior (dr);
895 bool step_preserves_misalignment_p;
896
897 unsigned HOST_WIDE_INT vector_alignment
898 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
899 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
900
901 /* No step for BB vectorization. */
902 if (!loop)
903 {
904 gcc_assert (integer_zerop (drb->step));
905 step_preserves_misalignment_p = true;
906 }
907
908 /* In case the dataref is in an inner-loop of the loop that is being
909 vectorized (LOOP), we use the base and misalignment information
910 relative to the outer-loop (LOOP). This is ok only if the misalignment
911 stays the same throughout the execution of the inner-loop, which is why
912 we have to check that the stride of the dataref in the inner-loop evenly
913 divides by the vector alignment. */
914 else if (nested_in_vect_loop_p (loop, stmt))
915 {
916 step_preserves_misalignment_p
917 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
918
919 if (dump_enabled_p ())
920 {
921 if (step_preserves_misalignment_p)
922 dump_printf_loc (MSG_NOTE, vect_location,
923 "inner step divides the vector alignment.\n");
924 else
925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 "inner step doesn't divide the vector"
927 " alignment.\n");
928 }
929 }
930
931 /* Similarly we can only use base and misalignment information relative to
932 an innermost loop if the misalignment stays the same throughout the
933 execution of the loop. As above, this is the case if the stride of
934 the dataref evenly divides by the alignment. */
935 else
936 {
937 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
938 step_preserves_misalignment_p
939 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
940
941 if (!step_preserves_misalignment_p && dump_enabled_p ())
942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
943 "step doesn't divide the vector alignment.\n");
944 }
945
946 unsigned int base_alignment = drb->base_alignment;
947 unsigned int base_misalignment = drb->base_misalignment;
948
949 /* Calculate the maximum of the pooled base address alignment and the
950 alignment that we can compute for DR itself. */
951 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
952 if (entry && base_alignment < (*entry)->base_alignment)
953 {
954 base_alignment = (*entry)->base_alignment;
955 base_misalignment = (*entry)->base_misalignment;
956 }
957
958 if (drb->offset_alignment < vector_alignment
959 || !step_preserves_misalignment_p
960 /* We need to know whether the step wrt the vectorized loop is
961 negative when computing the starting misalignment below. */
962 || TREE_CODE (drb->step) != INTEGER_CST)
963 {
964 if (dump_enabled_p ())
965 {
966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 "Unknown alignment for access: ");
968 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
969 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
970 }
971 return true;
972 }
973
974 if (base_alignment < vector_alignment)
975 {
976 unsigned int max_alignment;
977 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
978 if (max_alignment < vector_alignment
979 || !vect_can_force_dr_alignment_p (base,
980 vector_alignment * BITS_PER_UNIT))
981 {
982 if (dump_enabled_p ())
983 {
984 dump_printf_loc (MSG_NOTE, vect_location,
985 "can't force alignment of ref: ");
986 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
987 dump_printf (MSG_NOTE, "\n");
988 }
989 return true;
990 }
991
992 /* Force the alignment of the decl.
993 NOTE: This is the only change to the code we make during
994 the analysis phase, before deciding to vectorize the loop. */
995 if (dump_enabled_p ())
996 {
997 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
998 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
999 dump_printf (MSG_NOTE, "\n");
1000 }
1001
1002 DR_VECT_AUX (dr)->base_decl = base;
1003 DR_VECT_AUX (dr)->base_misaligned = true;
1004 base_misalignment = 0;
1005 }
1006 poly_int64 misalignment
1007 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1008
1009 /* If this is a backward running DR then first access in the larger
1010 vectype actually is N-1 elements before the address in the DR.
1011 Adjust misalign accordingly. */
1012 if (tree_int_cst_sgn (drb->step) < 0)
1013 /* PLUS because STEP is negative. */
1014 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1015 * TREE_INT_CST_LOW (drb->step));
1016
1017 unsigned int const_misalignment;
1018 if (!known_misalignment (misalignment, vector_alignment,
1019 &const_misalignment))
1020 {
1021 if (dump_enabled_p ())
1022 {
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "Non-constant misalignment for access: ");
1025 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027 }
1028 return true;
1029 }
1030
1031 SET_DR_MISALIGNMENT (dr, const_misalignment);
1032
1033 if (dump_enabled_p ())
1034 {
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039 }
1040
1041 return true;
1042 }
1043
1044 /* Function vect_update_misalignment_for_peel.
1045 Sets DR's misalignment
1046 - to 0 if it has the same alignment as DR_PEEL,
1047 - to the misalignment computed using NPEEL if DR's salignment is known,
1048 - to -1 (unknown) otherwise.
1049
1050 DR - the data reference whose misalignment is to be adjusted.
1051 DR_PEEL - the data reference whose misalignment is being made
1052 zero in the vector loop by the peel.
1053 NPEEL - the number of iterations in the peel loop if the misalignment
1054 of DR_PEEL is known at compile time. */
1055
1056 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1057 vect_update_misalignment_for_peel (struct data_reference *dr,
1058 struct data_reference *dr_peel, int npeel)
1059 {
1060 unsigned int i;
1061 vec<dr_p> same_aligned_drs;
1062 struct data_reference *current_dr;
1063 int dr_size = vect_get_scalar_dr_size (dr);
1064 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1065 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1066 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1067
1068 /* For interleaved data accesses the step in the loop must be multiplied by
1069 the size of the interleaving group. */
1070 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1071 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1072 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1073 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1074
1075 /* It can be assumed that the data refs with the same alignment as dr_peel
1076 are aligned in the vector loop. */
1077 same_aligned_drs
1078 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1079 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1080 {
1081 if (current_dr != dr)
1082 continue;
1083 gcc_assert (!known_alignment_for_access_p (dr)
1084 || !known_alignment_for_access_p (dr_peel)
1085 || (DR_MISALIGNMENT (dr) / dr_size
1086 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1087 SET_DR_MISALIGNMENT (dr, 0);
1088 return;
1089 }
1090
1091 if (known_alignment_for_access_p (dr)
1092 && known_alignment_for_access_p (dr_peel))
1093 {
1094 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1095 int misal = DR_MISALIGNMENT (dr);
1096 misal += negative ? -npeel * dr_size : npeel * dr_size;
1097 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1098 SET_DR_MISALIGNMENT (dr, misal);
1099 return;
1100 }
1101
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1104 "to unknown (-1).\n");
1105 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1106 }
1107
1108
1109 /* Function verify_data_ref_alignment
1110
1111 Return TRUE if DR can be handled with respect to alignment. */
1112
1113 static bool
verify_data_ref_alignment(data_reference_p dr)1114 verify_data_ref_alignment (data_reference_p dr)
1115 {
1116 enum dr_alignment_support supportable_dr_alignment
1117 = vect_supportable_dr_alignment (dr, false);
1118 if (!supportable_dr_alignment)
1119 {
1120 if (dump_enabled_p ())
1121 {
1122 if (DR_IS_READ (dr))
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "not vectorized: unsupported unaligned load.");
1125 else
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "not vectorized: unsupported unaligned "
1128 "store.");
1129
1130 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1131 DR_REF (dr));
1132 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1133 }
1134 return false;
1135 }
1136
1137 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1138 dump_printf_loc (MSG_NOTE, vect_location,
1139 "Vectorizing an unaligned access.\n");
1140
1141 return true;
1142 }
1143
1144 /* Function vect_verify_datarefs_alignment
1145
1146 Return TRUE if all data references in the loop can be
1147 handled with respect to alignment. */
1148
1149 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1150 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1151 {
1152 vec<data_reference_p> datarefs = vinfo->datarefs;
1153 struct data_reference *dr;
1154 unsigned int i;
1155
1156 FOR_EACH_VEC_ELT (datarefs, i, dr)
1157 {
1158 gimple *stmt = DR_STMT (dr);
1159 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1160
1161 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1162 continue;
1163
1164 /* For interleaving, only the alignment of the first access matters. */
1165 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1166 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1167 continue;
1168
1169 /* Strided accesses perform only component accesses, alignment is
1170 irrelevant for them. */
1171 if (STMT_VINFO_STRIDED_P (stmt_info)
1172 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1173 continue;
1174
1175 if (! verify_data_ref_alignment (dr))
1176 return false;
1177 }
1178
1179 return true;
1180 }
1181
1182 /* Given an memory reference EXP return whether its alignment is less
1183 than its size. */
1184
1185 static bool
not_size_aligned(tree exp)1186 not_size_aligned (tree exp)
1187 {
1188 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1189 return true;
1190
1191 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1192 > get_object_alignment (exp));
1193 }
1194
1195 /* Function vector_alignment_reachable_p
1196
1197 Return true if vector alignment for DR is reachable by peeling
1198 a few loop iterations. Return false otherwise. */
1199
1200 static bool
vector_alignment_reachable_p(struct data_reference * dr)1201 vector_alignment_reachable_p (struct data_reference *dr)
1202 {
1203 gimple *stmt = DR_STMT (dr);
1204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1205 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1206
1207 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1208 {
1209 /* For interleaved access we peel only if number of iterations in
1210 the prolog loop ({VF - misalignment}), is a multiple of the
1211 number of the interleaved accesses. */
1212 int elem_size, mis_in_elements;
1213
1214 /* FORNOW: handle only known alignment. */
1215 if (!known_alignment_for_access_p (dr))
1216 return false;
1217
1218 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1219 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1220 elem_size = vector_element_size (vector_size, nelements);
1221 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1222
1223 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1224 return false;
1225 }
1226
1227 /* If misalignment is known at the compile time then allow peeling
1228 only if natural alignment is reachable through peeling. */
1229 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1230 {
1231 HOST_WIDE_INT elmsize =
1232 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1233 if (dump_enabled_p ())
1234 {
1235 dump_printf_loc (MSG_NOTE, vect_location,
1236 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1237 dump_printf (MSG_NOTE,
1238 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1239 }
1240 if (DR_MISALIGNMENT (dr) % elmsize)
1241 {
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "data size does not divide the misalignment.\n");
1245 return false;
1246 }
1247 }
1248
1249 if (!known_alignment_for_access_p (dr))
1250 {
1251 tree type = TREE_TYPE (DR_REF (dr));
1252 bool is_packed = not_size_aligned (DR_REF (dr));
1253 if (dump_enabled_p ())
1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 "Unknown misalignment, %snaturally aligned\n",
1256 is_packed ? "not " : "");
1257 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1258 }
1259
1260 return true;
1261 }
1262
1263
1264 /* Calculate the cost of the memory access represented by DR. */
1265
1266 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1267 vect_get_data_access_cost (struct data_reference *dr,
1268 unsigned int *inside_cost,
1269 unsigned int *outside_cost,
1270 stmt_vector_for_cost *body_cost_vec)
1271 {
1272 gimple *stmt = DR_STMT (dr);
1273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1274 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1275 int ncopies;
1276
1277 if (PURE_SLP_STMT (stmt_info))
1278 ncopies = 1;
1279 else
1280 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1281
1282 if (DR_IS_READ (dr))
1283 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1284 NULL, body_cost_vec, false);
1285 else
1286 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1287
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_NOTE, vect_location,
1290 "vect_get_data_access_cost: inside_cost = %d, "
1291 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1292 }
1293
1294
1295 typedef struct _vect_peel_info
1296 {
1297 struct data_reference *dr;
1298 int npeel;
1299 unsigned int count;
1300 } *vect_peel_info;
1301
1302 typedef struct _vect_peel_extended_info
1303 {
1304 struct _vect_peel_info peel_info;
1305 unsigned int inside_cost;
1306 unsigned int outside_cost;
1307 } *vect_peel_extended_info;
1308
1309
1310 /* Peeling hashtable helpers. */
1311
1312 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1313 {
1314 static inline hashval_t hash (const _vect_peel_info *);
1315 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1316 };
1317
1318 inline hashval_t
hash(const _vect_peel_info * peel_info)1319 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1320 {
1321 return (hashval_t) peel_info->npeel;
1322 }
1323
1324 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1325 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1326 {
1327 return (a->npeel == b->npeel);
1328 }
1329
1330
1331 /* Insert DR into peeling hash table with NPEEL as key. */
1332
1333 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1334 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1335 loop_vec_info loop_vinfo, struct data_reference *dr,
1336 int npeel)
1337 {
1338 struct _vect_peel_info elem, *slot;
1339 _vect_peel_info **new_slot;
1340 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1341
1342 elem.npeel = npeel;
1343 slot = peeling_htab->find (&elem);
1344 if (slot)
1345 slot->count++;
1346 else
1347 {
1348 slot = XNEW (struct _vect_peel_info);
1349 slot->npeel = npeel;
1350 slot->dr = dr;
1351 slot->count = 1;
1352 new_slot = peeling_htab->find_slot (slot, INSERT);
1353 *new_slot = slot;
1354 }
1355
1356 if (!supportable_dr_alignment
1357 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1358 slot->count += VECT_MAX_COST;
1359 }
1360
1361
1362 /* Traverse peeling hash table to find peeling option that aligns maximum
1363 number of data accesses. */
1364
1365 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1366 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1367 _vect_peel_extended_info *max)
1368 {
1369 vect_peel_info elem = *slot;
1370
1371 if (elem->count > max->peel_info.count
1372 || (elem->count == max->peel_info.count
1373 && max->peel_info.npeel > elem->npeel))
1374 {
1375 max->peel_info.npeel = elem->npeel;
1376 max->peel_info.count = elem->count;
1377 max->peel_info.dr = elem->dr;
1378 }
1379
1380 return 1;
1381 }
1382
1383 /* Get the costs of peeling NPEEL iterations checking data access costs
1384 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1385 misalignment will be zero after peeling. */
1386
1387 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1388 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1389 struct data_reference *dr0,
1390 unsigned int *inside_cost,
1391 unsigned int *outside_cost,
1392 stmt_vector_for_cost *body_cost_vec,
1393 unsigned int npeel,
1394 bool unknown_misalignment)
1395 {
1396 unsigned i;
1397 data_reference *dr;
1398
1399 FOR_EACH_VEC_ELT (datarefs, i, dr)
1400 {
1401 gimple *stmt = DR_STMT (dr);
1402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1403 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1404 continue;
1405
1406 /* For interleaving, only the alignment of the first access
1407 matters. */
1408 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1409 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1410 continue;
1411
1412 /* Strided accesses perform only component accesses, alignment is
1413 irrelevant for them. */
1414 if (STMT_VINFO_STRIDED_P (stmt_info)
1415 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1416 continue;
1417
1418 int save_misalignment;
1419 save_misalignment = DR_MISALIGNMENT (dr);
1420 if (npeel == 0)
1421 ;
1422 else if (unknown_misalignment && dr == dr0)
1423 SET_DR_MISALIGNMENT (dr, 0);
1424 else
1425 vect_update_misalignment_for_peel (dr, dr0, npeel);
1426 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1427 body_cost_vec);
1428 SET_DR_MISALIGNMENT (dr, save_misalignment);
1429 }
1430 }
1431
1432 /* Traverse peeling hash table and calculate cost for each peeling option.
1433 Find the one with the lowest cost. */
1434
1435 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1436 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1437 _vect_peel_extended_info *min)
1438 {
1439 vect_peel_info elem = *slot;
1440 int dummy;
1441 unsigned int inside_cost = 0, outside_cost = 0;
1442 gimple *stmt = DR_STMT (elem->dr);
1443 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1445 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1446 epilogue_cost_vec;
1447
1448 prologue_cost_vec.create (2);
1449 body_cost_vec.create (2);
1450 epilogue_cost_vec.create (2);
1451
1452 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1453 elem->dr, &inside_cost, &outside_cost,
1454 &body_cost_vec, elem->npeel, false);
1455
1456 body_cost_vec.release ();
1457
1458 outside_cost += vect_get_known_peeling_cost
1459 (loop_vinfo, elem->npeel, &dummy,
1460 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1461 &prologue_cost_vec, &epilogue_cost_vec);
1462
1463 /* Prologue and epilogue costs are added to the target model later.
1464 These costs depend only on the scalar iteration cost, the
1465 number of peeling iterations finally chosen, and the number of
1466 misaligned statements. So discard the information found here. */
1467 prologue_cost_vec.release ();
1468 epilogue_cost_vec.release ();
1469
1470 if (inside_cost < min->inside_cost
1471 || (inside_cost == min->inside_cost
1472 && outside_cost < min->outside_cost))
1473 {
1474 min->inside_cost = inside_cost;
1475 min->outside_cost = outside_cost;
1476 min->peel_info.dr = elem->dr;
1477 min->peel_info.npeel = elem->npeel;
1478 min->peel_info.count = elem->count;
1479 }
1480
1481 return 1;
1482 }
1483
1484
1485 /* Choose best peeling option by traversing peeling hash table and either
1486 choosing an option with the lowest cost (if cost model is enabled) or the
1487 option that aligns as many accesses as possible. */
1488
1489 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1490 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1491 loop_vec_info loop_vinfo)
1492 {
1493 struct _vect_peel_extended_info res;
1494
1495 res.peel_info.dr = NULL;
1496
1497 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1498 {
1499 res.inside_cost = INT_MAX;
1500 res.outside_cost = INT_MAX;
1501 peeling_htab->traverse <_vect_peel_extended_info *,
1502 vect_peeling_hash_get_lowest_cost> (&res);
1503 }
1504 else
1505 {
1506 res.peel_info.count = 0;
1507 peeling_htab->traverse <_vect_peel_extended_info *,
1508 vect_peeling_hash_get_most_frequent> (&res);
1509 res.inside_cost = 0;
1510 res.outside_cost = 0;
1511 }
1512
1513 return res;
1514 }
1515
1516 /* Return true if the new peeling NPEEL is supported. */
1517
1518 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1519 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1520 unsigned npeel)
1521 {
1522 unsigned i;
1523 struct data_reference *dr = NULL;
1524 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1525 gimple *stmt;
1526 stmt_vec_info stmt_info;
1527 enum dr_alignment_support supportable_dr_alignment;
1528
1529 /* Ensure that all data refs can be vectorized after the peel. */
1530 FOR_EACH_VEC_ELT (datarefs, i, dr)
1531 {
1532 int save_misalignment;
1533
1534 if (dr == dr0)
1535 continue;
1536
1537 stmt = DR_STMT (dr);
1538 stmt_info = vinfo_for_stmt (stmt);
1539 /* For interleaving, only the alignment of the first access
1540 matters. */
1541 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1542 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1543 continue;
1544
1545 /* Strided accesses perform only component accesses, alignment is
1546 irrelevant for them. */
1547 if (STMT_VINFO_STRIDED_P (stmt_info)
1548 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1549 continue;
1550
1551 save_misalignment = DR_MISALIGNMENT (dr);
1552 vect_update_misalignment_for_peel (dr, dr0, npeel);
1553 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1554 SET_DR_MISALIGNMENT (dr, save_misalignment);
1555
1556 if (!supportable_dr_alignment)
1557 return false;
1558 }
1559
1560 return true;
1561 }
1562
1563 /* Function vect_enhance_data_refs_alignment
1564
1565 This pass will use loop versioning and loop peeling in order to enhance
1566 the alignment of data references in the loop.
1567
1568 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1569 original loop is to be vectorized. Any other loops that are created by
1570 the transformations performed in this pass - are not supposed to be
1571 vectorized. This restriction will be relaxed.
1572
1573 This pass will require a cost model to guide it whether to apply peeling
1574 or versioning or a combination of the two. For example, the scheme that
1575 intel uses when given a loop with several memory accesses, is as follows:
1576 choose one memory access ('p') which alignment you want to force by doing
1577 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1578 other accesses are not necessarily aligned, or (2) use loop versioning to
1579 generate one loop in which all accesses are aligned, and another loop in
1580 which only 'p' is necessarily aligned.
1581
1582 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1583 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1584 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1585
1586 Devising a cost model is the most critical aspect of this work. It will
1587 guide us on which access to peel for, whether to use loop versioning, how
1588 many versions to create, etc. The cost model will probably consist of
1589 generic considerations as well as target specific considerations (on
1590 powerpc for example, misaligned stores are more painful than misaligned
1591 loads).
1592
1593 Here are the general steps involved in alignment enhancements:
1594
1595 -- original loop, before alignment analysis:
1596 for (i=0; i<N; i++){
1597 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1598 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1599 }
1600
1601 -- After vect_compute_data_refs_alignment:
1602 for (i=0; i<N; i++){
1603 x = q[i]; # DR_MISALIGNMENT(q) = 3
1604 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1605 }
1606
1607 -- Possibility 1: we do loop versioning:
1608 if (p is aligned) {
1609 for (i=0; i<N; i++){ # loop 1A
1610 x = q[i]; # DR_MISALIGNMENT(q) = 3
1611 p[i] = y; # DR_MISALIGNMENT(p) = 0
1612 }
1613 }
1614 else {
1615 for (i=0; i<N; i++){ # loop 1B
1616 x = q[i]; # DR_MISALIGNMENT(q) = 3
1617 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1618 }
1619 }
1620
1621 -- Possibility 2: we do loop peeling:
1622 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1623 x = q[i];
1624 p[i] = y;
1625 }
1626 for (i = 3; i < N; i++){ # loop 2A
1627 x = q[i]; # DR_MISALIGNMENT(q) = 0
1628 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1629 }
1630
1631 -- Possibility 3: combination of loop peeling and versioning:
1632 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1633 x = q[i];
1634 p[i] = y;
1635 }
1636 if (p is aligned) {
1637 for (i = 3; i<N; i++){ # loop 3A
1638 x = q[i]; # DR_MISALIGNMENT(q) = 0
1639 p[i] = y; # DR_MISALIGNMENT(p) = 0
1640 }
1641 }
1642 else {
1643 for (i = 3; i<N; i++){ # loop 3B
1644 x = q[i]; # DR_MISALIGNMENT(q) = 0
1645 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1646 }
1647 }
1648
1649 These loops are later passed to loop_transform to be vectorized. The
1650 vectorizer will use the alignment information to guide the transformation
1651 (whether to generate regular loads/stores, or with special handling for
1652 misalignment). */
1653
1654 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1655 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1656 {
1657 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1659 enum dr_alignment_support supportable_dr_alignment;
1660 struct data_reference *dr0 = NULL, *first_store = NULL;
1661 struct data_reference *dr;
1662 unsigned int i, j;
1663 bool do_peeling = false;
1664 bool do_versioning = false;
1665 bool stat;
1666 gimple *stmt;
1667 stmt_vec_info stmt_info;
1668 unsigned int npeel = 0;
1669 bool one_misalignment_known = false;
1670 bool one_misalignment_unknown = false;
1671 bool one_dr_unsupportable = false;
1672 struct data_reference *unsupportable_dr = NULL;
1673 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1674 unsigned possible_npeel_number = 1;
1675 tree vectype;
1676 unsigned int mis, same_align_drs_max = 0;
1677 hash_table<peel_info_hasher> peeling_htab (1);
1678
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "=== vect_enhance_data_refs_alignment ===\n");
1682
1683 /* Reset data so we can safely be called multiple times. */
1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687 /* While cost model enhancements are expected in the future, the high level
1688 view of the code at this time is as follows:
1689
1690 A) If there is a misaligned access then see if peeling to align
1691 this access can make all data references satisfy
1692 vect_supportable_dr_alignment. If so, update data structures
1693 as needed and return true.
1694
1695 B) If peeling wasn't possible and there is a data reference with an
1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697 then see if loop versioning checks can be used to make all data
1698 references satisfy vect_supportable_dr_alignment. If so, update
1699 data structures as needed and return true.
1700
1701 C) If neither peeling nor versioning were successful then return false if
1702 any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706 Note, Possibility 3 above (which is peeling and versioning together) is not
1707 being done at this time. */
1708
1709 /* (1) Peeling to force alignment. */
1710
1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712 Considerations:
1713 + How many accesses will become aligned due to the peeling
1714 - How many accesses will become unaligned due to the peeling,
1715 and the cost of misaligned accesses.
1716 - The cost of peeling (the extra runtime checks, the increase
1717 in code size). */
1718
1719 FOR_EACH_VEC_ELT (datarefs, i, dr)
1720 {
1721 stmt = DR_STMT (dr);
1722 stmt_info = vinfo_for_stmt (stmt);
1723
1724 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 continue;
1726
1727 /* For interleaving, only the alignment of the first access
1728 matters. */
1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1731 continue;
1732
1733 /* For invariant accesses there is nothing to enhance. */
1734 if (integer_zerop (DR_STEP (dr)))
1735 continue;
1736
1737 /* Strided accesses perform only component accesses, alignment is
1738 irrelevant for them. */
1739 if (STMT_VINFO_STRIDED_P (stmt_info)
1740 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1741 continue;
1742
1743 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1744 do_peeling = vector_alignment_reachable_p (dr);
1745 if (do_peeling)
1746 {
1747 if (known_alignment_for_access_p (dr))
1748 {
1749 unsigned int npeel_tmp = 0;
1750 bool negative = tree_int_cst_compare (DR_STEP (dr),
1751 size_zero_node) < 0;
1752
1753 vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1755 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1756 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1757 if (DR_MISALIGNMENT (dr) != 0)
1758 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1759
1760 /* For multiple types, it is possible that the bigger type access
1761 will have more than one peeling option. E.g., a loop with two
1762 types: one of size (vector size / 4), and the other one of
1763 size (vector size / 8). Vectorization factor will 8. If both
1764 accesses are misaligned by 3, the first one needs one scalar
1765 iteration to be aligned, and the second one needs 5. But the
1766 first one will be aligned also by peeling 5 scalar
1767 iterations, and in that case both accesses will be aligned.
1768 Hence, except for the immediate peeling amount, we also want
1769 to try to add full vector size, while we don't exceed
1770 vectorization factor.
1771 We do this automatically for cost model, since we calculate
1772 cost for every peeling option. */
1773 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1774 {
1775 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1776 ? vf * GROUP_SIZE (stmt_info) : vf);
1777 possible_npeel_number
1778 = vect_get_num_vectors (nscalars, vectype);
1779
1780 /* NPEEL_TMP is 0 when there is no misalignment, but also
1781 allow peeling NELEMENTS. */
1782 if (DR_MISALIGNMENT (dr) == 0)
1783 possible_npeel_number++;
1784 }
1785
1786 /* Save info about DR in the hash table. Also include peeling
1787 amounts according to the explanation above. */
1788 for (j = 0; j < possible_npeel_number; j++)
1789 {
1790 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1791 dr, npeel_tmp);
1792 npeel_tmp += target_align / dr_size;
1793 }
1794
1795 one_misalignment_known = true;
1796 }
1797 else
1798 {
1799 /* If we don't know any misalignment values, we prefer
1800 peeling for data-ref that has the maximum number of data-refs
1801 with the same alignment, unless the target prefers to align
1802 stores over load. */
1803 unsigned same_align_drs
1804 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1805 if (!dr0
1806 || same_align_drs_max < same_align_drs)
1807 {
1808 same_align_drs_max = same_align_drs;
1809 dr0 = dr;
1810 }
1811 /* For data-refs with the same number of related
1812 accesses prefer the one where the misalign
1813 computation will be invariant in the outermost loop. */
1814 else if (same_align_drs_max == same_align_drs)
1815 {
1816 struct loop *ivloop0, *ivloop;
1817 ivloop0 = outermost_invariant_loop_for_expr
1818 (loop, DR_BASE_ADDRESS (dr0));
1819 ivloop = outermost_invariant_loop_for_expr
1820 (loop, DR_BASE_ADDRESS (dr));
1821 if ((ivloop && !ivloop0)
1822 || (ivloop && ivloop0
1823 && flow_loop_nested_p (ivloop, ivloop0)))
1824 dr0 = dr;
1825 }
1826
1827 one_misalignment_unknown = true;
1828
1829 /* Check for data refs with unsupportable alignment that
1830 can be peeled. */
1831 if (!supportable_dr_alignment)
1832 {
1833 one_dr_unsupportable = true;
1834 unsupportable_dr = dr;
1835 }
1836
1837 if (!first_store && DR_IS_WRITE (dr))
1838 first_store = dr;
1839 }
1840 }
1841 else
1842 {
1843 if (!aligned_access_p (dr))
1844 {
1845 if (dump_enabled_p ())
1846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847 "vector alignment may not be reachable\n");
1848 break;
1849 }
1850 }
1851 }
1852
1853 /* Check if we can possibly peel the loop. */
1854 if (!vect_can_advance_ivs_p (loop_vinfo)
1855 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1856 || loop->inner)
1857 do_peeling = false;
1858
1859 struct _vect_peel_extended_info peel_for_known_alignment;
1860 struct _vect_peel_extended_info peel_for_unknown_alignment;
1861 struct _vect_peel_extended_info best_peel;
1862
1863 peel_for_unknown_alignment.inside_cost = INT_MAX;
1864 peel_for_unknown_alignment.outside_cost = INT_MAX;
1865 peel_for_unknown_alignment.peel_info.count = 0;
1866
1867 if (do_peeling
1868 && one_misalignment_unknown)
1869 {
1870 /* Check if the target requires to prefer stores over loads, i.e., if
1871 misaligned stores are more expensive than misaligned loads (taking
1872 drs with same alignment into account). */
1873 unsigned int load_inside_cost = 0;
1874 unsigned int load_outside_cost = 0;
1875 unsigned int store_inside_cost = 0;
1876 unsigned int store_outside_cost = 0;
1877 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1878
1879 stmt_vector_for_cost dummy;
1880 dummy.create (2);
1881 vect_get_peeling_costs_all_drs (datarefs, dr0,
1882 &load_inside_cost,
1883 &load_outside_cost,
1884 &dummy, estimated_npeels, true);
1885 dummy.release ();
1886
1887 if (first_store)
1888 {
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (datarefs, first_store,
1891 &store_inside_cost,
1892 &store_outside_cost,
1893 &dummy, estimated_npeels, true);
1894 dummy.release ();
1895 }
1896 else
1897 {
1898 store_inside_cost = INT_MAX;
1899 store_outside_cost = INT_MAX;
1900 }
1901
1902 if (load_inside_cost > store_inside_cost
1903 || (load_inside_cost == store_inside_cost
1904 && load_outside_cost > store_outside_cost))
1905 {
1906 dr0 = first_store;
1907 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1908 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1909 }
1910 else
1911 {
1912 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1913 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1914 }
1915
1916 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1917 prologue_cost_vec.create (2);
1918 epilogue_cost_vec.create (2);
1919
1920 int dummy2;
1921 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1922 (loop_vinfo, estimated_npeels, &dummy2,
1923 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1924 &prologue_cost_vec, &epilogue_cost_vec);
1925
1926 prologue_cost_vec.release ();
1927 epilogue_cost_vec.release ();
1928
1929 peel_for_unknown_alignment.peel_info.count = 1
1930 + STMT_VINFO_SAME_ALIGN_REFS
1931 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1932 }
1933
1934 peel_for_unknown_alignment.peel_info.npeel = 0;
1935 peel_for_unknown_alignment.peel_info.dr = dr0;
1936
1937 best_peel = peel_for_unknown_alignment;
1938
1939 peel_for_known_alignment.inside_cost = INT_MAX;
1940 peel_for_known_alignment.outside_cost = INT_MAX;
1941 peel_for_known_alignment.peel_info.count = 0;
1942 peel_for_known_alignment.peel_info.dr = NULL;
1943
1944 if (do_peeling && one_misalignment_known)
1945 {
1946 /* Peeling is possible, but there is no data access that is not supported
1947 unless aligned. So we try to choose the best possible peeling from
1948 the hash table. */
1949 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1950 (&peeling_htab, loop_vinfo);
1951 }
1952
1953 /* Compare costs of peeling for known and unknown alignment. */
1954 if (peel_for_known_alignment.peel_info.dr != NULL
1955 && peel_for_unknown_alignment.inside_cost
1956 >= peel_for_known_alignment.inside_cost)
1957 {
1958 best_peel = peel_for_known_alignment;
1959
1960 /* If the best peeling for known alignment has NPEEL == 0, perform no
1961 peeling at all except if there is an unsupportable dr that we can
1962 align. */
1963 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1964 do_peeling = false;
1965 }
1966
1967 /* If there is an unsupportable data ref, prefer this over all choices so far
1968 since we'd have to discard a chosen peeling except when it accidentally
1969 aligned the unsupportable data ref. */
1970 if (one_dr_unsupportable)
1971 dr0 = unsupportable_dr;
1972 else if (do_peeling)
1973 {
1974 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1975 TODO: Use nopeel_outside_cost or get rid of it? */
1976 unsigned nopeel_inside_cost = 0;
1977 unsigned nopeel_outside_cost = 0;
1978
1979 stmt_vector_for_cost dummy;
1980 dummy.create (2);
1981 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1982 &nopeel_outside_cost, &dummy, 0, false);
1983 dummy.release ();
1984
1985 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1986 costs will be recorded. */
1987 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1988 prologue_cost_vec.create (2);
1989 epilogue_cost_vec.create (2);
1990
1991 int dummy2;
1992 nopeel_outside_cost += vect_get_known_peeling_cost
1993 (loop_vinfo, 0, &dummy2,
1994 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1995 &prologue_cost_vec, &epilogue_cost_vec);
1996
1997 prologue_cost_vec.release ();
1998 epilogue_cost_vec.release ();
1999
2000 npeel = best_peel.peel_info.npeel;
2001 dr0 = best_peel.peel_info.dr;
2002
2003 /* If no peeling is not more expensive than the best peeling we
2004 have so far, don't perform any peeling. */
2005 if (nopeel_inside_cost <= best_peel.inside_cost)
2006 do_peeling = false;
2007 }
2008
2009 if (do_peeling)
2010 {
2011 stmt = DR_STMT (dr0);
2012 stmt_info = vinfo_for_stmt (stmt);
2013 vectype = STMT_VINFO_VECTYPE (stmt_info);
2014
2015 if (known_alignment_for_access_p (dr0))
2016 {
2017 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2018 size_zero_node) < 0;
2019 if (!npeel)
2020 {
2021 /* Since it's known at compile time, compute the number of
2022 iterations in the peeled loop (the peeling factor) for use in
2023 updating DR_MISALIGNMENT values. The peeling factor is the
2024 vectorization factor minus the misalignment as an element
2025 count. */
2026 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2027 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2028 npeel = ((mis & (target_align - 1))
2029 / vect_get_scalar_dr_size (dr0));
2030 }
2031
2032 /* For interleaved data access every iteration accesses all the
2033 members of the group, therefore we divide the number of iterations
2034 by the group size. */
2035 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037 npeel /= GROUP_SIZE (stmt_info);
2038
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Try peeling by %d\n", npeel);
2042 }
2043
2044 /* Ensure that all datarefs can be vectorized after the peel. */
2045 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2046 do_peeling = false;
2047
2048 /* Check if all datarefs are supportable and log. */
2049 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2050 {
2051 stat = vect_verify_datarefs_alignment (loop_vinfo);
2052 if (!stat)
2053 do_peeling = false;
2054 else
2055 return stat;
2056 }
2057
2058 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2059 if (do_peeling)
2060 {
2061 unsigned max_allowed_peel
2062 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2063 if (max_allowed_peel != (unsigned)-1)
2064 {
2065 unsigned max_peel = npeel;
2066 if (max_peel == 0)
2067 {
2068 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2069 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2070 }
2071 if (max_peel > max_allowed_peel)
2072 {
2073 do_peeling = false;
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_NOTE, vect_location,
2076 "Disable peeling, max peels reached: %d\n", max_peel);
2077 }
2078 }
2079 }
2080
2081 /* Cost model #2 - if peeling may result in a remaining loop not
2082 iterating enough to be vectorized then do not peel. Since this
2083 is a cost heuristic rather than a correctness decision, use the
2084 most likely runtime value for variable vectorization factors. */
2085 if (do_peeling
2086 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2087 {
2088 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2089 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2090 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2091 < assumed_vf + max_peel)
2092 do_peeling = false;
2093 }
2094
2095 if (do_peeling)
2096 {
2097 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2098 If the misalignment of DR_i is identical to that of dr0 then set
2099 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2100 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2101 by the peeling factor times the element size of DR_i (MOD the
2102 vectorization factor times the size). Otherwise, the
2103 misalignment of DR_i must be set to unknown. */
2104 FOR_EACH_VEC_ELT (datarefs, i, dr)
2105 if (dr != dr0)
2106 {
2107 /* Strided accesses perform only component accesses, alignment
2108 is irrelevant for them. */
2109 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2110 if (STMT_VINFO_STRIDED_P (stmt_info)
2111 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112 continue;
2113
2114 vect_update_misalignment_for_peel (dr, dr0, npeel);
2115 }
2116
2117 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2118 if (npeel)
2119 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2120 else
2121 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2122 = DR_MISALIGNMENT (dr0);
2123 SET_DR_MISALIGNMENT (dr0, 0);
2124 if (dump_enabled_p ())
2125 {
2126 dump_printf_loc (MSG_NOTE, vect_location,
2127 "Alignment of access forced using peeling.\n");
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "Peeling for alignment will be applied.\n");
2130 }
2131
2132 /* The inside-loop cost will be accounted for in vectorizable_load
2133 and vectorizable_store correctly with adjusted alignments.
2134 Drop the body_cst_vec on the floor here. */
2135 stat = vect_verify_datarefs_alignment (loop_vinfo);
2136 gcc_assert (stat);
2137 return stat;
2138 }
2139 }
2140
2141 /* (2) Versioning to force alignment. */
2142
2143 /* Try versioning if:
2144 1) optimize loop for speed
2145 2) there is at least one unsupported misaligned data ref with an unknown
2146 misalignment, and
2147 3) all misaligned data refs with a known misalignment are supported, and
2148 4) the number of runtime alignment checks is within reason. */
2149
2150 do_versioning =
2151 optimize_loop_nest_for_speed_p (loop)
2152 && (!loop->inner); /* FORNOW */
2153
2154 if (do_versioning)
2155 {
2156 FOR_EACH_VEC_ELT (datarefs, i, dr)
2157 {
2158 stmt = DR_STMT (dr);
2159 stmt_info = vinfo_for_stmt (stmt);
2160
2161 /* For interleaving, only the alignment of the first access
2162 matters. */
2163 if (aligned_access_p (dr)
2164 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2166 continue;
2167
2168 if (STMT_VINFO_STRIDED_P (stmt_info))
2169 {
2170 /* Strided loads perform only component accesses, alignment is
2171 irrelevant for them. */
2172 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2173 continue;
2174 do_versioning = false;
2175 break;
2176 }
2177
2178 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2179
2180 if (!supportable_dr_alignment)
2181 {
2182 gimple *stmt;
2183 int mask;
2184 tree vectype;
2185
2186 if (known_alignment_for_access_p (dr)
2187 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2188 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2189 {
2190 do_versioning = false;
2191 break;
2192 }
2193
2194 stmt = DR_STMT (dr);
2195 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2196 gcc_assert (vectype);
2197
2198 /* At present we don't support versioning for alignment
2199 with variable VF, since there's no guarantee that the
2200 VF is a power of two. We could relax this if we added
2201 a way of enforcing a power-of-two size. */
2202 unsigned HOST_WIDE_INT size;
2203 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2204 {
2205 do_versioning = false;
2206 break;
2207 }
2208
2209 /* The rightmost bits of an aligned address must be zeros.
2210 Construct the mask needed for this test. For example,
2211 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2212 mask must be 15 = 0xf. */
2213 mask = size - 1;
2214
2215 /* FORNOW: use the same mask to test all potentially unaligned
2216 references in the loop. The vectorizer currently supports
2217 a single vector size, see the reference to
2218 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2219 vectorization factor is computed. */
2220 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2221 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2222 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2223 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2224 DR_STMT (dr));
2225 }
2226 }
2227
2228 /* Versioning requires at least one misaligned data reference. */
2229 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2230 do_versioning = false;
2231 else if (!do_versioning)
2232 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2233 }
2234
2235 if (do_versioning)
2236 {
2237 vec<gimple *> may_misalign_stmts
2238 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2239 gimple *stmt;
2240
2241 /* It can now be assumed that the data references in the statements
2242 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2243 of the loop being vectorized. */
2244 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2245 {
2246 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2247 dr = STMT_VINFO_DATA_REF (stmt_info);
2248 SET_DR_MISALIGNMENT (dr, 0);
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Alignment of access forced using versioning.\n");
2252 }
2253
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Versioning for alignment will be applied.\n");
2257
2258 /* Peeling and versioning can't be done together at this time. */
2259 gcc_assert (! (do_peeling && do_versioning));
2260
2261 stat = vect_verify_datarefs_alignment (loop_vinfo);
2262 gcc_assert (stat);
2263 return stat;
2264 }
2265
2266 /* This point is reached if neither peeling nor versioning is being done. */
2267 gcc_assert (! (do_peeling || do_versioning));
2268
2269 stat = vect_verify_datarefs_alignment (loop_vinfo);
2270 return stat;
2271 }
2272
2273
2274 /* Function vect_find_same_alignment_drs.
2275
2276 Update group and alignment relations according to the chosen
2277 vectorization factor. */
2278
2279 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2280 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2281 {
2282 struct data_reference *dra = DDR_A (ddr);
2283 struct data_reference *drb = DDR_B (ddr);
2284 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2285 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2286
2287 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2288 return;
2289
2290 if (dra == drb)
2291 return;
2292
2293 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2294 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2295 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2296 return;
2297
2298 /* Two references with distance zero have the same alignment. */
2299 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2300 - wi::to_poly_offset (DR_INIT (drb)));
2301 if (maybe_ne (diff, 0))
2302 {
2303 /* Get the wider of the two alignments. */
2304 unsigned int align_a = (vect_calculate_target_alignment (dra)
2305 / BITS_PER_UNIT);
2306 unsigned int align_b = (vect_calculate_target_alignment (drb)
2307 / BITS_PER_UNIT);
2308 unsigned int max_align = MAX (align_a, align_b);
2309
2310 /* Require the gap to be a multiple of the larger vector alignment. */
2311 if (!multiple_p (diff, max_align))
2312 return;
2313 }
2314
2315 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2316 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2317 if (dump_enabled_p ())
2318 {
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "accesses have the same alignment: ");
2321 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2322 dump_printf (MSG_NOTE, " and ");
2323 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2324 dump_printf (MSG_NOTE, "\n");
2325 }
2326 }
2327
2328
2329 /* Function vect_analyze_data_refs_alignment
2330
2331 Analyze the alignment of the data-references in the loop.
2332 Return FALSE if a data reference is found that cannot be vectorized. */
2333
2334 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2335 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_NOTE, vect_location,
2339 "=== vect_analyze_data_refs_alignment ===\n");
2340
2341 /* Mark groups of data references with same alignment using
2342 data dependence information. */
2343 vec<ddr_p> ddrs = vinfo->ddrs;
2344 struct data_dependence_relation *ddr;
2345 unsigned int i;
2346
2347 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2348 vect_find_same_alignment_drs (ddr);
2349
2350 vec<data_reference_p> datarefs = vinfo->datarefs;
2351 struct data_reference *dr;
2352
2353 vect_record_base_alignments (vinfo);
2354 FOR_EACH_VEC_ELT (datarefs, i, dr)
2355 {
2356 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2357 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2358 && !vect_compute_data_ref_alignment (dr))
2359 {
2360 /* Strided accesses perform only component accesses, misalignment
2361 information is irrelevant for them. */
2362 if (STMT_VINFO_STRIDED_P (stmt_info)
2363 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2364 continue;
2365
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 "not vectorized: can't calculate alignment "
2369 "for data ref.\n");
2370
2371 return false;
2372 }
2373 }
2374
2375 return true;
2376 }
2377
2378
2379 /* Analyze alignment of DRs of stmts in NODE. */
2380
2381 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2382 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2383 {
2384 /* We vectorize from the first scalar stmt in the node unless
2385 the node is permuted in which case we start from the first
2386 element in the group. */
2387 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2388 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2389 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2390 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2391
2392 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2393 if (! vect_compute_data_ref_alignment (dr)
2394 /* For creating the data-ref pointer we need alignment of the
2395 first element anyway. */
2396 || (dr != first_dr
2397 && ! vect_compute_data_ref_alignment (first_dr))
2398 || ! verify_data_ref_alignment (dr))
2399 {
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2402 "not vectorized: bad data alignment in basic "
2403 "block.\n");
2404 return false;
2405 }
2406
2407 return true;
2408 }
2409
2410 /* Function vect_slp_analyze_instance_alignment
2411
2412 Analyze the alignment of the data-references in the SLP instance.
2413 Return FALSE if a data reference is found that cannot be vectorized. */
2414
2415 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2416 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2417 {
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location,
2420 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2421
2422 slp_tree node;
2423 unsigned i;
2424 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2425 if (! vect_slp_analyze_and_verify_node_alignment (node))
2426 return false;
2427
2428 node = SLP_INSTANCE_TREE (instance);
2429 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2430 && ! vect_slp_analyze_and_verify_node_alignment
2431 (SLP_INSTANCE_TREE (instance)))
2432 return false;
2433
2434 return true;
2435 }
2436
2437
2438 /* Analyze groups of accesses: check that DR belongs to a group of
2439 accesses of legal size, step, etc. Detect gaps, single element
2440 interleaving, and other special cases. Set grouped access info.
2441 Collect groups of strided stores for further use in SLP analysis.
2442 Worker for vect_analyze_group_access. */
2443
2444 static bool
vect_analyze_group_access_1(struct data_reference * dr)2445 vect_analyze_group_access_1 (struct data_reference *dr)
2446 {
2447 tree step = DR_STEP (dr);
2448 tree scalar_type = TREE_TYPE (DR_REF (dr));
2449 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2450 gimple *stmt = DR_STMT (dr);
2451 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2452 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2453 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2454 HOST_WIDE_INT dr_step = -1;
2455 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2456 bool slp_impossible = false;
2457
2458 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2459 size of the interleaving group (including gaps). */
2460 if (tree_fits_shwi_p (step))
2461 {
2462 dr_step = tree_to_shwi (step);
2463 /* Check that STEP is a multiple of type size. Otherwise there is
2464 a non-element-sized gap at the end of the group which we
2465 cannot represent in GROUP_GAP or GROUP_SIZE.
2466 ??? As we can handle non-constant step fine here we should
2467 simply remove uses of GROUP_GAP between the last and first
2468 element and instead rely on DR_STEP. GROUP_SIZE then would
2469 simply not include that gap. */
2470 if ((dr_step % type_size) != 0)
2471 {
2472 if (dump_enabled_p ())
2473 {
2474 dump_printf_loc (MSG_NOTE, vect_location,
2475 "Step ");
2476 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2477 dump_printf (MSG_NOTE,
2478 " is not a multiple of the element size for ");
2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2480 dump_printf (MSG_NOTE, "\n");
2481 }
2482 return false;
2483 }
2484 groupsize = absu_hwi (dr_step) / type_size;
2485 }
2486 else
2487 groupsize = 0;
2488
2489 /* Not consecutive access is possible only if it is a part of interleaving. */
2490 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2491 {
2492 /* Check if it this DR is a part of interleaving, and is a single
2493 element of the group that is accessed in the loop. */
2494
2495 /* Gaps are supported only for loads. STEP must be a multiple of the type
2496 size. */
2497 if (DR_IS_READ (dr)
2498 && (dr_step % type_size) == 0
2499 && groupsize > 0)
2500 {
2501 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2502 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2503 GROUP_GAP (stmt_info) = groupsize - 1;
2504 if (dump_enabled_p ())
2505 {
2506 dump_printf_loc (MSG_NOTE, vect_location,
2507 "Detected single element interleaving ");
2508 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2509 dump_printf (MSG_NOTE, " step ");
2510 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2511 dump_printf (MSG_NOTE, "\n");
2512 }
2513
2514 return true;
2515 }
2516
2517 if (dump_enabled_p ())
2518 {
2519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2520 "not consecutive access ");
2521 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2522 }
2523
2524 if (bb_vinfo)
2525 {
2526 /* Mark the statement as unvectorizable. */
2527 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2528 return true;
2529 }
2530
2531 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2532 STMT_VINFO_STRIDED_P (stmt_info) = true;
2533 return true;
2534 }
2535
2536 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2537 {
2538 /* First stmt in the interleaving chain. Check the chain. */
2539 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2540 struct data_reference *data_ref = dr;
2541 unsigned int count = 1;
2542 tree prev_init = DR_INIT (data_ref);
2543 gimple *prev = stmt;
2544 HOST_WIDE_INT diff, gaps = 0;
2545
2546 /* By construction, all group members have INTEGER_CST DR_INITs. */
2547 while (next)
2548 {
2549 /* Skip same data-refs. In case that two or more stmts share
2550 data-ref (supported only for loads), we vectorize only the first
2551 stmt, and the rest get their vectorized loads from the first
2552 one. */
2553 if (!tree_int_cst_compare (DR_INIT (data_ref),
2554 DR_INIT (STMT_VINFO_DATA_REF (
2555 vinfo_for_stmt (next)))))
2556 {
2557 if (DR_IS_WRITE (data_ref))
2558 {
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561 "Two store stmts share the same dr.\n");
2562 return false;
2563 }
2564
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "Two or more load stmts share the same dr.\n");
2568
2569 /* For load use the same data-ref load. */
2570 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2571
2572 prev = next;
2573 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2574 continue;
2575 }
2576
2577 prev = next;
2578 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2579
2580 /* All group members have the same STEP by construction. */
2581 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2582
2583 /* Check that the distance between two accesses is equal to the type
2584 size. Otherwise, we have gaps. */
2585 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2586 - TREE_INT_CST_LOW (prev_init)) / type_size;
2587 if (diff != 1)
2588 {
2589 /* FORNOW: SLP of accesses with gaps is not supported. */
2590 slp_impossible = true;
2591 if (DR_IS_WRITE (data_ref))
2592 {
2593 if (dump_enabled_p ())
2594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2595 "interleaved store with gaps\n");
2596 return false;
2597 }
2598
2599 gaps += diff - 1;
2600 }
2601
2602 last_accessed_element += diff;
2603
2604 /* Store the gap from the previous member of the group. If there is no
2605 gap in the access, GROUP_GAP is always 1. */
2606 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2607
2608 prev_init = DR_INIT (data_ref);
2609 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2610 /* Count the number of data-refs in the chain. */
2611 count++;
2612 }
2613
2614 if (groupsize == 0)
2615 groupsize = count + gaps;
2616
2617 /* This could be UINT_MAX but as we are generating code in a very
2618 inefficient way we have to cap earlier. See PR78699 for example. */
2619 if (groupsize > 4096)
2620 {
2621 if (dump_enabled_p ())
2622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 "group is too large\n");
2624 return false;
2625 }
2626
2627 /* Check that the size of the interleaving is equal to count for stores,
2628 i.e., that there are no gaps. */
2629 if (groupsize != count
2630 && !DR_IS_READ (dr))
2631 {
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2634 "interleaved store with gaps\n");
2635 return false;
2636 }
2637
2638 /* If there is a gap after the last load in the group it is the
2639 difference between the groupsize and the last accessed
2640 element.
2641 When there is no gap, this difference should be 0. */
2642 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2643
2644 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2645 if (dump_enabled_p ())
2646 {
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "Detected interleaving ");
2649 if (DR_IS_READ (dr))
2650 dump_printf (MSG_NOTE, "load ");
2651 else
2652 dump_printf (MSG_NOTE, "store ");
2653 dump_printf (MSG_NOTE, "of size %u starting with ",
2654 (unsigned)groupsize);
2655 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2656 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2657 dump_printf_loc (MSG_NOTE, vect_location,
2658 "There is a gap of %u elements after the group\n",
2659 GROUP_GAP (vinfo_for_stmt (stmt)));
2660 }
2661
2662 /* SLP: create an SLP data structure for every interleaving group of
2663 stores for further analysis in vect_analyse_slp. */
2664 if (DR_IS_WRITE (dr) && !slp_impossible)
2665 {
2666 if (loop_vinfo)
2667 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2668 if (bb_vinfo)
2669 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2670 }
2671 }
2672
2673 return true;
2674 }
2675
2676 /* Analyze groups of accesses: check that DR belongs to a group of
2677 accesses of legal size, step, etc. Detect gaps, single element
2678 interleaving, and other special cases. Set grouped access info.
2679 Collect groups of strided stores for further use in SLP analysis. */
2680
2681 static bool
vect_analyze_group_access(struct data_reference * dr)2682 vect_analyze_group_access (struct data_reference *dr)
2683 {
2684 if (!vect_analyze_group_access_1 (dr))
2685 {
2686 /* Dissolve the group if present. */
2687 gimple *next;
2688 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2689 while (stmt)
2690 {
2691 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2692 next = GROUP_NEXT_ELEMENT (vinfo);
2693 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2694 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2695 stmt = next;
2696 }
2697 return false;
2698 }
2699 return true;
2700 }
2701
2702 /* Analyze the access pattern of the data-reference DR.
2703 In case of non-consecutive accesses call vect_analyze_group_access() to
2704 analyze groups of accesses. */
2705
2706 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2707 vect_analyze_data_ref_access (struct data_reference *dr)
2708 {
2709 tree step = DR_STEP (dr);
2710 tree scalar_type = TREE_TYPE (DR_REF (dr));
2711 gimple *stmt = DR_STMT (dr);
2712 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2713 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2714 struct loop *loop = NULL;
2715
2716 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2717 return true;
2718
2719 if (loop_vinfo)
2720 loop = LOOP_VINFO_LOOP (loop_vinfo);
2721
2722 if (loop_vinfo && !step)
2723 {
2724 if (dump_enabled_p ())
2725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2726 "bad data-ref access in loop\n");
2727 return false;
2728 }
2729
2730 /* Allow loads with zero step in inner-loop vectorization. */
2731 if (loop_vinfo && integer_zerop (step))
2732 {
2733 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2734 if (!nested_in_vect_loop_p (loop, stmt))
2735 return DR_IS_READ (dr);
2736 /* Allow references with zero step for outer loops marked
2737 with pragma omp simd only - it guarantees absence of
2738 loop-carried dependencies between inner loop iterations. */
2739 if (loop->safelen < 2)
2740 {
2741 if (dump_enabled_p ())
2742 dump_printf_loc (MSG_NOTE, vect_location,
2743 "zero step in inner loop of nest\n");
2744 return false;
2745 }
2746 }
2747
2748 if (loop && nested_in_vect_loop_p (loop, stmt))
2749 {
2750 /* Interleaved accesses are not yet supported within outer-loop
2751 vectorization for references in the inner-loop. */
2752 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2753
2754 /* For the rest of the analysis we use the outer-loop step. */
2755 step = STMT_VINFO_DR_STEP (stmt_info);
2756 if (integer_zerop (step))
2757 {
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_NOTE, vect_location,
2760 "zero step in outer loop.\n");
2761 return DR_IS_READ (dr);
2762 }
2763 }
2764
2765 /* Consecutive? */
2766 if (TREE_CODE (step) == INTEGER_CST)
2767 {
2768 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2769 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2770 || (dr_step < 0
2771 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2772 {
2773 /* Mark that it is not interleaving. */
2774 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2775 return true;
2776 }
2777 }
2778
2779 if (loop && nested_in_vect_loop_p (loop, stmt))
2780 {
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_NOTE, vect_location,
2783 "grouped access in outer loop.\n");
2784 return false;
2785 }
2786
2787
2788 /* Assume this is a DR handled by non-constant strided load case. */
2789 if (TREE_CODE (step) != INTEGER_CST)
2790 return (STMT_VINFO_STRIDED_P (stmt_info)
2791 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2792 || vect_analyze_group_access (dr)));
2793
2794 /* Not consecutive access - check if it's a part of interleaving group. */
2795 return vect_analyze_group_access (dr);
2796 }
2797
2798 /* Compare two data-references DRA and DRB to group them into chunks
2799 suitable for grouping. */
2800
2801 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2802 dr_group_sort_cmp (const void *dra_, const void *drb_)
2803 {
2804 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2805 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2806 int cmp;
2807
2808 /* Stabilize sort. */
2809 if (dra == drb)
2810 return 0;
2811
2812 /* DRs in different loops never belong to the same group. */
2813 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2814 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2815 if (loopa != loopb)
2816 return loopa->num < loopb->num ? -1 : 1;
2817
2818 /* Ordering of DRs according to base. */
2819 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2820 DR_BASE_ADDRESS (drb));
2821 if (cmp != 0)
2822 return cmp;
2823
2824 /* And according to DR_OFFSET. */
2825 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2826 if (cmp != 0)
2827 return cmp;
2828
2829 /* Put reads before writes. */
2830 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2831 return DR_IS_READ (dra) ? -1 : 1;
2832
2833 /* Then sort after access size. */
2834 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2835 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2836 if (cmp != 0)
2837 return cmp;
2838
2839 /* And after step. */
2840 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2841 if (cmp != 0)
2842 return cmp;
2843
2844 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2845 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2846 if (cmp == 0)
2847 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2848 return cmp;
2849 }
2850
2851 /* If OP is the result of a conversion, return the unconverted value,
2852 otherwise return null. */
2853
2854 static tree
strip_conversion(tree op)2855 strip_conversion (tree op)
2856 {
2857 if (TREE_CODE (op) != SSA_NAME)
2858 return NULL_TREE;
2859 gimple *stmt = SSA_NAME_DEF_STMT (op);
2860 if (!is_gimple_assign (stmt)
2861 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2862 return NULL_TREE;
2863 return gimple_assign_rhs1 (stmt);
2864 }
2865
2866 /* Return true if vectorizable_* routines can handle statements STMT1
2867 and STMT2 being in a single group. */
2868
2869 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2870 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2871 {
2872 if (gimple_assign_single_p (stmt1))
2873 return gimple_assign_single_p (stmt2);
2874
2875 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2876 {
2877 /* Check for two masked loads or two masked stores. */
2878 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2879 return false;
2880 internal_fn ifn = gimple_call_internal_fn (stmt1);
2881 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2882 return false;
2883 if (ifn != gimple_call_internal_fn (stmt2))
2884 return false;
2885
2886 /* Check that the masks are the same. Cope with casts of masks,
2887 like those created by build_mask_conversion. */
2888 tree mask1 = gimple_call_arg (stmt1, 2);
2889 tree mask2 = gimple_call_arg (stmt2, 2);
2890 if (!operand_equal_p (mask1, mask2, 0))
2891 {
2892 mask1 = strip_conversion (mask1);
2893 if (!mask1)
2894 return false;
2895 mask2 = strip_conversion (mask2);
2896 if (!mask2)
2897 return false;
2898 if (!operand_equal_p (mask1, mask2, 0))
2899 return false;
2900 }
2901 return true;
2902 }
2903
2904 return false;
2905 }
2906
2907 /* Function vect_analyze_data_ref_accesses.
2908
2909 Analyze the access pattern of all the data references in the loop.
2910
2911 FORNOW: the only access pattern that is considered vectorizable is a
2912 simple step 1 (consecutive) access.
2913
2914 FORNOW: handle only arrays and pointer accesses. */
2915
2916 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2917 vect_analyze_data_ref_accesses (vec_info *vinfo)
2918 {
2919 unsigned int i;
2920 vec<data_reference_p> datarefs = vinfo->datarefs;
2921 struct data_reference *dr;
2922
2923 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE, vect_location,
2925 "=== vect_analyze_data_ref_accesses ===\n");
2926
2927 if (datarefs.is_empty ())
2928 return true;
2929
2930 /* Sort the array of datarefs to make building the interleaving chains
2931 linear. Don't modify the original vector's order, it is needed for
2932 determining what dependencies are reversed. */
2933 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2934 datarefs_copy.qsort (dr_group_sort_cmp);
2935
2936 /* Build the interleaving chains. */
2937 for (i = 0; i < datarefs_copy.length () - 1;)
2938 {
2939 data_reference_p dra = datarefs_copy[i];
2940 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2941 stmt_vec_info lastinfo = NULL;
2942 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2943 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2944 {
2945 ++i;
2946 continue;
2947 }
2948 for (i = i + 1; i < datarefs_copy.length (); ++i)
2949 {
2950 data_reference_p drb = datarefs_copy[i];
2951 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2952 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2953 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2954 break;
2955
2956 /* ??? Imperfect sorting (non-compatible types, non-modulo
2957 accesses, same accesses) can lead to a group to be artificially
2958 split here as we don't just skip over those. If it really
2959 matters we can push those to a worklist and re-iterate
2960 over them. The we can just skip ahead to the next DR here. */
2961
2962 /* DRs in a different loop should not be put into the same
2963 interleaving group. */
2964 if (gimple_bb (DR_STMT (dra))->loop_father
2965 != gimple_bb (DR_STMT (drb))->loop_father)
2966 break;
2967
2968 /* Check that the data-refs have same first location (except init)
2969 and they are both either store or load (not load and store,
2970 not masked loads or stores). */
2971 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2972 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2973 DR_BASE_ADDRESS (drb)) != 0
2974 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2975 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2976 break;
2977
2978 /* Check that the data-refs have the same constant size. */
2979 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2980 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2981 if (!tree_fits_uhwi_p (sza)
2982 || !tree_fits_uhwi_p (szb)
2983 || !tree_int_cst_equal (sza, szb))
2984 break;
2985
2986 /* Check that the data-refs have the same step. */
2987 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2988 break;
2989
2990 /* Check the types are compatible.
2991 ??? We don't distinguish this during sorting. */
2992 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2993 TREE_TYPE (DR_REF (drb))))
2994 break;
2995
2996 /* Check that the DR_INITs are compile-time constants. */
2997 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2998 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2999 break;
3000
3001 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3002 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004 HOST_WIDE_INT init_prev
3005 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3006 gcc_assert (init_a <= init_b
3007 && init_a <= init_prev
3008 && init_prev <= init_b);
3009
3010 /* Do not place the same access in the interleaving chain twice. */
3011 if (init_b == init_prev)
3012 {
3013 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3014 < gimple_uid (DR_STMT (drb)));
3015 /* ??? For now we simply "drop" the later reference which is
3016 otherwise the same rather than finishing off this group.
3017 In the end we'd want to re-process duplicates forming
3018 multiple groups from the refs, likely by just collecting
3019 all candidates (including duplicates and split points
3020 below) in a vector and then process them together. */
3021 continue;
3022 }
3023
3024 /* If init_b == init_a + the size of the type * k, we have an
3025 interleaving, and DRA is accessed before DRB. */
3026 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027 if (type_size_a == 0
3028 || (init_b - init_a) % type_size_a != 0)
3029 break;
3030
3031 /* If we have a store, the accesses are adjacent. This splits
3032 groups into chunks we support (we don't support vectorization
3033 of stores with gaps). */
3034 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035 break;
3036
3037 /* If the step (if not zero or non-constant) is greater than the
3038 difference between data-refs' inits this splits groups into
3039 suitable sizes. */
3040 if (tree_fits_shwi_p (DR_STEP (dra)))
3041 {
3042 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043 if (step != 0 && step <= (init_b - init_a))
3044 break;
3045 }
3046
3047 if (dump_enabled_p ())
3048 {
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 "Detected interleaving ");
3051 if (DR_IS_READ (dra))
3052 dump_printf (MSG_NOTE, "load ");
3053 else
3054 dump_printf (MSG_NOTE, "store ");
3055 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3056 dump_printf (MSG_NOTE, " and ");
3057 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3058 dump_printf (MSG_NOTE, "\n");
3059 }
3060
3061 /* Link the found element into the group list. */
3062 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3063 {
3064 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3065 lastinfo = stmtinfo_a;
3066 }
3067 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3068 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3069 lastinfo = stmtinfo_b;
3070 }
3071 }
3072
3073 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3074 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3075 && !vect_analyze_data_ref_access (dr))
3076 {
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079 "not vectorized: complicated access pattern.\n");
3080
3081 if (is_a <bb_vec_info> (vinfo))
3082 {
3083 /* Mark the statement as not vectorizable. */
3084 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3085 continue;
3086 }
3087 else
3088 {
3089 datarefs_copy.release ();
3090 return false;
3091 }
3092 }
3093
3094 datarefs_copy.release ();
3095 return true;
3096 }
3097
3098 /* Function vect_vfa_segment_size.
3099
3100 Input:
3101 DR: The data reference.
3102 LENGTH_FACTOR: segment length to consider.
3103
3104 Return a value suitable for the dr_with_seg_len::seg_len field.
3105 This is the "distance travelled" by the pointer from the first
3106 iteration in the segment to the last. Note that it does not include
3107 the size of the access; in effect it only describes the first byte. */
3108
3109 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3110 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3111 {
3112 length_factor = size_binop (MINUS_EXPR,
3113 fold_convert (sizetype, length_factor),
3114 size_one_node);
3115 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3116 length_factor);
3117 }
3118
3119 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3120 gives the worst-case number of bytes covered by the segment. */
3121
3122 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3123 vect_vfa_access_size (data_reference *dr)
3124 {
3125 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3126 tree ref_type = TREE_TYPE (DR_REF (dr));
3127 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3128 unsigned HOST_WIDE_INT access_size = ref_size;
3129 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3130 {
3131 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3132 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3133 }
3134 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3135 && (vect_supportable_dr_alignment (dr, false)
3136 == dr_explicit_realign_optimized))
3137 {
3138 /* We might access a full vector's worth. */
3139 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3140 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3141 }
3142 return access_size;
3143 }
3144
3145 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3146
3147 static unsigned int
vect_vfa_align(const data_reference * dr)3148 vect_vfa_align (const data_reference *dr)
3149 {
3150 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3151 }
3152
3153 /* Function vect_no_alias_p.
3154
3155 Given data references A and B with equal base and offset, see whether
3156 the alias relation can be decided at compilation time. Return 1 if
3157 it can and the references alias, 0 if it can and the references do
3158 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3159 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3160 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3161
3162 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3163 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3164 tree segment_length_a, tree segment_length_b,
3165 unsigned HOST_WIDE_INT access_size_a,
3166 unsigned HOST_WIDE_INT access_size_b)
3167 {
3168 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3169 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3170 poly_uint64 const_length_a;
3171 poly_uint64 const_length_b;
3172
3173 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3174 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3175 [a, a+12) */
3176 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3177 {
3178 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3179 offset_a = (offset_a + access_size_a) - const_length_a;
3180 }
3181 else
3182 const_length_a = tree_to_poly_uint64 (segment_length_a);
3183 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3184 {
3185 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3186 offset_b = (offset_b + access_size_b) - const_length_b;
3187 }
3188 else
3189 const_length_b = tree_to_poly_uint64 (segment_length_b);
3190
3191 const_length_a += access_size_a;
3192 const_length_b += access_size_b;
3193
3194 if (ranges_known_overlap_p (offset_a, const_length_a,
3195 offset_b, const_length_b))
3196 return 1;
3197
3198 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3199 offset_b, const_length_b))
3200 return 0;
3201
3202 return -1;
3203 }
3204
3205 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3206 in DDR is >= VF. */
3207
3208 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3209 dependence_distance_ge_vf (data_dependence_relation *ddr,
3210 unsigned int loop_depth, poly_uint64 vf)
3211 {
3212 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3213 || DDR_NUM_DIST_VECTS (ddr) == 0)
3214 return false;
3215
3216 /* If the dependence is exact, we should have limited the VF instead. */
3217 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3218
3219 unsigned int i;
3220 lambda_vector dist_v;
3221 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3222 {
3223 HOST_WIDE_INT dist = dist_v[loop_depth];
3224 if (dist != 0
3225 && !(dist > 0 && DDR_REVERSED_P (ddr))
3226 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3227 return false;
3228 }
3229
3230 if (dump_enabled_p ())
3231 {
3232 dump_printf_loc (MSG_NOTE, vect_location,
3233 "dependence distance between ");
3234 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3235 dump_printf (MSG_NOTE, " and ");
3236 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3237 dump_printf (MSG_NOTE, " is >= VF\n");
3238 }
3239
3240 return true;
3241 }
3242
3243 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3244
3245 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3246 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3247 {
3248 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3249 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3250 dump_printf (dump_kind, ") >= ");
3251 dump_dec (dump_kind, lower_bound.min_value);
3252 }
3253
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255 by EXPR, UNSIGNED_P and MIN_VALUE. */
3256
3257 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3259 poly_uint64 min_value)
3260 {
3261 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3262 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3263 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3264 {
3265 unsigned_p &= lower_bounds[i].unsigned_p;
3266 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3267 if (lower_bounds[i].unsigned_p != unsigned_p
3268 || maybe_lt (lower_bounds[i].min_value, min_value))
3269 {
3270 lower_bounds[i].unsigned_p = unsigned_p;
3271 lower_bounds[i].min_value = min_value;
3272 if (dump_enabled_p ())
3273 {
3274 dump_printf_loc (MSG_NOTE, vect_location,
3275 "updating run-time check to ");
3276 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3277 dump_printf (MSG_NOTE, "\n");
3278 }
3279 }
3280 return;
3281 }
3282
3283 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3284 if (dump_enabled_p ())
3285 {
3286 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3287 dump_lower_bound (MSG_NOTE, lower_bound);
3288 dump_printf (MSG_NOTE, "\n");
3289 }
3290 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3291 }
3292
3293 /* Return true if it's unlikely that the step of the vectorized form of DR
3294 will span fewer than GAP bytes. */
3295
3296 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3297 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3298 {
3299 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3300 HOST_WIDE_INT count
3301 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3302 if (GROUP_FIRST_ELEMENT (stmt_info))
3303 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3304 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3305 }
3306
3307 /* Return true if we know that there is no alias between DR_A and DR_B
3308 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3309 *LOWER_BOUND_OUT to this N. */
3310
3311 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3312 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3313 poly_uint64 *lower_bound_out)
3314 {
3315 /* Check that there is a constant gap of known sign between DR_A
3316 and DR_B. */
3317 poly_int64 init_a, init_b;
3318 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3319 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3320 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3321 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3322 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3323 || !ordered_p (init_a, init_b))
3324 return false;
3325
3326 /* Sort DR_A and DR_B by the address they access. */
3327 if (maybe_lt (init_b, init_a))
3328 {
3329 std::swap (init_a, init_b);
3330 std::swap (dr_a, dr_b);
3331 }
3332
3333 /* If the two accesses could be dependent within a scalar iteration,
3334 make sure that we'd retain their order. */
3335 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3336 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3337 return false;
3338
3339 /* There is no alias if abs (DR_STEP) is greater than or equal to
3340 the bytes spanned by the combination of the two accesses. */
3341 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3342 return true;
3343 }
3344
3345 /* Function vect_prune_runtime_alias_test_list.
3346
3347 Prune a list of ddrs to be tested at run-time by versioning for alias.
3348 Merge several alias checks into one if possible.
3349 Return FALSE if resulting list of ddrs is longer then allowed by
3350 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3351
3352 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3353 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3354 {
3355 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3356 hash_set <tree_pair_hash> compared_objects;
3357
3358 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3359 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3360 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3361 vec<vec_object_pair> &check_unequal_addrs
3362 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3363 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3364 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3365
3366 ddr_p ddr;
3367 unsigned int i;
3368 tree length_factor;
3369
3370 if (dump_enabled_p ())
3371 dump_printf_loc (MSG_NOTE, vect_location,
3372 "=== vect_prune_runtime_alias_test_list ===\n");
3373
3374 /* Step values are irrelevant for aliasing if the number of vector
3375 iterations is equal to the number of scalar iterations (which can
3376 happen for fully-SLP loops). */
3377 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3378
3379 if (!ignore_step_p)
3380 {
3381 /* Convert the checks for nonzero steps into bound tests. */
3382 tree value;
3383 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3384 vect_check_lower_bound (loop_vinfo, value, true, 1);
3385 }
3386
3387 if (may_alias_ddrs.is_empty ())
3388 return true;
3389
3390 comp_alias_ddrs.create (may_alias_ddrs.length ());
3391
3392 unsigned int loop_depth
3393 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3394 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3395
3396 /* First, we collect all data ref pairs for aliasing checks. */
3397 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3398 {
3399 int comp_res;
3400 poly_uint64 lower_bound;
3401 struct data_reference *dr_a, *dr_b;
3402 gimple *dr_group_first_a, *dr_group_first_b;
3403 tree segment_length_a, segment_length_b;
3404 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3405 unsigned int align_a, align_b;
3406 gimple *stmt_a, *stmt_b;
3407
3408 /* Ignore the alias if the VF we chose ended up being no greater
3409 than the dependence distance. */
3410 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3411 continue;
3412
3413 if (DDR_OBJECT_A (ddr))
3414 {
3415 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3416 if (!compared_objects.add (new_pair))
3417 {
3418 if (dump_enabled_p ())
3419 {
3420 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3421 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3422 dump_printf (MSG_NOTE, " and ");
3423 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3424 dump_printf (MSG_NOTE, " have different addresses\n");
3425 }
3426 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3427 }
3428 continue;
3429 }
3430
3431 dr_a = DDR_A (ddr);
3432 stmt_a = DR_STMT (DDR_A (ddr));
3433
3434 dr_b = DDR_B (ddr);
3435 stmt_b = DR_STMT (DDR_B (ddr));
3436
3437 /* Skip the pair if inter-iteration dependencies are irrelevant
3438 and intra-iteration dependencies are guaranteed to be honored. */
3439 if (ignore_step_p
3440 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3441 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3442 {
3443 if (dump_enabled_p ())
3444 {
3445 dump_printf_loc (MSG_NOTE, vect_location,
3446 "no need for alias check between ");
3447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3448 dump_printf (MSG_NOTE, " and ");
3449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3450 dump_printf (MSG_NOTE, " when VF is 1\n");
3451 }
3452 continue;
3453 }
3454
3455 /* See whether we can handle the alias using a bounds check on
3456 the step, and whether that's likely to be the best approach.
3457 (It might not be, for example, if the minimum step is much larger
3458 than the number of bytes handled by one vector iteration.) */
3459 if (!ignore_step_p
3460 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3461 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3462 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3463 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3464 {
3465 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3466 if (dump_enabled_p ())
3467 {
3468 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3469 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3470 dump_printf (MSG_NOTE, " and ");
3471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3472 dump_printf (MSG_NOTE, " when the step ");
3473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3474 dump_printf (MSG_NOTE, " is outside ");
3475 if (unsigned_p)
3476 dump_printf (MSG_NOTE, "[0");
3477 else
3478 {
3479 dump_printf (MSG_NOTE, "(");
3480 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3481 }
3482 dump_printf (MSG_NOTE, ", ");
3483 dump_dec (MSG_NOTE, lower_bound);
3484 dump_printf (MSG_NOTE, ")\n");
3485 }
3486 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3487 lower_bound);
3488 continue;
3489 }
3490
3491 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3492 if (dr_group_first_a)
3493 {
3494 stmt_a = dr_group_first_a;
3495 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3496 }
3497
3498 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3499 if (dr_group_first_b)
3500 {
3501 stmt_b = dr_group_first_b;
3502 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3503 }
3504
3505 if (ignore_step_p)
3506 {
3507 segment_length_a = size_zero_node;
3508 segment_length_b = size_zero_node;
3509 }
3510 else
3511 {
3512 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3513 length_factor = scalar_loop_iters;
3514 else
3515 length_factor = size_int (vect_factor);
3516 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3517 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3518 }
3519 access_size_a = vect_vfa_access_size (dr_a);
3520 access_size_b = vect_vfa_access_size (dr_b);
3521 align_a = vect_vfa_align (dr_a);
3522 align_b = vect_vfa_align (dr_b);
3523
3524 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3525 DR_BASE_ADDRESS (dr_b));
3526 if (comp_res == 0)
3527 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3528 DR_OFFSET (dr_b));
3529
3530 /* See whether the alias is known at compilation time. */
3531 if (comp_res == 0
3532 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3533 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3534 && poly_int_tree_p (segment_length_a)
3535 && poly_int_tree_p (segment_length_b))
3536 {
3537 int res = vect_compile_time_alias (dr_a, dr_b,
3538 segment_length_a,
3539 segment_length_b,
3540 access_size_a,
3541 access_size_b);
3542 if (res >= 0 && dump_enabled_p ())
3543 {
3544 dump_printf_loc (MSG_NOTE, vect_location,
3545 "can tell at compile time that ");
3546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3547 dump_printf (MSG_NOTE, " and ");
3548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3549 if (res == 0)
3550 dump_printf (MSG_NOTE, " do not alias\n");
3551 else
3552 dump_printf (MSG_NOTE, " alias\n");
3553 }
3554
3555 if (res == 0)
3556 continue;
3557
3558 if (res == 1)
3559 {
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "not vectorized: compilation time alias.\n");
3563 return false;
3564 }
3565 }
3566
3567 dr_with_seg_len_pair_t dr_with_seg_len_pair
3568 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3569 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3570
3571 /* Canonicalize pairs by sorting the two DR members. */
3572 if (comp_res > 0)
3573 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3574
3575 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3576 }
3577
3578 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3579
3580 unsigned int count = (comp_alias_ddrs.length ()
3581 + check_unequal_addrs.length ());
3582
3583 dump_printf_loc (MSG_NOTE, vect_location,
3584 "improved number of alias checks from %d to %d\n",
3585 may_alias_ddrs.length (), count);
3586 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3587 {
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "number of versioning for alias "
3591 "run-time tests exceeds %d "
3592 "(--param vect-max-version-for-alias-checks)\n",
3593 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3594 return false;
3595 }
3596
3597 return true;
3598 }
3599
3600 /* Check whether we can use an internal function for a gather load
3601 or scatter store. READ_P is true for loads and false for stores.
3602 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3603 the type of the memory elements being loaded or stored. OFFSET_BITS
3604 is the number of bits in each scalar offset and OFFSET_SIGN is the
3605 sign of the offset. SCALE is the amount by which the offset should
3606 be multiplied *after* it has been converted to address width.
3607
3608 Return true if the function is supported, storing the function
3609 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3610
3611 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3612 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3613 tree memory_type, unsigned int offset_bits,
3614 signop offset_sign, int scale,
3615 internal_fn *ifn_out, tree *element_type_out)
3616 {
3617 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3618 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3619 if (offset_bits > element_bits)
3620 /* Internal functions require the offset to be the same width as
3621 the vector elements. We can extend narrower offsets, but it isn't
3622 safe to truncate wider offsets. */
3623 return false;
3624
3625 if (element_bits != memory_bits)
3626 /* For now the vector elements must be the same width as the
3627 memory elements. */
3628 return false;
3629
3630 /* Work out which function we need. */
3631 internal_fn ifn;
3632 if (read_p)
3633 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3634 else
3635 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3636
3637 /* Test whether the target supports this combination. */
3638 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3639 offset_sign, scale))
3640 return false;
3641
3642 *ifn_out = ifn;
3643 *element_type_out = TREE_TYPE (vectype);
3644 return true;
3645 }
3646
3647 /* CALL is a call to an internal gather load or scatter store function.
3648 Describe the operation in INFO. */
3649
3650 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3651 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3652 {
3653 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3654 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3655 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656
3657 info->ifn = gimple_call_internal_fn (call);
3658 info->decl = NULL_TREE;
3659 info->base = gimple_call_arg (call, 0);
3660 info->offset = gimple_call_arg (call, 1);
3661 info->offset_dt = vect_unknown_def_type;
3662 info->offset_vectype = NULL_TREE;
3663 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3664 info->element_type = TREE_TYPE (vectype);
3665 info->memory_type = TREE_TYPE (DR_REF (dr));
3666 }
3667
3668 /* Return true if a non-affine read or write in STMT is suitable for a
3669 gather load or scatter store. Describe the operation in *INFO if so. */
3670
3671 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3672 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3673 gather_scatter_info *info)
3674 {
3675 HOST_WIDE_INT scale = 1;
3676 poly_int64 pbitpos, pbitsize;
3677 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3678 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3679 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3680 tree offtype = NULL_TREE;
3681 tree decl = NULL_TREE, base, off;
3682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3683 tree memory_type = TREE_TYPE (DR_REF (dr));
3684 machine_mode pmode;
3685 int punsignedp, reversep, pvolatilep = 0;
3686 internal_fn ifn;
3687 tree element_type;
3688 bool masked_p = false;
3689
3690 /* See whether this is already a call to a gather/scatter internal function.
3691 If not, see whether it's a masked load or store. */
3692 gcall *call = dyn_cast <gcall *> (stmt);
3693 if (call && gimple_call_internal_p (call))
3694 {
3695 ifn = gimple_call_internal_fn (stmt);
3696 if (internal_gather_scatter_fn_p (ifn))
3697 {
3698 vect_describe_gather_scatter_call (call, info);
3699 return true;
3700 }
3701 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3702 }
3703
3704 /* True if we should aim to use internal functions rather than
3705 built-in functions. */
3706 bool use_ifn_p = (DR_IS_READ (dr)
3707 ? supports_vec_gather_load_p ()
3708 : supports_vec_scatter_store_p ());
3709
3710 base = DR_REF (dr);
3711 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3712 see if we can use the def stmt of the address. */
3713 if (masked_p
3714 && TREE_CODE (base) == MEM_REF
3715 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3716 && integer_zerop (TREE_OPERAND (base, 1))
3717 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3718 {
3719 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3720 if (is_gimple_assign (def_stmt)
3721 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3722 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3723 }
3724
3725 /* The gather and scatter builtins need address of the form
3726 loop_invariant + vector * {1, 2, 4, 8}
3727 or
3728 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3729 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3730 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3731 multiplications and additions in it. To get a vector, we need
3732 a single SSA_NAME that will be defined in the loop and will
3733 contain everything that is not loop invariant and that can be
3734 vectorized. The following code attempts to find such a preexistng
3735 SSA_NAME OFF and put the loop invariants into a tree BASE
3736 that can be gimplified before the loop. */
3737 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3738 &punsignedp, &reversep, &pvolatilep);
3739 gcc_assert (base && !reversep);
3740 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3741
3742 if (TREE_CODE (base) == MEM_REF)
3743 {
3744 if (!integer_zerop (TREE_OPERAND (base, 1)))
3745 {
3746 if (off == NULL_TREE)
3747 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3748 else
3749 off = size_binop (PLUS_EXPR, off,
3750 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3751 }
3752 base = TREE_OPERAND (base, 0);
3753 }
3754 else
3755 base = build_fold_addr_expr (base);
3756
3757 if (off == NULL_TREE)
3758 off = size_zero_node;
3759
3760 /* If base is not loop invariant, either off is 0, then we start with just
3761 the constant offset in the loop invariant BASE and continue with base
3762 as OFF, otherwise give up.
3763 We could handle that case by gimplifying the addition of base + off
3764 into some SSA_NAME and use that as off, but for now punt. */
3765 if (!expr_invariant_in_loop_p (loop, base))
3766 {
3767 if (!integer_zerop (off))
3768 return false;
3769 off = base;
3770 base = size_int (pbytepos);
3771 }
3772 /* Otherwise put base + constant offset into the loop invariant BASE
3773 and continue with OFF. */
3774 else
3775 {
3776 base = fold_convert (sizetype, base);
3777 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3778 }
3779
3780 /* OFF at this point may be either a SSA_NAME or some tree expression
3781 from get_inner_reference. Try to peel off loop invariants from it
3782 into BASE as long as possible. */
3783 STRIP_NOPS (off);
3784 while (offtype == NULL_TREE)
3785 {
3786 enum tree_code code;
3787 tree op0, op1, add = NULL_TREE;
3788
3789 if (TREE_CODE (off) == SSA_NAME)
3790 {
3791 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3792
3793 if (expr_invariant_in_loop_p (loop, off))
3794 return false;
3795
3796 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3797 break;
3798
3799 op0 = gimple_assign_rhs1 (def_stmt);
3800 code = gimple_assign_rhs_code (def_stmt);
3801 op1 = gimple_assign_rhs2 (def_stmt);
3802 }
3803 else
3804 {
3805 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3806 return false;
3807 code = TREE_CODE (off);
3808 extract_ops_from_tree (off, &code, &op0, &op1);
3809 }
3810 switch (code)
3811 {
3812 case POINTER_PLUS_EXPR:
3813 case PLUS_EXPR:
3814 if (expr_invariant_in_loop_p (loop, op0))
3815 {
3816 add = op0;
3817 off = op1;
3818 do_add:
3819 add = fold_convert (sizetype, add);
3820 if (scale != 1)
3821 add = size_binop (MULT_EXPR, add, size_int (scale));
3822 base = size_binop (PLUS_EXPR, base, add);
3823 continue;
3824 }
3825 if (expr_invariant_in_loop_p (loop, op1))
3826 {
3827 add = op1;
3828 off = op0;
3829 goto do_add;
3830 }
3831 break;
3832 case MINUS_EXPR:
3833 if (expr_invariant_in_loop_p (loop, op1))
3834 {
3835 add = fold_convert (sizetype, op1);
3836 add = size_binop (MINUS_EXPR, size_zero_node, add);
3837 off = op0;
3838 goto do_add;
3839 }
3840 break;
3841 case MULT_EXPR:
3842 if (scale == 1 && tree_fits_shwi_p (op1))
3843 {
3844 int new_scale = tree_to_shwi (op1);
3845 /* Only treat this as a scaling operation if the target
3846 supports it. */
3847 if (use_ifn_p
3848 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3849 vectype, memory_type, 1,
3850 TYPE_SIGN (TREE_TYPE (op0)),
3851 new_scale, &ifn,
3852 &element_type))
3853 break;
3854 scale = new_scale;
3855 off = op0;
3856 continue;
3857 }
3858 break;
3859 case SSA_NAME:
3860 off = op0;
3861 continue;
3862 CASE_CONVERT:
3863 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3864 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3865 break;
3866 if (TYPE_PRECISION (TREE_TYPE (op0))
3867 == TYPE_PRECISION (TREE_TYPE (off)))
3868 {
3869 off = op0;
3870 continue;
3871 }
3872
3873 /* The internal functions need the offset to be the same width
3874 as the elements of VECTYPE. Don't include operations that
3875 cast the offset from that width to a different width. */
3876 if (use_ifn_p
3877 && (int_size_in_bytes (TREE_TYPE (vectype))
3878 == int_size_in_bytes (TREE_TYPE (off))))
3879 break;
3880
3881 if (TYPE_PRECISION (TREE_TYPE (op0))
3882 < TYPE_PRECISION (TREE_TYPE (off)))
3883 {
3884 off = op0;
3885 offtype = TREE_TYPE (off);
3886 STRIP_NOPS (off);
3887 continue;
3888 }
3889 break;
3890 default:
3891 break;
3892 }
3893 break;
3894 }
3895
3896 /* If at the end OFF still isn't a SSA_NAME or isn't
3897 defined in the loop, punt. */
3898 if (TREE_CODE (off) != SSA_NAME
3899 || expr_invariant_in_loop_p (loop, off))
3900 return false;
3901
3902 if (offtype == NULL_TREE)
3903 offtype = TREE_TYPE (off);
3904
3905 if (use_ifn_p)
3906 {
3907 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3908 memory_type, TYPE_PRECISION (offtype),
3909 TYPE_SIGN (offtype), scale, &ifn,
3910 &element_type))
3911 return false;
3912 }
3913 else
3914 {
3915 if (DR_IS_READ (dr))
3916 {
3917 if (targetm.vectorize.builtin_gather)
3918 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3919 }
3920 else
3921 {
3922 if (targetm.vectorize.builtin_scatter)
3923 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3924 }
3925
3926 if (!decl)
3927 return false;
3928
3929 ifn = IFN_LAST;
3930 element_type = TREE_TYPE (vectype);
3931 }
3932
3933 info->ifn = ifn;
3934 info->decl = decl;
3935 info->base = base;
3936 info->offset = off;
3937 info->offset_dt = vect_unknown_def_type;
3938 info->offset_vectype = NULL_TREE;
3939 info->scale = scale;
3940 info->element_type = element_type;
3941 info->memory_type = memory_type;
3942 return true;
3943 }
3944
3945 /* Function vect_analyze_data_refs.
3946
3947 Find all the data references in the loop or basic block.
3948
3949 The general structure of the analysis of data refs in the vectorizer is as
3950 follows:
3951 1- vect_analyze_data_refs(loop/bb): call
3952 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3953 in the loop/bb and their dependences.
3954 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3955 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3956 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3957
3958 */
3959
3960 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)3961 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3962 {
3963 struct loop *loop = NULL;
3964 unsigned int i;
3965 struct data_reference *dr;
3966 tree scalar_type;
3967
3968 if (dump_enabled_p ())
3969 dump_printf_loc (MSG_NOTE, vect_location,
3970 "=== vect_analyze_data_refs ===\n");
3971
3972 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3973 loop = LOOP_VINFO_LOOP (loop_vinfo);
3974
3975 /* Go through the data-refs, check that the analysis succeeded. Update
3976 pointer from stmt_vec_info struct to DR and vectype. */
3977
3978 vec<data_reference_p> datarefs = vinfo->datarefs;
3979 FOR_EACH_VEC_ELT (datarefs, i, dr)
3980 {
3981 gimple *stmt;
3982 stmt_vec_info stmt_info;
3983 tree base, offset, init;
3984 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3985 bool simd_lane_access = false;
3986 poly_uint64 vf;
3987
3988 again:
3989 if (!dr || !DR_REF (dr))
3990 {
3991 if (dump_enabled_p ())
3992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3993 "not vectorized: unhandled data-ref\n");
3994 return false;
3995 }
3996
3997 stmt = DR_STMT (dr);
3998 stmt_info = vinfo_for_stmt (stmt);
3999
4000 /* Discard clobbers from the dataref vector. We will remove
4001 clobber stmts during vectorization. */
4002 if (gimple_clobber_p (stmt))
4003 {
4004 free_data_ref (dr);
4005 if (i == datarefs.length () - 1)
4006 {
4007 datarefs.pop ();
4008 break;
4009 }
4010 datarefs.ordered_remove (i);
4011 dr = datarefs[i];
4012 goto again;
4013 }
4014
4015 /* Check that analysis of the data-ref succeeded. */
4016 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4017 || !DR_STEP (dr))
4018 {
4019 bool maybe_gather
4020 = DR_IS_READ (dr)
4021 && !TREE_THIS_VOLATILE (DR_REF (dr))
4022 && (targetm.vectorize.builtin_gather != NULL
4023 || supports_vec_gather_load_p ());
4024 bool maybe_scatter
4025 = DR_IS_WRITE (dr)
4026 && !TREE_THIS_VOLATILE (DR_REF (dr))
4027 && (targetm.vectorize.builtin_scatter != NULL
4028 || supports_vec_scatter_store_p ());
4029 bool maybe_simd_lane_access
4030 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4031
4032 /* If target supports vector gather loads or scatter stores, or if
4033 this might be a SIMD lane access, see if they can't be used. */
4034 if (is_a <loop_vec_info> (vinfo)
4035 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4036 && !nested_in_vect_loop_p (loop, stmt))
4037 {
4038 struct data_reference *newdr
4039 = create_data_ref (NULL, loop_containing_stmt (stmt),
4040 DR_REF (dr), stmt, !maybe_scatter,
4041 DR_IS_CONDITIONAL_IN_STMT (dr));
4042 gcc_assert (newdr != NULL && DR_REF (newdr));
4043 if (DR_BASE_ADDRESS (newdr)
4044 && DR_OFFSET (newdr)
4045 && DR_INIT (newdr)
4046 && DR_STEP (newdr)
4047 && integer_zerop (DR_STEP (newdr)))
4048 {
4049 if (maybe_simd_lane_access)
4050 {
4051 tree off = DR_OFFSET (newdr);
4052 STRIP_NOPS (off);
4053 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4054 && TREE_CODE (off) == MULT_EXPR
4055 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4056 {
4057 tree step = TREE_OPERAND (off, 1);
4058 off = TREE_OPERAND (off, 0);
4059 STRIP_NOPS (off);
4060 if (CONVERT_EXPR_P (off)
4061 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4062 0)))
4063 < TYPE_PRECISION (TREE_TYPE (off)))
4064 off = TREE_OPERAND (off, 0);
4065 if (TREE_CODE (off) == SSA_NAME)
4066 {
4067 gimple *def = SSA_NAME_DEF_STMT (off);
4068 tree reft = TREE_TYPE (DR_REF (newdr));
4069 if (is_gimple_call (def)
4070 && gimple_call_internal_p (def)
4071 && (gimple_call_internal_fn (def)
4072 == IFN_GOMP_SIMD_LANE))
4073 {
4074 tree arg = gimple_call_arg (def, 0);
4075 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4076 arg = SSA_NAME_VAR (arg);
4077 if (arg == loop->simduid
4078 /* For now. */
4079 && tree_int_cst_equal
4080 (TYPE_SIZE_UNIT (reft),
4081 step))
4082 {
4083 DR_OFFSET (newdr) = ssize_int (0);
4084 DR_STEP (newdr) = step;
4085 DR_OFFSET_ALIGNMENT (newdr)
4086 = BIGGEST_ALIGNMENT;
4087 DR_STEP_ALIGNMENT (newdr)
4088 = highest_pow2_factor (step);
4089 dr = newdr;
4090 simd_lane_access = true;
4091 }
4092 }
4093 }
4094 }
4095 }
4096 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4097 {
4098 dr = newdr;
4099 if (maybe_gather)
4100 gatherscatter = GATHER;
4101 else
4102 gatherscatter = SCATTER;
4103 }
4104 }
4105 if (gatherscatter == SG_NONE && !simd_lane_access)
4106 free_data_ref (newdr);
4107 }
4108
4109 if (gatherscatter == SG_NONE && !simd_lane_access)
4110 {
4111 if (dump_enabled_p ())
4112 {
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114 "not vectorized: data ref analysis "
4115 "failed ");
4116 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4117 }
4118
4119 if (is_a <bb_vec_info> (vinfo))
4120 break;
4121
4122 return false;
4123 }
4124 }
4125
4126 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4127 {
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130 "not vectorized: base addr of dr is a "
4131 "constant\n");
4132
4133 if (is_a <bb_vec_info> (vinfo))
4134 break;
4135
4136 if (gatherscatter != SG_NONE || simd_lane_access)
4137 free_data_ref (dr);
4138 return false;
4139 }
4140
4141 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4142 {
4143 if (dump_enabled_p ())
4144 {
4145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146 "not vectorized: volatile type ");
4147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4148 }
4149
4150 if (is_a <bb_vec_info> (vinfo))
4151 break;
4152
4153 return false;
4154 }
4155
4156 if (stmt_can_throw_internal (stmt))
4157 {
4158 if (dump_enabled_p ())
4159 {
4160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4161 "not vectorized: statement can throw an "
4162 "exception ");
4163 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4164 }
4165
4166 if (is_a <bb_vec_info> (vinfo))
4167 break;
4168
4169 if (gatherscatter != SG_NONE || simd_lane_access)
4170 free_data_ref (dr);
4171 return false;
4172 }
4173
4174 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4175 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4176 {
4177 if (dump_enabled_p ())
4178 {
4179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180 "not vectorized: statement is bitfield "
4181 "access ");
4182 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4183 }
4184
4185 if (is_a <bb_vec_info> (vinfo))
4186 break;
4187
4188 if (gatherscatter != SG_NONE || simd_lane_access)
4189 free_data_ref (dr);
4190 return false;
4191 }
4192
4193 base = unshare_expr (DR_BASE_ADDRESS (dr));
4194 offset = unshare_expr (DR_OFFSET (dr));
4195 init = unshare_expr (DR_INIT (dr));
4196
4197 if (is_gimple_call (stmt)
4198 && (!gimple_call_internal_p (stmt)
4199 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4200 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4201 {
4202 if (dump_enabled_p ())
4203 {
4204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4205 "not vectorized: dr in a call ");
4206 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4207 }
4208
4209 if (is_a <bb_vec_info> (vinfo))
4210 break;
4211
4212 if (gatherscatter != SG_NONE || simd_lane_access)
4213 free_data_ref (dr);
4214 return false;
4215 }
4216
4217 /* Update DR field in stmt_vec_info struct. */
4218
4219 /* If the dataref is in an inner-loop of the loop that is considered for
4220 for vectorization, we also want to analyze the access relative to
4221 the outer-loop (DR contains information only relative to the
4222 inner-most enclosing loop). We do that by building a reference to the
4223 first location accessed by the inner-loop, and analyze it relative to
4224 the outer-loop. */
4225 if (loop && nested_in_vect_loop_p (loop, stmt))
4226 {
4227 /* Build a reference to the first location accessed by the
4228 inner loop: *(BASE + INIT + OFFSET). By construction,
4229 this address must be invariant in the inner loop, so we
4230 can consider it as being used in the outer loop. */
4231 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4232 init, offset);
4233 tree init_addr = fold_build_pointer_plus (base, init_offset);
4234 tree init_ref = build_fold_indirect_ref (init_addr);
4235
4236 if (dump_enabled_p ())
4237 {
4238 dump_printf_loc (MSG_NOTE, vect_location,
4239 "analyze in outer loop: ");
4240 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4241 dump_printf (MSG_NOTE, "\n");
4242 }
4243
4244 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4245 init_ref, loop))
4246 /* dr_analyze_innermost already explained the failure. */
4247 return false;
4248
4249 if (dump_enabled_p ())
4250 {
4251 dump_printf_loc (MSG_NOTE, vect_location,
4252 "\touter base_address: ");
4253 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4254 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4255 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4256 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4257 STMT_VINFO_DR_OFFSET (stmt_info));
4258 dump_printf (MSG_NOTE,
4259 "\n\touter constant offset from base address: ");
4260 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4261 STMT_VINFO_DR_INIT (stmt_info));
4262 dump_printf (MSG_NOTE, "\n\touter step: ");
4263 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4264 STMT_VINFO_DR_STEP (stmt_info));
4265 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4266 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4267 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4268 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4269 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4270 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4271 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4272 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4273 }
4274 }
4275
4276 if (STMT_VINFO_DATA_REF (stmt_info))
4277 {
4278 if (dump_enabled_p ())
4279 {
4280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4281 "not vectorized: more than one data ref "
4282 "in stmt: ");
4283 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4284 }
4285
4286 if (is_a <bb_vec_info> (vinfo))
4287 break;
4288
4289 if (gatherscatter != SG_NONE || simd_lane_access)
4290 free_data_ref (dr);
4291 return false;
4292 }
4293
4294 STMT_VINFO_DATA_REF (stmt_info) = dr;
4295 if (simd_lane_access)
4296 {
4297 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4298 free_data_ref (datarefs[i]);
4299 datarefs[i] = dr;
4300 }
4301
4302 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4303 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4304 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4305 {
4306 if (dump_enabled_p ())
4307 {
4308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4309 "not vectorized: base object not addressable "
4310 "for stmt: ");
4311 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4312 }
4313 if (is_a <bb_vec_info> (vinfo))
4314 {
4315 /* In BB vectorization the ref can still participate
4316 in dependence analysis, we just can't vectorize it. */
4317 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4318 continue;
4319 }
4320 return false;
4321 }
4322
4323 /* Set vectype for STMT. */
4324 scalar_type = TREE_TYPE (DR_REF (dr));
4325 STMT_VINFO_VECTYPE (stmt_info)
4326 = get_vectype_for_scalar_type (scalar_type);
4327 if (!STMT_VINFO_VECTYPE (stmt_info))
4328 {
4329 if (dump_enabled_p ())
4330 {
4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332 "not vectorized: no vectype for stmt: ");
4333 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4334 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4336 scalar_type);
4337 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4338 }
4339
4340 if (is_a <bb_vec_info> (vinfo))
4341 {
4342 /* No vector type is fine, the ref can still participate
4343 in dependence analysis, we just can't vectorize it. */
4344 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4345 continue;
4346 }
4347
4348 if (gatherscatter != SG_NONE || simd_lane_access)
4349 {
4350 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4351 if (gatherscatter != SG_NONE)
4352 free_data_ref (dr);
4353 }
4354 return false;
4355 }
4356 else
4357 {
4358 if (dump_enabled_p ())
4359 {
4360 dump_printf_loc (MSG_NOTE, vect_location,
4361 "got vectype for stmt: ");
4362 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4363 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4364 STMT_VINFO_VECTYPE (stmt_info));
4365 dump_printf (MSG_NOTE, "\n");
4366 }
4367 }
4368
4369 /* Adjust the minimal vectorization factor according to the
4370 vector type. */
4371 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4372 *min_vf = upper_bound (*min_vf, vf);
4373
4374 if (gatherscatter != SG_NONE)
4375 {
4376 gather_scatter_info gs_info;
4377 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4378 &gs_info)
4379 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4380 {
4381 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4382 free_data_ref (dr);
4383 if (dump_enabled_p ())
4384 {
4385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4386 (gatherscatter == GATHER) ?
4387 "not vectorized: not suitable for gather "
4388 "load " :
4389 "not vectorized: not suitable for scatter "
4390 "store ");
4391 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4392 }
4393 return false;
4394 }
4395
4396 free_data_ref (datarefs[i]);
4397 datarefs[i] = dr;
4398 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4399 }
4400
4401 else if (is_a <loop_vec_info> (vinfo)
4402 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4403 {
4404 if (nested_in_vect_loop_p (loop, stmt))
4405 {
4406 if (dump_enabled_p ())
4407 {
4408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4409 "not vectorized: not suitable for strided "
4410 "load ");
4411 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4412 }
4413 return false;
4414 }
4415 STMT_VINFO_STRIDED_P (stmt_info) = true;
4416 }
4417 }
4418
4419 /* If we stopped analysis at the first dataref we could not analyze
4420 when trying to vectorize a basic-block mark the rest of the datarefs
4421 as not vectorizable and truncate the vector of datarefs. That
4422 avoids spending useless time in analyzing their dependence. */
4423 if (i != datarefs.length ())
4424 {
4425 gcc_assert (is_a <bb_vec_info> (vinfo));
4426 for (unsigned j = i; j < datarefs.length (); ++j)
4427 {
4428 data_reference_p dr = datarefs[j];
4429 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4430 free_data_ref (dr);
4431 }
4432 datarefs.truncate (i);
4433 }
4434
4435 return true;
4436 }
4437
4438
4439 /* Function vect_get_new_vect_var.
4440
4441 Returns a name for a new variable. The current naming scheme appends the
4442 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4443 the name of vectorizer generated variables, and appends that to NAME if
4444 provided. */
4445
4446 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4447 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4448 {
4449 const char *prefix;
4450 tree new_vect_var;
4451
4452 switch (var_kind)
4453 {
4454 case vect_simple_var:
4455 prefix = "vect";
4456 break;
4457 case vect_scalar_var:
4458 prefix = "stmp";
4459 break;
4460 case vect_mask_var:
4461 prefix = "mask";
4462 break;
4463 case vect_pointer_var:
4464 prefix = "vectp";
4465 break;
4466 default:
4467 gcc_unreachable ();
4468 }
4469
4470 if (name)
4471 {
4472 char* tmp = concat (prefix, "_", name, NULL);
4473 new_vect_var = create_tmp_reg (type, tmp);
4474 free (tmp);
4475 }
4476 else
4477 new_vect_var = create_tmp_reg (type, prefix);
4478
4479 return new_vect_var;
4480 }
4481
4482 /* Like vect_get_new_vect_var but return an SSA name. */
4483
4484 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4485 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4486 {
4487 const char *prefix;
4488 tree new_vect_var;
4489
4490 switch (var_kind)
4491 {
4492 case vect_simple_var:
4493 prefix = "vect";
4494 break;
4495 case vect_scalar_var:
4496 prefix = "stmp";
4497 break;
4498 case vect_pointer_var:
4499 prefix = "vectp";
4500 break;
4501 default:
4502 gcc_unreachable ();
4503 }
4504
4505 if (name)
4506 {
4507 char* tmp = concat (prefix, "_", name, NULL);
4508 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4509 free (tmp);
4510 }
4511 else
4512 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4513
4514 return new_vect_var;
4515 }
4516
4517 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4518
4519 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4520 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4521 {
4522 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4523 int misalign = DR_MISALIGNMENT (dr);
4524 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4525 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4526 else
4527 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4528 DR_TARGET_ALIGNMENT (dr), misalign);
4529 }
4530
4531 /* Function vect_create_addr_base_for_vector_ref.
4532
4533 Create an expression that computes the address of the first memory location
4534 that will be accessed for a data reference.
4535
4536 Input:
4537 STMT: The statement containing the data reference.
4538 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4539 OFFSET: Optional. If supplied, it is be added to the initial address.
4540 LOOP: Specify relative to which loop-nest should the address be computed.
4541 For example, when the dataref is in an inner-loop nested in an
4542 outer-loop that is now being vectorized, LOOP can be either the
4543 outer-loop, or the inner-loop. The first memory location accessed
4544 by the following dataref ('in' points to short):
4545
4546 for (i=0; i<N; i++)
4547 for (j=0; j<M; j++)
4548 s += in[i+j]
4549
4550 is as follows:
4551 if LOOP=i_loop: &in (relative to i_loop)
4552 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4553 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4554 initial address. Unlike OFFSET, which is number of elements to
4555 be added, BYTE_OFFSET is measured in bytes.
4556
4557 Output:
4558 1. Return an SSA_NAME whose value is the address of the memory location of
4559 the first vector of the data reference.
4560 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4561 these statement(s) which define the returned SSA_NAME.
4562
4563 FORNOW: We are only handling array accesses with step 1. */
4564
4565 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4566 vect_create_addr_base_for_vector_ref (gimple *stmt,
4567 gimple_seq *new_stmt_list,
4568 tree offset,
4569 tree byte_offset)
4570 {
4571 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4572 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4573 const char *base_name;
4574 tree addr_base;
4575 tree dest;
4576 gimple_seq seq = NULL;
4577 tree vect_ptr_type;
4578 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4579 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4580 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4581
4582 tree data_ref_base = unshare_expr (drb->base_address);
4583 tree base_offset = unshare_expr (drb->offset);
4584 tree init = unshare_expr (drb->init);
4585
4586 if (loop_vinfo)
4587 base_name = get_name (data_ref_base);
4588 else
4589 {
4590 base_offset = ssize_int (0);
4591 init = ssize_int (0);
4592 base_name = get_name (DR_REF (dr));
4593 }
4594
4595 /* Create base_offset */
4596 base_offset = size_binop (PLUS_EXPR,
4597 fold_convert (sizetype, base_offset),
4598 fold_convert (sizetype, init));
4599
4600 if (offset)
4601 {
4602 offset = fold_build2 (MULT_EXPR, sizetype,
4603 fold_convert (sizetype, offset), step);
4604 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4605 base_offset, offset);
4606 }
4607 if (byte_offset)
4608 {
4609 byte_offset = fold_convert (sizetype, byte_offset);
4610 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4611 base_offset, byte_offset);
4612 }
4613
4614 /* base + base_offset */
4615 if (loop_vinfo)
4616 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4617 else
4618 {
4619 addr_base = build1 (ADDR_EXPR,
4620 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4621 unshare_expr (DR_REF (dr)));
4622 }
4623
4624 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4625 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4626 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4627 gimple_seq_add_seq (new_stmt_list, seq);
4628
4629 if (DR_PTR_INFO (dr)
4630 && TREE_CODE (addr_base) == SSA_NAME
4631 && !SSA_NAME_PTR_INFO (addr_base))
4632 {
4633 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4634 if (offset || byte_offset)
4635 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4636 }
4637
4638 if (dump_enabled_p ())
4639 {
4640 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4641 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4642 dump_printf (MSG_NOTE, "\n");
4643 }
4644
4645 return addr_base;
4646 }
4647
4648
4649 /* Function vect_create_data_ref_ptr.
4650
4651 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4652 location accessed in the loop by STMT, along with the def-use update
4653 chain to appropriately advance the pointer through the loop iterations.
4654 Also set aliasing information for the pointer. This pointer is used by
4655 the callers to this function to create a memory reference expression for
4656 vector load/store access.
4657
4658 Input:
4659 1. STMT: a stmt that references memory. Expected to be of the form
4660 GIMPLE_ASSIGN <name, data-ref> or
4661 GIMPLE_ASSIGN <data-ref, name>.
4662 2. AGGR_TYPE: the type of the reference, which should be either a vector
4663 or an array.
4664 3. AT_LOOP: the loop where the vector memref is to be created.
4665 4. OFFSET (optional): an offset to be added to the initial address accessed
4666 by the data-ref in STMT.
4667 5. BSI: location where the new stmts are to be placed if there is no loop
4668 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4669 pointing to the initial address.
4670 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4671 to the initial address accessed by the data-ref in STMT. This is
4672 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4673 in bytes.
4674 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4675 to the IV during each iteration of the loop. NULL says to move
4676 by one copy of AGGR_TYPE up or down, depending on the step of the
4677 data reference.
4678
4679 Output:
4680 1. Declare a new ptr to vector_type, and have it point to the base of the
4681 data reference (initial addressed accessed by the data reference).
4682 For example, for vector of type V8HI, the following code is generated:
4683
4684 v8hi *ap;
4685 ap = (v8hi *)initial_address;
4686
4687 if OFFSET is not supplied:
4688 initial_address = &a[init];
4689 if OFFSET is supplied:
4690 initial_address = &a[init + OFFSET];
4691 if BYTE_OFFSET is supplied:
4692 initial_address = &a[init] + BYTE_OFFSET;
4693
4694 Return the initial_address in INITIAL_ADDRESS.
4695
4696 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4697 update the pointer in each iteration of the loop.
4698
4699 Return the increment stmt that updates the pointer in PTR_INCR.
4700
4701 3. Set INV_P to true if the access pattern of the data reference in the
4702 vectorized loop is invariant. Set it to false otherwise.
4703
4704 4. Return the pointer. */
4705
4706 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4707 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4708 tree offset, tree *initial_address,
4709 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4710 bool only_init, bool *inv_p, tree byte_offset,
4711 tree iv_step)
4712 {
4713 const char *base_name;
4714 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4716 struct loop *loop = NULL;
4717 bool nested_in_vect_loop = false;
4718 struct loop *containing_loop = NULL;
4719 tree aggr_ptr_type;
4720 tree aggr_ptr;
4721 tree new_temp;
4722 gimple_seq new_stmt_list = NULL;
4723 edge pe = NULL;
4724 basic_block new_bb;
4725 tree aggr_ptr_init;
4726 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4727 tree aptr;
4728 gimple_stmt_iterator incr_gsi;
4729 bool insert_after;
4730 tree indx_before_incr, indx_after_incr;
4731 gimple *incr;
4732 tree step;
4733 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4734
4735 gcc_assert (iv_step != NULL_TREE
4736 || TREE_CODE (aggr_type) == ARRAY_TYPE
4737 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4738
4739 if (loop_vinfo)
4740 {
4741 loop = LOOP_VINFO_LOOP (loop_vinfo);
4742 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4743 containing_loop = (gimple_bb (stmt))->loop_father;
4744 pe = loop_preheader_edge (loop);
4745 }
4746 else
4747 {
4748 gcc_assert (bb_vinfo);
4749 only_init = true;
4750 *ptr_incr = NULL;
4751 }
4752
4753 /* Check the step (evolution) of the load in LOOP, and record
4754 whether it's invariant. */
4755 step = vect_dr_behavior (dr)->step;
4756 if (integer_zerop (step))
4757 *inv_p = true;
4758 else
4759 *inv_p = false;
4760
4761 /* Create an expression for the first address accessed by this load
4762 in LOOP. */
4763 base_name = get_name (DR_BASE_ADDRESS (dr));
4764
4765 if (dump_enabled_p ())
4766 {
4767 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4768 dump_printf_loc (MSG_NOTE, vect_location,
4769 "create %s-pointer variable to type: ",
4770 get_tree_code_name (TREE_CODE (aggr_type)));
4771 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4772 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4773 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4774 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4775 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4776 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4777 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4778 else
4779 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4780 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4781 dump_printf (MSG_NOTE, "\n");
4782 }
4783
4784 /* (1) Create the new aggregate-pointer variable.
4785 Vector and array types inherit the alias set of their component
4786 type by default so we need to use a ref-all pointer if the data
4787 reference does not conflict with the created aggregated data
4788 reference because it is not addressable. */
4789 bool need_ref_all = false;
4790 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4791 get_alias_set (DR_REF (dr))))
4792 need_ref_all = true;
4793 /* Likewise for any of the data references in the stmt group. */
4794 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4795 {
4796 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4797 do
4798 {
4799 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4800 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4801 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4802 get_alias_set (DR_REF (sdr))))
4803 {
4804 need_ref_all = true;
4805 break;
4806 }
4807 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4808 }
4809 while (orig_stmt);
4810 }
4811 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4812 need_ref_all);
4813 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4814
4815
4816 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4817 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4818 def-use update cycles for the pointer: one relative to the outer-loop
4819 (LOOP), which is what steps (3) and (4) below do. The other is relative
4820 to the inner-loop (which is the inner-most loop containing the dataref),
4821 and this is done be step (5) below.
4822
4823 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4824 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4825 redundant. Steps (3),(4) create the following:
4826
4827 vp0 = &base_addr;
4828 LOOP: vp1 = phi(vp0,vp2)
4829 ...
4830 ...
4831 vp2 = vp1 + step
4832 goto LOOP
4833
4834 If there is an inner-loop nested in loop, then step (5) will also be
4835 applied, and an additional update in the inner-loop will be created:
4836
4837 vp0 = &base_addr;
4838 LOOP: vp1 = phi(vp0,vp2)
4839 ...
4840 inner: vp3 = phi(vp1,vp4)
4841 vp4 = vp3 + inner_step
4842 if () goto inner
4843 ...
4844 vp2 = vp1 + step
4845 if () goto LOOP */
4846
4847 /* (2) Calculate the initial address of the aggregate-pointer, and set
4848 the aggregate-pointer to point to it before the loop. */
4849
4850 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4851
4852 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4853 offset, byte_offset);
4854 if (new_stmt_list)
4855 {
4856 if (pe)
4857 {
4858 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4859 gcc_assert (!new_bb);
4860 }
4861 else
4862 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4863 }
4864
4865 *initial_address = new_temp;
4866 aggr_ptr_init = new_temp;
4867
4868 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4869 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4870 inner-loop nested in LOOP (during outer-loop vectorization). */
4871
4872 /* No update in loop is required. */
4873 if (only_init && (!loop_vinfo || at_loop == loop))
4874 aptr = aggr_ptr_init;
4875 else
4876 {
4877 if (iv_step == NULL_TREE)
4878 {
4879 /* The step of the aggregate pointer is the type size. */
4880 iv_step = TYPE_SIZE_UNIT (aggr_type);
4881 /* One exception to the above is when the scalar step of the load in
4882 LOOP is zero. In this case the step here is also zero. */
4883 if (*inv_p)
4884 iv_step = size_zero_node;
4885 else if (tree_int_cst_sgn (step) == -1)
4886 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4887 }
4888
4889 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4890
4891 create_iv (aggr_ptr_init,
4892 fold_convert (aggr_ptr_type, iv_step),
4893 aggr_ptr, loop, &incr_gsi, insert_after,
4894 &indx_before_incr, &indx_after_incr);
4895 incr = gsi_stmt (incr_gsi);
4896 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4897
4898 /* Copy the points-to information if it exists. */
4899 if (DR_PTR_INFO (dr))
4900 {
4901 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4902 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4903 }
4904 if (ptr_incr)
4905 *ptr_incr = incr;
4906
4907 aptr = indx_before_incr;
4908 }
4909
4910 if (!nested_in_vect_loop || only_init)
4911 return aptr;
4912
4913
4914 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4915 nested in LOOP, if exists. */
4916
4917 gcc_assert (nested_in_vect_loop);
4918 if (!only_init)
4919 {
4920 standard_iv_increment_position (containing_loop, &incr_gsi,
4921 &insert_after);
4922 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4923 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4924 &indx_after_incr);
4925 incr = gsi_stmt (incr_gsi);
4926 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4927
4928 /* Copy the points-to information if it exists. */
4929 if (DR_PTR_INFO (dr))
4930 {
4931 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4932 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4933 }
4934 if (ptr_incr)
4935 *ptr_incr = incr;
4936
4937 return indx_before_incr;
4938 }
4939 else
4940 gcc_unreachable ();
4941 }
4942
4943
4944 /* Function bump_vector_ptr
4945
4946 Increment a pointer (to a vector type) by vector-size. If requested,
4947 i.e. if PTR-INCR is given, then also connect the new increment stmt
4948 to the existing def-use update-chain of the pointer, by modifying
4949 the PTR_INCR as illustrated below:
4950
4951 The pointer def-use update-chain before this function:
4952 DATAREF_PTR = phi (p_0, p_2)
4953 ....
4954 PTR_INCR: p_2 = DATAREF_PTR + step
4955
4956 The pointer def-use update-chain after this function:
4957 DATAREF_PTR = phi (p_0, p_2)
4958 ....
4959 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4960 ....
4961 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4962
4963 Input:
4964 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4965 in the loop.
4966 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4967 the loop. The increment amount across iterations is expected
4968 to be vector_size.
4969 BSI - location where the new update stmt is to be placed.
4970 STMT - the original scalar memory-access stmt that is being vectorized.
4971 BUMP - optional. The offset by which to bump the pointer. If not given,
4972 the offset is assumed to be vector_size.
4973
4974 Output: Return NEW_DATAREF_PTR as illustrated above.
4975
4976 */
4977
4978 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)4979 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4980 gimple *stmt, tree bump)
4981 {
4982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4983 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4984 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4985 tree update = TYPE_SIZE_UNIT (vectype);
4986 gassign *incr_stmt;
4987 ssa_op_iter iter;
4988 use_operand_p use_p;
4989 tree new_dataref_ptr;
4990
4991 if (bump)
4992 update = bump;
4993
4994 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4995 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4996 else
4997 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4998 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4999 dataref_ptr, update);
5000 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5001
5002 /* Copy the points-to information if it exists. */
5003 if (DR_PTR_INFO (dr))
5004 {
5005 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5006 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5007 }
5008
5009 if (!ptr_incr)
5010 return new_dataref_ptr;
5011
5012 /* Update the vector-pointer's cross-iteration increment. */
5013 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5014 {
5015 tree use = USE_FROM_PTR (use_p);
5016
5017 if (use == dataref_ptr)
5018 SET_USE (use_p, new_dataref_ptr);
5019 else
5020 gcc_assert (operand_equal_p (use, update, 0));
5021 }
5022
5023 return new_dataref_ptr;
5024 }
5025
5026
5027 /* Copy memory reference info such as base/clique from the SRC reference
5028 to the DEST MEM_REF. */
5029
5030 void
vect_copy_ref_info(tree dest,tree src)5031 vect_copy_ref_info (tree dest, tree src)
5032 {
5033 if (TREE_CODE (dest) != MEM_REF)
5034 return;
5035
5036 tree src_base = src;
5037 while (handled_component_p (src_base))
5038 src_base = TREE_OPERAND (src_base, 0);
5039 if (TREE_CODE (src_base) != MEM_REF
5040 && TREE_CODE (src_base) != TARGET_MEM_REF)
5041 return;
5042
5043 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5044 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5045 }
5046
5047
5048 /* Function vect_create_destination_var.
5049
5050 Create a new temporary of type VECTYPE. */
5051
5052 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5053 vect_create_destination_var (tree scalar_dest, tree vectype)
5054 {
5055 tree vec_dest;
5056 const char *name;
5057 char *new_name;
5058 tree type;
5059 enum vect_var_kind kind;
5060
5061 kind = vectype
5062 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5063 ? vect_mask_var
5064 : vect_simple_var
5065 : vect_scalar_var;
5066 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5067
5068 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5069
5070 name = get_name (scalar_dest);
5071 if (name)
5072 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5073 else
5074 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5075 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5076 free (new_name);
5077
5078 return vec_dest;
5079 }
5080
5081 /* Function vect_grouped_store_supported.
5082
5083 Returns TRUE if interleave high and interleave low permutations
5084 are supported, and FALSE otherwise. */
5085
5086 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5087 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5088 {
5089 machine_mode mode = TYPE_MODE (vectype);
5090
5091 /* vect_permute_store_chain requires the group size to be equal to 3 or
5092 be a power of two. */
5093 if (count != 3 && exact_log2 (count) == -1)
5094 {
5095 if (dump_enabled_p ())
5096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097 "the size of the group of accesses"
5098 " is not a power of 2 or not eqaul to 3\n");
5099 return false;
5100 }
5101
5102 /* Check that the permutation is supported. */
5103 if (VECTOR_MODE_P (mode))
5104 {
5105 unsigned int i;
5106 if (count == 3)
5107 {
5108 unsigned int j0 = 0, j1 = 0, j2 = 0;
5109 unsigned int i, j;
5110
5111 unsigned int nelt;
5112 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5113 {
5114 if (dump_enabled_p ())
5115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116 "cannot handle groups of 3 stores for"
5117 " variable-length vectors\n");
5118 return false;
5119 }
5120
5121 vec_perm_builder sel (nelt, nelt, 1);
5122 sel.quick_grow (nelt);
5123 vec_perm_indices indices;
5124 for (j = 0; j < 3; j++)
5125 {
5126 int nelt0 = ((3 - j) * nelt) % 3;
5127 int nelt1 = ((3 - j) * nelt + 1) % 3;
5128 int nelt2 = ((3 - j) * nelt + 2) % 3;
5129 for (i = 0; i < nelt; i++)
5130 {
5131 if (3 * i + nelt0 < nelt)
5132 sel[3 * i + nelt0] = j0++;
5133 if (3 * i + nelt1 < nelt)
5134 sel[3 * i + nelt1] = nelt + j1++;
5135 if (3 * i + nelt2 < nelt)
5136 sel[3 * i + nelt2] = 0;
5137 }
5138 indices.new_vector (sel, 2, nelt);
5139 if (!can_vec_perm_const_p (mode, indices))
5140 {
5141 if (dump_enabled_p ())
5142 dump_printf (MSG_MISSED_OPTIMIZATION,
5143 "permutation op not supported by target.\n");
5144 return false;
5145 }
5146
5147 for (i = 0; i < nelt; i++)
5148 {
5149 if (3 * i + nelt0 < nelt)
5150 sel[3 * i + nelt0] = 3 * i + nelt0;
5151 if (3 * i + nelt1 < nelt)
5152 sel[3 * i + nelt1] = 3 * i + nelt1;
5153 if (3 * i + nelt2 < nelt)
5154 sel[3 * i + nelt2] = nelt + j2++;
5155 }
5156 indices.new_vector (sel, 2, nelt);
5157 if (!can_vec_perm_const_p (mode, indices))
5158 {
5159 if (dump_enabled_p ())
5160 dump_printf (MSG_MISSED_OPTIMIZATION,
5161 "permutation op not supported by target.\n");
5162 return false;
5163 }
5164 }
5165 return true;
5166 }
5167 else
5168 {
5169 /* If length is not equal to 3 then only power of 2 is supported. */
5170 gcc_assert (pow2p_hwi (count));
5171 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5172
5173 /* The encoding has 2 interleaved stepped patterns. */
5174 vec_perm_builder sel (nelt, 2, 3);
5175 sel.quick_grow (6);
5176 for (i = 0; i < 3; i++)
5177 {
5178 sel[i * 2] = i;
5179 sel[i * 2 + 1] = i + nelt;
5180 }
5181 vec_perm_indices indices (sel, 2, nelt);
5182 if (can_vec_perm_const_p (mode, indices))
5183 {
5184 for (i = 0; i < 6; i++)
5185 sel[i] += exact_div (nelt, 2);
5186 indices.new_vector (sel, 2, nelt);
5187 if (can_vec_perm_const_p (mode, indices))
5188 return true;
5189 }
5190 }
5191 }
5192
5193 if (dump_enabled_p ())
5194 dump_printf (MSG_MISSED_OPTIMIZATION,
5195 "permutaion op not supported by target.\n");
5196 return false;
5197 }
5198
5199
5200 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5201 type VECTYPE. MASKED_P says whether the masked form is needed. */
5202
5203 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5204 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5205 bool masked_p)
5206 {
5207 if (masked_p)
5208 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5209 vec_mask_store_lanes_optab,
5210 vectype, count);
5211 else
5212 return vect_lanes_optab_supported_p ("vec_store_lanes",
5213 vec_store_lanes_optab,
5214 vectype, count);
5215 }
5216
5217
5218 /* Function vect_permute_store_chain.
5219
5220 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5221 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5222 the data correctly for the stores. Return the final references for stores
5223 in RESULT_CHAIN.
5224
5225 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5226 The input is 4 vectors each containing 8 elements. We assign a number to
5227 each element, the input sequence is:
5228
5229 1st vec: 0 1 2 3 4 5 6 7
5230 2nd vec: 8 9 10 11 12 13 14 15
5231 3rd vec: 16 17 18 19 20 21 22 23
5232 4th vec: 24 25 26 27 28 29 30 31
5233
5234 The output sequence should be:
5235
5236 1st vec: 0 8 16 24 1 9 17 25
5237 2nd vec: 2 10 18 26 3 11 19 27
5238 3rd vec: 4 12 20 28 5 13 21 30
5239 4th vec: 6 14 22 30 7 15 23 31
5240
5241 i.e., we interleave the contents of the four vectors in their order.
5242
5243 We use interleave_high/low instructions to create such output. The input of
5244 each interleave_high/low operation is two vectors:
5245 1st vec 2nd vec
5246 0 1 2 3 4 5 6 7
5247 the even elements of the result vector are obtained left-to-right from the
5248 high/low elements of the first vector. The odd elements of the result are
5249 obtained left-to-right from the high/low elements of the second vector.
5250 The output of interleave_high will be: 0 4 1 5
5251 and of interleave_low: 2 6 3 7
5252
5253
5254 The permutation is done in log LENGTH stages. In each stage interleave_high
5255 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5256 where the first argument is taken from the first half of DR_CHAIN and the
5257 second argument from it's second half.
5258 In our example,
5259
5260 I1: interleave_high (1st vec, 3rd vec)
5261 I2: interleave_low (1st vec, 3rd vec)
5262 I3: interleave_high (2nd vec, 4th vec)
5263 I4: interleave_low (2nd vec, 4th vec)
5264
5265 The output for the first stage is:
5266
5267 I1: 0 16 1 17 2 18 3 19
5268 I2: 4 20 5 21 6 22 7 23
5269 I3: 8 24 9 25 10 26 11 27
5270 I4: 12 28 13 29 14 30 15 31
5271
5272 The output of the second stage, i.e. the final result is:
5273
5274 I1: 0 8 16 24 1 9 17 25
5275 I2: 2 10 18 26 3 11 19 27
5276 I3: 4 12 20 28 5 13 21 30
5277 I4: 6 14 22 30 7 15 23 31. */
5278
5279 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5280 vect_permute_store_chain (vec<tree> dr_chain,
5281 unsigned int length,
5282 gimple *stmt,
5283 gimple_stmt_iterator *gsi,
5284 vec<tree> *result_chain)
5285 {
5286 tree vect1, vect2, high, low;
5287 gimple *perm_stmt;
5288 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5289 tree perm_mask_low, perm_mask_high;
5290 tree data_ref;
5291 tree perm3_mask_low, perm3_mask_high;
5292 unsigned int i, j, n, log_length = exact_log2 (length);
5293
5294 result_chain->quick_grow (length);
5295 memcpy (result_chain->address (), dr_chain.address (),
5296 length * sizeof (tree));
5297
5298 if (length == 3)
5299 {
5300 /* vect_grouped_store_supported ensures that this is constant. */
5301 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5302 unsigned int j0 = 0, j1 = 0, j2 = 0;
5303
5304 vec_perm_builder sel (nelt, nelt, 1);
5305 sel.quick_grow (nelt);
5306 vec_perm_indices indices;
5307 for (j = 0; j < 3; j++)
5308 {
5309 int nelt0 = ((3 - j) * nelt) % 3;
5310 int nelt1 = ((3 - j) * nelt + 1) % 3;
5311 int nelt2 = ((3 - j) * nelt + 2) % 3;
5312
5313 for (i = 0; i < nelt; i++)
5314 {
5315 if (3 * i + nelt0 < nelt)
5316 sel[3 * i + nelt0] = j0++;
5317 if (3 * i + nelt1 < nelt)
5318 sel[3 * i + nelt1] = nelt + j1++;
5319 if (3 * i + nelt2 < nelt)
5320 sel[3 * i + nelt2] = 0;
5321 }
5322 indices.new_vector (sel, 2, nelt);
5323 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5324
5325 for (i = 0; i < nelt; i++)
5326 {
5327 if (3 * i + nelt0 < nelt)
5328 sel[3 * i + nelt0] = 3 * i + nelt0;
5329 if (3 * i + nelt1 < nelt)
5330 sel[3 * i + nelt1] = 3 * i + nelt1;
5331 if (3 * i + nelt2 < nelt)
5332 sel[3 * i + nelt2] = nelt + j2++;
5333 }
5334 indices.new_vector (sel, 2, nelt);
5335 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5336
5337 vect1 = dr_chain[0];
5338 vect2 = dr_chain[1];
5339
5340 /* Create interleaving stmt:
5341 low = VEC_PERM_EXPR <vect1, vect2,
5342 {j, nelt, *, j + 1, nelt + j + 1, *,
5343 j + 2, nelt + j + 2, *, ...}> */
5344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5346 vect2, perm3_mask_low);
5347 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5348
5349 vect1 = data_ref;
5350 vect2 = dr_chain[2];
5351 /* Create interleaving stmt:
5352 low = VEC_PERM_EXPR <vect1, vect2,
5353 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5354 6, 7, nelt + j + 2, ...}> */
5355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5357 vect2, perm3_mask_high);
5358 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5359 (*result_chain)[j] = data_ref;
5360 }
5361 }
5362 else
5363 {
5364 /* If length is not equal to 3 then only power of 2 is supported. */
5365 gcc_assert (pow2p_hwi (length));
5366
5367 /* The encoding has 2 interleaved stepped patterns. */
5368 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5369 vec_perm_builder sel (nelt, 2, 3);
5370 sel.quick_grow (6);
5371 for (i = 0; i < 3; i++)
5372 {
5373 sel[i * 2] = i;
5374 sel[i * 2 + 1] = i + nelt;
5375 }
5376 vec_perm_indices indices (sel, 2, nelt);
5377 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5378
5379 for (i = 0; i < 6; i++)
5380 sel[i] += exact_div (nelt, 2);
5381 indices.new_vector (sel, 2, nelt);
5382 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5383
5384 for (i = 0, n = log_length; i < n; i++)
5385 {
5386 for (j = 0; j < length/2; j++)
5387 {
5388 vect1 = dr_chain[j];
5389 vect2 = dr_chain[j+length/2];
5390
5391 /* Create interleaving stmt:
5392 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5393 ...}> */
5394 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5395 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5396 vect2, perm_mask_high);
5397 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5398 (*result_chain)[2*j] = high;
5399
5400 /* Create interleaving stmt:
5401 low = VEC_PERM_EXPR <vect1, vect2,
5402 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5403 ...}> */
5404 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5405 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5406 vect2, perm_mask_low);
5407 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408 (*result_chain)[2*j+1] = low;
5409 }
5410 memcpy (dr_chain.address (), result_chain->address (),
5411 length * sizeof (tree));
5412 }
5413 }
5414 }
5415
5416 /* Function vect_setup_realignment
5417
5418 This function is called when vectorizing an unaligned load using
5419 the dr_explicit_realign[_optimized] scheme.
5420 This function generates the following code at the loop prolog:
5421
5422 p = initial_addr;
5423 x msq_init = *(floor(p)); # prolog load
5424 realignment_token = call target_builtin;
5425 loop:
5426 x msq = phi (msq_init, ---)
5427
5428 The stmts marked with x are generated only for the case of
5429 dr_explicit_realign_optimized.
5430
5431 The code above sets up a new (vector) pointer, pointing to the first
5432 location accessed by STMT, and a "floor-aligned" load using that pointer.
5433 It also generates code to compute the "realignment-token" (if the relevant
5434 target hook was defined), and creates a phi-node at the loop-header bb
5435 whose arguments are the result of the prolog-load (created by this
5436 function) and the result of a load that takes place in the loop (to be
5437 created by the caller to this function).
5438
5439 For the case of dr_explicit_realign_optimized:
5440 The caller to this function uses the phi-result (msq) to create the
5441 realignment code inside the loop, and sets up the missing phi argument,
5442 as follows:
5443 loop:
5444 msq = phi (msq_init, lsq)
5445 lsq = *(floor(p')); # load in loop
5446 result = realign_load (msq, lsq, realignment_token);
5447
5448 For the case of dr_explicit_realign:
5449 loop:
5450 msq = *(floor(p)); # load in loop
5451 p' = p + (VS-1);
5452 lsq = *(floor(p')); # load in loop
5453 result = realign_load (msq, lsq, realignment_token);
5454
5455 Input:
5456 STMT - (scalar) load stmt to be vectorized. This load accesses
5457 a memory location that may be unaligned.
5458 BSI - place where new code is to be inserted.
5459 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5460 is used.
5461
5462 Output:
5463 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5464 target hook, if defined.
5465 Return value - the result of the loop-header phi node. */
5466
5467 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5468 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5469 tree *realignment_token,
5470 enum dr_alignment_support alignment_support_scheme,
5471 tree init_addr,
5472 struct loop **at_loop)
5473 {
5474 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5475 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5476 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5477 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5478 struct loop *loop = NULL;
5479 edge pe = NULL;
5480 tree scalar_dest = gimple_assign_lhs (stmt);
5481 tree vec_dest;
5482 gimple *inc;
5483 tree ptr;
5484 tree data_ref;
5485 basic_block new_bb;
5486 tree msq_init = NULL_TREE;
5487 tree new_temp;
5488 gphi *phi_stmt;
5489 tree msq = NULL_TREE;
5490 gimple_seq stmts = NULL;
5491 bool inv_p;
5492 bool compute_in_loop = false;
5493 bool nested_in_vect_loop = false;
5494 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5495 struct loop *loop_for_initial_load = NULL;
5496
5497 if (loop_vinfo)
5498 {
5499 loop = LOOP_VINFO_LOOP (loop_vinfo);
5500 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5501 }
5502
5503 gcc_assert (alignment_support_scheme == dr_explicit_realign
5504 || alignment_support_scheme == dr_explicit_realign_optimized);
5505
5506 /* We need to generate three things:
5507 1. the misalignment computation
5508 2. the extra vector load (for the optimized realignment scheme).
5509 3. the phi node for the two vectors from which the realignment is
5510 done (for the optimized realignment scheme). */
5511
5512 /* 1. Determine where to generate the misalignment computation.
5513
5514 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5515 calculation will be generated by this function, outside the loop (in the
5516 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5517 caller, inside the loop.
5518
5519 Background: If the misalignment remains fixed throughout the iterations of
5520 the loop, then both realignment schemes are applicable, and also the
5521 misalignment computation can be done outside LOOP. This is because we are
5522 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5523 are a multiple of VS (the Vector Size), and therefore the misalignment in
5524 different vectorized LOOP iterations is always the same.
5525 The problem arises only if the memory access is in an inner-loop nested
5526 inside LOOP, which is now being vectorized using outer-loop vectorization.
5527 This is the only case when the misalignment of the memory access may not
5528 remain fixed throughout the iterations of the inner-loop (as explained in
5529 detail in vect_supportable_dr_alignment). In this case, not only is the
5530 optimized realignment scheme not applicable, but also the misalignment
5531 computation (and generation of the realignment token that is passed to
5532 REALIGN_LOAD) have to be done inside the loop.
5533
5534 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5535 or not, which in turn determines if the misalignment is computed inside
5536 the inner-loop, or outside LOOP. */
5537
5538 if (init_addr != NULL_TREE || !loop_vinfo)
5539 {
5540 compute_in_loop = true;
5541 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5542 }
5543
5544
5545 /* 2. Determine where to generate the extra vector load.
5546
5547 For the optimized realignment scheme, instead of generating two vector
5548 loads in each iteration, we generate a single extra vector load in the
5549 preheader of the loop, and in each iteration reuse the result of the
5550 vector load from the previous iteration. In case the memory access is in
5551 an inner-loop nested inside LOOP, which is now being vectorized using
5552 outer-loop vectorization, we need to determine whether this initial vector
5553 load should be generated at the preheader of the inner-loop, or can be
5554 generated at the preheader of LOOP. If the memory access has no evolution
5555 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5556 to be generated inside LOOP (in the preheader of the inner-loop). */
5557
5558 if (nested_in_vect_loop)
5559 {
5560 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5561 bool invariant_in_outerloop =
5562 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5563 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5564 }
5565 else
5566 loop_for_initial_load = loop;
5567 if (at_loop)
5568 *at_loop = loop_for_initial_load;
5569
5570 if (loop_for_initial_load)
5571 pe = loop_preheader_edge (loop_for_initial_load);
5572
5573 /* 3. For the case of the optimized realignment, create the first vector
5574 load at the loop preheader. */
5575
5576 if (alignment_support_scheme == dr_explicit_realign_optimized)
5577 {
5578 /* Create msq_init = *(floor(p1)) in the loop preheader */
5579 gassign *new_stmt;
5580
5581 gcc_assert (!compute_in_loop);
5582 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5583 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5584 NULL_TREE, &init_addr, NULL, &inc,
5585 true, &inv_p);
5586 if (TREE_CODE (ptr) == SSA_NAME)
5587 new_temp = copy_ssa_name (ptr);
5588 else
5589 new_temp = make_ssa_name (TREE_TYPE (ptr));
5590 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5591 new_stmt = gimple_build_assign
5592 (new_temp, BIT_AND_EXPR, ptr,
5593 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5594 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595 gcc_assert (!new_bb);
5596 data_ref
5597 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5598 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5599 vect_copy_ref_info (data_ref, DR_REF (dr));
5600 new_stmt = gimple_build_assign (vec_dest, data_ref);
5601 new_temp = make_ssa_name (vec_dest, new_stmt);
5602 gimple_assign_set_lhs (new_stmt, new_temp);
5603 if (pe)
5604 {
5605 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5606 gcc_assert (!new_bb);
5607 }
5608 else
5609 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610
5611 msq_init = gimple_assign_lhs (new_stmt);
5612 }
5613
5614 /* 4. Create realignment token using a target builtin, if available.
5615 It is done either inside the containing loop, or before LOOP (as
5616 determined above). */
5617
5618 if (targetm.vectorize.builtin_mask_for_load)
5619 {
5620 gcall *new_stmt;
5621 tree builtin_decl;
5622
5623 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5624 if (!init_addr)
5625 {
5626 /* Generate the INIT_ADDR computation outside LOOP. */
5627 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5628 NULL_TREE);
5629 if (loop)
5630 {
5631 pe = loop_preheader_edge (loop);
5632 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5633 gcc_assert (!new_bb);
5634 }
5635 else
5636 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5637 }
5638
5639 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5640 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5641 vec_dest =
5642 vect_create_destination_var (scalar_dest,
5643 gimple_call_return_type (new_stmt));
5644 new_temp = make_ssa_name (vec_dest, new_stmt);
5645 gimple_call_set_lhs (new_stmt, new_temp);
5646
5647 if (compute_in_loop)
5648 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649 else
5650 {
5651 /* Generate the misalignment computation outside LOOP. */
5652 pe = loop_preheader_edge (loop);
5653 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5654 gcc_assert (!new_bb);
5655 }
5656
5657 *realignment_token = gimple_call_lhs (new_stmt);
5658
5659 /* The result of the CALL_EXPR to this builtin is determined from
5660 the value of the parameter and no global variables are touched
5661 which makes the builtin a "const" function. Requiring the
5662 builtin to have the "const" attribute makes it unnecessary
5663 to call mark_call_clobbered. */
5664 gcc_assert (TREE_READONLY (builtin_decl));
5665 }
5666
5667 if (alignment_support_scheme == dr_explicit_realign)
5668 return msq;
5669
5670 gcc_assert (!compute_in_loop);
5671 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5672
5673
5674 /* 5. Create msq = phi <msq_init, lsq> in loop */
5675
5676 pe = loop_preheader_edge (containing_loop);
5677 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5678 msq = make_ssa_name (vec_dest);
5679 phi_stmt = create_phi_node (msq, containing_loop->header);
5680 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5681
5682 return msq;
5683 }
5684
5685
5686 /* Function vect_grouped_load_supported.
5687
5688 COUNT is the size of the load group (the number of statements plus the
5689 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5690 only one statement, with a gap of COUNT - 1.
5691
5692 Returns true if a suitable permute exists. */
5693
5694 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5695 vect_grouped_load_supported (tree vectype, bool single_element_p,
5696 unsigned HOST_WIDE_INT count)
5697 {
5698 machine_mode mode = TYPE_MODE (vectype);
5699
5700 /* If this is single-element interleaving with an element distance
5701 that leaves unused vector loads around punt - we at least create
5702 very sub-optimal code in that case (and blow up memory,
5703 see PR65518). */
5704 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5705 {
5706 if (dump_enabled_p ())
5707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708 "single-element interleaving not supported "
5709 "for not adjacent vector loads\n");
5710 return false;
5711 }
5712
5713 /* vect_permute_load_chain requires the group size to be equal to 3 or
5714 be a power of two. */
5715 if (count != 3 && exact_log2 (count) == -1)
5716 {
5717 if (dump_enabled_p ())
5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 "the size of the group of accesses"
5720 " is not a power of 2 or not equal to 3\n");
5721 return false;
5722 }
5723
5724 /* Check that the permutation is supported. */
5725 if (VECTOR_MODE_P (mode))
5726 {
5727 unsigned int i, j;
5728 if (count == 3)
5729 {
5730 unsigned int nelt;
5731 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5732 {
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5735 "cannot handle groups of 3 loads for"
5736 " variable-length vectors\n");
5737 return false;
5738 }
5739
5740 vec_perm_builder sel (nelt, nelt, 1);
5741 sel.quick_grow (nelt);
5742 vec_perm_indices indices;
5743 unsigned int k;
5744 for (k = 0; k < 3; k++)
5745 {
5746 for (i = 0; i < nelt; i++)
5747 if (3 * i + k < 2 * nelt)
5748 sel[i] = 3 * i + k;
5749 else
5750 sel[i] = 0;
5751 indices.new_vector (sel, 2, nelt);
5752 if (!can_vec_perm_const_p (mode, indices))
5753 {
5754 if (dump_enabled_p ())
5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 "shuffle of 3 loads is not supported by"
5757 " target\n");
5758 return false;
5759 }
5760 for (i = 0, j = 0; i < nelt; i++)
5761 if (3 * i + k < 2 * nelt)
5762 sel[i] = i;
5763 else
5764 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5765 indices.new_vector (sel, 2, nelt);
5766 if (!can_vec_perm_const_p (mode, indices))
5767 {
5768 if (dump_enabled_p ())
5769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5770 "shuffle of 3 loads is not supported by"
5771 " target\n");
5772 return false;
5773 }
5774 }
5775 return true;
5776 }
5777 else
5778 {
5779 /* If length is not equal to 3 then only power of 2 is supported. */
5780 gcc_assert (pow2p_hwi (count));
5781 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5782
5783 /* The encoding has a single stepped pattern. */
5784 vec_perm_builder sel (nelt, 1, 3);
5785 sel.quick_grow (3);
5786 for (i = 0; i < 3; i++)
5787 sel[i] = i * 2;
5788 vec_perm_indices indices (sel, 2, nelt);
5789 if (can_vec_perm_const_p (mode, indices))
5790 {
5791 for (i = 0; i < 3; i++)
5792 sel[i] = i * 2 + 1;
5793 indices.new_vector (sel, 2, nelt);
5794 if (can_vec_perm_const_p (mode, indices))
5795 return true;
5796 }
5797 }
5798 }
5799
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5802 "extract even/odd not supported by target\n");
5803 return false;
5804 }
5805
5806 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5807 type VECTYPE. MASKED_P says whether the masked form is needed. */
5808
5809 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5810 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5811 bool masked_p)
5812 {
5813 if (masked_p)
5814 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5815 vec_mask_load_lanes_optab,
5816 vectype, count);
5817 else
5818 return vect_lanes_optab_supported_p ("vec_load_lanes",
5819 vec_load_lanes_optab,
5820 vectype, count);
5821 }
5822
5823 /* Function vect_permute_load_chain.
5824
5825 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5826 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5827 the input data correctly. Return the final references for loads in
5828 RESULT_CHAIN.
5829
5830 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5831 The input is 4 vectors each containing 8 elements. We assign a number to each
5832 element, the input sequence is:
5833
5834 1st vec: 0 1 2 3 4 5 6 7
5835 2nd vec: 8 9 10 11 12 13 14 15
5836 3rd vec: 16 17 18 19 20 21 22 23
5837 4th vec: 24 25 26 27 28 29 30 31
5838
5839 The output sequence should be:
5840
5841 1st vec: 0 4 8 12 16 20 24 28
5842 2nd vec: 1 5 9 13 17 21 25 29
5843 3rd vec: 2 6 10 14 18 22 26 30
5844 4th vec: 3 7 11 15 19 23 27 31
5845
5846 i.e., the first output vector should contain the first elements of each
5847 interleaving group, etc.
5848
5849 We use extract_even/odd instructions to create such output. The input of
5850 each extract_even/odd operation is two vectors
5851 1st vec 2nd vec
5852 0 1 2 3 4 5 6 7
5853
5854 and the output is the vector of extracted even/odd elements. The output of
5855 extract_even will be: 0 2 4 6
5856 and of extract_odd: 1 3 5 7
5857
5858
5859 The permutation is done in log LENGTH stages. In each stage extract_even
5860 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5861 their order. In our example,
5862
5863 E1: extract_even (1st vec, 2nd vec)
5864 E2: extract_odd (1st vec, 2nd vec)
5865 E3: extract_even (3rd vec, 4th vec)
5866 E4: extract_odd (3rd vec, 4th vec)
5867
5868 The output for the first stage will be:
5869
5870 E1: 0 2 4 6 8 10 12 14
5871 E2: 1 3 5 7 9 11 13 15
5872 E3: 16 18 20 22 24 26 28 30
5873 E4: 17 19 21 23 25 27 29 31
5874
5875 In order to proceed and create the correct sequence for the next stage (or
5876 for the correct output, if the second stage is the last one, as in our
5877 example), we first put the output of extract_even operation and then the
5878 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5879 The input for the second stage is:
5880
5881 1st vec (E1): 0 2 4 6 8 10 12 14
5882 2nd vec (E3): 16 18 20 22 24 26 28 30
5883 3rd vec (E2): 1 3 5 7 9 11 13 15
5884 4th vec (E4): 17 19 21 23 25 27 29 31
5885
5886 The output of the second stage:
5887
5888 E1: 0 4 8 12 16 20 24 28
5889 E2: 2 6 10 14 18 22 26 30
5890 E3: 1 5 9 13 17 21 25 29
5891 E4: 3 7 11 15 19 23 27 31
5892
5893 And RESULT_CHAIN after reordering:
5894
5895 1st vec (E1): 0 4 8 12 16 20 24 28
5896 2nd vec (E3): 1 5 9 13 17 21 25 29
5897 3rd vec (E2): 2 6 10 14 18 22 26 30
5898 4th vec (E4): 3 7 11 15 19 23 27 31. */
5899
5900 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5901 vect_permute_load_chain (vec<tree> dr_chain,
5902 unsigned int length,
5903 gimple *stmt,
5904 gimple_stmt_iterator *gsi,
5905 vec<tree> *result_chain)
5906 {
5907 tree data_ref, first_vect, second_vect;
5908 tree perm_mask_even, perm_mask_odd;
5909 tree perm3_mask_low, perm3_mask_high;
5910 gimple *perm_stmt;
5911 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5912 unsigned int i, j, log_length = exact_log2 (length);
5913
5914 result_chain->quick_grow (length);
5915 memcpy (result_chain->address (), dr_chain.address (),
5916 length * sizeof (tree));
5917
5918 if (length == 3)
5919 {
5920 /* vect_grouped_load_supported ensures that this is constant. */
5921 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5922 unsigned int k;
5923
5924 vec_perm_builder sel (nelt, nelt, 1);
5925 sel.quick_grow (nelt);
5926 vec_perm_indices indices;
5927 for (k = 0; k < 3; k++)
5928 {
5929 for (i = 0; i < nelt; i++)
5930 if (3 * i + k < 2 * nelt)
5931 sel[i] = 3 * i + k;
5932 else
5933 sel[i] = 0;
5934 indices.new_vector (sel, 2, nelt);
5935 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5936
5937 for (i = 0, j = 0; i < nelt; i++)
5938 if (3 * i + k < 2 * nelt)
5939 sel[i] = i;
5940 else
5941 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5942 indices.new_vector (sel, 2, nelt);
5943 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5944
5945 first_vect = dr_chain[0];
5946 second_vect = dr_chain[1];
5947
5948 /* Create interleaving stmt (low part of):
5949 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5950 ...}> */
5951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5952 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5953 second_vect, perm3_mask_low);
5954 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5955
5956 /* Create interleaving stmt (high part of):
5957 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958 ...}> */
5959 first_vect = data_ref;
5960 second_vect = dr_chain[2];
5961 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5962 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5963 second_vect, perm3_mask_high);
5964 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5965 (*result_chain)[k] = data_ref;
5966 }
5967 }
5968 else
5969 {
5970 /* If length is not equal to 3 then only power of 2 is supported. */
5971 gcc_assert (pow2p_hwi (length));
5972
5973 /* The encoding has a single stepped pattern. */
5974 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5975 vec_perm_builder sel (nelt, 1, 3);
5976 sel.quick_grow (3);
5977 for (i = 0; i < 3; ++i)
5978 sel[i] = i * 2;
5979 vec_perm_indices indices (sel, 2, nelt);
5980 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5981
5982 for (i = 0; i < 3; ++i)
5983 sel[i] = i * 2 + 1;
5984 indices.new_vector (sel, 2, nelt);
5985 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5986
5987 for (i = 0; i < log_length; i++)
5988 {
5989 for (j = 0; j < length; j += 2)
5990 {
5991 first_vect = dr_chain[j];
5992 second_vect = dr_chain[j+1];
5993
5994 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5997 first_vect, second_vect,
5998 perm_mask_even);
5999 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6000 (*result_chain)[j/2] = data_ref;
6001
6002 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6003 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6004 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005 first_vect, second_vect,
6006 perm_mask_odd);
6007 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6008 (*result_chain)[j/2+length/2] = data_ref;
6009 }
6010 memcpy (dr_chain.address (), result_chain->address (),
6011 length * sizeof (tree));
6012 }
6013 }
6014 }
6015
6016 /* Function vect_shift_permute_load_chain.
6017
6018 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6019 sequence of stmts to reorder the input data accordingly.
6020 Return the final references for loads in RESULT_CHAIN.
6021 Return true if successed, false otherwise.
6022
6023 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6024 The input is 3 vectors each containing 8 elements. We assign a
6025 number to each element, the input sequence is:
6026
6027 1st vec: 0 1 2 3 4 5 6 7
6028 2nd vec: 8 9 10 11 12 13 14 15
6029 3rd vec: 16 17 18 19 20 21 22 23
6030
6031 The output sequence should be:
6032
6033 1st vec: 0 3 6 9 12 15 18 21
6034 2nd vec: 1 4 7 10 13 16 19 22
6035 3rd vec: 2 5 8 11 14 17 20 23
6036
6037 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6038
6039 First we shuffle all 3 vectors to get correct elements order:
6040
6041 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6042 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6043 3rd vec: (16 19 22) (17 20 23) (18 21)
6044
6045 Next we unite and shift vector 3 times:
6046
6047 1st step:
6048 shift right by 6 the concatenation of:
6049 "1st vec" and "2nd vec"
6050 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6051 "2nd vec" and "3rd vec"
6052 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6053 "3rd vec" and "1st vec"
6054 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6055 | New vectors |
6056
6057 So that now new vectors are:
6058
6059 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6060 2nd vec: (10 13) (16 19 22) (17 20 23)
6061 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6062
6063 2nd step:
6064 shift right by 5 the concatenation of:
6065 "1st vec" and "3rd vec"
6066 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6067 "2nd vec" and "1st vec"
6068 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6069 "3rd vec" and "2nd vec"
6070 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6071 | New vectors |
6072
6073 So that now new vectors are:
6074
6075 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6076 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6077 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6078
6079 3rd step:
6080 shift right by 5 the concatenation of:
6081 "1st vec" and "1st vec"
6082 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6083 shift right by 3 the concatenation of:
6084 "2nd vec" and "2nd vec"
6085 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6086 | New vectors |
6087
6088 So that now all vectors are READY:
6089 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6090 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6091 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6092
6093 This algorithm is faster than one in vect_permute_load_chain if:
6094 1. "shift of a concatination" is faster than general permutation.
6095 This is usually so.
6096 2. The TARGET machine can't execute vector instructions in parallel.
6097 This is because each step of the algorithm depends on previous.
6098 The algorithm in vect_permute_load_chain is much more parallel.
6099
6100 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6101 */
6102
6103 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6104 vect_shift_permute_load_chain (vec<tree> dr_chain,
6105 unsigned int length,
6106 gimple *stmt,
6107 gimple_stmt_iterator *gsi,
6108 vec<tree> *result_chain)
6109 {
6110 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6111 tree perm2_mask1, perm2_mask2, perm3_mask;
6112 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6113 gimple *perm_stmt;
6114
6115 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6116 unsigned int i;
6117 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6119
6120 unsigned HOST_WIDE_INT nelt, vf;
6121 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6122 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6123 /* Not supported for variable-length vectors. */
6124 return false;
6125
6126 vec_perm_builder sel (nelt, nelt, 1);
6127 sel.quick_grow (nelt);
6128
6129 result_chain->quick_grow (length);
6130 memcpy (result_chain->address (), dr_chain.address (),
6131 length * sizeof (tree));
6132
6133 if (pow2p_hwi (length) && vf > 4)
6134 {
6135 unsigned int j, log_length = exact_log2 (length);
6136 for (i = 0; i < nelt / 2; ++i)
6137 sel[i] = i * 2;
6138 for (i = 0; i < nelt / 2; ++i)
6139 sel[nelt / 2 + i] = i * 2 + 1;
6140 vec_perm_indices indices (sel, 2, nelt);
6141 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6142 {
6143 if (dump_enabled_p ())
6144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6145 "shuffle of 2 fields structure is not \
6146 supported by target\n");
6147 return false;
6148 }
6149 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6150
6151 for (i = 0; i < nelt / 2; ++i)
6152 sel[i] = i * 2 + 1;
6153 for (i = 0; i < nelt / 2; ++i)
6154 sel[nelt / 2 + i] = i * 2;
6155 indices.new_vector (sel, 2, nelt);
6156 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6157 {
6158 if (dump_enabled_p ())
6159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6160 "shuffle of 2 fields structure is not \
6161 supported by target\n");
6162 return false;
6163 }
6164 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6165
6166 /* Generating permutation constant to shift all elements.
6167 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6168 for (i = 0; i < nelt; i++)
6169 sel[i] = nelt / 2 + i;
6170 indices.new_vector (sel, 2, nelt);
6171 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6172 {
6173 if (dump_enabled_p ())
6174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175 "shift permutation is not supported by target\n");
6176 return false;
6177 }
6178 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6179
6180 /* Generating permutation constant to select vector from 2.
6181 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6182 for (i = 0; i < nelt / 2; i++)
6183 sel[i] = i;
6184 for (i = nelt / 2; i < nelt; i++)
6185 sel[i] = nelt + i;
6186 indices.new_vector (sel, 2, nelt);
6187 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6188 {
6189 if (dump_enabled_p ())
6190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6191 "select is not supported by target\n");
6192 return false;
6193 }
6194 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6195
6196 for (i = 0; i < log_length; i++)
6197 {
6198 for (j = 0; j < length; j += 2)
6199 {
6200 first_vect = dr_chain[j];
6201 second_vect = dr_chain[j + 1];
6202
6203 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6204 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6205 first_vect, first_vect,
6206 perm2_mask1);
6207 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6208 vect[0] = data_ref;
6209
6210 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6211 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212 second_vect, second_vect,
6213 perm2_mask2);
6214 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215 vect[1] = data_ref;
6216
6217 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6218 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219 vect[0], vect[1], shift1_mask);
6220 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6221 (*result_chain)[j/2 + length/2] = data_ref;
6222
6223 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6224 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6225 vect[0], vect[1], select_mask);
6226 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6227 (*result_chain)[j/2] = data_ref;
6228 }
6229 memcpy (dr_chain.address (), result_chain->address (),
6230 length * sizeof (tree));
6231 }
6232 return true;
6233 }
6234 if (length == 3 && vf > 2)
6235 {
6236 unsigned int k = 0, l = 0;
6237
6238 /* Generating permutation constant to get all elements in rigth order.
6239 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6240 for (i = 0; i < nelt; i++)
6241 {
6242 if (3 * k + (l % 3) >= nelt)
6243 {
6244 k = 0;
6245 l += (3 - (nelt % 3));
6246 }
6247 sel[i] = 3 * k + (l % 3);
6248 k++;
6249 }
6250 vec_perm_indices indices (sel, 2, nelt);
6251 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252 {
6253 if (dump_enabled_p ())
6254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255 "shuffle of 3 fields structure is not \
6256 supported by target\n");
6257 return false;
6258 }
6259 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6260
6261 /* Generating permutation constant to shift all elements.
6262 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6263 for (i = 0; i < nelt; i++)
6264 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6265 indices.new_vector (sel, 2, nelt);
6266 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6267 {
6268 if (dump_enabled_p ())
6269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270 "shift permutation is not supported by target\n");
6271 return false;
6272 }
6273 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6274
6275 /* Generating permutation constant to shift all elements.
6276 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6277 for (i = 0; i < nelt; i++)
6278 sel[i] = 2 * (nelt / 3) + 1 + i;
6279 indices.new_vector (sel, 2, nelt);
6280 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6281 {
6282 if (dump_enabled_p ())
6283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284 "shift permutation is not supported by target\n");
6285 return false;
6286 }
6287 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6288
6289 /* Generating permutation constant to shift all elements.
6290 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6291 for (i = 0; i < nelt; i++)
6292 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6293 indices.new_vector (sel, 2, nelt);
6294 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6295 {
6296 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298 "shift permutation is not supported by target\n");
6299 return false;
6300 }
6301 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6302
6303 /* Generating permutation constant to shift all elements.
6304 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6305 for (i = 0; i < nelt; i++)
6306 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6307 indices.new_vector (sel, 2, nelt);
6308 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6309 {
6310 if (dump_enabled_p ())
6311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6312 "shift permutation is not supported by target\n");
6313 return false;
6314 }
6315 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6316
6317 for (k = 0; k < 3; k++)
6318 {
6319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6321 dr_chain[k], dr_chain[k],
6322 perm3_mask);
6323 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6324 vect[k] = data_ref;
6325 }
6326
6327 for (k = 0; k < 3; k++)
6328 {
6329 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6330 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6331 vect[k % 3], vect[(k + 1) % 3],
6332 shift1_mask);
6333 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6334 vect_shift[k] = data_ref;
6335 }
6336
6337 for (k = 0; k < 3; k++)
6338 {
6339 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6340 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6341 vect_shift[(4 - k) % 3],
6342 vect_shift[(3 - k) % 3],
6343 shift2_mask);
6344 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6345 vect[k] = data_ref;
6346 }
6347
6348 (*result_chain)[3 - (nelt % 3)] = vect[2];
6349
6350 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6351 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6352 vect[0], shift3_mask);
6353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6354 (*result_chain)[nelt % 3] = data_ref;
6355
6356 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6357 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6358 vect[1], shift4_mask);
6359 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6360 (*result_chain)[0] = data_ref;
6361 return true;
6362 }
6363 return false;
6364 }
6365
6366 /* Function vect_transform_grouped_load.
6367
6368 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6369 to perform their permutation and ascribe the result vectorized statements to
6370 the scalar statements.
6371 */
6372
6373 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6374 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6375 gimple_stmt_iterator *gsi)
6376 {
6377 machine_mode mode;
6378 vec<tree> result_chain = vNULL;
6379
6380 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6381 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6382 vectors, that are ready for vector computation. */
6383 result_chain.create (size);
6384
6385 /* If reassociation width for vector type is 2 or greater target machine can
6386 execute 2 or more vector instructions in parallel. Otherwise try to
6387 get chain for loads group using vect_shift_permute_load_chain. */
6388 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6389 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6390 || pow2p_hwi (size)
6391 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6392 gsi, &result_chain))
6393 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6394 vect_record_grouped_load_vectors (stmt, result_chain);
6395 result_chain.release ();
6396 }
6397
6398 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6399 generated as part of the vectorization of STMT. Assign the statement
6400 for each vector to the associated scalar statement. */
6401
6402 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6403 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6404 {
6405 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6406 gimple *next_stmt, *new_stmt;
6407 unsigned int i, gap_count;
6408 tree tmp_data_ref;
6409
6410 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6411 Since we scan the chain starting from it's first node, their order
6412 corresponds the order of data-refs in RESULT_CHAIN. */
6413 next_stmt = first_stmt;
6414 gap_count = 1;
6415 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6416 {
6417 if (!next_stmt)
6418 break;
6419
6420 /* Skip the gaps. Loads created for the gaps will be removed by dead
6421 code elimination pass later. No need to check for the first stmt in
6422 the group, since it always exists.
6423 GROUP_GAP is the number of steps in elements from the previous
6424 access (if there is no gap GROUP_GAP is 1). We skip loads that
6425 correspond to the gaps. */
6426 if (next_stmt != first_stmt
6427 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6428 {
6429 gap_count++;
6430 continue;
6431 }
6432
6433 while (next_stmt)
6434 {
6435 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6436 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6437 copies, and we put the new vector statement in the first available
6438 RELATED_STMT. */
6439 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6440 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6441 else
6442 {
6443 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6444 {
6445 gimple *prev_stmt =
6446 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6447 gimple *rel_stmt =
6448 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6449 while (rel_stmt)
6450 {
6451 prev_stmt = rel_stmt;
6452 rel_stmt =
6453 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6454 }
6455
6456 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6457 new_stmt;
6458 }
6459 }
6460
6461 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6462 gap_count = 1;
6463 /* If NEXT_STMT accesses the same DR as the previous statement,
6464 put the same TMP_DATA_REF as its vectorized statement; otherwise
6465 get the next data-ref from RESULT_CHAIN. */
6466 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6467 break;
6468 }
6469 }
6470 }
6471
6472 /* Function vect_force_dr_alignment_p.
6473
6474 Returns whether the alignment of a DECL can be forced to be aligned
6475 on ALIGNMENT bit boundary. */
6476
6477 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6478 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6479 {
6480 if (!VAR_P (decl))
6481 return false;
6482
6483 if (decl_in_symtab_p (decl)
6484 && !symtab_node::get (decl)->can_increase_alignment_p ())
6485 return false;
6486
6487 if (TREE_STATIC (decl))
6488 return (alignment <= MAX_OFILE_ALIGNMENT);
6489 else
6490 return (alignment <= MAX_STACK_ALIGNMENT);
6491 }
6492
6493
6494 /* Return whether the data reference DR is supported with respect to its
6495 alignment.
6496 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6497 it is aligned, i.e., check if it is possible to vectorize it with different
6498 alignment. */
6499
6500 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6501 vect_supportable_dr_alignment (struct data_reference *dr,
6502 bool check_aligned_accesses)
6503 {
6504 gimple *stmt = DR_STMT (dr);
6505 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6506 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6507 machine_mode mode = TYPE_MODE (vectype);
6508 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6509 struct loop *vect_loop = NULL;
6510 bool nested_in_vect_loop = false;
6511
6512 if (aligned_access_p (dr) && !check_aligned_accesses)
6513 return dr_aligned;
6514
6515 /* For now assume all conditional loads/stores support unaligned
6516 access without any special code. */
6517 if (is_gimple_call (stmt)
6518 && gimple_call_internal_p (stmt)
6519 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6520 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6521 return dr_unaligned_supported;
6522
6523 if (loop_vinfo)
6524 {
6525 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6526 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6527 }
6528
6529 /* Possibly unaligned access. */
6530
6531 /* We can choose between using the implicit realignment scheme (generating
6532 a misaligned_move stmt) and the explicit realignment scheme (generating
6533 aligned loads with a REALIGN_LOAD). There are two variants to the
6534 explicit realignment scheme: optimized, and unoptimized.
6535 We can optimize the realignment only if the step between consecutive
6536 vector loads is equal to the vector size. Since the vector memory
6537 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6538 is guaranteed that the misalignment amount remains the same throughout the
6539 execution of the vectorized loop. Therefore, we can create the
6540 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6541 at the loop preheader.
6542
6543 However, in the case of outer-loop vectorization, when vectorizing a
6544 memory access in the inner-loop nested within the LOOP that is now being
6545 vectorized, while it is guaranteed that the misalignment of the
6546 vectorized memory access will remain the same in different outer-loop
6547 iterations, it is *not* guaranteed that is will remain the same throughout
6548 the execution of the inner-loop. This is because the inner-loop advances
6549 with the original scalar step (and not in steps of VS). If the inner-loop
6550 step happens to be a multiple of VS, then the misalignment remains fixed
6551 and we can use the optimized realignment scheme. For example:
6552
6553 for (i=0; i<N; i++)
6554 for (j=0; j<M; j++)
6555 s += a[i+j];
6556
6557 When vectorizing the i-loop in the above example, the step between
6558 consecutive vector loads is 1, and so the misalignment does not remain
6559 fixed across the execution of the inner-loop, and the realignment cannot
6560 be optimized (as illustrated in the following pseudo vectorized loop):
6561
6562 for (i=0; i<N; i+=4)
6563 for (j=0; j<M; j++){
6564 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6565 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6566 // (assuming that we start from an aligned address).
6567 }
6568
6569 We therefore have to use the unoptimized realignment scheme:
6570
6571 for (i=0; i<N; i+=4)
6572 for (j=k; j<M; j+=4)
6573 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6574 // that the misalignment of the initial address is
6575 // 0).
6576
6577 The loop can then be vectorized as follows:
6578
6579 for (k=0; k<4; k++){
6580 rt = get_realignment_token (&vp[k]);
6581 for (i=0; i<N; i+=4){
6582 v1 = vp[i+k];
6583 for (j=k; j<M; j+=4){
6584 v2 = vp[i+j+VS-1];
6585 va = REALIGN_LOAD <v1,v2,rt>;
6586 vs += va;
6587 v1 = v2;
6588 }
6589 }
6590 } */
6591
6592 if (DR_IS_READ (dr))
6593 {
6594 bool is_packed = false;
6595 tree type = (TREE_TYPE (DR_REF (dr)));
6596
6597 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6598 && (!targetm.vectorize.builtin_mask_for_load
6599 || targetm.vectorize.builtin_mask_for_load ()))
6600 {
6601 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6602
6603 /* If we are doing SLP then the accesses need not have the
6604 same alignment, instead it depends on the SLP group size. */
6605 if (loop_vinfo
6606 && STMT_SLP_TYPE (stmt_info)
6607 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6608 * GROUP_SIZE (vinfo_for_stmt
6609 (GROUP_FIRST_ELEMENT (stmt_info))),
6610 TYPE_VECTOR_SUBPARTS (vectype)))
6611 ;
6612 else if (!loop_vinfo
6613 || (nested_in_vect_loop
6614 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6615 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6616 return dr_explicit_realign;
6617 else
6618 return dr_explicit_realign_optimized;
6619 }
6620 if (!known_alignment_for_access_p (dr))
6621 is_packed = not_size_aligned (DR_REF (dr));
6622
6623 if (targetm.vectorize.support_vector_misalignment
6624 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6625 /* Can't software pipeline the loads, but can at least do them. */
6626 return dr_unaligned_supported;
6627 }
6628 else
6629 {
6630 bool is_packed = false;
6631 tree type = (TREE_TYPE (DR_REF (dr)));
6632
6633 if (!known_alignment_for_access_p (dr))
6634 is_packed = not_size_aligned (DR_REF (dr));
6635
6636 if (targetm.vectorize.support_vector_misalignment
6637 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6638 return dr_unaligned_supported;
6639 }
6640
6641 /* Unsupported. */
6642 return dr_unaligned_unsupported;
6643 }
6644