1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
81 }
82 }
83
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 {
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
91 }
92
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
97
98 return true;
99 }
100
101
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
118
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140 {
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
147
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
151 }
152
153
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
157
158 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
160 {
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
162
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
165
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
169
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
172 }
173
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
175
176 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
178 {
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
183
184 if (dump_enabled_p ())
185 {
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
189 }
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 }
192
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
196
197 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
199 {
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
202
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
207
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 SLP group are emitted at the position of the last scalar load and
210 all loads in an interleaving group are emitted at the position
211 of the first scalar load.
212 Stores in a group are emitted at the position of the last scalar store.
213 Compute that position and check whether the resulting order matches
214 the current one.
215 We have not yet decided between SLP and interleaving so we have
216 to conservatively assume both. */
217 gimple *il_a;
218 gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
219 if (last_a)
220 {
221 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
222 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
223 last_a = get_later_stmt (last_a, s);
224 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
225 {
226 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s;
227 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
228 if (get_later_stmt (il_a, s) == il_a)
229 il_a = s;
230 }
231 else
232 il_a = last_a;
233 }
234 else
235 last_a = il_a = stmt_a;
236 gimple *il_b;
237 gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
238 if (last_b)
239 {
240 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
241 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
242 last_b = get_later_stmt (last_b, s);
243 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
244 {
245 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s;
246 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247 if (get_later_stmt (il_b, s) == il_b)
248 il_b = s;
249 }
250 else
251 il_b = last_b;
252 }
253 else
254 last_b = il_b = stmt_b;
255 bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a);
256 return (/* SLP */
257 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
258 /* Interleaving */
259 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
260 /* Mixed */
261 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
262 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
263 }
264
265 /* A subroutine of vect_analyze_data_ref_dependence. Handle
266 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
267 distances. These distances are conservatively correct but they don't
268 reflect a guaranteed dependence.
269
270 Return true if this function does all the work necessary to avoid
271 an alias or false if the caller should use the dependence distances
272 to limit the vectorization factor in the usual way. LOOP_DEPTH is
273 the depth of the loop described by LOOP_VINFO and the other arguments
274 are as for vect_analyze_data_ref_dependence. */
275
276 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)277 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
278 loop_vec_info loop_vinfo,
279 int loop_depth, unsigned int *max_vf)
280 {
281 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 lambda_vector dist_v;
283 unsigned int i;
284 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
285 {
286 int dist = dist_v[loop_depth];
287 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
288 {
289 /* If the user asserted safelen >= DIST consecutive iterations
290 can be executed concurrently, assume independence.
291
292 ??? An alternative would be to add the alias check even
293 in this case, and vectorize the fallback loop with the
294 maximum VF set to safelen. However, if the user has
295 explicitly given a length, it's less likely that that
296 would be a win. */
297 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
298 {
299 if ((unsigned int) loop->safelen < *max_vf)
300 *max_vf = loop->safelen;
301 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
302 continue;
303 }
304
305 /* For dependence distances of 2 or more, we have the option
306 of limiting VF or checking for an alias at runtime.
307 Prefer to check at runtime if we can, to avoid limiting
308 the VF unnecessarily when the bases are in fact independent.
309
310 Note that the alias checks will be removed if the VF ends up
311 being small enough. */
312 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
313 }
314 }
315 return true;
316 }
317
318
319 /* Function vect_analyze_data_ref_dependence.
320
321 Return TRUE if there (might) exist a dependence between a memory-reference
322 DRA and a memory-reference DRB. When versioning for alias may check a
323 dependence at run-time, return FALSE. Adjust *MAX_VF according to
324 the data dependence. */
325
326 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)327 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
328 loop_vec_info loop_vinfo,
329 unsigned int *max_vf)
330 {
331 unsigned int i;
332 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
333 struct data_reference *dra = DDR_A (ddr);
334 struct data_reference *drb = DDR_B (ddr);
335 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
336 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
337 lambda_vector dist_v;
338 unsigned int loop_depth;
339
340 /* In loop analysis all data references should be vectorizable. */
341 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
342 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
343 gcc_unreachable ();
344
345 /* Independent data accesses. */
346 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
347 return false;
348
349 if (dra == drb
350 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
351 return false;
352
353 /* We do not have to consider dependences between accesses that belong
354 to the same group, unless the stride could be smaller than the
355 group size. */
356 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
357 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
358 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
359 return false;
360
361 /* Even if we have an anti-dependence then, as the vectorized loop covers at
362 least two scalar iterations, there is always also a true dependence.
363 As the vectorizer does not re-order loads and stores we can ignore
364 the anti-dependence if TBAA can disambiguate both DRs similar to the
365 case with known negative distance anti-dependences (positive
366 distance anti-dependences would violate TBAA constraints). */
367 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
368 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
369 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
370 get_alias_set (DR_REF (drb))))
371 return false;
372
373 /* Unknown data dependence. */
374 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
375 {
376 /* If user asserted safelen consecutive iterations can be
377 executed concurrently, assume independence. */
378 if (loop->safelen >= 2)
379 {
380 if ((unsigned int) loop->safelen < *max_vf)
381 *max_vf = loop->safelen;
382 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
383 return false;
384 }
385
386 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
387 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
388 {
389 if (dump_enabled_p ())
390 {
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 "versioning for alias not supported for: "
393 "can't determine dependence between ");
394 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
395 DR_REF (dra));
396 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (drb));
399 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
400 }
401 return true;
402 }
403
404 if (dump_enabled_p ())
405 {
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "versioning for alias required: "
408 "can't determine dependence between ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410 DR_REF (dra));
411 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
413 DR_REF (drb));
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
415 }
416
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
419 }
420
421 /* Known data dependence. */
422 if (DDR_NUM_DIST_VECTS (ddr) == 0)
423 {
424 /* If user asserted safelen consecutive iterations can be
425 executed concurrently, assume independence. */
426 if (loop->safelen >= 2)
427 {
428 if ((unsigned int) loop->safelen < *max_vf)
429 *max_vf = loop->safelen;
430 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
431 return false;
432 }
433
434 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
435 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
436 {
437 if (dump_enabled_p ())
438 {
439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
440 "versioning for alias not supported for: "
441 "bad dist vector for ");
442 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
443 DR_REF (dra));
444 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
445 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
446 DR_REF (drb));
447 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 }
449 return true;
450 }
451
452 if (dump_enabled_p ())
453 {
454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 "versioning for alias required: "
456 "bad dist vector for ");
457 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
458 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
459 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
460 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
461 }
462 /* Add to list of ddrs that need to be tested at run-time. */
463 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
464 }
465
466 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
467
468 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
469 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
470 loop_depth, max_vf))
471 return false;
472
473 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
474 {
475 int dist = dist_v[loop_depth];
476
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE, vect_location,
479 "dependence distance = %d.\n", dist);
480
481 if (dist == 0)
482 {
483 if (dump_enabled_p ())
484 {
485 dump_printf_loc (MSG_NOTE, vect_location,
486 "dependence distance == 0 between ");
487 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
488 dump_printf (MSG_NOTE, " and ");
489 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
490 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
491 }
492
493 /* When we perform grouped accesses and perform implicit CSE
494 by detecting equal accesses and doing disambiguation with
495 runtime alias tests like for
496 .. = a[i];
497 .. = a[i+1];
498 a[i] = ..;
499 a[i+1] = ..;
500 *p = ..;
501 .. = a[i];
502 .. = a[i+1];
503 where we will end up loading { a[i], a[i+1] } once, make
504 sure that inserting group loads before the first load and
505 stores after the last store will do the right thing.
506 Similar for groups like
507 a[i] = ...;
508 ... = a[i];
509 a[i+1] = ...;
510 where loads from the group interleave with the store. */
511 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
512 {
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "READ_WRITE dependence in interleaving.\n");
516 return true;
517 }
518
519 if (loop->safelen < 2)
520 {
521 tree indicator = dr_zero_step_indicator (dra);
522 if (TREE_CODE (indicator) != INTEGER_CST)
523 vect_check_nonzero_value (loop_vinfo, indicator);
524 else if (integer_zerop (indicator))
525 {
526 if (dump_enabled_p ())
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "access also has a zero step\n");
529 return true;
530 }
531 }
532 continue;
533 }
534
535 if (dist > 0 && DDR_REVERSED_P (ddr))
536 {
537 /* If DDR_REVERSED_P the order of the data-refs in DDR was
538 reversed (to make distance vector positive), and the actual
539 distance is negative. */
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "dependence distance negative.\n");
543 /* Record a negative dependence distance to later limit the
544 amount of stmt copying / unrolling we can perform.
545 Only need to handle read-after-write dependence. */
546 if (DR_IS_READ (drb)
547 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
548 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
549 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
550 continue;
551 }
552
553 unsigned int abs_dist = abs (dist);
554 if (abs_dist >= 2 && abs_dist < *max_vf)
555 {
556 /* The dependence distance requires reduction of the maximal
557 vectorization factor. */
558 *max_vf = abs (dist);
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE, vect_location,
561 "adjusting maximal vectorization factor to %i\n",
562 *max_vf);
563 }
564
565 if (abs_dist >= *max_vf)
566 {
567 /* Dependence distance does not create dependence, as far as
568 vectorization is concerned, in this case. */
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "dependence distance >= VF.\n");
572 continue;
573 }
574
575 if (dump_enabled_p ())
576 {
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "not vectorized, possible dependence "
579 "between data-refs ");
580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
581 dump_printf (MSG_NOTE, " and ");
582 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
583 dump_printf (MSG_NOTE, "\n");
584 }
585
586 return true;
587 }
588
589 return false;
590 }
591
592 /* Function vect_analyze_data_ref_dependences.
593
594 Examine all the data references in the loop, and make sure there do not
595 exist any data dependences between them. Set *MAX_VF according to
596 the maximum vectorization factor the data dependences allow. */
597
598 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)599 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
600 unsigned int *max_vf)
601 {
602 unsigned int i;
603 struct data_dependence_relation *ddr;
604
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_NOTE, vect_location,
607 "=== vect_analyze_data_ref_dependences ===\n");
608
609 LOOP_VINFO_DDRS (loop_vinfo)
610 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
611 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
612 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
613 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
614 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
615 &LOOP_VINFO_DDRS (loop_vinfo),
616 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
617 return false;
618
619 /* For epilogues we either have no aliases or alias versioning
620 was applied to original loop. Therefore we may just get max_vf
621 using VF of original loop. */
622 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
623 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
624 else
625 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
626 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
627 return false;
628
629 return true;
630 }
631
632
633 /* Function vect_slp_analyze_data_ref_dependence.
634
635 Return TRUE if there (might) exist a dependence between a memory-reference
636 DRA and a memory-reference DRB. When versioning for alias may check a
637 dependence at run-time, return FALSE. Adjust *MAX_VF according to
638 the data dependence. */
639
640 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)641 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
642 {
643 struct data_reference *dra = DDR_A (ddr);
644 struct data_reference *drb = DDR_B (ddr);
645
646 /* We need to check dependences of statements marked as unvectorizable
647 as well, they still can prohibit vectorization. */
648
649 /* Independent data accesses. */
650 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
651 return false;
652
653 if (dra == drb)
654 return false;
655
656 /* Read-read is OK. */
657 if (DR_IS_READ (dra) && DR_IS_READ (drb))
658 return false;
659
660 /* If dra and drb are part of the same interleaving chain consider
661 them independent. */
662 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
663 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
664 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
665 return false;
666
667 /* Unknown data dependence. */
668 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
669 {
670 if (dump_enabled_p ())
671 {
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 "can't determine dependence between ");
674 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
675 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
676 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
677 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
678 }
679 }
680 else if (dump_enabled_p ())
681 {
682 dump_printf_loc (MSG_NOTE, vect_location,
683 "determined dependence between ");
684 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
685 dump_printf (MSG_NOTE, " and ");
686 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
687 dump_printf (MSG_NOTE, "\n");
688 }
689
690 return true;
691 }
692
693
694 /* Analyze dependences involved in the transform of SLP NODE. STORES
695 contain the vector of scalar stores of this instance if we are
696 disambiguating the loads. */
697
698 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700 vec<gimple *> stores, gimple *last_store)
701 {
702 /* This walks over all stmts involved in the SLP load/store done
703 in NODE verifying we can sink them up to the last stmt in the
704 group. */
705 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
706 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
707 {
708 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
709 if (access == last_access)
710 continue;
711 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
712 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
713 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
714 {
715 gimple *stmt = gsi_stmt (gsi);
716 if (! gimple_vuse (stmt)
717 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
718 continue;
719
720 /* If we couldn't record a (single) data reference for this
721 stmt we have to give up. */
722 /* ??? Here and below if dependence analysis fails we can resort
723 to the alias oracle which can handle more kinds of stmts. */
724 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
725 if (!dr_b)
726 return false;
727
728 bool dependent = false;
729 /* If we run into a store of this same instance (we've just
730 marked those) then delay dependence checking until we run
731 into the last store because this is where it will have
732 been sunk to (and we verify if we can do that as well). */
733 if (gimple_visited_p (stmt))
734 {
735 if (stmt != last_store)
736 continue;
737 unsigned i;
738 gimple *store;
739 FOR_EACH_VEC_ELT (stores, i, store)
740 {
741 data_reference *store_dr
742 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
743 ddr_p ddr = initialize_data_dependence_relation
744 (dr_a, store_dr, vNULL);
745 dependent = vect_slp_analyze_data_ref_dependence (ddr);
746 free_dependence_relation (ddr);
747 if (dependent)
748 break;
749 }
750 }
751 else
752 {
753 ddr_p ddr = initialize_data_dependence_relation (dr_a,
754 dr_b, vNULL);
755 dependent = vect_slp_analyze_data_ref_dependence (ddr);
756 free_dependence_relation (ddr);
757 }
758 if (dependent)
759 return false;
760 }
761 }
762 return true;
763 }
764
765
766 /* Function vect_analyze_data_ref_dependences.
767
768 Examine all the data references in the basic-block, and make sure there
769 do not exist any data dependences between them. Set *MAX_VF according to
770 the maximum vectorization factor the data dependences allow. */
771
772 bool
vect_slp_analyze_instance_dependence(slp_instance instance)773 vect_slp_analyze_instance_dependence (slp_instance instance)
774 {
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_NOTE, vect_location,
777 "=== vect_slp_analyze_instance_dependence ===\n");
778
779 /* The stores of this instance are at the root of the SLP tree. */
780 slp_tree store = SLP_INSTANCE_TREE (instance);
781 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
782 store = NULL;
783
784 /* Verify we can sink stores to the vectorized stmt insert location. */
785 gimple *last_store = NULL;
786 if (store)
787 {
788 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
789 return false;
790
791 /* Mark stores in this instance and remember the last one. */
792 last_store = vect_find_last_scalar_stmt_in_slp (store);
793 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
794 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
795 }
796
797 bool res = true;
798
799 /* Verify we can sink loads to the vectorized stmt insert location,
800 special-casing stores of this instance. */
801 slp_tree load;
802 unsigned int i;
803 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
804 if (! vect_slp_analyze_node_dependences (instance, load,
805 store
806 ? SLP_TREE_SCALAR_STMTS (store)
807 : vNULL, last_store))
808 {
809 res = false;
810 break;
811 }
812
813 /* Unset the visited flag. */
814 if (store)
815 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
816 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
817
818 return res;
819 }
820
821 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
822 the statement that contains DRB, which is useful for recording in the
823 dump file. */
824
825 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)826 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
827 innermost_loop_behavior *drb)
828 {
829 bool existed;
830 innermost_loop_behavior *&entry
831 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
832 if (!existed || entry->base_alignment < drb->base_alignment)
833 {
834 entry = drb;
835 if (dump_enabled_p ())
836 {
837 dump_printf_loc (MSG_NOTE, vect_location,
838 "recording new base alignment for ");
839 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
840 dump_printf (MSG_NOTE, "\n");
841 dump_printf_loc (MSG_NOTE, vect_location,
842 " alignment: %d\n", drb->base_alignment);
843 dump_printf_loc (MSG_NOTE, vect_location,
844 " misalignment: %d\n", drb->base_misalignment);
845 dump_printf_loc (MSG_NOTE, vect_location,
846 " based on: ");
847 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
848 }
849 }
850 }
851
852 /* If the region we're going to vectorize is reached, all unconditional
853 data references occur at least once. We can therefore pool the base
854 alignment guarantees from each unconditional reference. Do this by
855 going through all the data references in VINFO and checking whether
856 the containing statement makes the reference unconditionally. If so,
857 record the alignment of the base address in VINFO so that it can be
858 used for all other references with the same base. */
859
860 void
vect_record_base_alignments(vec_info * vinfo)861 vect_record_base_alignments (vec_info *vinfo)
862 {
863 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
864 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
865 data_reference *dr;
866 unsigned int i;
867 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
868 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
869 {
870 gimple *stmt = DR_STMT (dr);
871 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
872
873 /* If DR is nested in the loop that is being vectorized, we can also
874 record the alignment of the base wrt the outer loop. */
875 if (loop && nested_in_vect_loop_p (loop, stmt))
876 {
877 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
878 vect_record_base_alignment
879 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
880 }
881 }
882 }
883
884 /* Return the target alignment for the vectorized form of DR. */
885
886 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)887 vect_calculate_target_alignment (struct data_reference *dr)
888 {
889 gimple *stmt = DR_STMT (dr);
890 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
891 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
892 return targetm.vectorize.preferred_vector_alignment (vectype);
893 }
894
895 /* Function vect_compute_data_ref_alignment
896
897 Compute the misalignment of the data reference DR.
898
899 Output:
900 1. If during the misalignment computation it is found that the data reference
901 cannot be vectorized then false is returned.
902 2. DR_MISALIGNMENT (DR) is defined.
903
904 FOR NOW: No analysis is actually performed. Misalignment is calculated
905 only for trivial cases. TODO. */
906
907 bool
vect_compute_data_ref_alignment(struct data_reference * dr)908 vect_compute_data_ref_alignment (struct data_reference *dr)
909 {
910 gimple *stmt = DR_STMT (dr);
911 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
912 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
914 struct loop *loop = NULL;
915 tree ref = DR_REF (dr);
916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
917
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_NOTE, vect_location,
920 "vect_compute_data_ref_alignment:\n");
921
922 if (loop_vinfo)
923 loop = LOOP_VINFO_LOOP (loop_vinfo);
924
925 /* Initialize misalignment to unknown. */
926 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
927
928 innermost_loop_behavior *drb = vect_dr_behavior (dr);
929 bool step_preserves_misalignment_p;
930
931 unsigned HOST_WIDE_INT vector_alignment
932 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
933 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
934
935 /* No step for BB vectorization. */
936 if (!loop)
937 {
938 gcc_assert (integer_zerop (drb->step));
939 step_preserves_misalignment_p = true;
940 }
941
942 /* In case the dataref is in an inner-loop of the loop that is being
943 vectorized (LOOP), we use the base and misalignment information
944 relative to the outer-loop (LOOP). This is ok only if the misalignment
945 stays the same throughout the execution of the inner-loop, which is why
946 we have to check that the stride of the dataref in the inner-loop evenly
947 divides by the vector alignment. */
948 else if (nested_in_vect_loop_p (loop, stmt))
949 {
950 step_preserves_misalignment_p
951 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
952
953 if (dump_enabled_p ())
954 {
955 if (step_preserves_misalignment_p)
956 dump_printf_loc (MSG_NOTE, vect_location,
957 "inner step divides the vector alignment.\n");
958 else
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "inner step doesn't divide the vector"
961 " alignment.\n");
962 }
963 }
964
965 /* Similarly we can only use base and misalignment information relative to
966 an innermost loop if the misalignment stays the same throughout the
967 execution of the loop. As above, this is the case if the stride of
968 the dataref evenly divides by the alignment. */
969 else
970 {
971 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
972 step_preserves_misalignment_p
973 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
974
975 if (!step_preserves_misalignment_p && dump_enabled_p ())
976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977 "step doesn't divide the vector alignment.\n");
978 }
979
980 unsigned int base_alignment = drb->base_alignment;
981 unsigned int base_misalignment = drb->base_misalignment;
982
983 /* Calculate the maximum of the pooled base address alignment and the
984 alignment that we can compute for DR itself. */
985 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
986 if (entry && base_alignment < (*entry)->base_alignment)
987 {
988 base_alignment = (*entry)->base_alignment;
989 base_misalignment = (*entry)->base_misalignment;
990 }
991
992 if (drb->offset_alignment < vector_alignment
993 || !step_preserves_misalignment_p
994 /* We need to know whether the step wrt the vectorized loop is
995 negative when computing the starting misalignment below. */
996 || TREE_CODE (drb->step) != INTEGER_CST)
997 {
998 if (dump_enabled_p ())
999 {
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "Unknown alignment for access: ");
1002 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1003 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1004 }
1005 return true;
1006 }
1007
1008 if (base_alignment < vector_alignment)
1009 {
1010 unsigned int max_alignment;
1011 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1012 if (max_alignment < vector_alignment
1013 || !vect_can_force_dr_alignment_p (base,
1014 vector_alignment * BITS_PER_UNIT))
1015 {
1016 if (dump_enabled_p ())
1017 {
1018 dump_printf_loc (MSG_NOTE, vect_location,
1019 "can't force alignment of ref: ");
1020 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1021 dump_printf (MSG_NOTE, "\n");
1022 }
1023 return true;
1024 }
1025
1026 /* Force the alignment of the decl.
1027 NOTE: This is the only change to the code we make during
1028 the analysis phase, before deciding to vectorize the loop. */
1029 if (dump_enabled_p ())
1030 {
1031 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1032 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1033 dump_printf (MSG_NOTE, "\n");
1034 }
1035
1036 DR_VECT_AUX (dr)->base_decl = base;
1037 DR_VECT_AUX (dr)->base_misaligned = true;
1038 base_misalignment = 0;
1039 }
1040 poly_int64 misalignment
1041 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1042
1043 /* If this is a backward running DR then first access in the larger
1044 vectype actually is N-1 elements before the address in the DR.
1045 Adjust misalign accordingly. */
1046 if (tree_int_cst_sgn (drb->step) < 0)
1047 /* PLUS because STEP is negative. */
1048 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1049 * TREE_INT_CST_LOW (drb->step));
1050
1051 unsigned int const_misalignment;
1052 if (!known_misalignment (misalignment, vector_alignment,
1053 &const_misalignment))
1054 {
1055 if (dump_enabled_p ())
1056 {
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "Non-constant misalignment for access: ");
1059 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1060 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1061 }
1062 return true;
1063 }
1064
1065 SET_DR_MISALIGNMENT (dr, const_misalignment);
1066
1067 if (dump_enabled_p ())
1068 {
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1070 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1071 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1072 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1073 }
1074
1075 return true;
1076 }
1077
1078 /* Function vect_update_misalignment_for_peel.
1079 Sets DR's misalignment
1080 - to 0 if it has the same alignment as DR_PEEL,
1081 - to the misalignment computed using NPEEL if DR's salignment is known,
1082 - to -1 (unknown) otherwise.
1083
1084 DR - the data reference whose misalignment is to be adjusted.
1085 DR_PEEL - the data reference whose misalignment is being made
1086 zero in the vector loop by the peel.
1087 NPEEL - the number of iterations in the peel loop if the misalignment
1088 of DR_PEEL is known at compile time. */
1089
1090 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1091 vect_update_misalignment_for_peel (struct data_reference *dr,
1092 struct data_reference *dr_peel, int npeel)
1093 {
1094 unsigned int i;
1095 vec<dr_p> same_aligned_drs;
1096 struct data_reference *current_dr;
1097 int dr_size = vect_get_scalar_dr_size (dr);
1098 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1099 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1100 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1101
1102 /* For interleaved data accesses the step in the loop must be multiplied by
1103 the size of the interleaving group. */
1104 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1105 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1106 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1107 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1108
1109 /* It can be assumed that the data refs with the same alignment as dr_peel
1110 are aligned in the vector loop. */
1111 same_aligned_drs
1112 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1113 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1114 {
1115 if (current_dr != dr)
1116 continue;
1117 gcc_assert (!known_alignment_for_access_p (dr)
1118 || !known_alignment_for_access_p (dr_peel)
1119 || (DR_MISALIGNMENT (dr) / dr_size
1120 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1121 SET_DR_MISALIGNMENT (dr, 0);
1122 return;
1123 }
1124
1125 if (known_alignment_for_access_p (dr)
1126 && known_alignment_for_access_p (dr_peel))
1127 {
1128 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1129 int misal = DR_MISALIGNMENT (dr);
1130 misal += negative ? -npeel * dr_size : npeel * dr_size;
1131 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1132 SET_DR_MISALIGNMENT (dr, misal);
1133 return;
1134 }
1135
1136 if (dump_enabled_p ())
1137 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1138 "to unknown (-1).\n");
1139 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1140 }
1141
1142
1143 /* Function verify_data_ref_alignment
1144
1145 Return TRUE if DR can be handled with respect to alignment. */
1146
1147 static bool
verify_data_ref_alignment(data_reference_p dr)1148 verify_data_ref_alignment (data_reference_p dr)
1149 {
1150 enum dr_alignment_support supportable_dr_alignment
1151 = vect_supportable_dr_alignment (dr, false);
1152 if (!supportable_dr_alignment)
1153 {
1154 if (dump_enabled_p ())
1155 {
1156 if (DR_IS_READ (dr))
1157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1158 "not vectorized: unsupported unaligned load.");
1159 else
1160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161 "not vectorized: unsupported unaligned "
1162 "store.");
1163
1164 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1165 DR_REF (dr));
1166 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1167 }
1168 return false;
1169 }
1170
1171 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1172 dump_printf_loc (MSG_NOTE, vect_location,
1173 "Vectorizing an unaligned access.\n");
1174
1175 return true;
1176 }
1177
1178 /* Function vect_verify_datarefs_alignment
1179
1180 Return TRUE if all data references in the loop can be
1181 handled with respect to alignment. */
1182
1183 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1184 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1185 {
1186 vec<data_reference_p> datarefs = vinfo->datarefs;
1187 struct data_reference *dr;
1188 unsigned int i;
1189
1190 FOR_EACH_VEC_ELT (datarefs, i, dr)
1191 {
1192 gimple *stmt = DR_STMT (dr);
1193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1194
1195 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1196 continue;
1197
1198 /* For interleaving, only the alignment of the first access matters. */
1199 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1200 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1201 continue;
1202
1203 /* Strided accesses perform only component accesses, alignment is
1204 irrelevant for them. */
1205 if (STMT_VINFO_STRIDED_P (stmt_info)
1206 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1207 continue;
1208
1209 if (! verify_data_ref_alignment (dr))
1210 return false;
1211 }
1212
1213 return true;
1214 }
1215
1216 /* Given an memory reference EXP return whether its alignment is less
1217 than its size. */
1218
1219 static bool
not_size_aligned(tree exp)1220 not_size_aligned (tree exp)
1221 {
1222 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1223 return true;
1224
1225 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1226 > get_object_alignment (exp));
1227 }
1228
1229 /* Function vector_alignment_reachable_p
1230
1231 Return true if vector alignment for DR is reachable by peeling
1232 a few loop iterations. Return false otherwise. */
1233
1234 static bool
vector_alignment_reachable_p(struct data_reference * dr)1235 vector_alignment_reachable_p (struct data_reference *dr)
1236 {
1237 gimple *stmt = DR_STMT (dr);
1238 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1239 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 {
1243 /* For interleaved access we peel only if number of iterations in
1244 the prolog loop ({VF - misalignment}), is a multiple of the
1245 number of the interleaved accesses. */
1246 int elem_size, mis_in_elements;
1247
1248 /* FORNOW: handle only known alignment. */
1249 if (!known_alignment_for_access_p (dr))
1250 return false;
1251
1252 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1253 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1254 elem_size = vector_element_size (vector_size, nelements);
1255 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1256
1257 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1258 return false;
1259 }
1260
1261 /* If misalignment is known at the compile time then allow peeling
1262 only if natural alignment is reachable through peeling. */
1263 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1264 {
1265 HOST_WIDE_INT elmsize =
1266 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1267 if (dump_enabled_p ())
1268 {
1269 dump_printf_loc (MSG_NOTE, vect_location,
1270 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1271 dump_printf (MSG_NOTE,
1272 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1273 }
1274 if (DR_MISALIGNMENT (dr) % elmsize)
1275 {
1276 if (dump_enabled_p ())
1277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1278 "data size does not divide the misalignment.\n");
1279 return false;
1280 }
1281 }
1282
1283 if (!known_alignment_for_access_p (dr))
1284 {
1285 tree type = TREE_TYPE (DR_REF (dr));
1286 bool is_packed = not_size_aligned (DR_REF (dr));
1287 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289 "Unknown misalignment, %snaturally aligned\n",
1290 is_packed ? "not " : "");
1291 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1292 }
1293
1294 return true;
1295 }
1296
1297
1298 /* Calculate the cost of the memory access represented by DR. */
1299
1300 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1301 vect_get_data_access_cost (struct data_reference *dr,
1302 unsigned int *inside_cost,
1303 unsigned int *outside_cost,
1304 stmt_vector_for_cost *body_cost_vec)
1305 {
1306 gimple *stmt = DR_STMT (dr);
1307 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1308 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1309 int ncopies;
1310
1311 if (PURE_SLP_STMT (stmt_info))
1312 ncopies = 1;
1313 else
1314 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1315
1316 if (DR_IS_READ (dr))
1317 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1318 NULL, body_cost_vec, false);
1319 else
1320 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1321
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_NOTE, vect_location,
1324 "vect_get_data_access_cost: inside_cost = %d, "
1325 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1326 }
1327
1328
1329 typedef struct _vect_peel_info
1330 {
1331 struct data_reference *dr;
1332 int npeel;
1333 unsigned int count;
1334 } *vect_peel_info;
1335
1336 typedef struct _vect_peel_extended_info
1337 {
1338 struct _vect_peel_info peel_info;
1339 unsigned int inside_cost;
1340 unsigned int outside_cost;
1341 } *vect_peel_extended_info;
1342
1343
1344 /* Peeling hashtable helpers. */
1345
1346 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1347 {
1348 static inline hashval_t hash (const _vect_peel_info *);
1349 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1350 };
1351
1352 inline hashval_t
hash(const _vect_peel_info * peel_info)1353 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1354 {
1355 return (hashval_t) peel_info->npeel;
1356 }
1357
1358 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1359 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1360 {
1361 return (a->npeel == b->npeel);
1362 }
1363
1364
1365 /* Insert DR into peeling hash table with NPEEL as key. */
1366
1367 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1368 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1369 loop_vec_info loop_vinfo, struct data_reference *dr,
1370 int npeel)
1371 {
1372 struct _vect_peel_info elem, *slot;
1373 _vect_peel_info **new_slot;
1374 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1375
1376 elem.npeel = npeel;
1377 slot = peeling_htab->find (&elem);
1378 if (slot)
1379 slot->count++;
1380 else
1381 {
1382 slot = XNEW (struct _vect_peel_info);
1383 slot->npeel = npeel;
1384 slot->dr = dr;
1385 slot->count = 1;
1386 new_slot = peeling_htab->find_slot (slot, INSERT);
1387 *new_slot = slot;
1388 }
1389
1390 if (!supportable_dr_alignment
1391 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1392 slot->count += VECT_MAX_COST;
1393 }
1394
1395
1396 /* Traverse peeling hash table to find peeling option that aligns maximum
1397 number of data accesses. */
1398
1399 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1400 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1401 _vect_peel_extended_info *max)
1402 {
1403 vect_peel_info elem = *slot;
1404
1405 if (elem->count > max->peel_info.count
1406 || (elem->count == max->peel_info.count
1407 && max->peel_info.npeel > elem->npeel))
1408 {
1409 max->peel_info.npeel = elem->npeel;
1410 max->peel_info.count = elem->count;
1411 max->peel_info.dr = elem->dr;
1412 }
1413
1414 return 1;
1415 }
1416
1417 /* Get the costs of peeling NPEEL iterations checking data access costs
1418 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1419 misalignment will be zero after peeling. */
1420
1421 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1422 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1423 struct data_reference *dr0,
1424 unsigned int *inside_cost,
1425 unsigned int *outside_cost,
1426 stmt_vector_for_cost *body_cost_vec,
1427 unsigned int npeel,
1428 bool unknown_misalignment)
1429 {
1430 unsigned i;
1431 data_reference *dr;
1432
1433 FOR_EACH_VEC_ELT (datarefs, i, dr)
1434 {
1435 gimple *stmt = DR_STMT (dr);
1436 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1437 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1438 continue;
1439
1440 /* For interleaving, only the alignment of the first access
1441 matters. */
1442 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1443 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1444 continue;
1445
1446 /* Strided accesses perform only component accesses, alignment is
1447 irrelevant for them. */
1448 if (STMT_VINFO_STRIDED_P (stmt_info)
1449 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1450 continue;
1451
1452 int save_misalignment;
1453 save_misalignment = DR_MISALIGNMENT (dr);
1454 if (npeel == 0)
1455 ;
1456 else if (unknown_misalignment && dr == dr0)
1457 SET_DR_MISALIGNMENT (dr, 0);
1458 else
1459 vect_update_misalignment_for_peel (dr, dr0, npeel);
1460 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1461 body_cost_vec);
1462 SET_DR_MISALIGNMENT (dr, save_misalignment);
1463 }
1464 }
1465
1466 /* Traverse peeling hash table and calculate cost for each peeling option.
1467 Find the one with the lowest cost. */
1468
1469 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1470 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1471 _vect_peel_extended_info *min)
1472 {
1473 vect_peel_info elem = *slot;
1474 int dummy;
1475 unsigned int inside_cost = 0, outside_cost = 0;
1476 gimple *stmt = DR_STMT (elem->dr);
1477 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1478 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1479 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1480 epilogue_cost_vec;
1481
1482 prologue_cost_vec.create (2);
1483 body_cost_vec.create (2);
1484 epilogue_cost_vec.create (2);
1485
1486 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1487 elem->dr, &inside_cost, &outside_cost,
1488 &body_cost_vec, elem->npeel, false);
1489
1490 body_cost_vec.release ();
1491
1492 outside_cost += vect_get_known_peeling_cost
1493 (loop_vinfo, elem->npeel, &dummy,
1494 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1495 &prologue_cost_vec, &epilogue_cost_vec);
1496
1497 /* Prologue and epilogue costs are added to the target model later.
1498 These costs depend only on the scalar iteration cost, the
1499 number of peeling iterations finally chosen, and the number of
1500 misaligned statements. So discard the information found here. */
1501 prologue_cost_vec.release ();
1502 epilogue_cost_vec.release ();
1503
1504 if (inside_cost < min->inside_cost
1505 || (inside_cost == min->inside_cost
1506 && outside_cost < min->outside_cost))
1507 {
1508 min->inside_cost = inside_cost;
1509 min->outside_cost = outside_cost;
1510 min->peel_info.dr = elem->dr;
1511 min->peel_info.npeel = elem->npeel;
1512 min->peel_info.count = elem->count;
1513 }
1514
1515 return 1;
1516 }
1517
1518
1519 /* Choose best peeling option by traversing peeling hash table and either
1520 choosing an option with the lowest cost (if cost model is enabled) or the
1521 option that aligns as many accesses as possible. */
1522
1523 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1524 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1525 loop_vec_info loop_vinfo)
1526 {
1527 struct _vect_peel_extended_info res;
1528
1529 res.peel_info.dr = NULL;
1530
1531 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1532 {
1533 res.inside_cost = INT_MAX;
1534 res.outside_cost = INT_MAX;
1535 peeling_htab->traverse <_vect_peel_extended_info *,
1536 vect_peeling_hash_get_lowest_cost> (&res);
1537 }
1538 else
1539 {
1540 res.peel_info.count = 0;
1541 peeling_htab->traverse <_vect_peel_extended_info *,
1542 vect_peeling_hash_get_most_frequent> (&res);
1543 res.inside_cost = 0;
1544 res.outside_cost = 0;
1545 }
1546
1547 return res;
1548 }
1549
1550 /* Return true if the new peeling NPEEL is supported. */
1551
1552 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1553 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1554 unsigned npeel)
1555 {
1556 unsigned i;
1557 struct data_reference *dr = NULL;
1558 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1559 gimple *stmt;
1560 stmt_vec_info stmt_info;
1561 enum dr_alignment_support supportable_dr_alignment;
1562
1563 /* Ensure that all data refs can be vectorized after the peel. */
1564 FOR_EACH_VEC_ELT (datarefs, i, dr)
1565 {
1566 int save_misalignment;
1567
1568 if (dr == dr0)
1569 continue;
1570
1571 stmt = DR_STMT (dr);
1572 stmt_info = vinfo_for_stmt (stmt);
1573 /* For interleaving, only the alignment of the first access
1574 matters. */
1575 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1576 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1577 continue;
1578
1579 /* Strided accesses perform only component accesses, alignment is
1580 irrelevant for them. */
1581 if (STMT_VINFO_STRIDED_P (stmt_info)
1582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583 continue;
1584
1585 save_misalignment = DR_MISALIGNMENT (dr);
1586 vect_update_misalignment_for_peel (dr, dr0, npeel);
1587 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1588 SET_DR_MISALIGNMENT (dr, save_misalignment);
1589
1590 if (!supportable_dr_alignment)
1591 return false;
1592 }
1593
1594 return true;
1595 }
1596
1597 /* Function vect_enhance_data_refs_alignment
1598
1599 This pass will use loop versioning and loop peeling in order to enhance
1600 the alignment of data references in the loop.
1601
1602 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1603 original loop is to be vectorized. Any other loops that are created by
1604 the transformations performed in this pass - are not supposed to be
1605 vectorized. This restriction will be relaxed.
1606
1607 This pass will require a cost model to guide it whether to apply peeling
1608 or versioning or a combination of the two. For example, the scheme that
1609 intel uses when given a loop with several memory accesses, is as follows:
1610 choose one memory access ('p') which alignment you want to force by doing
1611 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1612 other accesses are not necessarily aligned, or (2) use loop versioning to
1613 generate one loop in which all accesses are aligned, and another loop in
1614 which only 'p' is necessarily aligned.
1615
1616 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1617 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1618 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1619
1620 Devising a cost model is the most critical aspect of this work. It will
1621 guide us on which access to peel for, whether to use loop versioning, how
1622 many versions to create, etc. The cost model will probably consist of
1623 generic considerations as well as target specific considerations (on
1624 powerpc for example, misaligned stores are more painful than misaligned
1625 loads).
1626
1627 Here are the general steps involved in alignment enhancements:
1628
1629 -- original loop, before alignment analysis:
1630 for (i=0; i<N; i++){
1631 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1633 }
1634
1635 -- After vect_compute_data_refs_alignment:
1636 for (i=0; i<N; i++){
1637 x = q[i]; # DR_MISALIGNMENT(q) = 3
1638 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1639 }
1640
1641 -- Possibility 1: we do loop versioning:
1642 if (p is aligned) {
1643 for (i=0; i<N; i++){ # loop 1A
1644 x = q[i]; # DR_MISALIGNMENT(q) = 3
1645 p[i] = y; # DR_MISALIGNMENT(p) = 0
1646 }
1647 }
1648 else {
1649 for (i=0; i<N; i++){ # loop 1B
1650 x = q[i]; # DR_MISALIGNMENT(q) = 3
1651 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1652 }
1653 }
1654
1655 -- Possibility 2: we do loop peeling:
1656 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1657 x = q[i];
1658 p[i] = y;
1659 }
1660 for (i = 3; i < N; i++){ # loop 2A
1661 x = q[i]; # DR_MISALIGNMENT(q) = 0
1662 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1663 }
1664
1665 -- Possibility 3: combination of loop peeling and versioning:
1666 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1667 x = q[i];
1668 p[i] = y;
1669 }
1670 if (p is aligned) {
1671 for (i = 3; i<N; i++){ # loop 3A
1672 x = q[i]; # DR_MISALIGNMENT(q) = 0
1673 p[i] = y; # DR_MISALIGNMENT(p) = 0
1674 }
1675 }
1676 else {
1677 for (i = 3; i<N; i++){ # loop 3B
1678 x = q[i]; # DR_MISALIGNMENT(q) = 0
1679 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1680 }
1681 }
1682
1683 These loops are later passed to loop_transform to be vectorized. The
1684 vectorizer will use the alignment information to guide the transformation
1685 (whether to generate regular loads/stores, or with special handling for
1686 misalignment). */
1687
1688 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1689 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1690 {
1691 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1692 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1693 enum dr_alignment_support supportable_dr_alignment;
1694 struct data_reference *dr0 = NULL, *first_store = NULL;
1695 struct data_reference *dr;
1696 unsigned int i, j;
1697 bool do_peeling = false;
1698 bool do_versioning = false;
1699 bool stat;
1700 gimple *stmt;
1701 stmt_vec_info stmt_info;
1702 unsigned int npeel = 0;
1703 bool one_misalignment_known = false;
1704 bool one_misalignment_unknown = false;
1705 bool one_dr_unsupportable = false;
1706 struct data_reference *unsupportable_dr = NULL;
1707 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1708 unsigned possible_npeel_number = 1;
1709 tree vectype;
1710 unsigned int mis, same_align_drs_max = 0;
1711 hash_table<peel_info_hasher> peeling_htab (1);
1712
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_NOTE, vect_location,
1715 "=== vect_enhance_data_refs_alignment ===\n");
1716
1717 /* Reset data so we can safely be called multiple times. */
1718 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1719 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1720
1721 /* While cost model enhancements are expected in the future, the high level
1722 view of the code at this time is as follows:
1723
1724 A) If there is a misaligned access then see if peeling to align
1725 this access can make all data references satisfy
1726 vect_supportable_dr_alignment. If so, update data structures
1727 as needed and return true.
1728
1729 B) If peeling wasn't possible and there is a data reference with an
1730 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1731 then see if loop versioning checks can be used to make all data
1732 references satisfy vect_supportable_dr_alignment. If so, update
1733 data structures as needed and return true.
1734
1735 C) If neither peeling nor versioning were successful then return false if
1736 any data reference does not satisfy vect_supportable_dr_alignment.
1737
1738 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1739
1740 Note, Possibility 3 above (which is peeling and versioning together) is not
1741 being done at this time. */
1742
1743 /* (1) Peeling to force alignment. */
1744
1745 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1746 Considerations:
1747 + How many accesses will become aligned due to the peeling
1748 - How many accesses will become unaligned due to the peeling,
1749 and the cost of misaligned accesses.
1750 - The cost of peeling (the extra runtime checks, the increase
1751 in code size). */
1752
1753 FOR_EACH_VEC_ELT (datarefs, i, dr)
1754 {
1755 stmt = DR_STMT (dr);
1756 stmt_info = vinfo_for_stmt (stmt);
1757
1758 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1759 continue;
1760
1761 /* For interleaving, only the alignment of the first access
1762 matters. */
1763 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1764 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1765 continue;
1766
1767 /* For invariant accesses there is nothing to enhance. */
1768 if (integer_zerop (DR_STEP (dr)))
1769 continue;
1770
1771 /* Strided accesses perform only component accesses, alignment is
1772 irrelevant for them. */
1773 if (STMT_VINFO_STRIDED_P (stmt_info)
1774 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1775 continue;
1776
1777 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1778 do_peeling = vector_alignment_reachable_p (dr);
1779 if (do_peeling)
1780 {
1781 if (known_alignment_for_access_p (dr))
1782 {
1783 unsigned int npeel_tmp = 0;
1784 bool negative = tree_int_cst_compare (DR_STEP (dr),
1785 size_zero_node) < 0;
1786
1787 vectype = STMT_VINFO_VECTYPE (stmt_info);
1788 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1789 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1790 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1791 if (DR_MISALIGNMENT (dr) != 0)
1792 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1793
1794 /* For multiple types, it is possible that the bigger type access
1795 will have more than one peeling option. E.g., a loop with two
1796 types: one of size (vector size / 4), and the other one of
1797 size (vector size / 8). Vectorization factor will 8. If both
1798 accesses are misaligned by 3, the first one needs one scalar
1799 iteration to be aligned, and the second one needs 5. But the
1800 first one will be aligned also by peeling 5 scalar
1801 iterations, and in that case both accesses will be aligned.
1802 Hence, except for the immediate peeling amount, we also want
1803 to try to add full vector size, while we don't exceed
1804 vectorization factor.
1805 We do this automatically for cost model, since we calculate
1806 cost for every peeling option. */
1807 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1808 {
1809 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1810 ? vf * GROUP_SIZE (stmt_info) : vf);
1811 possible_npeel_number
1812 = vect_get_num_vectors (nscalars, vectype);
1813
1814 /* NPEEL_TMP is 0 when there is no misalignment, but also
1815 allow peeling NELEMENTS. */
1816 if (DR_MISALIGNMENT (dr) == 0)
1817 possible_npeel_number++;
1818 }
1819
1820 /* Save info about DR in the hash table. Also include peeling
1821 amounts according to the explanation above. */
1822 for (j = 0; j < possible_npeel_number; j++)
1823 {
1824 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1825 dr, npeel_tmp);
1826 npeel_tmp += target_align / dr_size;
1827 }
1828
1829 one_misalignment_known = true;
1830 }
1831 else
1832 {
1833 /* If we don't know any misalignment values, we prefer
1834 peeling for data-ref that has the maximum number of data-refs
1835 with the same alignment, unless the target prefers to align
1836 stores over load. */
1837 unsigned same_align_drs
1838 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1839 if (!dr0
1840 || same_align_drs_max < same_align_drs)
1841 {
1842 same_align_drs_max = same_align_drs;
1843 dr0 = dr;
1844 }
1845 /* For data-refs with the same number of related
1846 accesses prefer the one where the misalign
1847 computation will be invariant in the outermost loop. */
1848 else if (same_align_drs_max == same_align_drs)
1849 {
1850 struct loop *ivloop0, *ivloop;
1851 ivloop0 = outermost_invariant_loop_for_expr
1852 (loop, DR_BASE_ADDRESS (dr0));
1853 ivloop = outermost_invariant_loop_for_expr
1854 (loop, DR_BASE_ADDRESS (dr));
1855 if ((ivloop && !ivloop0)
1856 || (ivloop && ivloop0
1857 && flow_loop_nested_p (ivloop, ivloop0)))
1858 dr0 = dr;
1859 }
1860
1861 one_misalignment_unknown = true;
1862
1863 /* Check for data refs with unsupportable alignment that
1864 can be peeled. */
1865 if (!supportable_dr_alignment)
1866 {
1867 one_dr_unsupportable = true;
1868 unsupportable_dr = dr;
1869 }
1870
1871 if (!first_store && DR_IS_WRITE (dr))
1872 first_store = dr;
1873 }
1874 }
1875 else
1876 {
1877 if (!aligned_access_p (dr))
1878 {
1879 if (dump_enabled_p ())
1880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1881 "vector alignment may not be reachable\n");
1882 break;
1883 }
1884 }
1885 }
1886
1887 /* Check if we can possibly peel the loop. */
1888 if (!vect_can_advance_ivs_p (loop_vinfo)
1889 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1890 || loop->inner)
1891 do_peeling = false;
1892
1893 struct _vect_peel_extended_info peel_for_known_alignment;
1894 struct _vect_peel_extended_info peel_for_unknown_alignment;
1895 struct _vect_peel_extended_info best_peel;
1896
1897 peel_for_unknown_alignment.inside_cost = INT_MAX;
1898 peel_for_unknown_alignment.outside_cost = INT_MAX;
1899 peel_for_unknown_alignment.peel_info.count = 0;
1900
1901 if (do_peeling
1902 && one_misalignment_unknown)
1903 {
1904 /* Check if the target requires to prefer stores over loads, i.e., if
1905 misaligned stores are more expensive than misaligned loads (taking
1906 drs with same alignment into account). */
1907 unsigned int load_inside_cost = 0;
1908 unsigned int load_outside_cost = 0;
1909 unsigned int store_inside_cost = 0;
1910 unsigned int store_outside_cost = 0;
1911 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1912
1913 stmt_vector_for_cost dummy;
1914 dummy.create (2);
1915 vect_get_peeling_costs_all_drs (datarefs, dr0,
1916 &load_inside_cost,
1917 &load_outside_cost,
1918 &dummy, estimated_npeels, true);
1919 dummy.release ();
1920
1921 if (first_store)
1922 {
1923 dummy.create (2);
1924 vect_get_peeling_costs_all_drs (datarefs, first_store,
1925 &store_inside_cost,
1926 &store_outside_cost,
1927 &dummy, estimated_npeels, true);
1928 dummy.release ();
1929 }
1930 else
1931 {
1932 store_inside_cost = INT_MAX;
1933 store_outside_cost = INT_MAX;
1934 }
1935
1936 if (load_inside_cost > store_inside_cost
1937 || (load_inside_cost == store_inside_cost
1938 && load_outside_cost > store_outside_cost))
1939 {
1940 dr0 = first_store;
1941 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1942 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1943 }
1944 else
1945 {
1946 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1947 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1948 }
1949
1950 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1951 prologue_cost_vec.create (2);
1952 epilogue_cost_vec.create (2);
1953
1954 int dummy2;
1955 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1956 (loop_vinfo, estimated_npeels, &dummy2,
1957 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1958 &prologue_cost_vec, &epilogue_cost_vec);
1959
1960 prologue_cost_vec.release ();
1961 epilogue_cost_vec.release ();
1962
1963 peel_for_unknown_alignment.peel_info.count = 1
1964 + STMT_VINFO_SAME_ALIGN_REFS
1965 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1966 }
1967
1968 peel_for_unknown_alignment.peel_info.npeel = 0;
1969 peel_for_unknown_alignment.peel_info.dr = dr0;
1970
1971 best_peel = peel_for_unknown_alignment;
1972
1973 peel_for_known_alignment.inside_cost = INT_MAX;
1974 peel_for_known_alignment.outside_cost = INT_MAX;
1975 peel_for_known_alignment.peel_info.count = 0;
1976 peel_for_known_alignment.peel_info.dr = NULL;
1977
1978 if (do_peeling && one_misalignment_known)
1979 {
1980 /* Peeling is possible, but there is no data access that is not supported
1981 unless aligned. So we try to choose the best possible peeling from
1982 the hash table. */
1983 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1984 (&peeling_htab, loop_vinfo);
1985 }
1986
1987 /* Compare costs of peeling for known and unknown alignment. */
1988 if (peel_for_known_alignment.peel_info.dr != NULL
1989 && peel_for_unknown_alignment.inside_cost
1990 >= peel_for_known_alignment.inside_cost)
1991 {
1992 best_peel = peel_for_known_alignment;
1993
1994 /* If the best peeling for known alignment has NPEEL == 0, perform no
1995 peeling at all except if there is an unsupportable dr that we can
1996 align. */
1997 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1998 do_peeling = false;
1999 }
2000
2001 /* If there is an unsupportable data ref, prefer this over all choices so far
2002 since we'd have to discard a chosen peeling except when it accidentally
2003 aligned the unsupportable data ref. */
2004 if (one_dr_unsupportable)
2005 dr0 = unsupportable_dr;
2006 else if (do_peeling)
2007 {
2008 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2009 TODO: Use nopeel_outside_cost or get rid of it? */
2010 unsigned nopeel_inside_cost = 0;
2011 unsigned nopeel_outside_cost = 0;
2012
2013 stmt_vector_for_cost dummy;
2014 dummy.create (2);
2015 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
2016 &nopeel_outside_cost, &dummy, 0, false);
2017 dummy.release ();
2018
2019 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2020 costs will be recorded. */
2021 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2022 prologue_cost_vec.create (2);
2023 epilogue_cost_vec.create (2);
2024
2025 int dummy2;
2026 nopeel_outside_cost += vect_get_known_peeling_cost
2027 (loop_vinfo, 0, &dummy2,
2028 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2029 &prologue_cost_vec, &epilogue_cost_vec);
2030
2031 prologue_cost_vec.release ();
2032 epilogue_cost_vec.release ();
2033
2034 npeel = best_peel.peel_info.npeel;
2035 dr0 = best_peel.peel_info.dr;
2036
2037 /* If no peeling is not more expensive than the best peeling we
2038 have so far, don't perform any peeling. */
2039 if (nopeel_inside_cost <= best_peel.inside_cost)
2040 do_peeling = false;
2041 }
2042
2043 if (do_peeling)
2044 {
2045 stmt = DR_STMT (dr0);
2046 stmt_info = vinfo_for_stmt (stmt);
2047 vectype = STMT_VINFO_VECTYPE (stmt_info);
2048
2049 if (known_alignment_for_access_p (dr0))
2050 {
2051 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2052 size_zero_node) < 0;
2053 if (!npeel)
2054 {
2055 /* Since it's known at compile time, compute the number of
2056 iterations in the peeled loop (the peeling factor) for use in
2057 updating DR_MISALIGNMENT values. The peeling factor is the
2058 vectorization factor minus the misalignment as an element
2059 count. */
2060 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2061 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2062 npeel = ((mis & (target_align - 1))
2063 / vect_get_scalar_dr_size (dr0));
2064 }
2065
2066 /* For interleaved data access every iteration accesses all the
2067 members of the group, therefore we divide the number of iterations
2068 by the group size. */
2069 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2070 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2071 npeel /= GROUP_SIZE (stmt_info);
2072
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "Try peeling by %d\n", npeel);
2076 }
2077
2078 /* Ensure that all datarefs can be vectorized after the peel. */
2079 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2080 do_peeling = false;
2081
2082 /* Check if all datarefs are supportable and log. */
2083 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2084 {
2085 stat = vect_verify_datarefs_alignment (loop_vinfo);
2086 if (!stat)
2087 do_peeling = false;
2088 else
2089 return stat;
2090 }
2091
2092 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2093 if (do_peeling)
2094 {
2095 unsigned max_allowed_peel
2096 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2097 if (max_allowed_peel != (unsigned)-1)
2098 {
2099 unsigned max_peel = npeel;
2100 if (max_peel == 0)
2101 {
2102 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2103 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2104 }
2105 if (max_peel > max_allowed_peel)
2106 {
2107 do_peeling = false;
2108 if (dump_enabled_p ())
2109 dump_printf_loc (MSG_NOTE, vect_location,
2110 "Disable peeling, max peels reached: %d\n", max_peel);
2111 }
2112 }
2113 }
2114
2115 /* Cost model #2 - if peeling may result in a remaining loop not
2116 iterating enough to be vectorized then do not peel. Since this
2117 is a cost heuristic rather than a correctness decision, use the
2118 most likely runtime value for variable vectorization factors. */
2119 if (do_peeling
2120 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2121 {
2122 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2123 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2124 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2125 < assumed_vf + max_peel)
2126 do_peeling = false;
2127 }
2128
2129 if (do_peeling)
2130 {
2131 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2132 If the misalignment of DR_i is identical to that of dr0 then set
2133 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2134 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2135 by the peeling factor times the element size of DR_i (MOD the
2136 vectorization factor times the size). Otherwise, the
2137 misalignment of DR_i must be set to unknown. */
2138 FOR_EACH_VEC_ELT (datarefs, i, dr)
2139 if (dr != dr0)
2140 {
2141 /* Strided accesses perform only component accesses, alignment
2142 is irrelevant for them. */
2143 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2144 if (STMT_VINFO_STRIDED_P (stmt_info)
2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 continue;
2147
2148 vect_update_misalignment_for_peel (dr, dr0, npeel);
2149 }
2150
2151 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2152 if (npeel)
2153 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2154 else
2155 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2156 = DR_MISALIGNMENT (dr0);
2157 SET_DR_MISALIGNMENT (dr0, 0);
2158 if (dump_enabled_p ())
2159 {
2160 dump_printf_loc (MSG_NOTE, vect_location,
2161 "Alignment of access forced using peeling.\n");
2162 dump_printf_loc (MSG_NOTE, vect_location,
2163 "Peeling for alignment will be applied.\n");
2164 }
2165
2166 /* The inside-loop cost will be accounted for in vectorizable_load
2167 and vectorizable_store correctly with adjusted alignments.
2168 Drop the body_cst_vec on the floor here. */
2169 stat = vect_verify_datarefs_alignment (loop_vinfo);
2170 gcc_assert (stat);
2171 return stat;
2172 }
2173 }
2174
2175 /* (2) Versioning to force alignment. */
2176
2177 /* Try versioning if:
2178 1) optimize loop for speed
2179 2) there is at least one unsupported misaligned data ref with an unknown
2180 misalignment, and
2181 3) all misaligned data refs with a known misalignment are supported, and
2182 4) the number of runtime alignment checks is within reason. */
2183
2184 do_versioning =
2185 optimize_loop_nest_for_speed_p (loop)
2186 && (!loop->inner); /* FORNOW */
2187
2188 if (do_versioning)
2189 {
2190 FOR_EACH_VEC_ELT (datarefs, i, dr)
2191 {
2192 stmt = DR_STMT (dr);
2193 stmt_info = vinfo_for_stmt (stmt);
2194
2195 /* For interleaving, only the alignment of the first access
2196 matters. */
2197 if (aligned_access_p (dr)
2198 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2199 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2200 continue;
2201
2202 if (STMT_VINFO_STRIDED_P (stmt_info))
2203 {
2204 /* Strided loads perform only component accesses, alignment is
2205 irrelevant for them. */
2206 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2207 continue;
2208 do_versioning = false;
2209 break;
2210 }
2211
2212 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2213
2214 if (!supportable_dr_alignment)
2215 {
2216 gimple *stmt;
2217 int mask;
2218 tree vectype;
2219
2220 if (known_alignment_for_access_p (dr)
2221 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2222 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2223 {
2224 do_versioning = false;
2225 break;
2226 }
2227
2228 stmt = DR_STMT (dr);
2229 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2230 gcc_assert (vectype);
2231
2232 /* At present we don't support versioning for alignment
2233 with variable VF, since there's no guarantee that the
2234 VF is a power of two. We could relax this if we added
2235 a way of enforcing a power-of-two size. */
2236 unsigned HOST_WIDE_INT size;
2237 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2238 {
2239 do_versioning = false;
2240 break;
2241 }
2242
2243 /* The rightmost bits of an aligned address must be zeros.
2244 Construct the mask needed for this test. For example,
2245 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2246 mask must be 15 = 0xf. */
2247 mask = size - 1;
2248
2249 /* FORNOW: use the same mask to test all potentially unaligned
2250 references in the loop. The vectorizer currently supports
2251 a single vector size, see the reference to
2252 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2253 vectorization factor is computed. */
2254 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2255 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2256 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2257 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2258 DR_STMT (dr));
2259 }
2260 }
2261
2262 /* Versioning requires at least one misaligned data reference. */
2263 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2264 do_versioning = false;
2265 else if (!do_versioning)
2266 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2267 }
2268
2269 if (do_versioning)
2270 {
2271 vec<gimple *> may_misalign_stmts
2272 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2273 gimple *stmt;
2274
2275 /* It can now be assumed that the data references in the statements
2276 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2277 of the loop being vectorized. */
2278 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2279 {
2280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2281 dr = STMT_VINFO_DATA_REF (stmt_info);
2282 SET_DR_MISALIGNMENT (dr, 0);
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_NOTE, vect_location,
2285 "Alignment of access forced using versioning.\n");
2286 }
2287
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "Versioning for alignment will be applied.\n");
2291
2292 /* Peeling and versioning can't be done together at this time. */
2293 gcc_assert (! (do_peeling && do_versioning));
2294
2295 stat = vect_verify_datarefs_alignment (loop_vinfo);
2296 gcc_assert (stat);
2297 return stat;
2298 }
2299
2300 /* This point is reached if neither peeling nor versioning is being done. */
2301 gcc_assert (! (do_peeling || do_versioning));
2302
2303 stat = vect_verify_datarefs_alignment (loop_vinfo);
2304 return stat;
2305 }
2306
2307
2308 /* Function vect_find_same_alignment_drs.
2309
2310 Update group and alignment relations according to the chosen
2311 vectorization factor. */
2312
2313 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2314 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2315 {
2316 struct data_reference *dra = DDR_A (ddr);
2317 struct data_reference *drb = DDR_B (ddr);
2318 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2319 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2320
2321 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2322 return;
2323
2324 if (dra == drb)
2325 return;
2326
2327 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2328 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2329 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2330 return;
2331
2332 /* Two references with distance zero have the same alignment. */
2333 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2334 - wi::to_poly_offset (DR_INIT (drb)));
2335 if (maybe_ne (diff, 0))
2336 {
2337 /* Get the wider of the two alignments. */
2338 unsigned int align_a = (vect_calculate_target_alignment (dra)
2339 / BITS_PER_UNIT);
2340 unsigned int align_b = (vect_calculate_target_alignment (drb)
2341 / BITS_PER_UNIT);
2342 unsigned int max_align = MAX (align_a, align_b);
2343
2344 /* Require the gap to be a multiple of the larger vector alignment. */
2345 if (!multiple_p (diff, max_align))
2346 return;
2347 }
2348
2349 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2350 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2351 if (dump_enabled_p ())
2352 {
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "accesses have the same alignment: ");
2355 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2356 dump_printf (MSG_NOTE, " and ");
2357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2358 dump_printf (MSG_NOTE, "\n");
2359 }
2360 }
2361
2362
2363 /* Function vect_analyze_data_refs_alignment
2364
2365 Analyze the alignment of the data-references in the loop.
2366 Return FALSE if a data reference is found that cannot be vectorized. */
2367
2368 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2369 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2370 {
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "=== vect_analyze_data_refs_alignment ===\n");
2374
2375 /* Mark groups of data references with same alignment using
2376 data dependence information. */
2377 vec<ddr_p> ddrs = vinfo->ddrs;
2378 struct data_dependence_relation *ddr;
2379 unsigned int i;
2380
2381 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2382 vect_find_same_alignment_drs (ddr);
2383
2384 vec<data_reference_p> datarefs = vinfo->datarefs;
2385 struct data_reference *dr;
2386
2387 vect_record_base_alignments (vinfo);
2388 FOR_EACH_VEC_ELT (datarefs, i, dr)
2389 {
2390 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2391 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2392 && !vect_compute_data_ref_alignment (dr))
2393 {
2394 /* Strided accesses perform only component accesses, misalignment
2395 information is irrelevant for them. */
2396 if (STMT_VINFO_STRIDED_P (stmt_info)
2397 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2398 continue;
2399
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2402 "not vectorized: can't calculate alignment "
2403 "for data ref.\n");
2404
2405 return false;
2406 }
2407 }
2408
2409 return true;
2410 }
2411
2412
2413 /* Analyze alignment of DRs of stmts in NODE. */
2414
2415 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2416 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2417 {
2418 /* We vectorize from the first scalar stmt in the node unless
2419 the node is permuted in which case we start from the first
2420 element in the group. */
2421 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2422 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2423 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2424 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2425
2426 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2427 if (! vect_compute_data_ref_alignment (dr)
2428 /* For creating the data-ref pointer we need alignment of the
2429 first element anyway. */
2430 || (dr != first_dr
2431 && ! vect_compute_data_ref_alignment (first_dr))
2432 || ! verify_data_ref_alignment (dr))
2433 {
2434 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436 "not vectorized: bad data alignment in basic "
2437 "block.\n");
2438 return false;
2439 }
2440
2441 return true;
2442 }
2443
2444 /* Function vect_slp_analyze_instance_alignment
2445
2446 Analyze the alignment of the data-references in the SLP instance.
2447 Return FALSE if a data reference is found that cannot be vectorized. */
2448
2449 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2450 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2451 {
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_NOTE, vect_location,
2454 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2455
2456 slp_tree node;
2457 unsigned i;
2458 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2459 if (! vect_slp_analyze_and_verify_node_alignment (node))
2460 return false;
2461
2462 node = SLP_INSTANCE_TREE (instance);
2463 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2464 && ! vect_slp_analyze_and_verify_node_alignment
2465 (SLP_INSTANCE_TREE (instance)))
2466 return false;
2467
2468 return true;
2469 }
2470
2471
2472 /* Analyze groups of accesses: check that DR belongs to a group of
2473 accesses of legal size, step, etc. Detect gaps, single element
2474 interleaving, and other special cases. Set grouped access info.
2475 Collect groups of strided stores for further use in SLP analysis.
2476 Worker for vect_analyze_group_access. */
2477
2478 static bool
vect_analyze_group_access_1(struct data_reference * dr)2479 vect_analyze_group_access_1 (struct data_reference *dr)
2480 {
2481 tree step = DR_STEP (dr);
2482 tree scalar_type = TREE_TYPE (DR_REF (dr));
2483 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2484 gimple *stmt = DR_STMT (dr);
2485 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2486 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2487 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2488 HOST_WIDE_INT dr_step = -1;
2489 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2490 bool slp_impossible = false;
2491
2492 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2493 size of the interleaving group (including gaps). */
2494 if (tree_fits_shwi_p (step))
2495 {
2496 dr_step = tree_to_shwi (step);
2497 /* Check that STEP is a multiple of type size. Otherwise there is
2498 a non-element-sized gap at the end of the group which we
2499 cannot represent in GROUP_GAP or GROUP_SIZE.
2500 ??? As we can handle non-constant step fine here we should
2501 simply remove uses of GROUP_GAP between the last and first
2502 element and instead rely on DR_STEP. GROUP_SIZE then would
2503 simply not include that gap. */
2504 if ((dr_step % type_size) != 0)
2505 {
2506 if (dump_enabled_p ())
2507 {
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Step ");
2510 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2511 dump_printf (MSG_NOTE,
2512 " is not a multiple of the element size for ");
2513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2514 dump_printf (MSG_NOTE, "\n");
2515 }
2516 return false;
2517 }
2518 groupsize = absu_hwi (dr_step) / type_size;
2519 }
2520 else
2521 groupsize = 0;
2522
2523 /* Not consecutive access is possible only if it is a part of interleaving. */
2524 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2525 {
2526 /* Check if it this DR is a part of interleaving, and is a single
2527 element of the group that is accessed in the loop. */
2528
2529 /* Gaps are supported only for loads. STEP must be a multiple of the type
2530 size. */
2531 if (DR_IS_READ (dr)
2532 && (dr_step % type_size) == 0
2533 && groupsize > 0)
2534 {
2535 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2536 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2537 GROUP_GAP (stmt_info) = groupsize - 1;
2538 if (dump_enabled_p ())
2539 {
2540 dump_printf_loc (MSG_NOTE, vect_location,
2541 "Detected single element interleaving ");
2542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2543 dump_printf (MSG_NOTE, " step ");
2544 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2545 dump_printf (MSG_NOTE, "\n");
2546 }
2547
2548 return true;
2549 }
2550
2551 if (dump_enabled_p ())
2552 {
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "not consecutive access ");
2555 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2556 }
2557
2558 if (bb_vinfo)
2559 {
2560 /* Mark the statement as unvectorizable. */
2561 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2562 return true;
2563 }
2564
2565 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566 STMT_VINFO_STRIDED_P (stmt_info) = true;
2567 return true;
2568 }
2569
2570 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2571 {
2572 /* First stmt in the interleaving chain. Check the chain. */
2573 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2574 struct data_reference *data_ref = dr;
2575 unsigned int count = 1;
2576 tree prev_init = DR_INIT (data_ref);
2577 gimple *prev = stmt;
2578 HOST_WIDE_INT diff, gaps = 0;
2579
2580 /* By construction, all group members have INTEGER_CST DR_INITs. */
2581 while (next)
2582 {
2583 /* Skip same data-refs. In case that two or more stmts share
2584 data-ref (supported only for loads), we vectorize only the first
2585 stmt, and the rest get their vectorized loads from the first
2586 one. */
2587 if (!tree_int_cst_compare (DR_INIT (data_ref),
2588 DR_INIT (STMT_VINFO_DATA_REF (
2589 vinfo_for_stmt (next)))))
2590 {
2591 if (DR_IS_WRITE (data_ref))
2592 {
2593 if (dump_enabled_p ())
2594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2595 "Two store stmts share the same dr.\n");
2596 return false;
2597 }
2598
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601 "Two or more load stmts share the same dr.\n");
2602
2603 /* For load use the same data-ref load. */
2604 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2605
2606 prev = next;
2607 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2608 continue;
2609 }
2610
2611 prev = next;
2612 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2613
2614 /* All group members have the same STEP by construction. */
2615 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2616
2617 /* Check that the distance between two accesses is equal to the type
2618 size. Otherwise, we have gaps. */
2619 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2620 - TREE_INT_CST_LOW (prev_init)) / type_size;
2621 if (diff != 1)
2622 {
2623 /* FORNOW: SLP of accesses with gaps is not supported. */
2624 slp_impossible = true;
2625 if (DR_IS_WRITE (data_ref))
2626 {
2627 if (dump_enabled_p ())
2628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2629 "interleaved store with gaps\n");
2630 return false;
2631 }
2632
2633 gaps += diff - 1;
2634 }
2635
2636 last_accessed_element += diff;
2637
2638 /* Store the gap from the previous member of the group. If there is no
2639 gap in the access, GROUP_GAP is always 1. */
2640 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2641
2642 prev_init = DR_INIT (data_ref);
2643 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2644 /* Count the number of data-refs in the chain. */
2645 count++;
2646 }
2647
2648 if (groupsize == 0)
2649 groupsize = count + gaps;
2650
2651 /* This could be UINT_MAX but as we are generating code in a very
2652 inefficient way we have to cap earlier. See PR78699 for example. */
2653 if (groupsize > 4096)
2654 {
2655 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657 "group is too large\n");
2658 return false;
2659 }
2660
2661 /* Check that the size of the interleaving is equal to count for stores,
2662 i.e., that there are no gaps. */
2663 if (groupsize != count
2664 && !DR_IS_READ (dr))
2665 {
2666 if (dump_enabled_p ())
2667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2668 "interleaved store with gaps\n");
2669 return false;
2670 }
2671
2672 /* If there is a gap after the last load in the group it is the
2673 difference between the groupsize and the last accessed
2674 element.
2675 When there is no gap, this difference should be 0. */
2676 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2677
2678 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2679 if (dump_enabled_p ())
2680 {
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "Detected interleaving ");
2683 if (DR_IS_READ (dr))
2684 dump_printf (MSG_NOTE, "load ");
2685 else
2686 dump_printf (MSG_NOTE, "store ");
2687 dump_printf (MSG_NOTE, "of size %u starting with ",
2688 (unsigned)groupsize);
2689 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2690 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2691 dump_printf_loc (MSG_NOTE, vect_location,
2692 "There is a gap of %u elements after the group\n",
2693 GROUP_GAP (vinfo_for_stmt (stmt)));
2694 }
2695
2696 /* SLP: create an SLP data structure for every interleaving group of
2697 stores for further analysis in vect_analyse_slp. */
2698 if (DR_IS_WRITE (dr) && !slp_impossible)
2699 {
2700 if (loop_vinfo)
2701 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2702 if (bb_vinfo)
2703 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2704 }
2705 }
2706
2707 return true;
2708 }
2709
2710 /* Analyze groups of accesses: check that DR belongs to a group of
2711 accesses of legal size, step, etc. Detect gaps, single element
2712 interleaving, and other special cases. Set grouped access info.
2713 Collect groups of strided stores for further use in SLP analysis. */
2714
2715 static bool
vect_analyze_group_access(struct data_reference * dr)2716 vect_analyze_group_access (struct data_reference *dr)
2717 {
2718 if (!vect_analyze_group_access_1 (dr))
2719 {
2720 /* Dissolve the group if present. */
2721 gimple *next;
2722 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2723 while (stmt)
2724 {
2725 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2726 next = GROUP_NEXT_ELEMENT (vinfo);
2727 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2728 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2729 stmt = next;
2730 }
2731 return false;
2732 }
2733 return true;
2734 }
2735
2736 /* Analyze the access pattern of the data-reference DR.
2737 In case of non-consecutive accesses call vect_analyze_group_access() to
2738 analyze groups of accesses. */
2739
2740 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2741 vect_analyze_data_ref_access (struct data_reference *dr)
2742 {
2743 tree step = DR_STEP (dr);
2744 tree scalar_type = TREE_TYPE (DR_REF (dr));
2745 gimple *stmt = DR_STMT (dr);
2746 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2747 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2748 struct loop *loop = NULL;
2749
2750 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2751 return true;
2752
2753 if (loop_vinfo)
2754 loop = LOOP_VINFO_LOOP (loop_vinfo);
2755
2756 if (loop_vinfo && !step)
2757 {
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad data-ref access in loop\n");
2761 return false;
2762 }
2763
2764 /* Allow loads with zero step in inner-loop vectorization. */
2765 if (loop_vinfo && integer_zerop (step))
2766 {
2767 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2768 if (!nested_in_vect_loop_p (loop, stmt))
2769 return DR_IS_READ (dr);
2770 /* Allow references with zero step for outer loops marked
2771 with pragma omp simd only - it guarantees absence of
2772 loop-carried dependencies between inner loop iterations. */
2773 if (loop->safelen < 2)
2774 {
2775 if (dump_enabled_p ())
2776 dump_printf_loc (MSG_NOTE, vect_location,
2777 "zero step in inner loop of nest\n");
2778 return false;
2779 }
2780 }
2781
2782 if (loop && nested_in_vect_loop_p (loop, stmt))
2783 {
2784 /* Interleaved accesses are not yet supported within outer-loop
2785 vectorization for references in the inner-loop. */
2786 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2787
2788 /* For the rest of the analysis we use the outer-loop step. */
2789 step = STMT_VINFO_DR_STEP (stmt_info);
2790 if (integer_zerop (step))
2791 {
2792 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_NOTE, vect_location,
2794 "zero step in outer loop.\n");
2795 return DR_IS_READ (dr);
2796 }
2797 }
2798
2799 /* Consecutive? */
2800 if (TREE_CODE (step) == INTEGER_CST)
2801 {
2802 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2803 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2804 || (dr_step < 0
2805 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2806 {
2807 /* Mark that it is not interleaving. */
2808 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2809 return true;
2810 }
2811 }
2812
2813 if (loop && nested_in_vect_loop_p (loop, stmt))
2814 {
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_NOTE, vect_location,
2817 "grouped access in outer loop.\n");
2818 return false;
2819 }
2820
2821
2822 /* Assume this is a DR handled by non-constant strided load case. */
2823 if (TREE_CODE (step) != INTEGER_CST)
2824 return (STMT_VINFO_STRIDED_P (stmt_info)
2825 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2826 || vect_analyze_group_access (dr)));
2827
2828 /* Not consecutive access - check if it's a part of interleaving group. */
2829 return vect_analyze_group_access (dr);
2830 }
2831
2832 /* Compare two data-references DRA and DRB to group them into chunks
2833 suitable for grouping. */
2834
2835 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2836 dr_group_sort_cmp (const void *dra_, const void *drb_)
2837 {
2838 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2839 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2840 int cmp;
2841
2842 /* Stabilize sort. */
2843 if (dra == drb)
2844 return 0;
2845
2846 /* DRs in different loops never belong to the same group. */
2847 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2848 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2849 if (loopa != loopb)
2850 return loopa->num < loopb->num ? -1 : 1;
2851
2852 /* Ordering of DRs according to base. */
2853 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2854 DR_BASE_ADDRESS (drb));
2855 if (cmp != 0)
2856 return cmp;
2857
2858 /* And according to DR_OFFSET. */
2859 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2860 if (cmp != 0)
2861 return cmp;
2862
2863 /* Put reads before writes. */
2864 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2865 return DR_IS_READ (dra) ? -1 : 1;
2866
2867 /* Then sort after access size. */
2868 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2869 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2870 if (cmp != 0)
2871 return cmp;
2872
2873 /* And after step. */
2874 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2875 if (cmp != 0)
2876 return cmp;
2877
2878 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2879 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2880 if (cmp == 0)
2881 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2882 return cmp;
2883 }
2884
2885 /* If OP is the result of a conversion, return the unconverted value,
2886 otherwise return null. */
2887
2888 static tree
strip_conversion(tree op)2889 strip_conversion (tree op)
2890 {
2891 if (TREE_CODE (op) != SSA_NAME)
2892 return NULL_TREE;
2893 gimple *stmt = SSA_NAME_DEF_STMT (op);
2894 if (!is_gimple_assign (stmt)
2895 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2896 return NULL_TREE;
2897 return gimple_assign_rhs1 (stmt);
2898 }
2899
2900 /* Return true if vectorizable_* routines can handle statements STMT1
2901 and STMT2 being in a single group. */
2902
2903 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2904 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2905 {
2906 if (gimple_assign_single_p (stmt1))
2907 return gimple_assign_single_p (stmt2);
2908
2909 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2910 {
2911 /* Check for two masked loads or two masked stores. */
2912 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2913 return false;
2914 internal_fn ifn = gimple_call_internal_fn (stmt1);
2915 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2916 return false;
2917 if (ifn != gimple_call_internal_fn (stmt2))
2918 return false;
2919
2920 /* Check that the masks are the same. Cope with casts of masks,
2921 like those created by build_mask_conversion. */
2922 tree mask1 = gimple_call_arg (stmt1, 2);
2923 tree mask2 = gimple_call_arg (stmt2, 2);
2924 if (!operand_equal_p (mask1, mask2, 0))
2925 {
2926 mask1 = strip_conversion (mask1);
2927 if (!mask1)
2928 return false;
2929 mask2 = strip_conversion (mask2);
2930 if (!mask2)
2931 return false;
2932 if (!operand_equal_p (mask1, mask2, 0))
2933 return false;
2934 }
2935 return true;
2936 }
2937
2938 return false;
2939 }
2940
2941 /* Function vect_analyze_data_ref_accesses.
2942
2943 Analyze the access pattern of all the data references in the loop.
2944
2945 FORNOW: the only access pattern that is considered vectorizable is a
2946 simple step 1 (consecutive) access.
2947
2948 FORNOW: handle only arrays and pointer accesses. */
2949
2950 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2951 vect_analyze_data_ref_accesses (vec_info *vinfo)
2952 {
2953 unsigned int i;
2954 vec<data_reference_p> datarefs = vinfo->datarefs;
2955 struct data_reference *dr;
2956
2957 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_NOTE, vect_location,
2959 "=== vect_analyze_data_ref_accesses ===\n");
2960
2961 if (datarefs.is_empty ())
2962 return true;
2963
2964 /* Sort the array of datarefs to make building the interleaving chains
2965 linear. Don't modify the original vector's order, it is needed for
2966 determining what dependencies are reversed. */
2967 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2968 datarefs_copy.qsort (dr_group_sort_cmp);
2969
2970 /* Build the interleaving chains. */
2971 for (i = 0; i < datarefs_copy.length () - 1;)
2972 {
2973 data_reference_p dra = datarefs_copy[i];
2974 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2975 stmt_vec_info lastinfo = NULL;
2976 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2977 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2978 {
2979 ++i;
2980 continue;
2981 }
2982 for (i = i + 1; i < datarefs_copy.length (); ++i)
2983 {
2984 data_reference_p drb = datarefs_copy[i];
2985 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2986 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2987 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2988 break;
2989
2990 /* ??? Imperfect sorting (non-compatible types, non-modulo
2991 accesses, same accesses) can lead to a group to be artificially
2992 split here as we don't just skip over those. If it really
2993 matters we can push those to a worklist and re-iterate
2994 over them. The we can just skip ahead to the next DR here. */
2995
2996 /* DRs in a different loop should not be put into the same
2997 interleaving group. */
2998 if (gimple_bb (DR_STMT (dra))->loop_father
2999 != gimple_bb (DR_STMT (drb))->loop_father)
3000 break;
3001
3002 /* Check that the data-refs have same first location (except init)
3003 and they are both either store or load (not load and store,
3004 not masked loads or stores). */
3005 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3006 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3007 DR_BASE_ADDRESS (drb)) != 0
3008 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3009 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
3010 break;
3011
3012 /* Check that the data-refs have the same constant size. */
3013 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3014 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3015 if (!tree_fits_uhwi_p (sza)
3016 || !tree_fits_uhwi_p (szb)
3017 || !tree_int_cst_equal (sza, szb))
3018 break;
3019
3020 /* Check that the data-refs have the same step. */
3021 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3022 break;
3023
3024 /* Check the types are compatible.
3025 ??? We don't distinguish this during sorting. */
3026 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3027 TREE_TYPE (DR_REF (drb))))
3028 break;
3029
3030 /* Check that the DR_INITs are compile-time constants. */
3031 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3032 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3033 break;
3034
3035 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3036 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3037 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3038 HOST_WIDE_INT init_prev
3039 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3040 gcc_assert (init_a <= init_b
3041 && init_a <= init_prev
3042 && init_prev <= init_b);
3043
3044 /* Do not place the same access in the interleaving chain twice. */
3045 if (init_b == init_prev)
3046 {
3047 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3048 < gimple_uid (DR_STMT (drb)));
3049 /* ??? For now we simply "drop" the later reference which is
3050 otherwise the same rather than finishing off this group.
3051 In the end we'd want to re-process duplicates forming
3052 multiple groups from the refs, likely by just collecting
3053 all candidates (including duplicates and split points
3054 below) in a vector and then process them together. */
3055 continue;
3056 }
3057
3058 /* If init_b == init_a + the size of the type * k, we have an
3059 interleaving, and DRA is accessed before DRB. */
3060 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3061 if (type_size_a == 0
3062 || (init_b - init_a) % type_size_a != 0)
3063 break;
3064
3065 /* If we have a store, the accesses are adjacent. This splits
3066 groups into chunks we support (we don't support vectorization
3067 of stores with gaps). */
3068 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3069 break;
3070
3071 /* If the step (if not zero or non-constant) is greater than the
3072 difference between data-refs' inits this splits groups into
3073 suitable sizes. */
3074 if (tree_fits_shwi_p (DR_STEP (dra)))
3075 {
3076 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3077 if (step != 0 && step <= (init_b - init_a))
3078 break;
3079 }
3080
3081 if (dump_enabled_p ())
3082 {
3083 dump_printf_loc (MSG_NOTE, vect_location,
3084 "Detected interleaving ");
3085 if (DR_IS_READ (dra))
3086 dump_printf (MSG_NOTE, "load ");
3087 else
3088 dump_printf (MSG_NOTE, "store ");
3089 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3090 dump_printf (MSG_NOTE, " and ");
3091 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3092 dump_printf (MSG_NOTE, "\n");
3093 }
3094
3095 /* Link the found element into the group list. */
3096 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3097 {
3098 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3099 lastinfo = stmtinfo_a;
3100 }
3101 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3102 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3103 lastinfo = stmtinfo_b;
3104 }
3105 }
3106
3107 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3108 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3109 && !vect_analyze_data_ref_access (dr))
3110 {
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3113 "not vectorized: complicated access pattern.\n");
3114
3115 if (is_a <bb_vec_info> (vinfo))
3116 {
3117 /* Mark the statement as not vectorizable. */
3118 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3119 continue;
3120 }
3121 else
3122 {
3123 datarefs_copy.release ();
3124 return false;
3125 }
3126 }
3127
3128 datarefs_copy.release ();
3129 return true;
3130 }
3131
3132 /* Function vect_vfa_segment_size.
3133
3134 Input:
3135 DR: The data reference.
3136 LENGTH_FACTOR: segment length to consider.
3137
3138 Return a value suitable for the dr_with_seg_len::seg_len field.
3139 This is the "distance travelled" by the pointer from the first
3140 iteration in the segment to the last. Note that it does not include
3141 the size of the access; in effect it only describes the first byte. */
3142
3143 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3144 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3145 {
3146 length_factor = size_binop (MINUS_EXPR,
3147 fold_convert (sizetype, length_factor),
3148 size_one_node);
3149 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3150 length_factor);
3151 }
3152
3153 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3154 gives the worst-case number of bytes covered by the segment. */
3155
3156 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3157 vect_vfa_access_size (data_reference *dr)
3158 {
3159 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3160 tree ref_type = TREE_TYPE (DR_REF (dr));
3161 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3162 unsigned HOST_WIDE_INT access_size = ref_size;
3163 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3164 {
3165 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3166 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3167 }
3168 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3169 && (vect_supportable_dr_alignment (dr, false)
3170 == dr_explicit_realign_optimized))
3171 {
3172 /* We might access a full vector's worth. */
3173 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3174 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3175 }
3176 return access_size;
3177 }
3178
3179 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3180
3181 static unsigned int
vect_vfa_align(const data_reference * dr)3182 vect_vfa_align (const data_reference *dr)
3183 {
3184 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3185 }
3186
3187 /* Function vect_no_alias_p.
3188
3189 Given data references A and B with equal base and offset, see whether
3190 the alias relation can be decided at compilation time. Return 1 if
3191 it can and the references alias, 0 if it can and the references do
3192 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3193 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3194 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3195
3196 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3197 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3198 tree segment_length_a, tree segment_length_b,
3199 unsigned HOST_WIDE_INT access_size_a,
3200 unsigned HOST_WIDE_INT access_size_b)
3201 {
3202 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3203 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3204 poly_uint64 const_length_a;
3205 poly_uint64 const_length_b;
3206
3207 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3208 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3209 [a, a+12) */
3210 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3211 {
3212 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3213 offset_a = (offset_a + access_size_a) - const_length_a;
3214 }
3215 else
3216 const_length_a = tree_to_poly_uint64 (segment_length_a);
3217 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3218 {
3219 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3220 offset_b = (offset_b + access_size_b) - const_length_b;
3221 }
3222 else
3223 const_length_b = tree_to_poly_uint64 (segment_length_b);
3224
3225 const_length_a += access_size_a;
3226 const_length_b += access_size_b;
3227
3228 if (ranges_known_overlap_p (offset_a, const_length_a,
3229 offset_b, const_length_b))
3230 return 1;
3231
3232 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3233 offset_b, const_length_b))
3234 return 0;
3235
3236 return -1;
3237 }
3238
3239 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3240 in DDR is >= VF. */
3241
3242 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3243 dependence_distance_ge_vf (data_dependence_relation *ddr,
3244 unsigned int loop_depth, poly_uint64 vf)
3245 {
3246 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3247 || DDR_NUM_DIST_VECTS (ddr) == 0)
3248 return false;
3249
3250 /* If the dependence is exact, we should have limited the VF instead. */
3251 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3252
3253 unsigned int i;
3254 lambda_vector dist_v;
3255 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3256 {
3257 HOST_WIDE_INT dist = dist_v[loop_depth];
3258 if (dist != 0
3259 && !(dist > 0 && DDR_REVERSED_P (ddr))
3260 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3261 return false;
3262 }
3263
3264 if (dump_enabled_p ())
3265 {
3266 dump_printf_loc (MSG_NOTE, vect_location,
3267 "dependence distance between ");
3268 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3269 dump_printf (MSG_NOTE, " and ");
3270 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3271 dump_printf (MSG_NOTE, " is >= VF\n");
3272 }
3273
3274 return true;
3275 }
3276
3277 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3278
3279 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3280 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3281 {
3282 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3283 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3284 dump_printf (dump_kind, ") >= ");
3285 dump_dec (dump_kind, lower_bound.min_value);
3286 }
3287
3288 /* Record that the vectorized loop requires the vec_lower_bound described
3289 by EXPR, UNSIGNED_P and MIN_VALUE. */
3290
3291 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3292 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3293 poly_uint64 min_value)
3294 {
3295 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3296 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3297 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3298 {
3299 unsigned_p &= lower_bounds[i].unsigned_p;
3300 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3301 if (lower_bounds[i].unsigned_p != unsigned_p
3302 || maybe_lt (lower_bounds[i].min_value, min_value))
3303 {
3304 lower_bounds[i].unsigned_p = unsigned_p;
3305 lower_bounds[i].min_value = min_value;
3306 if (dump_enabled_p ())
3307 {
3308 dump_printf_loc (MSG_NOTE, vect_location,
3309 "updating run-time check to ");
3310 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3311 dump_printf (MSG_NOTE, "\n");
3312 }
3313 }
3314 return;
3315 }
3316
3317 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3318 if (dump_enabled_p ())
3319 {
3320 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3321 dump_lower_bound (MSG_NOTE, lower_bound);
3322 dump_printf (MSG_NOTE, "\n");
3323 }
3324 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3325 }
3326
3327 /* Return true if it's unlikely that the step of the vectorized form of DR
3328 will span fewer than GAP bytes. */
3329
3330 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3331 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3332 {
3333 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3334 HOST_WIDE_INT count
3335 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3336 if (GROUP_FIRST_ELEMENT (stmt_info))
3337 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3338 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3339 }
3340
3341 /* Return true if we know that there is no alias between DR_A and DR_B
3342 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3343 *LOWER_BOUND_OUT to this N. */
3344
3345 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3346 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3347 poly_uint64 *lower_bound_out)
3348 {
3349 /* Check that there is a constant gap of known sign between DR_A
3350 and DR_B. */
3351 poly_int64 init_a, init_b;
3352 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3353 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3354 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3355 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3356 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3357 || !ordered_p (init_a, init_b))
3358 return false;
3359
3360 /* Sort DR_A and DR_B by the address they access. */
3361 if (maybe_lt (init_b, init_a))
3362 {
3363 std::swap (init_a, init_b);
3364 std::swap (dr_a, dr_b);
3365 }
3366
3367 /* If the two accesses could be dependent within a scalar iteration,
3368 make sure that we'd retain their order. */
3369 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3370 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3371 return false;
3372
3373 /* There is no alias if abs (DR_STEP) is greater than or equal to
3374 the bytes spanned by the combination of the two accesses. */
3375 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3376 return true;
3377 }
3378
3379 /* Function vect_prune_runtime_alias_test_list.
3380
3381 Prune a list of ddrs to be tested at run-time by versioning for alias.
3382 Merge several alias checks into one if possible.
3383 Return FALSE if resulting list of ddrs is longer then allowed by
3384 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3385
3386 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3387 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3388 {
3389 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3390 hash_set <tree_pair_hash> compared_objects;
3391
3392 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3393 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3394 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3395 vec<vec_object_pair> &check_unequal_addrs
3396 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3397 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3398 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3399
3400 ddr_p ddr;
3401 unsigned int i;
3402 tree length_factor;
3403
3404 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_NOTE, vect_location,
3406 "=== vect_prune_runtime_alias_test_list ===\n");
3407
3408 /* Step values are irrelevant for aliasing if the number of vector
3409 iterations is equal to the number of scalar iterations (which can
3410 happen for fully-SLP loops). */
3411 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3412
3413 if (!ignore_step_p)
3414 {
3415 /* Convert the checks for nonzero steps into bound tests. */
3416 tree value;
3417 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3418 vect_check_lower_bound (loop_vinfo, value, true, 1);
3419 }
3420
3421 if (may_alias_ddrs.is_empty ())
3422 return true;
3423
3424 comp_alias_ddrs.create (may_alias_ddrs.length ());
3425
3426 unsigned int loop_depth
3427 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3428 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3429
3430 /* First, we collect all data ref pairs for aliasing checks. */
3431 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3432 {
3433 int comp_res;
3434 poly_uint64 lower_bound;
3435 struct data_reference *dr_a, *dr_b;
3436 gimple *dr_group_first_a, *dr_group_first_b;
3437 tree segment_length_a, segment_length_b;
3438 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3439 unsigned int align_a, align_b;
3440 gimple *stmt_a, *stmt_b;
3441
3442 /* Ignore the alias if the VF we chose ended up being no greater
3443 than the dependence distance. */
3444 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3445 continue;
3446
3447 if (DDR_OBJECT_A (ddr))
3448 {
3449 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3450 if (!compared_objects.add (new_pair))
3451 {
3452 if (dump_enabled_p ())
3453 {
3454 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3455 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3456 dump_printf (MSG_NOTE, " and ");
3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3458 dump_printf (MSG_NOTE, " have different addresses\n");
3459 }
3460 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3461 }
3462 continue;
3463 }
3464
3465 dr_a = DDR_A (ddr);
3466 stmt_a = DR_STMT (DDR_A (ddr));
3467
3468 dr_b = DDR_B (ddr);
3469 stmt_b = DR_STMT (DDR_B (ddr));
3470
3471 /* Skip the pair if inter-iteration dependencies are irrelevant
3472 and intra-iteration dependencies are guaranteed to be honored. */
3473 if (ignore_step_p
3474 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3475 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3476 {
3477 if (dump_enabled_p ())
3478 {
3479 dump_printf_loc (MSG_NOTE, vect_location,
3480 "no need for alias check between ");
3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3482 dump_printf (MSG_NOTE, " and ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3484 dump_printf (MSG_NOTE, " when VF is 1\n");
3485 }
3486 continue;
3487 }
3488
3489 /* See whether we can handle the alias using a bounds check on
3490 the step, and whether that's likely to be the best approach.
3491 (It might not be, for example, if the minimum step is much larger
3492 than the number of bytes handled by one vector iteration.) */
3493 if (!ignore_step_p
3494 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3495 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3496 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3497 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3498 {
3499 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3500 if (dump_enabled_p ())
3501 {
3502 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3503 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3504 dump_printf (MSG_NOTE, " and ");
3505 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3506 dump_printf (MSG_NOTE, " when the step ");
3507 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3508 dump_printf (MSG_NOTE, " is outside ");
3509 if (unsigned_p)
3510 dump_printf (MSG_NOTE, "[0");
3511 else
3512 {
3513 dump_printf (MSG_NOTE, "(");
3514 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3515 }
3516 dump_printf (MSG_NOTE, ", ");
3517 dump_dec (MSG_NOTE, lower_bound);
3518 dump_printf (MSG_NOTE, ")\n");
3519 }
3520 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3521 lower_bound);
3522 continue;
3523 }
3524
3525 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3526 if (dr_group_first_a)
3527 {
3528 stmt_a = dr_group_first_a;
3529 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3530 }
3531
3532 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3533 if (dr_group_first_b)
3534 {
3535 stmt_b = dr_group_first_b;
3536 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3537 }
3538
3539 if (ignore_step_p)
3540 {
3541 segment_length_a = size_zero_node;
3542 segment_length_b = size_zero_node;
3543 }
3544 else
3545 {
3546 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3547 length_factor = scalar_loop_iters;
3548 else
3549 length_factor = size_int (vect_factor);
3550 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3551 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3552 }
3553 access_size_a = vect_vfa_access_size (dr_a);
3554 access_size_b = vect_vfa_access_size (dr_b);
3555 align_a = vect_vfa_align (dr_a);
3556 align_b = vect_vfa_align (dr_b);
3557
3558 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3559 DR_BASE_ADDRESS (dr_b));
3560 if (comp_res == 0)
3561 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3562 DR_OFFSET (dr_b));
3563
3564 /* See whether the alias is known at compilation time. */
3565 if (comp_res == 0
3566 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3567 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3568 && poly_int_tree_p (segment_length_a)
3569 && poly_int_tree_p (segment_length_b))
3570 {
3571 int res = vect_compile_time_alias (dr_a, dr_b,
3572 segment_length_a,
3573 segment_length_b,
3574 access_size_a,
3575 access_size_b);
3576 if (res >= 0 && dump_enabled_p ())
3577 {
3578 dump_printf_loc (MSG_NOTE, vect_location,
3579 "can tell at compile time that ");
3580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3581 dump_printf (MSG_NOTE, " and ");
3582 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3583 if (res == 0)
3584 dump_printf (MSG_NOTE, " do not alias\n");
3585 else
3586 dump_printf (MSG_NOTE, " alias\n");
3587 }
3588
3589 if (res == 0)
3590 continue;
3591
3592 if (res == 1)
3593 {
3594 if (dump_enabled_p ())
3595 dump_printf_loc (MSG_NOTE, vect_location,
3596 "not vectorized: compilation time alias.\n");
3597 return false;
3598 }
3599 }
3600
3601 dr_with_seg_len_pair_t dr_with_seg_len_pair
3602 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3603 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3604
3605 /* Canonicalize pairs by sorting the two DR members. */
3606 if (comp_res > 0)
3607 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3608
3609 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3610 }
3611
3612 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3613
3614 unsigned int count = (comp_alias_ddrs.length ()
3615 + check_unequal_addrs.length ());
3616
3617 dump_printf_loc (MSG_NOTE, vect_location,
3618 "improved number of alias checks from %d to %d\n",
3619 may_alias_ddrs.length (), count);
3620 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3621 {
3622 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624 "number of versioning for alias "
3625 "run-time tests exceeds %d "
3626 "(--param vect-max-version-for-alias-checks)\n",
3627 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3628 return false;
3629 }
3630
3631 return true;
3632 }
3633
3634 /* Check whether we can use an internal function for a gather load
3635 or scatter store. READ_P is true for loads and false for stores.
3636 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3637 the type of the memory elements being loaded or stored. OFFSET_BITS
3638 is the number of bits in each scalar offset and OFFSET_SIGN is the
3639 sign of the offset. SCALE is the amount by which the offset should
3640 be multiplied *after* it has been converted to address width.
3641
3642 Return true if the function is supported, storing the function
3643 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3644
3645 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3646 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3647 tree memory_type, unsigned int offset_bits,
3648 signop offset_sign, int scale,
3649 internal_fn *ifn_out, tree *element_type_out)
3650 {
3651 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3652 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3653 if (offset_bits > element_bits)
3654 /* Internal functions require the offset to be the same width as
3655 the vector elements. We can extend narrower offsets, but it isn't
3656 safe to truncate wider offsets. */
3657 return false;
3658
3659 if (element_bits != memory_bits)
3660 /* For now the vector elements must be the same width as the
3661 memory elements. */
3662 return false;
3663
3664 /* Work out which function we need. */
3665 internal_fn ifn;
3666 if (read_p)
3667 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3668 else
3669 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3670
3671 /* Test whether the target supports this combination. */
3672 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3673 offset_sign, scale))
3674 return false;
3675
3676 *ifn_out = ifn;
3677 *element_type_out = TREE_TYPE (vectype);
3678 return true;
3679 }
3680
3681 /* CALL is a call to an internal gather load or scatter store function.
3682 Describe the operation in INFO. */
3683
3684 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3685 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3686 {
3687 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3688 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3689 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3690
3691 info->ifn = gimple_call_internal_fn (call);
3692 info->decl = NULL_TREE;
3693 info->base = gimple_call_arg (call, 0);
3694 info->offset = gimple_call_arg (call, 1);
3695 info->offset_dt = vect_unknown_def_type;
3696 info->offset_vectype = NULL_TREE;
3697 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3698 info->element_type = TREE_TYPE (vectype);
3699 info->memory_type = TREE_TYPE (DR_REF (dr));
3700 }
3701
3702 /* Return true if a non-affine read or write in STMT is suitable for a
3703 gather load or scatter store. Describe the operation in *INFO if so. */
3704
3705 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3706 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3707 gather_scatter_info *info)
3708 {
3709 HOST_WIDE_INT scale = 1;
3710 poly_int64 pbitpos, pbitsize;
3711 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3712 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3713 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3714 tree offtype = NULL_TREE;
3715 tree decl = NULL_TREE, base, off;
3716 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3717 tree memory_type = TREE_TYPE (DR_REF (dr));
3718 machine_mode pmode;
3719 int punsignedp, reversep, pvolatilep = 0;
3720 internal_fn ifn;
3721 tree element_type;
3722 bool masked_p = false;
3723
3724 /* See whether this is already a call to a gather/scatter internal function.
3725 If not, see whether it's a masked load or store. */
3726 gcall *call = dyn_cast <gcall *> (stmt);
3727 if (call && gimple_call_internal_p (call))
3728 {
3729 ifn = gimple_call_internal_fn (stmt);
3730 if (internal_gather_scatter_fn_p (ifn))
3731 {
3732 vect_describe_gather_scatter_call (call, info);
3733 return true;
3734 }
3735 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3736 }
3737
3738 /* True if we should aim to use internal functions rather than
3739 built-in functions. */
3740 bool use_ifn_p = (DR_IS_READ (dr)
3741 ? supports_vec_gather_load_p ()
3742 : supports_vec_scatter_store_p ());
3743
3744 base = DR_REF (dr);
3745 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3746 see if we can use the def stmt of the address. */
3747 if (masked_p
3748 && TREE_CODE (base) == MEM_REF
3749 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3750 && integer_zerop (TREE_OPERAND (base, 1))
3751 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3752 {
3753 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3754 if (is_gimple_assign (def_stmt)
3755 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3756 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3757 }
3758
3759 /* The gather and scatter builtins need address of the form
3760 loop_invariant + vector * {1, 2, 4, 8}
3761 or
3762 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3763 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3764 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3765 multiplications and additions in it. To get a vector, we need
3766 a single SSA_NAME that will be defined in the loop and will
3767 contain everything that is not loop invariant and that can be
3768 vectorized. The following code attempts to find such a preexistng
3769 SSA_NAME OFF and put the loop invariants into a tree BASE
3770 that can be gimplified before the loop. */
3771 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3772 &punsignedp, &reversep, &pvolatilep);
3773 gcc_assert (base && !reversep);
3774 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3775
3776 if (TREE_CODE (base) == MEM_REF)
3777 {
3778 if (!integer_zerop (TREE_OPERAND (base, 1)))
3779 {
3780 if (off == NULL_TREE)
3781 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3782 else
3783 off = size_binop (PLUS_EXPR, off,
3784 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3785 }
3786 base = TREE_OPERAND (base, 0);
3787 }
3788 else
3789 base = build_fold_addr_expr (base);
3790
3791 if (off == NULL_TREE)
3792 off = size_zero_node;
3793
3794 /* If base is not loop invariant, either off is 0, then we start with just
3795 the constant offset in the loop invariant BASE and continue with base
3796 as OFF, otherwise give up.
3797 We could handle that case by gimplifying the addition of base + off
3798 into some SSA_NAME and use that as off, but for now punt. */
3799 if (!expr_invariant_in_loop_p (loop, base))
3800 {
3801 if (!integer_zerop (off))
3802 return false;
3803 off = base;
3804 base = size_int (pbytepos);
3805 }
3806 /* Otherwise put base + constant offset into the loop invariant BASE
3807 and continue with OFF. */
3808 else
3809 {
3810 base = fold_convert (sizetype, base);
3811 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3812 }
3813
3814 /* OFF at this point may be either a SSA_NAME or some tree expression
3815 from get_inner_reference. Try to peel off loop invariants from it
3816 into BASE as long as possible. */
3817 STRIP_NOPS (off);
3818 while (offtype == NULL_TREE)
3819 {
3820 enum tree_code code;
3821 tree op0, op1, add = NULL_TREE;
3822
3823 if (TREE_CODE (off) == SSA_NAME)
3824 {
3825 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3826
3827 if (expr_invariant_in_loop_p (loop, off))
3828 return false;
3829
3830 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3831 break;
3832
3833 op0 = gimple_assign_rhs1 (def_stmt);
3834 code = gimple_assign_rhs_code (def_stmt);
3835 op1 = gimple_assign_rhs2 (def_stmt);
3836 }
3837 else
3838 {
3839 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3840 return false;
3841 code = TREE_CODE (off);
3842 extract_ops_from_tree (off, &code, &op0, &op1);
3843 }
3844 switch (code)
3845 {
3846 case POINTER_PLUS_EXPR:
3847 case PLUS_EXPR:
3848 if (expr_invariant_in_loop_p (loop, op0))
3849 {
3850 add = op0;
3851 off = op1;
3852 do_add:
3853 add = fold_convert (sizetype, add);
3854 if (scale != 1)
3855 add = size_binop (MULT_EXPR, add, size_int (scale));
3856 base = size_binop (PLUS_EXPR, base, add);
3857 continue;
3858 }
3859 if (expr_invariant_in_loop_p (loop, op1))
3860 {
3861 add = op1;
3862 off = op0;
3863 goto do_add;
3864 }
3865 break;
3866 case MINUS_EXPR:
3867 if (expr_invariant_in_loop_p (loop, op1))
3868 {
3869 add = fold_convert (sizetype, op1);
3870 add = size_binop (MINUS_EXPR, size_zero_node, add);
3871 off = op0;
3872 goto do_add;
3873 }
3874 break;
3875 case MULT_EXPR:
3876 if (scale == 1 && tree_fits_shwi_p (op1))
3877 {
3878 int new_scale = tree_to_shwi (op1);
3879 /* Only treat this as a scaling operation if the target
3880 supports it. */
3881 if (use_ifn_p
3882 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3883 vectype, memory_type, 1,
3884 TYPE_SIGN (TREE_TYPE (op0)),
3885 new_scale, &ifn,
3886 &element_type))
3887 break;
3888 scale = new_scale;
3889 off = op0;
3890 continue;
3891 }
3892 break;
3893 case SSA_NAME:
3894 off = op0;
3895 continue;
3896 CASE_CONVERT:
3897 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3898 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3899 break;
3900 if (TYPE_PRECISION (TREE_TYPE (op0))
3901 == TYPE_PRECISION (TREE_TYPE (off)))
3902 {
3903 off = op0;
3904 continue;
3905 }
3906
3907 /* The internal functions need the offset to be the same width
3908 as the elements of VECTYPE. Don't include operations that
3909 cast the offset from that width to a different width. */
3910 if (use_ifn_p
3911 && (int_size_in_bytes (TREE_TYPE (vectype))
3912 == int_size_in_bytes (TREE_TYPE (off))))
3913 break;
3914
3915 if (TYPE_PRECISION (TREE_TYPE (op0))
3916 < TYPE_PRECISION (TREE_TYPE (off)))
3917 {
3918 off = op0;
3919 offtype = TREE_TYPE (off);
3920 STRIP_NOPS (off);
3921 continue;
3922 }
3923 break;
3924 default:
3925 break;
3926 }
3927 break;
3928 }
3929
3930 /* If at the end OFF still isn't a SSA_NAME or isn't
3931 defined in the loop, punt. */
3932 if (TREE_CODE (off) != SSA_NAME
3933 || expr_invariant_in_loop_p (loop, off))
3934 return false;
3935
3936 if (offtype == NULL_TREE)
3937 offtype = TREE_TYPE (off);
3938
3939 if (use_ifn_p)
3940 {
3941 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3942 memory_type, TYPE_PRECISION (offtype),
3943 TYPE_SIGN (offtype), scale, &ifn,
3944 &element_type))
3945 return false;
3946 }
3947 else
3948 {
3949 if (DR_IS_READ (dr))
3950 {
3951 if (targetm.vectorize.builtin_gather)
3952 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3953 }
3954 else
3955 {
3956 if (targetm.vectorize.builtin_scatter)
3957 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3958 }
3959
3960 if (!decl)
3961 return false;
3962
3963 ifn = IFN_LAST;
3964 element_type = TREE_TYPE (vectype);
3965 }
3966
3967 info->ifn = ifn;
3968 info->decl = decl;
3969 info->base = base;
3970 info->offset = off;
3971 info->offset_dt = vect_unknown_def_type;
3972 info->offset_vectype = NULL_TREE;
3973 info->scale = scale;
3974 info->element_type = element_type;
3975 info->memory_type = memory_type;
3976 return true;
3977 }
3978
3979 /* Function vect_analyze_data_refs.
3980
3981 Find all the data references in the loop or basic block.
3982
3983 The general structure of the analysis of data refs in the vectorizer is as
3984 follows:
3985 1- vect_analyze_data_refs(loop/bb): call
3986 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3987 in the loop/bb and their dependences.
3988 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3989 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3990 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3991
3992 */
3993
3994 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)3995 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3996 {
3997 struct loop *loop = NULL;
3998 unsigned int i;
3999 struct data_reference *dr;
4000 tree scalar_type;
4001
4002 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_NOTE, vect_location,
4004 "=== vect_analyze_data_refs ===\n");
4005
4006 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4007 loop = LOOP_VINFO_LOOP (loop_vinfo);
4008
4009 /* Go through the data-refs, check that the analysis succeeded. Update
4010 pointer from stmt_vec_info struct to DR and vectype. */
4011
4012 vec<data_reference_p> datarefs = vinfo->datarefs;
4013 FOR_EACH_VEC_ELT (datarefs, i, dr)
4014 {
4015 gimple *stmt;
4016 stmt_vec_info stmt_info;
4017 tree base, offset, init;
4018 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4019 bool simd_lane_access = false;
4020 poly_uint64 vf;
4021
4022 again:
4023 if (!dr || !DR_REF (dr))
4024 {
4025 if (dump_enabled_p ())
4026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4027 "not vectorized: unhandled data-ref\n");
4028 return false;
4029 }
4030
4031 stmt = DR_STMT (dr);
4032 stmt_info = vinfo_for_stmt (stmt);
4033
4034 /* Discard clobbers from the dataref vector. We will remove
4035 clobber stmts during vectorization. */
4036 if (gimple_clobber_p (stmt))
4037 {
4038 free_data_ref (dr);
4039 if (i == datarefs.length () - 1)
4040 {
4041 datarefs.pop ();
4042 break;
4043 }
4044 datarefs.ordered_remove (i);
4045 dr = datarefs[i];
4046 goto again;
4047 }
4048
4049 /* Check that analysis of the data-ref succeeded. */
4050 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4051 || !DR_STEP (dr))
4052 {
4053 bool maybe_gather
4054 = DR_IS_READ (dr)
4055 && !TREE_THIS_VOLATILE (DR_REF (dr))
4056 && (targetm.vectorize.builtin_gather != NULL
4057 || supports_vec_gather_load_p ());
4058 bool maybe_scatter
4059 = DR_IS_WRITE (dr)
4060 && !TREE_THIS_VOLATILE (DR_REF (dr))
4061 && (targetm.vectorize.builtin_scatter != NULL
4062 || supports_vec_scatter_store_p ());
4063 bool maybe_simd_lane_access
4064 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4065
4066 /* If target supports vector gather loads or scatter stores, or if
4067 this might be a SIMD lane access, see if they can't be used. */
4068 if (is_a <loop_vec_info> (vinfo)
4069 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4070 && !nested_in_vect_loop_p (loop, stmt))
4071 {
4072 struct data_reference *newdr
4073 = create_data_ref (NULL, loop_containing_stmt (stmt),
4074 DR_REF (dr), stmt, !maybe_scatter,
4075 DR_IS_CONDITIONAL_IN_STMT (dr));
4076 gcc_assert (newdr != NULL && DR_REF (newdr));
4077 if (DR_BASE_ADDRESS (newdr)
4078 && DR_OFFSET (newdr)
4079 && DR_INIT (newdr)
4080 && DR_STEP (newdr)
4081 && integer_zerop (DR_STEP (newdr)))
4082 {
4083 if (maybe_simd_lane_access)
4084 {
4085 tree off = DR_OFFSET (newdr);
4086 STRIP_NOPS (off);
4087 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4088 && TREE_CODE (off) == MULT_EXPR
4089 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4090 {
4091 tree step = TREE_OPERAND (off, 1);
4092 off = TREE_OPERAND (off, 0);
4093 STRIP_NOPS (off);
4094 if (CONVERT_EXPR_P (off)
4095 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4096 0)))
4097 < TYPE_PRECISION (TREE_TYPE (off)))
4098 off = TREE_OPERAND (off, 0);
4099 if (TREE_CODE (off) == SSA_NAME)
4100 {
4101 gimple *def = SSA_NAME_DEF_STMT (off);
4102 tree reft = TREE_TYPE (DR_REF (newdr));
4103 if (is_gimple_call (def)
4104 && gimple_call_internal_p (def)
4105 && (gimple_call_internal_fn (def)
4106 == IFN_GOMP_SIMD_LANE))
4107 {
4108 tree arg = gimple_call_arg (def, 0);
4109 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4110 arg = SSA_NAME_VAR (arg);
4111 if (arg == loop->simduid
4112 /* For now. */
4113 && tree_int_cst_equal
4114 (TYPE_SIZE_UNIT (reft),
4115 step))
4116 {
4117 DR_OFFSET (newdr) = ssize_int (0);
4118 DR_STEP (newdr) = step;
4119 DR_OFFSET_ALIGNMENT (newdr)
4120 = BIGGEST_ALIGNMENT;
4121 DR_STEP_ALIGNMENT (newdr)
4122 = highest_pow2_factor (step);
4123 dr = newdr;
4124 simd_lane_access = true;
4125 }
4126 }
4127 }
4128 }
4129 }
4130 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4131 {
4132 dr = newdr;
4133 if (maybe_gather)
4134 gatherscatter = GATHER;
4135 else
4136 gatherscatter = SCATTER;
4137 }
4138 }
4139 if (gatherscatter == SG_NONE && !simd_lane_access)
4140 free_data_ref (newdr);
4141 }
4142
4143 if (gatherscatter == SG_NONE && !simd_lane_access)
4144 {
4145 if (dump_enabled_p ())
4146 {
4147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4148 "not vectorized: data ref analysis "
4149 "failed ");
4150 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4151 }
4152
4153 if (is_a <bb_vec_info> (vinfo))
4154 break;
4155
4156 return false;
4157 }
4158 }
4159
4160 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4161 {
4162 if (dump_enabled_p ())
4163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4164 "not vectorized: base addr of dr is a "
4165 "constant\n");
4166
4167 if (is_a <bb_vec_info> (vinfo))
4168 break;
4169
4170 if (gatherscatter != SG_NONE || simd_lane_access)
4171 free_data_ref (dr);
4172 return false;
4173 }
4174
4175 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4176 {
4177 if (dump_enabled_p ())
4178 {
4179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180 "not vectorized: volatile type ");
4181 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4182 }
4183
4184 if (is_a <bb_vec_info> (vinfo))
4185 break;
4186
4187 return false;
4188 }
4189
4190 if (stmt_can_throw_internal (stmt))
4191 {
4192 if (dump_enabled_p ())
4193 {
4194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4195 "not vectorized: statement can throw an "
4196 "exception ");
4197 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4198 }
4199
4200 if (is_a <bb_vec_info> (vinfo))
4201 break;
4202
4203 if (gatherscatter != SG_NONE || simd_lane_access)
4204 free_data_ref (dr);
4205 return false;
4206 }
4207
4208 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4209 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4210 {
4211 if (dump_enabled_p ())
4212 {
4213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4214 "not vectorized: statement is bitfield "
4215 "access ");
4216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4217 }
4218
4219 if (is_a <bb_vec_info> (vinfo))
4220 break;
4221
4222 if (gatherscatter != SG_NONE || simd_lane_access)
4223 free_data_ref (dr);
4224 return false;
4225 }
4226
4227 base = unshare_expr (DR_BASE_ADDRESS (dr));
4228 offset = unshare_expr (DR_OFFSET (dr));
4229 init = unshare_expr (DR_INIT (dr));
4230
4231 if (is_gimple_call (stmt)
4232 && (!gimple_call_internal_p (stmt)
4233 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4234 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4235 {
4236 if (dump_enabled_p ())
4237 {
4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239 "not vectorized: dr in a call ");
4240 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4241 }
4242
4243 if (is_a <bb_vec_info> (vinfo))
4244 break;
4245
4246 if (gatherscatter != SG_NONE || simd_lane_access)
4247 free_data_ref (dr);
4248 return false;
4249 }
4250
4251 /* Update DR field in stmt_vec_info struct. */
4252
4253 /* If the dataref is in an inner-loop of the loop that is considered for
4254 for vectorization, we also want to analyze the access relative to
4255 the outer-loop (DR contains information only relative to the
4256 inner-most enclosing loop). We do that by building a reference to the
4257 first location accessed by the inner-loop, and analyze it relative to
4258 the outer-loop. */
4259 if (loop && nested_in_vect_loop_p (loop, stmt))
4260 {
4261 /* Build a reference to the first location accessed by the
4262 inner loop: *(BASE + INIT + OFFSET). By construction,
4263 this address must be invariant in the inner loop, so we
4264 can consider it as being used in the outer loop. */
4265 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4266 init, offset);
4267 tree init_addr = fold_build_pointer_plus (base, init_offset);
4268 tree init_ref = build_fold_indirect_ref (init_addr);
4269
4270 if (dump_enabled_p ())
4271 {
4272 dump_printf_loc (MSG_NOTE, vect_location,
4273 "analyze in outer loop: ");
4274 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4275 dump_printf (MSG_NOTE, "\n");
4276 }
4277
4278 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4279 init_ref, loop))
4280 /* dr_analyze_innermost already explained the failure. */
4281 return false;
4282
4283 if (dump_enabled_p ())
4284 {
4285 dump_printf_loc (MSG_NOTE, vect_location,
4286 "\touter base_address: ");
4287 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4288 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4289 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4290 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4291 STMT_VINFO_DR_OFFSET (stmt_info));
4292 dump_printf (MSG_NOTE,
4293 "\n\touter constant offset from base address: ");
4294 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295 STMT_VINFO_DR_INIT (stmt_info));
4296 dump_printf (MSG_NOTE, "\n\touter step: ");
4297 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4298 STMT_VINFO_DR_STEP (stmt_info));
4299 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4300 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4301 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4302 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4303 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4304 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4305 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4306 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4307 }
4308 }
4309
4310 if (STMT_VINFO_DATA_REF (stmt_info))
4311 {
4312 if (dump_enabled_p ())
4313 {
4314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4315 "not vectorized: more than one data ref "
4316 "in stmt: ");
4317 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4318 }
4319
4320 if (is_a <bb_vec_info> (vinfo))
4321 break;
4322
4323 if (gatherscatter != SG_NONE || simd_lane_access)
4324 free_data_ref (dr);
4325 return false;
4326 }
4327
4328 STMT_VINFO_DATA_REF (stmt_info) = dr;
4329 if (simd_lane_access)
4330 {
4331 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4332 free_data_ref (datarefs[i]);
4333 datarefs[i] = dr;
4334 }
4335
4336 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4337 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4338 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4339 {
4340 if (dump_enabled_p ())
4341 {
4342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4343 "not vectorized: base object not addressable "
4344 "for stmt: ");
4345 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4346 }
4347 if (is_a <bb_vec_info> (vinfo))
4348 {
4349 /* In BB vectorization the ref can still participate
4350 in dependence analysis, we just can't vectorize it. */
4351 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4352 continue;
4353 }
4354 return false;
4355 }
4356
4357 /* Set vectype for STMT. */
4358 scalar_type = TREE_TYPE (DR_REF (dr));
4359 STMT_VINFO_VECTYPE (stmt_info)
4360 = get_vectype_for_scalar_type (scalar_type);
4361 if (!STMT_VINFO_VECTYPE (stmt_info))
4362 {
4363 if (dump_enabled_p ())
4364 {
4365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366 "not vectorized: no vectype for stmt: ");
4367 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4368 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4369 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4370 scalar_type);
4371 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4372 }
4373
4374 if (is_a <bb_vec_info> (vinfo))
4375 {
4376 /* No vector type is fine, the ref can still participate
4377 in dependence analysis, we just can't vectorize it. */
4378 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4379 continue;
4380 }
4381
4382 if (gatherscatter != SG_NONE || simd_lane_access)
4383 {
4384 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4385 if (gatherscatter != SG_NONE)
4386 free_data_ref (dr);
4387 }
4388 return false;
4389 }
4390 else
4391 {
4392 if (dump_enabled_p ())
4393 {
4394 dump_printf_loc (MSG_NOTE, vect_location,
4395 "got vectype for stmt: ");
4396 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4397 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4398 STMT_VINFO_VECTYPE (stmt_info));
4399 dump_printf (MSG_NOTE, "\n");
4400 }
4401 }
4402
4403 /* Adjust the minimal vectorization factor according to the
4404 vector type. */
4405 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4406 *min_vf = upper_bound (*min_vf, vf);
4407
4408 if (gatherscatter != SG_NONE)
4409 {
4410 gather_scatter_info gs_info;
4411 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4412 &gs_info)
4413 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4414 {
4415 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4416 free_data_ref (dr);
4417 if (dump_enabled_p ())
4418 {
4419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4420 (gatherscatter == GATHER) ?
4421 "not vectorized: not suitable for gather "
4422 "load " :
4423 "not vectorized: not suitable for scatter "
4424 "store ");
4425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4426 }
4427 return false;
4428 }
4429
4430 free_data_ref (datarefs[i]);
4431 datarefs[i] = dr;
4432 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4433 }
4434
4435 else if (is_a <loop_vec_info> (vinfo)
4436 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4437 {
4438 if (nested_in_vect_loop_p (loop, stmt))
4439 {
4440 if (dump_enabled_p ())
4441 {
4442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4443 "not vectorized: not suitable for strided "
4444 "load ");
4445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4446 }
4447 return false;
4448 }
4449 STMT_VINFO_STRIDED_P (stmt_info) = true;
4450 }
4451 }
4452
4453 /* If we stopped analysis at the first dataref we could not analyze
4454 when trying to vectorize a basic-block mark the rest of the datarefs
4455 as not vectorizable and truncate the vector of datarefs. That
4456 avoids spending useless time in analyzing their dependence. */
4457 if (i != datarefs.length ())
4458 {
4459 gcc_assert (is_a <bb_vec_info> (vinfo));
4460 for (unsigned j = i; j < datarefs.length (); ++j)
4461 {
4462 data_reference_p dr = datarefs[j];
4463 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4464 free_data_ref (dr);
4465 }
4466 datarefs.truncate (i);
4467 }
4468
4469 return true;
4470 }
4471
4472
4473 /* Function vect_get_new_vect_var.
4474
4475 Returns a name for a new variable. The current naming scheme appends the
4476 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4477 the name of vectorizer generated variables, and appends that to NAME if
4478 provided. */
4479
4480 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4481 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4482 {
4483 const char *prefix;
4484 tree new_vect_var;
4485
4486 switch (var_kind)
4487 {
4488 case vect_simple_var:
4489 prefix = "vect";
4490 break;
4491 case vect_scalar_var:
4492 prefix = "stmp";
4493 break;
4494 case vect_mask_var:
4495 prefix = "mask";
4496 break;
4497 case vect_pointer_var:
4498 prefix = "vectp";
4499 break;
4500 default:
4501 gcc_unreachable ();
4502 }
4503
4504 if (name)
4505 {
4506 char* tmp = concat (prefix, "_", name, NULL);
4507 new_vect_var = create_tmp_reg (type, tmp);
4508 free (tmp);
4509 }
4510 else
4511 new_vect_var = create_tmp_reg (type, prefix);
4512
4513 return new_vect_var;
4514 }
4515
4516 /* Like vect_get_new_vect_var but return an SSA name. */
4517
4518 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4519 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4520 {
4521 const char *prefix;
4522 tree new_vect_var;
4523
4524 switch (var_kind)
4525 {
4526 case vect_simple_var:
4527 prefix = "vect";
4528 break;
4529 case vect_scalar_var:
4530 prefix = "stmp";
4531 break;
4532 case vect_pointer_var:
4533 prefix = "vectp";
4534 break;
4535 default:
4536 gcc_unreachable ();
4537 }
4538
4539 if (name)
4540 {
4541 char* tmp = concat (prefix, "_", name, NULL);
4542 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4543 free (tmp);
4544 }
4545 else
4546 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4547
4548 return new_vect_var;
4549 }
4550
4551 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4552
4553 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4554 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4555 {
4556 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4557 int misalign = DR_MISALIGNMENT (dr);
4558 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4559 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4560 else
4561 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4562 DR_TARGET_ALIGNMENT (dr), misalign);
4563 }
4564
4565 /* Function vect_create_addr_base_for_vector_ref.
4566
4567 Create an expression that computes the address of the first memory location
4568 that will be accessed for a data reference.
4569
4570 Input:
4571 STMT: The statement containing the data reference.
4572 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4573 OFFSET: Optional. If supplied, it is be added to the initial address.
4574 LOOP: Specify relative to which loop-nest should the address be computed.
4575 For example, when the dataref is in an inner-loop nested in an
4576 outer-loop that is now being vectorized, LOOP can be either the
4577 outer-loop, or the inner-loop. The first memory location accessed
4578 by the following dataref ('in' points to short):
4579
4580 for (i=0; i<N; i++)
4581 for (j=0; j<M; j++)
4582 s += in[i+j]
4583
4584 is as follows:
4585 if LOOP=i_loop: &in (relative to i_loop)
4586 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4587 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4588 initial address. Unlike OFFSET, which is number of elements to
4589 be added, BYTE_OFFSET is measured in bytes.
4590
4591 Output:
4592 1. Return an SSA_NAME whose value is the address of the memory location of
4593 the first vector of the data reference.
4594 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4595 these statement(s) which define the returned SSA_NAME.
4596
4597 FORNOW: We are only handling array accesses with step 1. */
4598
4599 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4600 vect_create_addr_base_for_vector_ref (gimple *stmt,
4601 gimple_seq *new_stmt_list,
4602 tree offset,
4603 tree byte_offset)
4604 {
4605 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4606 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4607 const char *base_name;
4608 tree addr_base;
4609 tree dest;
4610 gimple_seq seq = NULL;
4611 tree vect_ptr_type;
4612 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4613 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4614 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4615
4616 tree data_ref_base = unshare_expr (drb->base_address);
4617 tree base_offset = unshare_expr (drb->offset);
4618 tree init = unshare_expr (drb->init);
4619
4620 if (loop_vinfo)
4621 base_name = get_name (data_ref_base);
4622 else
4623 {
4624 base_offset = ssize_int (0);
4625 init = ssize_int (0);
4626 base_name = get_name (DR_REF (dr));
4627 }
4628
4629 /* Create base_offset */
4630 base_offset = size_binop (PLUS_EXPR,
4631 fold_convert (sizetype, base_offset),
4632 fold_convert (sizetype, init));
4633
4634 if (offset)
4635 {
4636 offset = fold_build2 (MULT_EXPR, sizetype,
4637 fold_convert (sizetype, offset), step);
4638 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4639 base_offset, offset);
4640 }
4641 if (byte_offset)
4642 {
4643 byte_offset = fold_convert (sizetype, byte_offset);
4644 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4645 base_offset, byte_offset);
4646 }
4647
4648 /* base + base_offset */
4649 if (loop_vinfo)
4650 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4651 else
4652 {
4653 addr_base = build1 (ADDR_EXPR,
4654 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4655 unshare_expr (DR_REF (dr)));
4656 }
4657
4658 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4659 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4660 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4661 gimple_seq_add_seq (new_stmt_list, seq);
4662
4663 if (DR_PTR_INFO (dr)
4664 && TREE_CODE (addr_base) == SSA_NAME
4665 && !SSA_NAME_PTR_INFO (addr_base))
4666 {
4667 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4668 if (offset || byte_offset)
4669 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4670 }
4671
4672 if (dump_enabled_p ())
4673 {
4674 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4675 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4676 dump_printf (MSG_NOTE, "\n");
4677 }
4678
4679 return addr_base;
4680 }
4681
4682
4683 /* Function vect_create_data_ref_ptr.
4684
4685 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4686 location accessed in the loop by STMT, along with the def-use update
4687 chain to appropriately advance the pointer through the loop iterations.
4688 Also set aliasing information for the pointer. This pointer is used by
4689 the callers to this function to create a memory reference expression for
4690 vector load/store access.
4691
4692 Input:
4693 1. STMT: a stmt that references memory. Expected to be of the form
4694 GIMPLE_ASSIGN <name, data-ref> or
4695 GIMPLE_ASSIGN <data-ref, name>.
4696 2. AGGR_TYPE: the type of the reference, which should be either a vector
4697 or an array.
4698 3. AT_LOOP: the loop where the vector memref is to be created.
4699 4. OFFSET (optional): an offset to be added to the initial address accessed
4700 by the data-ref in STMT.
4701 5. BSI: location where the new stmts are to be placed if there is no loop
4702 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4703 pointing to the initial address.
4704 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4705 to the initial address accessed by the data-ref in STMT. This is
4706 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4707 in bytes.
4708 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4709 to the IV during each iteration of the loop. NULL says to move
4710 by one copy of AGGR_TYPE up or down, depending on the step of the
4711 data reference.
4712
4713 Output:
4714 1. Declare a new ptr to vector_type, and have it point to the base of the
4715 data reference (initial addressed accessed by the data reference).
4716 For example, for vector of type V8HI, the following code is generated:
4717
4718 v8hi *ap;
4719 ap = (v8hi *)initial_address;
4720
4721 if OFFSET is not supplied:
4722 initial_address = &a[init];
4723 if OFFSET is supplied:
4724 initial_address = &a[init + OFFSET];
4725 if BYTE_OFFSET is supplied:
4726 initial_address = &a[init] + BYTE_OFFSET;
4727
4728 Return the initial_address in INITIAL_ADDRESS.
4729
4730 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4731 update the pointer in each iteration of the loop.
4732
4733 Return the increment stmt that updates the pointer in PTR_INCR.
4734
4735 3. Set INV_P to true if the access pattern of the data reference in the
4736 vectorized loop is invariant. Set it to false otherwise.
4737
4738 4. Return the pointer. */
4739
4740 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4741 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4742 tree offset, tree *initial_address,
4743 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4744 bool only_init, bool *inv_p, tree byte_offset,
4745 tree iv_step)
4746 {
4747 const char *base_name;
4748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4749 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4750 struct loop *loop = NULL;
4751 bool nested_in_vect_loop = false;
4752 struct loop *containing_loop = NULL;
4753 tree aggr_ptr_type;
4754 tree aggr_ptr;
4755 tree new_temp;
4756 gimple_seq new_stmt_list = NULL;
4757 edge pe = NULL;
4758 basic_block new_bb;
4759 tree aggr_ptr_init;
4760 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4761 tree aptr;
4762 gimple_stmt_iterator incr_gsi;
4763 bool insert_after;
4764 tree indx_before_incr, indx_after_incr;
4765 gimple *incr;
4766 tree step;
4767 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4768
4769 gcc_assert (iv_step != NULL_TREE
4770 || TREE_CODE (aggr_type) == ARRAY_TYPE
4771 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4772
4773 if (loop_vinfo)
4774 {
4775 loop = LOOP_VINFO_LOOP (loop_vinfo);
4776 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4777 containing_loop = (gimple_bb (stmt))->loop_father;
4778 pe = loop_preheader_edge (loop);
4779 }
4780 else
4781 {
4782 gcc_assert (bb_vinfo);
4783 only_init = true;
4784 *ptr_incr = NULL;
4785 }
4786
4787 /* Check the step (evolution) of the load in LOOP, and record
4788 whether it's invariant. */
4789 step = vect_dr_behavior (dr)->step;
4790 if (integer_zerop (step))
4791 *inv_p = true;
4792 else
4793 *inv_p = false;
4794
4795 /* Create an expression for the first address accessed by this load
4796 in LOOP. */
4797 base_name = get_name (DR_BASE_ADDRESS (dr));
4798
4799 if (dump_enabled_p ())
4800 {
4801 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4802 dump_printf_loc (MSG_NOTE, vect_location,
4803 "create %s-pointer variable to type: ",
4804 get_tree_code_name (TREE_CODE (aggr_type)));
4805 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4806 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4807 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4808 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4809 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4810 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4811 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4812 else
4813 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4814 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4815 dump_printf (MSG_NOTE, "\n");
4816 }
4817
4818 /* (1) Create the new aggregate-pointer variable.
4819 Vector and array types inherit the alias set of their component
4820 type by default so we need to use a ref-all pointer if the data
4821 reference does not conflict with the created aggregated data
4822 reference because it is not addressable. */
4823 bool need_ref_all = false;
4824 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4825 get_alias_set (DR_REF (dr))))
4826 need_ref_all = true;
4827 /* Likewise for any of the data references in the stmt group. */
4828 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4829 {
4830 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4831 do
4832 {
4833 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4834 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4835 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4836 get_alias_set (DR_REF (sdr))))
4837 {
4838 need_ref_all = true;
4839 break;
4840 }
4841 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4842 }
4843 while (orig_stmt);
4844 }
4845 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4846 need_ref_all);
4847 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4848
4849
4850 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4851 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4852 def-use update cycles for the pointer: one relative to the outer-loop
4853 (LOOP), which is what steps (3) and (4) below do. The other is relative
4854 to the inner-loop (which is the inner-most loop containing the dataref),
4855 and this is done be step (5) below.
4856
4857 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4858 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4859 redundant. Steps (3),(4) create the following:
4860
4861 vp0 = &base_addr;
4862 LOOP: vp1 = phi(vp0,vp2)
4863 ...
4864 ...
4865 vp2 = vp1 + step
4866 goto LOOP
4867
4868 If there is an inner-loop nested in loop, then step (5) will also be
4869 applied, and an additional update in the inner-loop will be created:
4870
4871 vp0 = &base_addr;
4872 LOOP: vp1 = phi(vp0,vp2)
4873 ...
4874 inner: vp3 = phi(vp1,vp4)
4875 vp4 = vp3 + inner_step
4876 if () goto inner
4877 ...
4878 vp2 = vp1 + step
4879 if () goto LOOP */
4880
4881 /* (2) Calculate the initial address of the aggregate-pointer, and set
4882 the aggregate-pointer to point to it before the loop. */
4883
4884 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4885
4886 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4887 offset, byte_offset);
4888 if (new_stmt_list)
4889 {
4890 if (pe)
4891 {
4892 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4893 gcc_assert (!new_bb);
4894 }
4895 else
4896 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4897 }
4898
4899 *initial_address = new_temp;
4900 aggr_ptr_init = new_temp;
4901
4902 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4903 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4904 inner-loop nested in LOOP (during outer-loop vectorization). */
4905
4906 /* No update in loop is required. */
4907 if (only_init && (!loop_vinfo || at_loop == loop))
4908 aptr = aggr_ptr_init;
4909 else
4910 {
4911 if (iv_step == NULL_TREE)
4912 {
4913 /* The step of the aggregate pointer is the type size. */
4914 iv_step = TYPE_SIZE_UNIT (aggr_type);
4915 /* One exception to the above is when the scalar step of the load in
4916 LOOP is zero. In this case the step here is also zero. */
4917 if (*inv_p)
4918 iv_step = size_zero_node;
4919 else if (tree_int_cst_sgn (step) == -1)
4920 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4921 }
4922
4923 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4924
4925 create_iv (aggr_ptr_init,
4926 fold_convert (aggr_ptr_type, iv_step),
4927 aggr_ptr, loop, &incr_gsi, insert_after,
4928 &indx_before_incr, &indx_after_incr);
4929 incr = gsi_stmt (incr_gsi);
4930 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4931
4932 /* Copy the points-to information if it exists. */
4933 if (DR_PTR_INFO (dr))
4934 {
4935 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4936 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4937 }
4938 if (ptr_incr)
4939 *ptr_incr = incr;
4940
4941 aptr = indx_before_incr;
4942 }
4943
4944 if (!nested_in_vect_loop || only_init)
4945 return aptr;
4946
4947
4948 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4949 nested in LOOP, if exists. */
4950
4951 gcc_assert (nested_in_vect_loop);
4952 if (!only_init)
4953 {
4954 standard_iv_increment_position (containing_loop, &incr_gsi,
4955 &insert_after);
4956 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4957 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4958 &indx_after_incr);
4959 incr = gsi_stmt (incr_gsi);
4960 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4961
4962 /* Copy the points-to information if it exists. */
4963 if (DR_PTR_INFO (dr))
4964 {
4965 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4966 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4967 }
4968 if (ptr_incr)
4969 *ptr_incr = incr;
4970
4971 return indx_before_incr;
4972 }
4973 else
4974 gcc_unreachable ();
4975 }
4976
4977
4978 /* Function bump_vector_ptr
4979
4980 Increment a pointer (to a vector type) by vector-size. If requested,
4981 i.e. if PTR-INCR is given, then also connect the new increment stmt
4982 to the existing def-use update-chain of the pointer, by modifying
4983 the PTR_INCR as illustrated below:
4984
4985 The pointer def-use update-chain before this function:
4986 DATAREF_PTR = phi (p_0, p_2)
4987 ....
4988 PTR_INCR: p_2 = DATAREF_PTR + step
4989
4990 The pointer def-use update-chain after this function:
4991 DATAREF_PTR = phi (p_0, p_2)
4992 ....
4993 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4994 ....
4995 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4996
4997 Input:
4998 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4999 in the loop.
5000 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5001 the loop. The increment amount across iterations is expected
5002 to be vector_size.
5003 BSI - location where the new update stmt is to be placed.
5004 STMT - the original scalar memory-access stmt that is being vectorized.
5005 BUMP - optional. The offset by which to bump the pointer. If not given,
5006 the offset is assumed to be vector_size.
5007
5008 Output: Return NEW_DATAREF_PTR as illustrated above.
5009
5010 */
5011
5012 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)5013 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5014 gimple *stmt, tree bump)
5015 {
5016 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5017 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5018 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5019 tree update = TYPE_SIZE_UNIT (vectype);
5020 gassign *incr_stmt;
5021 ssa_op_iter iter;
5022 use_operand_p use_p;
5023 tree new_dataref_ptr;
5024
5025 if (bump)
5026 update = bump;
5027
5028 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5029 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5030 else
5031 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5032 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5033 dataref_ptr, update);
5034 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5035
5036 /* Copy the points-to information if it exists. */
5037 if (DR_PTR_INFO (dr))
5038 {
5039 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5040 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5041 }
5042
5043 if (!ptr_incr)
5044 return new_dataref_ptr;
5045
5046 /* Update the vector-pointer's cross-iteration increment. */
5047 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5048 {
5049 tree use = USE_FROM_PTR (use_p);
5050
5051 if (use == dataref_ptr)
5052 SET_USE (use_p, new_dataref_ptr);
5053 else
5054 gcc_assert (operand_equal_p (use, update, 0));
5055 }
5056
5057 return new_dataref_ptr;
5058 }
5059
5060
5061 /* Copy memory reference info such as base/clique from the SRC reference
5062 to the DEST MEM_REF. */
5063
5064 void
vect_copy_ref_info(tree dest,tree src)5065 vect_copy_ref_info (tree dest, tree src)
5066 {
5067 if (TREE_CODE (dest) != MEM_REF)
5068 return;
5069
5070 tree src_base = src;
5071 while (handled_component_p (src_base))
5072 src_base = TREE_OPERAND (src_base, 0);
5073 if (TREE_CODE (src_base) != MEM_REF
5074 && TREE_CODE (src_base) != TARGET_MEM_REF)
5075 return;
5076
5077 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5078 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5079 }
5080
5081
5082 /* Function vect_create_destination_var.
5083
5084 Create a new temporary of type VECTYPE. */
5085
5086 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5087 vect_create_destination_var (tree scalar_dest, tree vectype)
5088 {
5089 tree vec_dest;
5090 const char *name;
5091 char *new_name;
5092 tree type;
5093 enum vect_var_kind kind;
5094
5095 kind = vectype
5096 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5097 ? vect_mask_var
5098 : vect_simple_var
5099 : vect_scalar_var;
5100 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5101
5102 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5103
5104 name = get_name (scalar_dest);
5105 if (name)
5106 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5107 else
5108 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5109 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5110 free (new_name);
5111
5112 return vec_dest;
5113 }
5114
5115 /* Function vect_grouped_store_supported.
5116
5117 Returns TRUE if interleave high and interleave low permutations
5118 are supported, and FALSE otherwise. */
5119
5120 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5121 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5122 {
5123 machine_mode mode = TYPE_MODE (vectype);
5124
5125 /* vect_permute_store_chain requires the group size to be equal to 3 or
5126 be a power of two. */
5127 if (count != 3 && exact_log2 (count) == -1)
5128 {
5129 if (dump_enabled_p ())
5130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5131 "the size of the group of accesses"
5132 " is not a power of 2 or not eqaul to 3\n");
5133 return false;
5134 }
5135
5136 /* Check that the permutation is supported. */
5137 if (VECTOR_MODE_P (mode))
5138 {
5139 unsigned int i;
5140 if (count == 3)
5141 {
5142 unsigned int j0 = 0, j1 = 0, j2 = 0;
5143 unsigned int i, j;
5144
5145 unsigned int nelt;
5146 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5147 {
5148 if (dump_enabled_p ())
5149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5150 "cannot handle groups of 3 stores for"
5151 " variable-length vectors\n");
5152 return false;
5153 }
5154
5155 vec_perm_builder sel (nelt, nelt, 1);
5156 sel.quick_grow (nelt);
5157 vec_perm_indices indices;
5158 for (j = 0; j < 3; j++)
5159 {
5160 int nelt0 = ((3 - j) * nelt) % 3;
5161 int nelt1 = ((3 - j) * nelt + 1) % 3;
5162 int nelt2 = ((3 - j) * nelt + 2) % 3;
5163 for (i = 0; i < nelt; i++)
5164 {
5165 if (3 * i + nelt0 < nelt)
5166 sel[3 * i + nelt0] = j0++;
5167 if (3 * i + nelt1 < nelt)
5168 sel[3 * i + nelt1] = nelt + j1++;
5169 if (3 * i + nelt2 < nelt)
5170 sel[3 * i + nelt2] = 0;
5171 }
5172 indices.new_vector (sel, 2, nelt);
5173 if (!can_vec_perm_const_p (mode, indices))
5174 {
5175 if (dump_enabled_p ())
5176 dump_printf (MSG_MISSED_OPTIMIZATION,
5177 "permutation op not supported by target.\n");
5178 return false;
5179 }
5180
5181 for (i = 0; i < nelt; i++)
5182 {
5183 if (3 * i + nelt0 < nelt)
5184 sel[3 * i + nelt0] = 3 * i + nelt0;
5185 if (3 * i + nelt1 < nelt)
5186 sel[3 * i + nelt1] = 3 * i + nelt1;
5187 if (3 * i + nelt2 < nelt)
5188 sel[3 * i + nelt2] = nelt + j2++;
5189 }
5190 indices.new_vector (sel, 2, nelt);
5191 if (!can_vec_perm_const_p (mode, indices))
5192 {
5193 if (dump_enabled_p ())
5194 dump_printf (MSG_MISSED_OPTIMIZATION,
5195 "permutation op not supported by target.\n");
5196 return false;
5197 }
5198 }
5199 return true;
5200 }
5201 else
5202 {
5203 /* If length is not equal to 3 then only power of 2 is supported. */
5204 gcc_assert (pow2p_hwi (count));
5205 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5206
5207 /* The encoding has 2 interleaved stepped patterns. */
5208 vec_perm_builder sel (nelt, 2, 3);
5209 sel.quick_grow (6);
5210 for (i = 0; i < 3; i++)
5211 {
5212 sel[i * 2] = i;
5213 sel[i * 2 + 1] = i + nelt;
5214 }
5215 vec_perm_indices indices (sel, 2, nelt);
5216 if (can_vec_perm_const_p (mode, indices))
5217 {
5218 for (i = 0; i < 6; i++)
5219 sel[i] += exact_div (nelt, 2);
5220 indices.new_vector (sel, 2, nelt);
5221 if (can_vec_perm_const_p (mode, indices))
5222 return true;
5223 }
5224 }
5225 }
5226
5227 if (dump_enabled_p ())
5228 dump_printf (MSG_MISSED_OPTIMIZATION,
5229 "permutaion op not supported by target.\n");
5230 return false;
5231 }
5232
5233
5234 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5235 type VECTYPE. MASKED_P says whether the masked form is needed. */
5236
5237 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5238 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5239 bool masked_p)
5240 {
5241 if (masked_p)
5242 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5243 vec_mask_store_lanes_optab,
5244 vectype, count);
5245 else
5246 return vect_lanes_optab_supported_p ("vec_store_lanes",
5247 vec_store_lanes_optab,
5248 vectype, count);
5249 }
5250
5251
5252 /* Function vect_permute_store_chain.
5253
5254 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5255 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5256 the data correctly for the stores. Return the final references for stores
5257 in RESULT_CHAIN.
5258
5259 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5260 The input is 4 vectors each containing 8 elements. We assign a number to
5261 each element, the input sequence is:
5262
5263 1st vec: 0 1 2 3 4 5 6 7
5264 2nd vec: 8 9 10 11 12 13 14 15
5265 3rd vec: 16 17 18 19 20 21 22 23
5266 4th vec: 24 25 26 27 28 29 30 31
5267
5268 The output sequence should be:
5269
5270 1st vec: 0 8 16 24 1 9 17 25
5271 2nd vec: 2 10 18 26 3 11 19 27
5272 3rd vec: 4 12 20 28 5 13 21 30
5273 4th vec: 6 14 22 30 7 15 23 31
5274
5275 i.e., we interleave the contents of the four vectors in their order.
5276
5277 We use interleave_high/low instructions to create such output. The input of
5278 each interleave_high/low operation is two vectors:
5279 1st vec 2nd vec
5280 0 1 2 3 4 5 6 7
5281 the even elements of the result vector are obtained left-to-right from the
5282 high/low elements of the first vector. The odd elements of the result are
5283 obtained left-to-right from the high/low elements of the second vector.
5284 The output of interleave_high will be: 0 4 1 5
5285 and of interleave_low: 2 6 3 7
5286
5287
5288 The permutation is done in log LENGTH stages. In each stage interleave_high
5289 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5290 where the first argument is taken from the first half of DR_CHAIN and the
5291 second argument from it's second half.
5292 In our example,
5293
5294 I1: interleave_high (1st vec, 3rd vec)
5295 I2: interleave_low (1st vec, 3rd vec)
5296 I3: interleave_high (2nd vec, 4th vec)
5297 I4: interleave_low (2nd vec, 4th vec)
5298
5299 The output for the first stage is:
5300
5301 I1: 0 16 1 17 2 18 3 19
5302 I2: 4 20 5 21 6 22 7 23
5303 I3: 8 24 9 25 10 26 11 27
5304 I4: 12 28 13 29 14 30 15 31
5305
5306 The output of the second stage, i.e. the final result is:
5307
5308 I1: 0 8 16 24 1 9 17 25
5309 I2: 2 10 18 26 3 11 19 27
5310 I3: 4 12 20 28 5 13 21 30
5311 I4: 6 14 22 30 7 15 23 31. */
5312
5313 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5314 vect_permute_store_chain (vec<tree> dr_chain,
5315 unsigned int length,
5316 gimple *stmt,
5317 gimple_stmt_iterator *gsi,
5318 vec<tree> *result_chain)
5319 {
5320 tree vect1, vect2, high, low;
5321 gimple *perm_stmt;
5322 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5323 tree perm_mask_low, perm_mask_high;
5324 tree data_ref;
5325 tree perm3_mask_low, perm3_mask_high;
5326 unsigned int i, j, n, log_length = exact_log2 (length);
5327
5328 result_chain->quick_grow (length);
5329 memcpy (result_chain->address (), dr_chain.address (),
5330 length * sizeof (tree));
5331
5332 if (length == 3)
5333 {
5334 /* vect_grouped_store_supported ensures that this is constant. */
5335 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5336 unsigned int j0 = 0, j1 = 0, j2 = 0;
5337
5338 vec_perm_builder sel (nelt, nelt, 1);
5339 sel.quick_grow (nelt);
5340 vec_perm_indices indices;
5341 for (j = 0; j < 3; j++)
5342 {
5343 int nelt0 = ((3 - j) * nelt) % 3;
5344 int nelt1 = ((3 - j) * nelt + 1) % 3;
5345 int nelt2 = ((3 - j) * nelt + 2) % 3;
5346
5347 for (i = 0; i < nelt; i++)
5348 {
5349 if (3 * i + nelt0 < nelt)
5350 sel[3 * i + nelt0] = j0++;
5351 if (3 * i + nelt1 < nelt)
5352 sel[3 * i + nelt1] = nelt + j1++;
5353 if (3 * i + nelt2 < nelt)
5354 sel[3 * i + nelt2] = 0;
5355 }
5356 indices.new_vector (sel, 2, nelt);
5357 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5358
5359 for (i = 0; i < nelt; i++)
5360 {
5361 if (3 * i + nelt0 < nelt)
5362 sel[3 * i + nelt0] = 3 * i + nelt0;
5363 if (3 * i + nelt1 < nelt)
5364 sel[3 * i + nelt1] = 3 * i + nelt1;
5365 if (3 * i + nelt2 < nelt)
5366 sel[3 * i + nelt2] = nelt + j2++;
5367 }
5368 indices.new_vector (sel, 2, nelt);
5369 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5370
5371 vect1 = dr_chain[0];
5372 vect2 = dr_chain[1];
5373
5374 /* Create interleaving stmt:
5375 low = VEC_PERM_EXPR <vect1, vect2,
5376 {j, nelt, *, j + 1, nelt + j + 1, *,
5377 j + 2, nelt + j + 2, *, ...}> */
5378 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5379 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5380 vect2, perm3_mask_low);
5381 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5382
5383 vect1 = data_ref;
5384 vect2 = dr_chain[2];
5385 /* Create interleaving stmt:
5386 low = VEC_PERM_EXPR <vect1, vect2,
5387 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5388 6, 7, nelt + j + 2, ...}> */
5389 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5390 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5391 vect2, perm3_mask_high);
5392 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5393 (*result_chain)[j] = data_ref;
5394 }
5395 }
5396 else
5397 {
5398 /* If length is not equal to 3 then only power of 2 is supported. */
5399 gcc_assert (pow2p_hwi (length));
5400
5401 /* The encoding has 2 interleaved stepped patterns. */
5402 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5403 vec_perm_builder sel (nelt, 2, 3);
5404 sel.quick_grow (6);
5405 for (i = 0; i < 3; i++)
5406 {
5407 sel[i * 2] = i;
5408 sel[i * 2 + 1] = i + nelt;
5409 }
5410 vec_perm_indices indices (sel, 2, nelt);
5411 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5412
5413 for (i = 0; i < 6; i++)
5414 sel[i] += exact_div (nelt, 2);
5415 indices.new_vector (sel, 2, nelt);
5416 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5417
5418 for (i = 0, n = log_length; i < n; i++)
5419 {
5420 for (j = 0; j < length/2; j++)
5421 {
5422 vect1 = dr_chain[j];
5423 vect2 = dr_chain[j+length/2];
5424
5425 /* Create interleaving stmt:
5426 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5427 ...}> */
5428 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5429 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5430 vect2, perm_mask_high);
5431 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5432 (*result_chain)[2*j] = high;
5433
5434 /* Create interleaving stmt:
5435 low = VEC_PERM_EXPR <vect1, vect2,
5436 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5437 ...}> */
5438 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5439 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5440 vect2, perm_mask_low);
5441 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5442 (*result_chain)[2*j+1] = low;
5443 }
5444 memcpy (dr_chain.address (), result_chain->address (),
5445 length * sizeof (tree));
5446 }
5447 }
5448 }
5449
5450 /* Function vect_setup_realignment
5451
5452 This function is called when vectorizing an unaligned load using
5453 the dr_explicit_realign[_optimized] scheme.
5454 This function generates the following code at the loop prolog:
5455
5456 p = initial_addr;
5457 x msq_init = *(floor(p)); # prolog load
5458 realignment_token = call target_builtin;
5459 loop:
5460 x msq = phi (msq_init, ---)
5461
5462 The stmts marked with x are generated only for the case of
5463 dr_explicit_realign_optimized.
5464
5465 The code above sets up a new (vector) pointer, pointing to the first
5466 location accessed by STMT, and a "floor-aligned" load using that pointer.
5467 It also generates code to compute the "realignment-token" (if the relevant
5468 target hook was defined), and creates a phi-node at the loop-header bb
5469 whose arguments are the result of the prolog-load (created by this
5470 function) and the result of a load that takes place in the loop (to be
5471 created by the caller to this function).
5472
5473 For the case of dr_explicit_realign_optimized:
5474 The caller to this function uses the phi-result (msq) to create the
5475 realignment code inside the loop, and sets up the missing phi argument,
5476 as follows:
5477 loop:
5478 msq = phi (msq_init, lsq)
5479 lsq = *(floor(p')); # load in loop
5480 result = realign_load (msq, lsq, realignment_token);
5481
5482 For the case of dr_explicit_realign:
5483 loop:
5484 msq = *(floor(p)); # load in loop
5485 p' = p + (VS-1);
5486 lsq = *(floor(p')); # load in loop
5487 result = realign_load (msq, lsq, realignment_token);
5488
5489 Input:
5490 STMT - (scalar) load stmt to be vectorized. This load accesses
5491 a memory location that may be unaligned.
5492 BSI - place where new code is to be inserted.
5493 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5494 is used.
5495
5496 Output:
5497 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5498 target hook, if defined.
5499 Return value - the result of the loop-header phi node. */
5500
5501 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5502 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5503 tree *realignment_token,
5504 enum dr_alignment_support alignment_support_scheme,
5505 tree init_addr,
5506 struct loop **at_loop)
5507 {
5508 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5509 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5510 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5511 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5512 struct loop *loop = NULL;
5513 edge pe = NULL;
5514 tree scalar_dest = gimple_assign_lhs (stmt);
5515 tree vec_dest;
5516 gimple *inc;
5517 tree ptr;
5518 tree data_ref;
5519 basic_block new_bb;
5520 tree msq_init = NULL_TREE;
5521 tree new_temp;
5522 gphi *phi_stmt;
5523 tree msq = NULL_TREE;
5524 gimple_seq stmts = NULL;
5525 bool inv_p;
5526 bool compute_in_loop = false;
5527 bool nested_in_vect_loop = false;
5528 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5529 struct loop *loop_for_initial_load = NULL;
5530
5531 if (loop_vinfo)
5532 {
5533 loop = LOOP_VINFO_LOOP (loop_vinfo);
5534 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5535 }
5536
5537 gcc_assert (alignment_support_scheme == dr_explicit_realign
5538 || alignment_support_scheme == dr_explicit_realign_optimized);
5539
5540 /* We need to generate three things:
5541 1. the misalignment computation
5542 2. the extra vector load (for the optimized realignment scheme).
5543 3. the phi node for the two vectors from which the realignment is
5544 done (for the optimized realignment scheme). */
5545
5546 /* 1. Determine where to generate the misalignment computation.
5547
5548 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5549 calculation will be generated by this function, outside the loop (in the
5550 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5551 caller, inside the loop.
5552
5553 Background: If the misalignment remains fixed throughout the iterations of
5554 the loop, then both realignment schemes are applicable, and also the
5555 misalignment computation can be done outside LOOP. This is because we are
5556 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5557 are a multiple of VS (the Vector Size), and therefore the misalignment in
5558 different vectorized LOOP iterations is always the same.
5559 The problem arises only if the memory access is in an inner-loop nested
5560 inside LOOP, which is now being vectorized using outer-loop vectorization.
5561 This is the only case when the misalignment of the memory access may not
5562 remain fixed throughout the iterations of the inner-loop (as explained in
5563 detail in vect_supportable_dr_alignment). In this case, not only is the
5564 optimized realignment scheme not applicable, but also the misalignment
5565 computation (and generation of the realignment token that is passed to
5566 REALIGN_LOAD) have to be done inside the loop.
5567
5568 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5569 or not, which in turn determines if the misalignment is computed inside
5570 the inner-loop, or outside LOOP. */
5571
5572 if (init_addr != NULL_TREE || !loop_vinfo)
5573 {
5574 compute_in_loop = true;
5575 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5576 }
5577
5578
5579 /* 2. Determine where to generate the extra vector load.
5580
5581 For the optimized realignment scheme, instead of generating two vector
5582 loads in each iteration, we generate a single extra vector load in the
5583 preheader of the loop, and in each iteration reuse the result of the
5584 vector load from the previous iteration. In case the memory access is in
5585 an inner-loop nested inside LOOP, which is now being vectorized using
5586 outer-loop vectorization, we need to determine whether this initial vector
5587 load should be generated at the preheader of the inner-loop, or can be
5588 generated at the preheader of LOOP. If the memory access has no evolution
5589 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5590 to be generated inside LOOP (in the preheader of the inner-loop). */
5591
5592 if (nested_in_vect_loop)
5593 {
5594 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5595 bool invariant_in_outerloop =
5596 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5597 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5598 }
5599 else
5600 loop_for_initial_load = loop;
5601 if (at_loop)
5602 *at_loop = loop_for_initial_load;
5603
5604 if (loop_for_initial_load)
5605 pe = loop_preheader_edge (loop_for_initial_load);
5606
5607 /* 3. For the case of the optimized realignment, create the first vector
5608 load at the loop preheader. */
5609
5610 if (alignment_support_scheme == dr_explicit_realign_optimized)
5611 {
5612 /* Create msq_init = *(floor(p1)) in the loop preheader */
5613 gassign *new_stmt;
5614
5615 gcc_assert (!compute_in_loop);
5616 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5617 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5618 NULL_TREE, &init_addr, NULL, &inc,
5619 true, &inv_p);
5620 if (TREE_CODE (ptr) == SSA_NAME)
5621 new_temp = copy_ssa_name (ptr);
5622 else
5623 new_temp = make_ssa_name (TREE_TYPE (ptr));
5624 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5625 new_stmt = gimple_build_assign
5626 (new_temp, BIT_AND_EXPR, ptr,
5627 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5628 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5629 gcc_assert (!new_bb);
5630 data_ref
5631 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5632 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5633 vect_copy_ref_info (data_ref, DR_REF (dr));
5634 new_stmt = gimple_build_assign (vec_dest, data_ref);
5635 new_temp = make_ssa_name (vec_dest, new_stmt);
5636 gimple_assign_set_lhs (new_stmt, new_temp);
5637 if (pe)
5638 {
5639 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5640 gcc_assert (!new_bb);
5641 }
5642 else
5643 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5644
5645 msq_init = gimple_assign_lhs (new_stmt);
5646 }
5647
5648 /* 4. Create realignment token using a target builtin, if available.
5649 It is done either inside the containing loop, or before LOOP (as
5650 determined above). */
5651
5652 if (targetm.vectorize.builtin_mask_for_load)
5653 {
5654 gcall *new_stmt;
5655 tree builtin_decl;
5656
5657 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5658 if (!init_addr)
5659 {
5660 /* Generate the INIT_ADDR computation outside LOOP. */
5661 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5662 NULL_TREE);
5663 if (loop)
5664 {
5665 pe = loop_preheader_edge (loop);
5666 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5667 gcc_assert (!new_bb);
5668 }
5669 else
5670 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5671 }
5672
5673 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5674 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5675 vec_dest =
5676 vect_create_destination_var (scalar_dest,
5677 gimple_call_return_type (new_stmt));
5678 new_temp = make_ssa_name (vec_dest, new_stmt);
5679 gimple_call_set_lhs (new_stmt, new_temp);
5680
5681 if (compute_in_loop)
5682 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5683 else
5684 {
5685 /* Generate the misalignment computation outside LOOP. */
5686 pe = loop_preheader_edge (loop);
5687 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5688 gcc_assert (!new_bb);
5689 }
5690
5691 *realignment_token = gimple_call_lhs (new_stmt);
5692
5693 /* The result of the CALL_EXPR to this builtin is determined from
5694 the value of the parameter and no global variables are touched
5695 which makes the builtin a "const" function. Requiring the
5696 builtin to have the "const" attribute makes it unnecessary
5697 to call mark_call_clobbered. */
5698 gcc_assert (TREE_READONLY (builtin_decl));
5699 }
5700
5701 if (alignment_support_scheme == dr_explicit_realign)
5702 return msq;
5703
5704 gcc_assert (!compute_in_loop);
5705 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5706
5707
5708 /* 5. Create msq = phi <msq_init, lsq> in loop */
5709
5710 pe = loop_preheader_edge (containing_loop);
5711 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5712 msq = make_ssa_name (vec_dest);
5713 phi_stmt = create_phi_node (msq, containing_loop->header);
5714 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5715
5716 return msq;
5717 }
5718
5719
5720 /* Function vect_grouped_load_supported.
5721
5722 COUNT is the size of the load group (the number of statements plus the
5723 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5724 only one statement, with a gap of COUNT - 1.
5725
5726 Returns true if a suitable permute exists. */
5727
5728 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5729 vect_grouped_load_supported (tree vectype, bool single_element_p,
5730 unsigned HOST_WIDE_INT count)
5731 {
5732 machine_mode mode = TYPE_MODE (vectype);
5733
5734 /* If this is single-element interleaving with an element distance
5735 that leaves unused vector loads around punt - we at least create
5736 very sub-optimal code in that case (and blow up memory,
5737 see PR65518). */
5738 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5739 {
5740 if (dump_enabled_p ())
5741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742 "single-element interleaving not supported "
5743 "for not adjacent vector loads\n");
5744 return false;
5745 }
5746
5747 /* vect_permute_load_chain requires the group size to be equal to 3 or
5748 be a power of two. */
5749 if (count != 3 && exact_log2 (count) == -1)
5750 {
5751 if (dump_enabled_p ())
5752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753 "the size of the group of accesses"
5754 " is not a power of 2 or not equal to 3\n");
5755 return false;
5756 }
5757
5758 /* Check that the permutation is supported. */
5759 if (VECTOR_MODE_P (mode))
5760 {
5761 unsigned int i, j;
5762 if (count == 3)
5763 {
5764 unsigned int nelt;
5765 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5766 {
5767 if (dump_enabled_p ())
5768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5769 "cannot handle groups of 3 loads for"
5770 " variable-length vectors\n");
5771 return false;
5772 }
5773
5774 vec_perm_builder sel (nelt, nelt, 1);
5775 sel.quick_grow (nelt);
5776 vec_perm_indices indices;
5777 unsigned int k;
5778 for (k = 0; k < 3; k++)
5779 {
5780 for (i = 0; i < nelt; i++)
5781 if (3 * i + k < 2 * nelt)
5782 sel[i] = 3 * i + k;
5783 else
5784 sel[i] = 0;
5785 indices.new_vector (sel, 2, nelt);
5786 if (!can_vec_perm_const_p (mode, indices))
5787 {
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790 "shuffle of 3 loads is not supported by"
5791 " target\n");
5792 return false;
5793 }
5794 for (i = 0, j = 0; i < nelt; i++)
5795 if (3 * i + k < 2 * nelt)
5796 sel[i] = i;
5797 else
5798 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5799 indices.new_vector (sel, 2, nelt);
5800 if (!can_vec_perm_const_p (mode, indices))
5801 {
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "shuffle of 3 loads is not supported by"
5805 " target\n");
5806 return false;
5807 }
5808 }
5809 return true;
5810 }
5811 else
5812 {
5813 /* If length is not equal to 3 then only power of 2 is supported. */
5814 gcc_assert (pow2p_hwi (count));
5815 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5816
5817 /* The encoding has a single stepped pattern. */
5818 vec_perm_builder sel (nelt, 1, 3);
5819 sel.quick_grow (3);
5820 for (i = 0; i < 3; i++)
5821 sel[i] = i * 2;
5822 vec_perm_indices indices (sel, 2, nelt);
5823 if (can_vec_perm_const_p (mode, indices))
5824 {
5825 for (i = 0; i < 3; i++)
5826 sel[i] = i * 2 + 1;
5827 indices.new_vector (sel, 2, nelt);
5828 if (can_vec_perm_const_p (mode, indices))
5829 return true;
5830 }
5831 }
5832 }
5833
5834 if (dump_enabled_p ())
5835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5836 "extract even/odd not supported by target\n");
5837 return false;
5838 }
5839
5840 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5841 type VECTYPE. MASKED_P says whether the masked form is needed. */
5842
5843 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5844 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5845 bool masked_p)
5846 {
5847 if (masked_p)
5848 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5849 vec_mask_load_lanes_optab,
5850 vectype, count);
5851 else
5852 return vect_lanes_optab_supported_p ("vec_load_lanes",
5853 vec_load_lanes_optab,
5854 vectype, count);
5855 }
5856
5857 /* Function vect_permute_load_chain.
5858
5859 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5860 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5861 the input data correctly. Return the final references for loads in
5862 RESULT_CHAIN.
5863
5864 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5865 The input is 4 vectors each containing 8 elements. We assign a number to each
5866 element, the input sequence is:
5867
5868 1st vec: 0 1 2 3 4 5 6 7
5869 2nd vec: 8 9 10 11 12 13 14 15
5870 3rd vec: 16 17 18 19 20 21 22 23
5871 4th vec: 24 25 26 27 28 29 30 31
5872
5873 The output sequence should be:
5874
5875 1st vec: 0 4 8 12 16 20 24 28
5876 2nd vec: 1 5 9 13 17 21 25 29
5877 3rd vec: 2 6 10 14 18 22 26 30
5878 4th vec: 3 7 11 15 19 23 27 31
5879
5880 i.e., the first output vector should contain the first elements of each
5881 interleaving group, etc.
5882
5883 We use extract_even/odd instructions to create such output. The input of
5884 each extract_even/odd operation is two vectors
5885 1st vec 2nd vec
5886 0 1 2 3 4 5 6 7
5887
5888 and the output is the vector of extracted even/odd elements. The output of
5889 extract_even will be: 0 2 4 6
5890 and of extract_odd: 1 3 5 7
5891
5892
5893 The permutation is done in log LENGTH stages. In each stage extract_even
5894 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5895 their order. In our example,
5896
5897 E1: extract_even (1st vec, 2nd vec)
5898 E2: extract_odd (1st vec, 2nd vec)
5899 E3: extract_even (3rd vec, 4th vec)
5900 E4: extract_odd (3rd vec, 4th vec)
5901
5902 The output for the first stage will be:
5903
5904 E1: 0 2 4 6 8 10 12 14
5905 E2: 1 3 5 7 9 11 13 15
5906 E3: 16 18 20 22 24 26 28 30
5907 E4: 17 19 21 23 25 27 29 31
5908
5909 In order to proceed and create the correct sequence for the next stage (or
5910 for the correct output, if the second stage is the last one, as in our
5911 example), we first put the output of extract_even operation and then the
5912 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5913 The input for the second stage is:
5914
5915 1st vec (E1): 0 2 4 6 8 10 12 14
5916 2nd vec (E3): 16 18 20 22 24 26 28 30
5917 3rd vec (E2): 1 3 5 7 9 11 13 15
5918 4th vec (E4): 17 19 21 23 25 27 29 31
5919
5920 The output of the second stage:
5921
5922 E1: 0 4 8 12 16 20 24 28
5923 E2: 2 6 10 14 18 22 26 30
5924 E3: 1 5 9 13 17 21 25 29
5925 E4: 3 7 11 15 19 23 27 31
5926
5927 And RESULT_CHAIN after reordering:
5928
5929 1st vec (E1): 0 4 8 12 16 20 24 28
5930 2nd vec (E3): 1 5 9 13 17 21 25 29
5931 3rd vec (E2): 2 6 10 14 18 22 26 30
5932 4th vec (E4): 3 7 11 15 19 23 27 31. */
5933
5934 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5935 vect_permute_load_chain (vec<tree> dr_chain,
5936 unsigned int length,
5937 gimple *stmt,
5938 gimple_stmt_iterator *gsi,
5939 vec<tree> *result_chain)
5940 {
5941 tree data_ref, first_vect, second_vect;
5942 tree perm_mask_even, perm_mask_odd;
5943 tree perm3_mask_low, perm3_mask_high;
5944 gimple *perm_stmt;
5945 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5946 unsigned int i, j, log_length = exact_log2 (length);
5947
5948 result_chain->quick_grow (length);
5949 memcpy (result_chain->address (), dr_chain.address (),
5950 length * sizeof (tree));
5951
5952 if (length == 3)
5953 {
5954 /* vect_grouped_load_supported ensures that this is constant. */
5955 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5956 unsigned int k;
5957
5958 vec_perm_builder sel (nelt, nelt, 1);
5959 sel.quick_grow (nelt);
5960 vec_perm_indices indices;
5961 for (k = 0; k < 3; k++)
5962 {
5963 for (i = 0; i < nelt; i++)
5964 if (3 * i + k < 2 * nelt)
5965 sel[i] = 3 * i + k;
5966 else
5967 sel[i] = 0;
5968 indices.new_vector (sel, 2, nelt);
5969 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5970
5971 for (i = 0, j = 0; i < nelt; i++)
5972 if (3 * i + k < 2 * nelt)
5973 sel[i] = i;
5974 else
5975 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5976 indices.new_vector (sel, 2, nelt);
5977 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5978
5979 first_vect = dr_chain[0];
5980 second_vect = dr_chain[1];
5981
5982 /* Create interleaving stmt (low part of):
5983 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5984 ...}> */
5985 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5986 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5987 second_vect, perm3_mask_low);
5988 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5989
5990 /* Create interleaving stmt (high part of):
5991 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5992 ...}> */
5993 first_vect = data_ref;
5994 second_vect = dr_chain[2];
5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5997 second_vect, perm3_mask_high);
5998 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999 (*result_chain)[k] = data_ref;
6000 }
6001 }
6002 else
6003 {
6004 /* If length is not equal to 3 then only power of 2 is supported. */
6005 gcc_assert (pow2p_hwi (length));
6006
6007 /* The encoding has a single stepped pattern. */
6008 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6009 vec_perm_builder sel (nelt, 1, 3);
6010 sel.quick_grow (3);
6011 for (i = 0; i < 3; ++i)
6012 sel[i] = i * 2;
6013 vec_perm_indices indices (sel, 2, nelt);
6014 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6015
6016 for (i = 0; i < 3; ++i)
6017 sel[i] = i * 2 + 1;
6018 indices.new_vector (sel, 2, nelt);
6019 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6020
6021 for (i = 0; i < log_length; i++)
6022 {
6023 for (j = 0; j < length; j += 2)
6024 {
6025 first_vect = dr_chain[j];
6026 second_vect = dr_chain[j+1];
6027
6028 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6029 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6030 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6031 first_vect, second_vect,
6032 perm_mask_even);
6033 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6034 (*result_chain)[j/2] = data_ref;
6035
6036 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6037 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6038 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6039 first_vect, second_vect,
6040 perm_mask_odd);
6041 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6042 (*result_chain)[j/2+length/2] = data_ref;
6043 }
6044 memcpy (dr_chain.address (), result_chain->address (),
6045 length * sizeof (tree));
6046 }
6047 }
6048 }
6049
6050 /* Function vect_shift_permute_load_chain.
6051
6052 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6053 sequence of stmts to reorder the input data accordingly.
6054 Return the final references for loads in RESULT_CHAIN.
6055 Return true if successed, false otherwise.
6056
6057 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6058 The input is 3 vectors each containing 8 elements. We assign a
6059 number to each element, the input sequence is:
6060
6061 1st vec: 0 1 2 3 4 5 6 7
6062 2nd vec: 8 9 10 11 12 13 14 15
6063 3rd vec: 16 17 18 19 20 21 22 23
6064
6065 The output sequence should be:
6066
6067 1st vec: 0 3 6 9 12 15 18 21
6068 2nd vec: 1 4 7 10 13 16 19 22
6069 3rd vec: 2 5 8 11 14 17 20 23
6070
6071 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6072
6073 First we shuffle all 3 vectors to get correct elements order:
6074
6075 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6076 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6077 3rd vec: (16 19 22) (17 20 23) (18 21)
6078
6079 Next we unite and shift vector 3 times:
6080
6081 1st step:
6082 shift right by 6 the concatenation of:
6083 "1st vec" and "2nd vec"
6084 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6085 "2nd vec" and "3rd vec"
6086 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6087 "3rd vec" and "1st vec"
6088 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6089 | New vectors |
6090
6091 So that now new vectors are:
6092
6093 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6094 2nd vec: (10 13) (16 19 22) (17 20 23)
6095 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6096
6097 2nd step:
6098 shift right by 5 the concatenation of:
6099 "1st vec" and "3rd vec"
6100 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6101 "2nd vec" and "1st vec"
6102 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6103 "3rd vec" and "2nd vec"
6104 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6105 | New vectors |
6106
6107 So that now new vectors are:
6108
6109 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6110 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6111 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6112
6113 3rd step:
6114 shift right by 5 the concatenation of:
6115 "1st vec" and "1st vec"
6116 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6117 shift right by 3 the concatenation of:
6118 "2nd vec" and "2nd vec"
6119 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6120 | New vectors |
6121
6122 So that now all vectors are READY:
6123 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6124 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6125 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6126
6127 This algorithm is faster than one in vect_permute_load_chain if:
6128 1. "shift of a concatination" is faster than general permutation.
6129 This is usually so.
6130 2. The TARGET machine can't execute vector instructions in parallel.
6131 This is because each step of the algorithm depends on previous.
6132 The algorithm in vect_permute_load_chain is much more parallel.
6133
6134 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6135 */
6136
6137 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6138 vect_shift_permute_load_chain (vec<tree> dr_chain,
6139 unsigned int length,
6140 gimple *stmt,
6141 gimple_stmt_iterator *gsi,
6142 vec<tree> *result_chain)
6143 {
6144 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6145 tree perm2_mask1, perm2_mask2, perm3_mask;
6146 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6147 gimple *perm_stmt;
6148
6149 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6150 unsigned int i;
6151 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6152 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6153
6154 unsigned HOST_WIDE_INT nelt, vf;
6155 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6156 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6157 /* Not supported for variable-length vectors. */
6158 return false;
6159
6160 vec_perm_builder sel (nelt, nelt, 1);
6161 sel.quick_grow (nelt);
6162
6163 result_chain->quick_grow (length);
6164 memcpy (result_chain->address (), dr_chain.address (),
6165 length * sizeof (tree));
6166
6167 if (pow2p_hwi (length) && vf > 4)
6168 {
6169 unsigned int j, log_length = exact_log2 (length);
6170 for (i = 0; i < nelt / 2; ++i)
6171 sel[i] = i * 2;
6172 for (i = 0; i < nelt / 2; ++i)
6173 sel[nelt / 2 + i] = i * 2 + 1;
6174 vec_perm_indices indices (sel, 2, nelt);
6175 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6176 {
6177 if (dump_enabled_p ())
6178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6179 "shuffle of 2 fields structure is not \
6180 supported by target\n");
6181 return false;
6182 }
6183 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6184
6185 for (i = 0; i < nelt / 2; ++i)
6186 sel[i] = i * 2 + 1;
6187 for (i = 0; i < nelt / 2; ++i)
6188 sel[nelt / 2 + i] = i * 2;
6189 indices.new_vector (sel, 2, nelt);
6190 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6191 {
6192 if (dump_enabled_p ())
6193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6194 "shuffle of 2 fields structure is not \
6195 supported by target\n");
6196 return false;
6197 }
6198 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6199
6200 /* Generating permutation constant to shift all elements.
6201 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6202 for (i = 0; i < nelt; i++)
6203 sel[i] = nelt / 2 + i;
6204 indices.new_vector (sel, 2, nelt);
6205 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6206 {
6207 if (dump_enabled_p ())
6208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6209 "shift permutation is not supported by target\n");
6210 return false;
6211 }
6212 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6213
6214 /* Generating permutation constant to select vector from 2.
6215 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6216 for (i = 0; i < nelt / 2; i++)
6217 sel[i] = i;
6218 for (i = nelt / 2; i < nelt; i++)
6219 sel[i] = nelt + i;
6220 indices.new_vector (sel, 2, nelt);
6221 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6222 {
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6225 "select is not supported by target\n");
6226 return false;
6227 }
6228 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6229
6230 for (i = 0; i < log_length; i++)
6231 {
6232 for (j = 0; j < length; j += 2)
6233 {
6234 first_vect = dr_chain[j];
6235 second_vect = dr_chain[j + 1];
6236
6237 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6238 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6239 first_vect, first_vect,
6240 perm2_mask1);
6241 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6242 vect[0] = data_ref;
6243
6244 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6245 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6246 second_vect, second_vect,
6247 perm2_mask2);
6248 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6249 vect[1] = data_ref;
6250
6251 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6252 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253 vect[0], vect[1], shift1_mask);
6254 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6255 (*result_chain)[j/2 + length/2] = data_ref;
6256
6257 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6258 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6259 vect[0], vect[1], select_mask);
6260 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6261 (*result_chain)[j/2] = data_ref;
6262 }
6263 memcpy (dr_chain.address (), result_chain->address (),
6264 length * sizeof (tree));
6265 }
6266 return true;
6267 }
6268 if (length == 3 && vf > 2)
6269 {
6270 unsigned int k = 0, l = 0;
6271
6272 /* Generating permutation constant to get all elements in rigth order.
6273 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6274 for (i = 0; i < nelt; i++)
6275 {
6276 if (3 * k + (l % 3) >= nelt)
6277 {
6278 k = 0;
6279 l += (3 - (nelt % 3));
6280 }
6281 sel[i] = 3 * k + (l % 3);
6282 k++;
6283 }
6284 vec_perm_indices indices (sel, 2, nelt);
6285 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6286 {
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "shuffle of 3 fields structure is not \
6290 supported by target\n");
6291 return false;
6292 }
6293 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6294
6295 /* Generating permutation constant to shift all elements.
6296 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6297 for (i = 0; i < nelt; i++)
6298 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6299 indices.new_vector (sel, 2, nelt);
6300 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6301 {
6302 if (dump_enabled_p ())
6303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304 "shift permutation is not supported by target\n");
6305 return false;
6306 }
6307 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6308
6309 /* Generating permutation constant to shift all elements.
6310 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6311 for (i = 0; i < nelt; i++)
6312 sel[i] = 2 * (nelt / 3) + 1 + i;
6313 indices.new_vector (sel, 2, nelt);
6314 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6315 {
6316 if (dump_enabled_p ())
6317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318 "shift permutation is not supported by target\n");
6319 return false;
6320 }
6321 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6322
6323 /* Generating permutation constant to shift all elements.
6324 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6325 for (i = 0; i < nelt; i++)
6326 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6327 indices.new_vector (sel, 2, nelt);
6328 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6329 {
6330 if (dump_enabled_p ())
6331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6332 "shift permutation is not supported by target\n");
6333 return false;
6334 }
6335 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6336
6337 /* Generating permutation constant to shift all elements.
6338 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6339 for (i = 0; i < nelt; i++)
6340 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6341 indices.new_vector (sel, 2, nelt);
6342 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6343 {
6344 if (dump_enabled_p ())
6345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6346 "shift permutation is not supported by target\n");
6347 return false;
6348 }
6349 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6350
6351 for (k = 0; k < 3; k++)
6352 {
6353 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6354 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6355 dr_chain[k], dr_chain[k],
6356 perm3_mask);
6357 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6358 vect[k] = data_ref;
6359 }
6360
6361 for (k = 0; k < 3; k++)
6362 {
6363 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6364 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6365 vect[k % 3], vect[(k + 1) % 3],
6366 shift1_mask);
6367 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6368 vect_shift[k] = data_ref;
6369 }
6370
6371 for (k = 0; k < 3; k++)
6372 {
6373 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6374 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6375 vect_shift[(4 - k) % 3],
6376 vect_shift[(3 - k) % 3],
6377 shift2_mask);
6378 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6379 vect[k] = data_ref;
6380 }
6381
6382 (*result_chain)[3 - (nelt % 3)] = vect[2];
6383
6384 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6385 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6386 vect[0], shift3_mask);
6387 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6388 (*result_chain)[nelt % 3] = data_ref;
6389
6390 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6391 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6392 vect[1], shift4_mask);
6393 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6394 (*result_chain)[0] = data_ref;
6395 return true;
6396 }
6397 return false;
6398 }
6399
6400 /* Function vect_transform_grouped_load.
6401
6402 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6403 to perform their permutation and ascribe the result vectorized statements to
6404 the scalar statements.
6405 */
6406
6407 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6408 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6409 gimple_stmt_iterator *gsi)
6410 {
6411 machine_mode mode;
6412 vec<tree> result_chain = vNULL;
6413
6414 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6415 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6416 vectors, that are ready for vector computation. */
6417 result_chain.create (size);
6418
6419 /* If reassociation width for vector type is 2 or greater target machine can
6420 execute 2 or more vector instructions in parallel. Otherwise try to
6421 get chain for loads group using vect_shift_permute_load_chain. */
6422 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6423 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6424 || pow2p_hwi (size)
6425 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6426 gsi, &result_chain))
6427 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6428 vect_record_grouped_load_vectors (stmt, result_chain);
6429 result_chain.release ();
6430 }
6431
6432 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6433 generated as part of the vectorization of STMT. Assign the statement
6434 for each vector to the associated scalar statement. */
6435
6436 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6437 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6438 {
6439 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6440 gimple *next_stmt, *new_stmt;
6441 unsigned int i, gap_count;
6442 tree tmp_data_ref;
6443
6444 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6445 Since we scan the chain starting from it's first node, their order
6446 corresponds the order of data-refs in RESULT_CHAIN. */
6447 next_stmt = first_stmt;
6448 gap_count = 1;
6449 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6450 {
6451 if (!next_stmt)
6452 break;
6453
6454 /* Skip the gaps. Loads created for the gaps will be removed by dead
6455 code elimination pass later. No need to check for the first stmt in
6456 the group, since it always exists.
6457 GROUP_GAP is the number of steps in elements from the previous
6458 access (if there is no gap GROUP_GAP is 1). We skip loads that
6459 correspond to the gaps. */
6460 if (next_stmt != first_stmt
6461 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6462 {
6463 gap_count++;
6464 continue;
6465 }
6466
6467 while (next_stmt)
6468 {
6469 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6470 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6471 copies, and we put the new vector statement in the first available
6472 RELATED_STMT. */
6473 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6474 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6475 else
6476 {
6477 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6478 {
6479 gimple *prev_stmt =
6480 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6481 gimple *rel_stmt =
6482 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6483 while (rel_stmt)
6484 {
6485 prev_stmt = rel_stmt;
6486 rel_stmt =
6487 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6488 }
6489
6490 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6491 new_stmt;
6492 }
6493 }
6494
6495 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6496 gap_count = 1;
6497 /* If NEXT_STMT accesses the same DR as the previous statement,
6498 put the same TMP_DATA_REF as its vectorized statement; otherwise
6499 get the next data-ref from RESULT_CHAIN. */
6500 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6501 break;
6502 }
6503 }
6504 }
6505
6506 /* Function vect_force_dr_alignment_p.
6507
6508 Returns whether the alignment of a DECL can be forced to be aligned
6509 on ALIGNMENT bit boundary. */
6510
6511 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6512 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6513 {
6514 if (!VAR_P (decl))
6515 return false;
6516
6517 if (decl_in_symtab_p (decl)
6518 && !symtab_node::get (decl)->can_increase_alignment_p ())
6519 return false;
6520
6521 if (TREE_STATIC (decl))
6522 return (alignment <= MAX_OFILE_ALIGNMENT);
6523 else
6524 return (alignment <= MAX_STACK_ALIGNMENT);
6525 }
6526
6527
6528 /* Return whether the data reference DR is supported with respect to its
6529 alignment.
6530 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6531 it is aligned, i.e., check if it is possible to vectorize it with different
6532 alignment. */
6533
6534 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6535 vect_supportable_dr_alignment (struct data_reference *dr,
6536 bool check_aligned_accesses)
6537 {
6538 gimple *stmt = DR_STMT (dr);
6539 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6540 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6541 machine_mode mode = TYPE_MODE (vectype);
6542 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6543 struct loop *vect_loop = NULL;
6544 bool nested_in_vect_loop = false;
6545
6546 if (aligned_access_p (dr) && !check_aligned_accesses)
6547 return dr_aligned;
6548
6549 /* For now assume all conditional loads/stores support unaligned
6550 access without any special code. */
6551 if (is_gimple_call (stmt)
6552 && gimple_call_internal_p (stmt)
6553 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6554 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6555 return dr_unaligned_supported;
6556
6557 if (loop_vinfo)
6558 {
6559 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6560 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6561 }
6562
6563 /* Possibly unaligned access. */
6564
6565 /* We can choose between using the implicit realignment scheme (generating
6566 a misaligned_move stmt) and the explicit realignment scheme (generating
6567 aligned loads with a REALIGN_LOAD). There are two variants to the
6568 explicit realignment scheme: optimized, and unoptimized.
6569 We can optimize the realignment only if the step between consecutive
6570 vector loads is equal to the vector size. Since the vector memory
6571 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6572 is guaranteed that the misalignment amount remains the same throughout the
6573 execution of the vectorized loop. Therefore, we can create the
6574 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6575 at the loop preheader.
6576
6577 However, in the case of outer-loop vectorization, when vectorizing a
6578 memory access in the inner-loop nested within the LOOP that is now being
6579 vectorized, while it is guaranteed that the misalignment of the
6580 vectorized memory access will remain the same in different outer-loop
6581 iterations, it is *not* guaranteed that is will remain the same throughout
6582 the execution of the inner-loop. This is because the inner-loop advances
6583 with the original scalar step (and not in steps of VS). If the inner-loop
6584 step happens to be a multiple of VS, then the misalignment remains fixed
6585 and we can use the optimized realignment scheme. For example:
6586
6587 for (i=0; i<N; i++)
6588 for (j=0; j<M; j++)
6589 s += a[i+j];
6590
6591 When vectorizing the i-loop in the above example, the step between
6592 consecutive vector loads is 1, and so the misalignment does not remain
6593 fixed across the execution of the inner-loop, and the realignment cannot
6594 be optimized (as illustrated in the following pseudo vectorized loop):
6595
6596 for (i=0; i<N; i+=4)
6597 for (j=0; j<M; j++){
6598 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6599 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6600 // (assuming that we start from an aligned address).
6601 }
6602
6603 We therefore have to use the unoptimized realignment scheme:
6604
6605 for (i=0; i<N; i+=4)
6606 for (j=k; j<M; j+=4)
6607 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6608 // that the misalignment of the initial address is
6609 // 0).
6610
6611 The loop can then be vectorized as follows:
6612
6613 for (k=0; k<4; k++){
6614 rt = get_realignment_token (&vp[k]);
6615 for (i=0; i<N; i+=4){
6616 v1 = vp[i+k];
6617 for (j=k; j<M; j+=4){
6618 v2 = vp[i+j+VS-1];
6619 va = REALIGN_LOAD <v1,v2,rt>;
6620 vs += va;
6621 v1 = v2;
6622 }
6623 }
6624 } */
6625
6626 if (DR_IS_READ (dr))
6627 {
6628 bool is_packed = false;
6629 tree type = (TREE_TYPE (DR_REF (dr)));
6630
6631 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6632 && (!targetm.vectorize.builtin_mask_for_load
6633 || targetm.vectorize.builtin_mask_for_load ()))
6634 {
6635 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6636
6637 /* If we are doing SLP then the accesses need not have the
6638 same alignment, instead it depends on the SLP group size. */
6639 if (loop_vinfo
6640 && STMT_SLP_TYPE (stmt_info)
6641 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6642 * GROUP_SIZE (vinfo_for_stmt
6643 (GROUP_FIRST_ELEMENT (stmt_info))),
6644 TYPE_VECTOR_SUBPARTS (vectype)))
6645 ;
6646 else if (!loop_vinfo
6647 || (nested_in_vect_loop
6648 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6649 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6650 return dr_explicit_realign;
6651 else
6652 return dr_explicit_realign_optimized;
6653 }
6654 if (!known_alignment_for_access_p (dr))
6655 is_packed = not_size_aligned (DR_REF (dr));
6656
6657 if (targetm.vectorize.support_vector_misalignment
6658 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6659 /* Can't software pipeline the loads, but can at least do them. */
6660 return dr_unaligned_supported;
6661 }
6662 else
6663 {
6664 bool is_packed = false;
6665 tree type = (TREE_TYPE (DR_REF (dr)));
6666
6667 if (!known_alignment_for_access_p (dr))
6668 is_packed = not_size_aligned (DR_REF (dr));
6669
6670 if (targetm.vectorize.support_vector_misalignment
6671 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6672 return dr_unaligned_supported;
6673 }
6674
6675 /* Unsupported. */
6676 return dr_unaligned_unsupported;
6677 }
6678