1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
81 }
82 }
83
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 {
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
91 }
92
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
97
98 return true;
99 }
100
101
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
118
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140 {
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
147 else if (gcall *call = dyn_cast <gcall *> (stmt))
148 {
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
151 {
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
159 }
160 if (i < gimple_call_num_args (call))
161 {
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 {
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
168 }
169 }
170 }
171
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
175 }
176
177
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
181
182 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184 {
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186
187 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
188 return false;
189
190 if (!runtime_alias_check_p (ddr, loop,
191 optimize_loop_nest_for_speed_p (loop)))
192 return false;
193
194 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
195 return true;
196 }
197
198 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
199
200 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)201 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
202 {
203 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
204 for (unsigned int i = 0; i < checks.length(); ++i)
205 if (checks[i] == value)
206 return;
207
208 if (dump_enabled_p ())
209 {
210 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
211 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
212 dump_printf (MSG_NOTE, " is nonzero\n");
213 }
214 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
215 }
216
217 /* Return true if we know that the order of vectorized STMT_A and
218 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
219 At least one of the statements is a write. */
220
221 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)222 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
223 {
224 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
225 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
226
227 /* Single statements are always kept in their original order. */
228 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
229 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
230 return true;
231
232 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
233 SLP group are emitted at the position of the last scalar load and
234 all loads in an interleaving group are emitted at the position
235 of the first scalar load.
236 Stores in a group are emitted at the position of the last scalar store.
237 Compute that position and check whether the resulting order matches
238 the current one.
239 We have not yet decided between SLP and interleaving so we have
240 to conservatively assume both. */
241 gimple *il_a;
242 gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
243 if (last_a)
244 {
245 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
246 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247 last_a = get_later_stmt (last_a, s);
248 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
249 {
250 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s;
251 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
252 if (get_later_stmt (il_a, s) == il_a)
253 il_a = s;
254 }
255 else
256 il_a = last_a;
257 }
258 else
259 last_a = il_a = stmt_a;
260 gimple *il_b;
261 gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
262 if (last_b)
263 {
264 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
265 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
266 last_b = get_later_stmt (last_b, s);
267 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
268 {
269 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s;
270 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
271 if (get_later_stmt (il_b, s) == il_b)
272 il_b = s;
273 }
274 else
275 il_b = last_b;
276 }
277 else
278 last_b = il_b = stmt_b;
279 bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a);
280 return (/* SLP */
281 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
282 /* Interleaving */
283 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
284 /* Mixed */
285 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
286 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
287 }
288
289 /* A subroutine of vect_analyze_data_ref_dependence. Handle
290 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
291 distances. These distances are conservatively correct but they don't
292 reflect a guaranteed dependence.
293
294 Return true if this function does all the work necessary to avoid
295 an alias or false if the caller should use the dependence distances
296 to limit the vectorization factor in the usual way. LOOP_DEPTH is
297 the depth of the loop described by LOOP_VINFO and the other arguments
298 are as for vect_analyze_data_ref_dependence. */
299
300 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)301 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
302 loop_vec_info loop_vinfo,
303 int loop_depth, unsigned int *max_vf)
304 {
305 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
306 lambda_vector dist_v;
307 unsigned int i;
308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
309 {
310 int dist = dist_v[loop_depth];
311 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
312 {
313 /* If the user asserted safelen >= DIST consecutive iterations
314 can be executed concurrently, assume independence.
315
316 ??? An alternative would be to add the alias check even
317 in this case, and vectorize the fallback loop with the
318 maximum VF set to safelen. However, if the user has
319 explicitly given a length, it's less likely that that
320 would be a win. */
321 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
322 {
323 if ((unsigned int) loop->safelen < *max_vf)
324 *max_vf = loop->safelen;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 continue;
327 }
328
329 /* For dependence distances of 2 or more, we have the option
330 of limiting VF or checking for an alias at runtime.
331 Prefer to check at runtime if we can, to avoid limiting
332 the VF unnecessarily when the bases are in fact independent.
333
334 Note that the alias checks will be removed if the VF ends up
335 being small enough. */
336 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
337 }
338 }
339 return true;
340 }
341
342
343 /* Function vect_analyze_data_ref_dependence.
344
345 Return TRUE if there (might) exist a dependence between a memory-reference
346 DRA and a memory-reference DRB. When versioning for alias may check a
347 dependence at run-time, return FALSE. Adjust *MAX_VF according to
348 the data dependence. */
349
350 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)351 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
352 loop_vec_info loop_vinfo,
353 unsigned int *max_vf)
354 {
355 unsigned int i;
356 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
357 struct data_reference *dra = DDR_A (ddr);
358 struct data_reference *drb = DDR_B (ddr);
359 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
360 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
361 lambda_vector dist_v;
362 unsigned int loop_depth;
363
364 /* In loop analysis all data references should be vectorizable. */
365 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
366 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
367 gcc_unreachable ();
368
369 /* Independent data accesses. */
370 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
371 return false;
372
373 if (dra == drb
374 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
375 return false;
376
377 /* We do not have to consider dependences between accesses that belong
378 to the same group, unless the stride could be smaller than the
379 group size. */
380 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
381 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
382 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
383 return false;
384
385 /* Even if we have an anti-dependence then, as the vectorized loop covers at
386 least two scalar iterations, there is always also a true dependence.
387 As the vectorizer does not re-order loads and stores we can ignore
388 the anti-dependence if TBAA can disambiguate both DRs similar to the
389 case with known negative distance anti-dependences (positive
390 distance anti-dependences would violate TBAA constraints). */
391 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
392 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
393 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
394 get_alias_set (DR_REF (drb))))
395 return false;
396
397 /* Unknown data dependence. */
398 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
399 {
400 /* If user asserted safelen consecutive iterations can be
401 executed concurrently, assume independence. */
402 if (loop->safelen >= 2)
403 {
404 if ((unsigned int) loop->safelen < *max_vf)
405 *max_vf = loop->safelen;
406 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
407 return false;
408 }
409
410 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412 {
413 if (dump_enabled_p ())
414 {
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "versioning for alias not supported for: "
417 "can't determine dependence between ");
418 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
419 DR_REF (dra));
420 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
421 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
422 DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
424 }
425 return true;
426 }
427
428 if (dump_enabled_p ())
429 {
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 "versioning for alias required: "
432 "can't determine dependence between ");
433 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
434 DR_REF (dra));
435 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
437 DR_REF (drb));
438 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
439 }
440
441 /* Add to list of ddrs that need to be tested at run-time. */
442 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
443 }
444
445 /* Known data dependence. */
446 if (DDR_NUM_DIST_VECTS (ddr) == 0)
447 {
448 /* If user asserted safelen consecutive iterations can be
449 executed concurrently, assume independence. */
450 if (loop->safelen >= 2)
451 {
452 if ((unsigned int) loop->safelen < *max_vf)
453 *max_vf = loop->safelen;
454 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
455 return false;
456 }
457
458 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
459 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
460 {
461 if (dump_enabled_p ())
462 {
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 "versioning for alias not supported for: "
465 "bad dist vector for ");
466 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
467 DR_REF (dra));
468 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
469 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
470 DR_REF (drb));
471 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
472 }
473 return true;
474 }
475
476 if (dump_enabled_p ())
477 {
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "versioning for alias required: "
480 "bad dist vector for ");
481 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
482 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
483 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
484 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
485 }
486 /* Add to list of ddrs that need to be tested at run-time. */
487 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
488 }
489
490 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
491
492 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
493 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
494 loop_depth, max_vf))
495 return false;
496
497 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
498 {
499 int dist = dist_v[loop_depth];
500
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "dependence distance = %d.\n", dist);
504
505 if (dist == 0)
506 {
507 if (dump_enabled_p ())
508 {
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "dependence distance == 0 between ");
511 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
512 dump_printf (MSG_NOTE, " and ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
514 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
515 }
516
517 /* When we perform grouped accesses and perform implicit CSE
518 by detecting equal accesses and doing disambiguation with
519 runtime alias tests like for
520 .. = a[i];
521 .. = a[i+1];
522 a[i] = ..;
523 a[i+1] = ..;
524 *p = ..;
525 .. = a[i];
526 .. = a[i+1];
527 where we will end up loading { a[i], a[i+1] } once, make
528 sure that inserting group loads before the first load and
529 stores after the last store will do the right thing.
530 Similar for groups like
531 a[i] = ...;
532 ... = a[i];
533 a[i+1] = ...;
534 where loads from the group interleave with the store. */
535 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
536 {
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
539 "READ_WRITE dependence in interleaving.\n");
540 return true;
541 }
542
543 if (loop->safelen < 2)
544 {
545 tree indicator = dr_zero_step_indicator (dra);
546 if (TREE_CODE (indicator) != INTEGER_CST)
547 vect_check_nonzero_value (loop_vinfo, indicator);
548 else if (integer_zerop (indicator))
549 {
550 if (dump_enabled_p ())
551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
552 "access also has a zero step\n");
553 return true;
554 }
555 }
556 continue;
557 }
558
559 if (dist > 0 && DDR_REVERSED_P (ddr))
560 {
561 /* If DDR_REVERSED_P the order of the data-refs in DDR was
562 reversed (to make distance vector positive), and the actual
563 distance is negative. */
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "dependence distance negative.\n");
567 /* When doing outer loop vectorization, we need to check if there is
568 a backward dependence at the inner loop level if the dependence
569 at the outer loop is reversed. See PR81740. */
570 if (nested_in_vect_loop_p (loop, DR_STMT (dra))
571 || nested_in_vect_loop_p (loop, DR_STMT (drb)))
572 {
573 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
574 DDR_LOOP_NEST (ddr));
575 if (dist_v[inner_depth] < 0)
576 return true;
577 }
578 /* Record a negative dependence distance to later limit the
579 amount of stmt copying / unrolling we can perform.
580 Only need to handle read-after-write dependence. */
581 if (DR_IS_READ (drb)
582 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
583 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
584 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
585 continue;
586 }
587
588 unsigned int abs_dist = abs (dist);
589 if (abs_dist >= 2 && abs_dist < *max_vf)
590 {
591 /* The dependence distance requires reduction of the maximal
592 vectorization factor. */
593 *max_vf = abs_dist;
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_NOTE, vect_location,
596 "adjusting maximal vectorization factor to %i\n",
597 *max_vf);
598 }
599
600 if (abs_dist >= *max_vf)
601 {
602 /* Dependence distance does not create dependence, as far as
603 vectorization is concerned, in this case. */
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_NOTE, vect_location,
606 "dependence distance >= VF.\n");
607 continue;
608 }
609
610 if (dump_enabled_p ())
611 {
612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
613 "not vectorized, possible dependence "
614 "between data-refs ");
615 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
616 dump_printf (MSG_NOTE, " and ");
617 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
618 dump_printf (MSG_NOTE, "\n");
619 }
620
621 return true;
622 }
623
624 return false;
625 }
626
627 /* Function vect_analyze_data_ref_dependences.
628
629 Examine all the data references in the loop, and make sure there do not
630 exist any data dependences between them. Set *MAX_VF according to
631 the maximum vectorization factor the data dependences allow. */
632
633 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)634 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
635 unsigned int *max_vf)
636 {
637 unsigned int i;
638 struct data_dependence_relation *ddr;
639
640 if (dump_enabled_p ())
641 dump_printf_loc (MSG_NOTE, vect_location,
642 "=== vect_analyze_data_ref_dependences ===\n");
643
644 LOOP_VINFO_DDRS (loop_vinfo)
645 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
646 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
647 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
648 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
649 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
650 &LOOP_VINFO_DDRS (loop_vinfo),
651 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
652 return false;
653
654 /* For epilogues we either have no aliases or alias versioning
655 was applied to original loop. Therefore we may just get max_vf
656 using VF of original loop. */
657 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
658 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
659 else
660 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
661 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
662 return false;
663
664 return true;
665 }
666
667
668 /* Function vect_slp_analyze_data_ref_dependence.
669
670 Return TRUE if there (might) exist a dependence between a memory-reference
671 DRA and a memory-reference DRB. When versioning for alias may check a
672 dependence at run-time, return FALSE. Adjust *MAX_VF according to
673 the data dependence. */
674
675 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)676 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
677 {
678 struct data_reference *dra = DDR_A (ddr);
679 struct data_reference *drb = DDR_B (ddr);
680
681 /* We need to check dependences of statements marked as unvectorizable
682 as well, they still can prohibit vectorization. */
683
684 /* Independent data accesses. */
685 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
686 return false;
687
688 if (dra == drb)
689 return false;
690
691 /* Read-read is OK. */
692 if (DR_IS_READ (dra) && DR_IS_READ (drb))
693 return false;
694
695 /* If dra and drb are part of the same interleaving chain consider
696 them independent. */
697 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
698 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
699 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
700 return false;
701
702 /* Unknown data dependence. */
703 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
704 {
705 if (dump_enabled_p ())
706 {
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "can't determine dependence between ");
709 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
710 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
711 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
712 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
713 }
714 }
715 else if (dump_enabled_p ())
716 {
717 dump_printf_loc (MSG_NOTE, vect_location,
718 "determined dependence between ");
719 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
720 dump_printf (MSG_NOTE, " and ");
721 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
722 dump_printf (MSG_NOTE, "\n");
723 }
724
725 return true;
726 }
727
728
729 /* Analyze dependences involved in the transform of SLP NODE. STORES
730 contain the vector of scalar stores of this instance if we are
731 disambiguating the loads. */
732
733 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)734 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
735 vec<gimple *> stores, gimple *last_store)
736 {
737 /* This walks over all stmts involved in the SLP load/store done
738 in NODE verifying we can sink them up to the last stmt in the
739 group. */
740 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
741 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
742 {
743 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
744 if (access == last_access)
745 continue;
746 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
747 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
748 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
749 {
750 gimple *stmt = gsi_stmt (gsi);
751 if (! gimple_vuse (stmt)
752 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
753 continue;
754
755 /* If we couldn't record a (single) data reference for this
756 stmt we have to give up. */
757 /* ??? Here and below if dependence analysis fails we can resort
758 to the alias oracle which can handle more kinds of stmts. */
759 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
760 if (!dr_b)
761 return false;
762
763 bool dependent = false;
764 /* If we run into a store of this same instance (we've just
765 marked those) then delay dependence checking until we run
766 into the last store because this is where it will have
767 been sunk to (and we verify if we can do that as well). */
768 if (gimple_visited_p (stmt))
769 {
770 if (stmt != last_store)
771 continue;
772 unsigned i;
773 gimple *store;
774 FOR_EACH_VEC_ELT (stores, i, store)
775 {
776 data_reference *store_dr
777 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
778 ddr_p ddr = initialize_data_dependence_relation
779 (dr_a, store_dr, vNULL);
780 dependent = vect_slp_analyze_data_ref_dependence (ddr);
781 free_dependence_relation (ddr);
782 if (dependent)
783 break;
784 }
785 }
786 else
787 {
788 ddr_p ddr = initialize_data_dependence_relation (dr_a,
789 dr_b, vNULL);
790 dependent = vect_slp_analyze_data_ref_dependence (ddr);
791 free_dependence_relation (ddr);
792 }
793 if (dependent)
794 return false;
795 }
796 }
797 return true;
798 }
799
800
801 /* Function vect_analyze_data_ref_dependences.
802
803 Examine all the data references in the basic-block, and make sure there
804 do not exist any data dependences between them. Set *MAX_VF according to
805 the maximum vectorization factor the data dependences allow. */
806
807 bool
vect_slp_analyze_instance_dependence(slp_instance instance)808 vect_slp_analyze_instance_dependence (slp_instance instance)
809 {
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location,
812 "=== vect_slp_analyze_instance_dependence ===\n");
813
814 /* The stores of this instance are at the root of the SLP tree. */
815 slp_tree store = SLP_INSTANCE_TREE (instance);
816 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
817 store = NULL;
818
819 /* Verify we can sink stores to the vectorized stmt insert location. */
820 gimple *last_store = NULL;
821 if (store)
822 {
823 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
824 return false;
825
826 /* Mark stores in this instance and remember the last one. */
827 last_store = vect_find_last_scalar_stmt_in_slp (store);
828 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
829 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
830 }
831
832 bool res = true;
833
834 /* Verify we can sink loads to the vectorized stmt insert location,
835 special-casing stores of this instance. */
836 slp_tree load;
837 unsigned int i;
838 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
839 if (! vect_slp_analyze_node_dependences (instance, load,
840 store
841 ? SLP_TREE_SCALAR_STMTS (store)
842 : vNULL, last_store))
843 {
844 res = false;
845 break;
846 }
847
848 /* Unset the visited flag. */
849 if (store)
850 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
851 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
852
853 return res;
854 }
855
856 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
857 the statement that contains DRB, which is useful for recording in the
858 dump file. */
859
860 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)861 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
862 innermost_loop_behavior *drb)
863 {
864 bool existed;
865 innermost_loop_behavior *&entry
866 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
867 if (!existed || entry->base_alignment < drb->base_alignment)
868 {
869 entry = drb;
870 if (dump_enabled_p ())
871 {
872 dump_printf_loc (MSG_NOTE, vect_location,
873 "recording new base alignment for ");
874 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
875 dump_printf (MSG_NOTE, "\n");
876 dump_printf_loc (MSG_NOTE, vect_location,
877 " alignment: %d\n", drb->base_alignment);
878 dump_printf_loc (MSG_NOTE, vect_location,
879 " misalignment: %d\n", drb->base_misalignment);
880 dump_printf_loc (MSG_NOTE, vect_location,
881 " based on: ");
882 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
883 }
884 }
885 }
886
887 /* If the region we're going to vectorize is reached, all unconditional
888 data references occur at least once. We can therefore pool the base
889 alignment guarantees from each unconditional reference. Do this by
890 going through all the data references in VINFO and checking whether
891 the containing statement makes the reference unconditionally. If so,
892 record the alignment of the base address in VINFO so that it can be
893 used for all other references with the same base. */
894
895 void
vect_record_base_alignments(vec_info * vinfo)896 vect_record_base_alignments (vec_info *vinfo)
897 {
898 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
899 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
900 data_reference *dr;
901 unsigned int i;
902 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
903 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
904 {
905 gimple *stmt = DR_STMT (dr);
906 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
907
908 /* If DR is nested in the loop that is being vectorized, we can also
909 record the alignment of the base wrt the outer loop. */
910 if (loop && nested_in_vect_loop_p (loop, stmt))
911 {
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
913 vect_record_base_alignment
914 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
915 }
916 }
917 }
918
919 /* Return the target alignment for the vectorized form of DR. */
920
921 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)922 vect_calculate_target_alignment (struct data_reference *dr)
923 {
924 gimple *stmt = DR_STMT (dr);
925 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
926 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
927 return targetm.vectorize.preferred_vector_alignment (vectype);
928 }
929
930 /* Function vect_compute_data_ref_alignment
931
932 Compute the misalignment of the data reference DR.
933
934 Output:
935 1. If during the misalignment computation it is found that the data reference
936 cannot be vectorized then false is returned.
937 2. DR_MISALIGNMENT (DR) is defined.
938
939 FOR NOW: No analysis is actually performed. Misalignment is calculated
940 only for trivial cases. TODO. */
941
942 bool
vect_compute_data_ref_alignment(struct data_reference * dr)943 vect_compute_data_ref_alignment (struct data_reference *dr)
944 {
945 gimple *stmt = DR_STMT (dr);
946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
947 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
949 struct loop *loop = NULL;
950 tree ref = DR_REF (dr);
951 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
952
953 if (dump_enabled_p ())
954 dump_printf_loc (MSG_NOTE, vect_location,
955 "vect_compute_data_ref_alignment:\n");
956
957 if (loop_vinfo)
958 loop = LOOP_VINFO_LOOP (loop_vinfo);
959
960 /* Initialize misalignment to unknown. */
961 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
962
963 innermost_loop_behavior *drb = vect_dr_behavior (dr);
964 bool step_preserves_misalignment_p;
965
966 unsigned HOST_WIDE_INT vector_alignment
967 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
968 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
969
970 /* No step for BB vectorization. */
971 if (!loop)
972 {
973 gcc_assert (integer_zerop (drb->step));
974 step_preserves_misalignment_p = true;
975 }
976
977 /* In case the dataref is in an inner-loop of the loop that is being
978 vectorized (LOOP), we use the base and misalignment information
979 relative to the outer-loop (LOOP). This is ok only if the misalignment
980 stays the same throughout the execution of the inner-loop, which is why
981 we have to check that the stride of the dataref in the inner-loop evenly
982 divides by the vector alignment. */
983 else if (nested_in_vect_loop_p (loop, stmt))
984 {
985 step_preserves_misalignment_p
986 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
987
988 if (dump_enabled_p ())
989 {
990 if (step_preserves_misalignment_p)
991 dump_printf_loc (MSG_NOTE, vect_location,
992 "inner step divides the vector alignment.\n");
993 else
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "inner step doesn't divide the vector"
996 " alignment.\n");
997 }
998 }
999
1000 /* Similarly we can only use base and misalignment information relative to
1001 an innermost loop if the misalignment stays the same throughout the
1002 execution of the loop. As above, this is the case if the stride of
1003 the dataref evenly divides by the alignment. */
1004 else
1005 {
1006 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1007 step_preserves_misalignment_p
1008 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
1009
1010 if (!step_preserves_misalignment_p && dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "step doesn't divide the vector alignment.\n");
1013 }
1014
1015 unsigned int base_alignment = drb->base_alignment;
1016 unsigned int base_misalignment = drb->base_misalignment;
1017
1018 /* Calculate the maximum of the pooled base address alignment and the
1019 alignment that we can compute for DR itself. */
1020 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1021 if (entry && base_alignment < (*entry)->base_alignment)
1022 {
1023 base_alignment = (*entry)->base_alignment;
1024 base_misalignment = (*entry)->base_misalignment;
1025 }
1026
1027 if (drb->offset_alignment < vector_alignment
1028 || !step_preserves_misalignment_p
1029 /* We need to know whether the step wrt the vectorized loop is
1030 negative when computing the starting misalignment below. */
1031 || TREE_CODE (drb->step) != INTEGER_CST)
1032 {
1033 if (dump_enabled_p ())
1034 {
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "Unknown alignment for access: ");
1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039 }
1040 return true;
1041 }
1042
1043 if (base_alignment < vector_alignment)
1044 {
1045 unsigned int max_alignment;
1046 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1047 if (max_alignment < vector_alignment
1048 || !vect_can_force_dr_alignment_p (base,
1049 vector_alignment * BITS_PER_UNIT))
1050 {
1051 if (dump_enabled_p ())
1052 {
1053 dump_printf_loc (MSG_NOTE, vect_location,
1054 "can't force alignment of ref: ");
1055 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1056 dump_printf (MSG_NOTE, "\n");
1057 }
1058 return true;
1059 }
1060
1061 /* Force the alignment of the decl.
1062 NOTE: This is the only change to the code we make during
1063 the analysis phase, before deciding to vectorize the loop. */
1064 if (dump_enabled_p ())
1065 {
1066 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1067 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1068 dump_printf (MSG_NOTE, "\n");
1069 }
1070
1071 DR_VECT_AUX (dr)->base_decl = base;
1072 DR_VECT_AUX (dr)->base_misaligned = true;
1073 base_misalignment = 0;
1074 }
1075 poly_int64 misalignment
1076 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1077
1078 /* If this is a backward running DR then first access in the larger
1079 vectype actually is N-1 elements before the address in the DR.
1080 Adjust misalign accordingly. */
1081 if (tree_int_cst_sgn (drb->step) < 0)
1082 /* PLUS because STEP is negative. */
1083 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1084 * TREE_INT_CST_LOW (drb->step));
1085
1086 unsigned int const_misalignment;
1087 if (!known_misalignment (misalignment, vector_alignment,
1088 &const_misalignment))
1089 {
1090 if (dump_enabled_p ())
1091 {
1092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1093 "Non-constant misalignment for access: ");
1094 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1095 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1096 }
1097 return true;
1098 }
1099
1100 SET_DR_MISALIGNMENT (dr, const_misalignment);
1101
1102 if (dump_enabled_p ())
1103 {
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1106 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1107 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1108 }
1109
1110 return true;
1111 }
1112
1113 /* Function vect_update_misalignment_for_peel.
1114 Sets DR's misalignment
1115 - to 0 if it has the same alignment as DR_PEEL,
1116 - to the misalignment computed using NPEEL if DR's salignment is known,
1117 - to -1 (unknown) otherwise.
1118
1119 DR - the data reference whose misalignment is to be adjusted.
1120 DR_PEEL - the data reference whose misalignment is being made
1121 zero in the vector loop by the peel.
1122 NPEEL - the number of iterations in the peel loop if the misalignment
1123 of DR_PEEL is known at compile time. */
1124
1125 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1126 vect_update_misalignment_for_peel (struct data_reference *dr,
1127 struct data_reference *dr_peel, int npeel)
1128 {
1129 unsigned int i;
1130 vec<dr_p> same_aligned_drs;
1131 struct data_reference *current_dr;
1132 int dr_size = vect_get_scalar_dr_size (dr);
1133 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1134 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1135 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1136
1137 /* For interleaved data accesses the step in the loop must be multiplied by
1138 the size of the interleaving group. */
1139 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1140 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1141 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1142 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1143
1144 /* It can be assumed that the data refs with the same alignment as dr_peel
1145 are aligned in the vector loop. */
1146 same_aligned_drs
1147 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1148 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1149 {
1150 if (current_dr != dr)
1151 continue;
1152 gcc_assert (!known_alignment_for_access_p (dr)
1153 || !known_alignment_for_access_p (dr_peel)
1154 || (DR_MISALIGNMENT (dr) / dr_size
1155 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1156 SET_DR_MISALIGNMENT (dr, 0);
1157 return;
1158 }
1159
1160 if (known_alignment_for_access_p (dr)
1161 && known_alignment_for_access_p (dr_peel))
1162 {
1163 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1164 int misal = DR_MISALIGNMENT (dr);
1165 misal += negative ? -npeel * dr_size : npeel * dr_size;
1166 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1167 SET_DR_MISALIGNMENT (dr, misal);
1168 return;
1169 }
1170
1171 if (dump_enabled_p ())
1172 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1173 "to unknown (-1).\n");
1174 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1175 }
1176
1177
1178 /* Function verify_data_ref_alignment
1179
1180 Return TRUE if DR can be handled with respect to alignment. */
1181
1182 static bool
verify_data_ref_alignment(data_reference_p dr)1183 verify_data_ref_alignment (data_reference_p dr)
1184 {
1185 enum dr_alignment_support supportable_dr_alignment
1186 = vect_supportable_dr_alignment (dr, false);
1187 if (!supportable_dr_alignment)
1188 {
1189 if (dump_enabled_p ())
1190 {
1191 if (DR_IS_READ (dr))
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 "not vectorized: unsupported unaligned load.");
1194 else
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: unsupported unaligned "
1197 "store.");
1198
1199 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1200 DR_REF (dr));
1201 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1202 }
1203 return false;
1204 }
1205
1206 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1207 dump_printf_loc (MSG_NOTE, vect_location,
1208 "Vectorizing an unaligned access.\n");
1209
1210 return true;
1211 }
1212
1213 /* Function vect_verify_datarefs_alignment
1214
1215 Return TRUE if all data references in the loop can be
1216 handled with respect to alignment. */
1217
1218 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1219 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1220 {
1221 vec<data_reference_p> datarefs = vinfo->datarefs;
1222 struct data_reference *dr;
1223 unsigned int i;
1224
1225 FOR_EACH_VEC_ELT (datarefs, i, dr)
1226 {
1227 gimple *stmt = DR_STMT (dr);
1228 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1229
1230 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1231 continue;
1232
1233 /* For interleaving, only the alignment of the first access matters. */
1234 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1235 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1236 continue;
1237
1238 /* Strided accesses perform only component accesses, alignment is
1239 irrelevant for them. */
1240 if (STMT_VINFO_STRIDED_P (stmt_info)
1241 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242 continue;
1243
1244 if (! verify_data_ref_alignment (dr))
1245 return false;
1246 }
1247
1248 return true;
1249 }
1250
1251 /* Given an memory reference EXP return whether its alignment is less
1252 than its size. */
1253
1254 static bool
not_size_aligned(tree exp)1255 not_size_aligned (tree exp)
1256 {
1257 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1258 return true;
1259
1260 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1261 > get_object_alignment (exp));
1262 }
1263
1264 /* Function vector_alignment_reachable_p
1265
1266 Return true if vector alignment for DR is reachable by peeling
1267 a few loop iterations. Return false otherwise. */
1268
1269 static bool
vector_alignment_reachable_p(struct data_reference * dr)1270 vector_alignment_reachable_p (struct data_reference *dr)
1271 {
1272 gimple *stmt = DR_STMT (dr);
1273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1274 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1275
1276 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1277 {
1278 /* For interleaved access we peel only if number of iterations in
1279 the prolog loop ({VF - misalignment}), is a multiple of the
1280 number of the interleaved accesses. */
1281 int elem_size, mis_in_elements;
1282
1283 /* FORNOW: handle only known alignment. */
1284 if (!known_alignment_for_access_p (dr))
1285 return false;
1286
1287 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1288 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1289 elem_size = vector_element_size (vector_size, nelements);
1290 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1291
1292 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1293 return false;
1294 }
1295
1296 /* If misalignment is known at the compile time then allow peeling
1297 only if natural alignment is reachable through peeling. */
1298 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1299 {
1300 HOST_WIDE_INT elmsize =
1301 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1302 if (dump_enabled_p ())
1303 {
1304 dump_printf_loc (MSG_NOTE, vect_location,
1305 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1306 dump_printf (MSG_NOTE,
1307 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1308 }
1309 if (DR_MISALIGNMENT (dr) % elmsize)
1310 {
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "data size does not divide the misalignment.\n");
1314 return false;
1315 }
1316 }
1317
1318 if (!known_alignment_for_access_p (dr))
1319 {
1320 tree type = TREE_TYPE (DR_REF (dr));
1321 bool is_packed = not_size_aligned (DR_REF (dr));
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 "Unknown misalignment, %snaturally aligned\n",
1325 is_packed ? "not " : "");
1326 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1327 }
1328
1329 return true;
1330 }
1331
1332
1333 /* Calculate the cost of the memory access represented by DR. */
1334
1335 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1336 vect_get_data_access_cost (struct data_reference *dr,
1337 unsigned int *inside_cost,
1338 unsigned int *outside_cost,
1339 stmt_vector_for_cost *body_cost_vec)
1340 {
1341 gimple *stmt = DR_STMT (dr);
1342 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1343 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1344 int ncopies;
1345
1346 if (PURE_SLP_STMT (stmt_info))
1347 ncopies = 1;
1348 else
1349 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1350
1351 if (DR_IS_READ (dr))
1352 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1353 NULL, body_cost_vec, false);
1354 else
1355 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1356
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_NOTE, vect_location,
1359 "vect_get_data_access_cost: inside_cost = %d, "
1360 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1361 }
1362
1363
1364 typedef struct _vect_peel_info
1365 {
1366 struct data_reference *dr;
1367 int npeel;
1368 unsigned int count;
1369 } *vect_peel_info;
1370
1371 typedef struct _vect_peel_extended_info
1372 {
1373 struct _vect_peel_info peel_info;
1374 unsigned int inside_cost;
1375 unsigned int outside_cost;
1376 } *vect_peel_extended_info;
1377
1378
1379 /* Peeling hashtable helpers. */
1380
1381 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1382 {
1383 static inline hashval_t hash (const _vect_peel_info *);
1384 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1385 };
1386
1387 inline hashval_t
hash(const _vect_peel_info * peel_info)1388 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1389 {
1390 return (hashval_t) peel_info->npeel;
1391 }
1392
1393 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1394 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1395 {
1396 return (a->npeel == b->npeel);
1397 }
1398
1399
1400 /* Insert DR into peeling hash table with NPEEL as key. */
1401
1402 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1403 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1404 loop_vec_info loop_vinfo, struct data_reference *dr,
1405 int npeel)
1406 {
1407 struct _vect_peel_info elem, *slot;
1408 _vect_peel_info **new_slot;
1409 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1410
1411 elem.npeel = npeel;
1412 slot = peeling_htab->find (&elem);
1413 if (slot)
1414 slot->count++;
1415 else
1416 {
1417 slot = XNEW (struct _vect_peel_info);
1418 slot->npeel = npeel;
1419 slot->dr = dr;
1420 slot->count = 1;
1421 new_slot = peeling_htab->find_slot (slot, INSERT);
1422 *new_slot = slot;
1423 }
1424
1425 if (!supportable_dr_alignment
1426 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1427 slot->count += VECT_MAX_COST;
1428 }
1429
1430
1431 /* Traverse peeling hash table to find peeling option that aligns maximum
1432 number of data accesses. */
1433
1434 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1435 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1436 _vect_peel_extended_info *max)
1437 {
1438 vect_peel_info elem = *slot;
1439
1440 if (elem->count > max->peel_info.count
1441 || (elem->count == max->peel_info.count
1442 && max->peel_info.npeel > elem->npeel))
1443 {
1444 max->peel_info.npeel = elem->npeel;
1445 max->peel_info.count = elem->count;
1446 max->peel_info.dr = elem->dr;
1447 }
1448
1449 return 1;
1450 }
1451
1452 /* Get the costs of peeling NPEEL iterations checking data access costs
1453 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1454 misalignment will be zero after peeling. */
1455
1456 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1457 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1458 struct data_reference *dr0,
1459 unsigned int *inside_cost,
1460 unsigned int *outside_cost,
1461 stmt_vector_for_cost *body_cost_vec,
1462 unsigned int npeel,
1463 bool unknown_misalignment)
1464 {
1465 unsigned i;
1466 data_reference *dr;
1467
1468 FOR_EACH_VEC_ELT (datarefs, i, dr)
1469 {
1470 gimple *stmt = DR_STMT (dr);
1471 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1472 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1473 continue;
1474
1475 /* For interleaving, only the alignment of the first access
1476 matters. */
1477 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1478 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1479 continue;
1480
1481 /* Strided accesses perform only component accesses, alignment is
1482 irrelevant for them. */
1483 if (STMT_VINFO_STRIDED_P (stmt_info)
1484 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1485 continue;
1486
1487 int save_misalignment;
1488 save_misalignment = DR_MISALIGNMENT (dr);
1489 if (npeel == 0)
1490 ;
1491 else if (unknown_misalignment && dr == dr0)
1492 SET_DR_MISALIGNMENT (dr, 0);
1493 else
1494 vect_update_misalignment_for_peel (dr, dr0, npeel);
1495 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1496 body_cost_vec);
1497 SET_DR_MISALIGNMENT (dr, save_misalignment);
1498 }
1499 }
1500
1501 /* Traverse peeling hash table and calculate cost for each peeling option.
1502 Find the one with the lowest cost. */
1503
1504 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1505 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1506 _vect_peel_extended_info *min)
1507 {
1508 vect_peel_info elem = *slot;
1509 int dummy;
1510 unsigned int inside_cost = 0, outside_cost = 0;
1511 gimple *stmt = DR_STMT (elem->dr);
1512 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1513 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1514 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1515 epilogue_cost_vec;
1516
1517 prologue_cost_vec.create (2);
1518 body_cost_vec.create (2);
1519 epilogue_cost_vec.create (2);
1520
1521 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1522 elem->dr, &inside_cost, &outside_cost,
1523 &body_cost_vec, elem->npeel, false);
1524
1525 body_cost_vec.release ();
1526
1527 outside_cost += vect_get_known_peeling_cost
1528 (loop_vinfo, elem->npeel, &dummy,
1529 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1530 &prologue_cost_vec, &epilogue_cost_vec);
1531
1532 /* Prologue and epilogue costs are added to the target model later.
1533 These costs depend only on the scalar iteration cost, the
1534 number of peeling iterations finally chosen, and the number of
1535 misaligned statements. So discard the information found here. */
1536 prologue_cost_vec.release ();
1537 epilogue_cost_vec.release ();
1538
1539 if (inside_cost < min->inside_cost
1540 || (inside_cost == min->inside_cost
1541 && outside_cost < min->outside_cost))
1542 {
1543 min->inside_cost = inside_cost;
1544 min->outside_cost = outside_cost;
1545 min->peel_info.dr = elem->dr;
1546 min->peel_info.npeel = elem->npeel;
1547 min->peel_info.count = elem->count;
1548 }
1549
1550 return 1;
1551 }
1552
1553
1554 /* Choose best peeling option by traversing peeling hash table and either
1555 choosing an option with the lowest cost (if cost model is enabled) or the
1556 option that aligns as many accesses as possible. */
1557
1558 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1559 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1560 loop_vec_info loop_vinfo)
1561 {
1562 struct _vect_peel_extended_info res;
1563
1564 res.peel_info.dr = NULL;
1565
1566 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1567 {
1568 res.inside_cost = INT_MAX;
1569 res.outside_cost = INT_MAX;
1570 peeling_htab->traverse <_vect_peel_extended_info *,
1571 vect_peeling_hash_get_lowest_cost> (&res);
1572 }
1573 else
1574 {
1575 res.peel_info.count = 0;
1576 peeling_htab->traverse <_vect_peel_extended_info *,
1577 vect_peeling_hash_get_most_frequent> (&res);
1578 res.inside_cost = 0;
1579 res.outside_cost = 0;
1580 }
1581
1582 return res;
1583 }
1584
1585 /* Return true if the new peeling NPEEL is supported. */
1586
1587 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1588 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1589 unsigned npeel)
1590 {
1591 unsigned i;
1592 struct data_reference *dr = NULL;
1593 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1594 gimple *stmt;
1595 stmt_vec_info stmt_info;
1596 enum dr_alignment_support supportable_dr_alignment;
1597
1598 /* Ensure that all data refs can be vectorized after the peel. */
1599 FOR_EACH_VEC_ELT (datarefs, i, dr)
1600 {
1601 int save_misalignment;
1602
1603 if (dr == dr0)
1604 continue;
1605
1606 stmt = DR_STMT (dr);
1607 stmt_info = vinfo_for_stmt (stmt);
1608 /* For interleaving, only the alignment of the first access
1609 matters. */
1610 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1611 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1612 continue;
1613
1614 /* Strided accesses perform only component accesses, alignment is
1615 irrelevant for them. */
1616 if (STMT_VINFO_STRIDED_P (stmt_info)
1617 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1618 continue;
1619
1620 save_misalignment = DR_MISALIGNMENT (dr);
1621 vect_update_misalignment_for_peel (dr, dr0, npeel);
1622 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1623 SET_DR_MISALIGNMENT (dr, save_misalignment);
1624
1625 if (!supportable_dr_alignment)
1626 return false;
1627 }
1628
1629 return true;
1630 }
1631
1632 /* Function vect_enhance_data_refs_alignment
1633
1634 This pass will use loop versioning and loop peeling in order to enhance
1635 the alignment of data references in the loop.
1636
1637 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1638 original loop is to be vectorized. Any other loops that are created by
1639 the transformations performed in this pass - are not supposed to be
1640 vectorized. This restriction will be relaxed.
1641
1642 This pass will require a cost model to guide it whether to apply peeling
1643 or versioning or a combination of the two. For example, the scheme that
1644 intel uses when given a loop with several memory accesses, is as follows:
1645 choose one memory access ('p') which alignment you want to force by doing
1646 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1647 other accesses are not necessarily aligned, or (2) use loop versioning to
1648 generate one loop in which all accesses are aligned, and another loop in
1649 which only 'p' is necessarily aligned.
1650
1651 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1652 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1653 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1654
1655 Devising a cost model is the most critical aspect of this work. It will
1656 guide us on which access to peel for, whether to use loop versioning, how
1657 many versions to create, etc. The cost model will probably consist of
1658 generic considerations as well as target specific considerations (on
1659 powerpc for example, misaligned stores are more painful than misaligned
1660 loads).
1661
1662 Here are the general steps involved in alignment enhancements:
1663
1664 -- original loop, before alignment analysis:
1665 for (i=0; i<N; i++){
1666 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1667 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1668 }
1669
1670 -- After vect_compute_data_refs_alignment:
1671 for (i=0; i<N; i++){
1672 x = q[i]; # DR_MISALIGNMENT(q) = 3
1673 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1674 }
1675
1676 -- Possibility 1: we do loop versioning:
1677 if (p is aligned) {
1678 for (i=0; i<N; i++){ # loop 1A
1679 x = q[i]; # DR_MISALIGNMENT(q) = 3
1680 p[i] = y; # DR_MISALIGNMENT(p) = 0
1681 }
1682 }
1683 else {
1684 for (i=0; i<N; i++){ # loop 1B
1685 x = q[i]; # DR_MISALIGNMENT(q) = 3
1686 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1687 }
1688 }
1689
1690 -- Possibility 2: we do loop peeling:
1691 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1692 x = q[i];
1693 p[i] = y;
1694 }
1695 for (i = 3; i < N; i++){ # loop 2A
1696 x = q[i]; # DR_MISALIGNMENT(q) = 0
1697 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1698 }
1699
1700 -- Possibility 3: combination of loop peeling and versioning:
1701 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1702 x = q[i];
1703 p[i] = y;
1704 }
1705 if (p is aligned) {
1706 for (i = 3; i<N; i++){ # loop 3A
1707 x = q[i]; # DR_MISALIGNMENT(q) = 0
1708 p[i] = y; # DR_MISALIGNMENT(p) = 0
1709 }
1710 }
1711 else {
1712 for (i = 3; i<N; i++){ # loop 3B
1713 x = q[i]; # DR_MISALIGNMENT(q) = 0
1714 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1715 }
1716 }
1717
1718 These loops are later passed to loop_transform to be vectorized. The
1719 vectorizer will use the alignment information to guide the transformation
1720 (whether to generate regular loads/stores, or with special handling for
1721 misalignment). */
1722
1723 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1724 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1725 {
1726 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1728 enum dr_alignment_support supportable_dr_alignment;
1729 struct data_reference *dr0 = NULL, *first_store = NULL;
1730 struct data_reference *dr;
1731 unsigned int i, j;
1732 bool do_peeling = false;
1733 bool do_versioning = false;
1734 bool stat;
1735 gimple *stmt;
1736 stmt_vec_info stmt_info;
1737 unsigned int npeel = 0;
1738 bool one_misalignment_known = false;
1739 bool one_misalignment_unknown = false;
1740 bool one_dr_unsupportable = false;
1741 struct data_reference *unsupportable_dr = NULL;
1742 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1743 unsigned possible_npeel_number = 1;
1744 tree vectype;
1745 unsigned int mis, same_align_drs_max = 0;
1746 hash_table<peel_info_hasher> peeling_htab (1);
1747
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "=== vect_enhance_data_refs_alignment ===\n");
1751
1752 /* Reset data so we can safely be called multiple times. */
1753 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1754 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1755
1756 /* While cost model enhancements are expected in the future, the high level
1757 view of the code at this time is as follows:
1758
1759 A) If there is a misaligned access then see if peeling to align
1760 this access can make all data references satisfy
1761 vect_supportable_dr_alignment. If so, update data structures
1762 as needed and return true.
1763
1764 B) If peeling wasn't possible and there is a data reference with an
1765 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1766 then see if loop versioning checks can be used to make all data
1767 references satisfy vect_supportable_dr_alignment. If so, update
1768 data structures as needed and return true.
1769
1770 C) If neither peeling nor versioning were successful then return false if
1771 any data reference does not satisfy vect_supportable_dr_alignment.
1772
1773 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1774
1775 Note, Possibility 3 above (which is peeling and versioning together) is not
1776 being done at this time. */
1777
1778 /* (1) Peeling to force alignment. */
1779
1780 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1781 Considerations:
1782 + How many accesses will become aligned due to the peeling
1783 - How many accesses will become unaligned due to the peeling,
1784 and the cost of misaligned accesses.
1785 - The cost of peeling (the extra runtime checks, the increase
1786 in code size). */
1787
1788 FOR_EACH_VEC_ELT (datarefs, i, dr)
1789 {
1790 stmt = DR_STMT (dr);
1791 stmt_info = vinfo_for_stmt (stmt);
1792
1793 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1794 continue;
1795
1796 /* For interleaving, only the alignment of the first access
1797 matters. */
1798 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1799 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1800 continue;
1801
1802 /* For invariant accesses there is nothing to enhance. */
1803 if (integer_zerop (DR_STEP (dr)))
1804 continue;
1805
1806 /* Strided accesses perform only component accesses, alignment is
1807 irrelevant for them. */
1808 if (STMT_VINFO_STRIDED_P (stmt_info)
1809 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1810 continue;
1811
1812 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1813 do_peeling = vector_alignment_reachable_p (dr);
1814 if (do_peeling)
1815 {
1816 if (known_alignment_for_access_p (dr))
1817 {
1818 unsigned int npeel_tmp = 0;
1819 bool negative = tree_int_cst_compare (DR_STEP (dr),
1820 size_zero_node) < 0;
1821
1822 vectype = STMT_VINFO_VECTYPE (stmt_info);
1823 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1824 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1825 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1826 if (DR_MISALIGNMENT (dr) != 0)
1827 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1828
1829 /* For multiple types, it is possible that the bigger type access
1830 will have more than one peeling option. E.g., a loop with two
1831 types: one of size (vector size / 4), and the other one of
1832 size (vector size / 8). Vectorization factor will 8. If both
1833 accesses are misaligned by 3, the first one needs one scalar
1834 iteration to be aligned, and the second one needs 5. But the
1835 first one will be aligned also by peeling 5 scalar
1836 iterations, and in that case both accesses will be aligned.
1837 Hence, except for the immediate peeling amount, we also want
1838 to try to add full vector size, while we don't exceed
1839 vectorization factor.
1840 We do this automatically for cost model, since we calculate
1841 cost for every peeling option. */
1842 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1843 {
1844 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1845 ? vf * GROUP_SIZE (stmt_info) : vf);
1846 possible_npeel_number
1847 = vect_get_num_vectors (nscalars, vectype);
1848
1849 /* NPEEL_TMP is 0 when there is no misalignment, but also
1850 allow peeling NELEMENTS. */
1851 if (DR_MISALIGNMENT (dr) == 0)
1852 possible_npeel_number++;
1853 }
1854
1855 /* Save info about DR in the hash table. Also include peeling
1856 amounts according to the explanation above. */
1857 for (j = 0; j < possible_npeel_number; j++)
1858 {
1859 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1860 dr, npeel_tmp);
1861 npeel_tmp += target_align / dr_size;
1862 }
1863
1864 one_misalignment_known = true;
1865 }
1866 else
1867 {
1868 /* If we don't know any misalignment values, we prefer
1869 peeling for data-ref that has the maximum number of data-refs
1870 with the same alignment, unless the target prefers to align
1871 stores over load. */
1872 unsigned same_align_drs
1873 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1874 if (!dr0
1875 || same_align_drs_max < same_align_drs)
1876 {
1877 same_align_drs_max = same_align_drs;
1878 dr0 = dr;
1879 }
1880 /* For data-refs with the same number of related
1881 accesses prefer the one where the misalign
1882 computation will be invariant in the outermost loop. */
1883 else if (same_align_drs_max == same_align_drs)
1884 {
1885 struct loop *ivloop0, *ivloop;
1886 ivloop0 = outermost_invariant_loop_for_expr
1887 (loop, DR_BASE_ADDRESS (dr0));
1888 ivloop = outermost_invariant_loop_for_expr
1889 (loop, DR_BASE_ADDRESS (dr));
1890 if ((ivloop && !ivloop0)
1891 || (ivloop && ivloop0
1892 && flow_loop_nested_p (ivloop, ivloop0)))
1893 dr0 = dr;
1894 }
1895
1896 one_misalignment_unknown = true;
1897
1898 /* Check for data refs with unsupportable alignment that
1899 can be peeled. */
1900 if (!supportable_dr_alignment)
1901 {
1902 one_dr_unsupportable = true;
1903 unsupportable_dr = dr;
1904 }
1905
1906 if (!first_store && DR_IS_WRITE (dr))
1907 first_store = dr;
1908 }
1909 }
1910 else
1911 {
1912 if (!aligned_access_p (dr))
1913 {
1914 if (dump_enabled_p ())
1915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1916 "vector alignment may not be reachable\n");
1917 break;
1918 }
1919 }
1920 }
1921
1922 /* Check if we can possibly peel the loop. */
1923 if (!vect_can_advance_ivs_p (loop_vinfo)
1924 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1925 || loop->inner)
1926 do_peeling = false;
1927
1928 struct _vect_peel_extended_info peel_for_known_alignment;
1929 struct _vect_peel_extended_info peel_for_unknown_alignment;
1930 struct _vect_peel_extended_info best_peel;
1931
1932 peel_for_unknown_alignment.inside_cost = INT_MAX;
1933 peel_for_unknown_alignment.outside_cost = INT_MAX;
1934 peel_for_unknown_alignment.peel_info.count = 0;
1935
1936 if (do_peeling
1937 && one_misalignment_unknown)
1938 {
1939 /* Check if the target requires to prefer stores over loads, i.e., if
1940 misaligned stores are more expensive than misaligned loads (taking
1941 drs with same alignment into account). */
1942 unsigned int load_inside_cost = 0;
1943 unsigned int load_outside_cost = 0;
1944 unsigned int store_inside_cost = 0;
1945 unsigned int store_outside_cost = 0;
1946 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1947
1948 stmt_vector_for_cost dummy;
1949 dummy.create (2);
1950 vect_get_peeling_costs_all_drs (datarefs, dr0,
1951 &load_inside_cost,
1952 &load_outside_cost,
1953 &dummy, estimated_npeels, true);
1954 dummy.release ();
1955
1956 if (first_store)
1957 {
1958 dummy.create (2);
1959 vect_get_peeling_costs_all_drs (datarefs, first_store,
1960 &store_inside_cost,
1961 &store_outside_cost,
1962 &dummy, estimated_npeels, true);
1963 dummy.release ();
1964 }
1965 else
1966 {
1967 store_inside_cost = INT_MAX;
1968 store_outside_cost = INT_MAX;
1969 }
1970
1971 if (load_inside_cost > store_inside_cost
1972 || (load_inside_cost == store_inside_cost
1973 && load_outside_cost > store_outside_cost))
1974 {
1975 dr0 = first_store;
1976 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1977 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1978 }
1979 else
1980 {
1981 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1982 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1983 }
1984
1985 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1986 prologue_cost_vec.create (2);
1987 epilogue_cost_vec.create (2);
1988
1989 int dummy2;
1990 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1991 (loop_vinfo, estimated_npeels, &dummy2,
1992 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1993 &prologue_cost_vec, &epilogue_cost_vec);
1994
1995 prologue_cost_vec.release ();
1996 epilogue_cost_vec.release ();
1997
1998 peel_for_unknown_alignment.peel_info.count = 1
1999 + STMT_VINFO_SAME_ALIGN_REFS
2000 (vinfo_for_stmt (DR_STMT (dr0))).length ();
2001 }
2002
2003 peel_for_unknown_alignment.peel_info.npeel = 0;
2004 peel_for_unknown_alignment.peel_info.dr = dr0;
2005
2006 best_peel = peel_for_unknown_alignment;
2007
2008 peel_for_known_alignment.inside_cost = INT_MAX;
2009 peel_for_known_alignment.outside_cost = INT_MAX;
2010 peel_for_known_alignment.peel_info.count = 0;
2011 peel_for_known_alignment.peel_info.dr = NULL;
2012
2013 if (do_peeling && one_misalignment_known)
2014 {
2015 /* Peeling is possible, but there is no data access that is not supported
2016 unless aligned. So we try to choose the best possible peeling from
2017 the hash table. */
2018 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2019 (&peeling_htab, loop_vinfo);
2020 }
2021
2022 /* Compare costs of peeling for known and unknown alignment. */
2023 if (peel_for_known_alignment.peel_info.dr != NULL
2024 && peel_for_unknown_alignment.inside_cost
2025 >= peel_for_known_alignment.inside_cost)
2026 {
2027 best_peel = peel_for_known_alignment;
2028
2029 /* If the best peeling for known alignment has NPEEL == 0, perform no
2030 peeling at all except if there is an unsupportable dr that we can
2031 align. */
2032 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2033 do_peeling = false;
2034 }
2035
2036 /* If there is an unsupportable data ref, prefer this over all choices so far
2037 since we'd have to discard a chosen peeling except when it accidentally
2038 aligned the unsupportable data ref. */
2039 if (one_dr_unsupportable)
2040 dr0 = unsupportable_dr;
2041 else if (do_peeling)
2042 {
2043 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2044 TODO: Use nopeel_outside_cost or get rid of it? */
2045 unsigned nopeel_inside_cost = 0;
2046 unsigned nopeel_outside_cost = 0;
2047
2048 stmt_vector_for_cost dummy;
2049 dummy.create (2);
2050 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
2051 &nopeel_outside_cost, &dummy, 0, false);
2052 dummy.release ();
2053
2054 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2055 costs will be recorded. */
2056 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2057 prologue_cost_vec.create (2);
2058 epilogue_cost_vec.create (2);
2059
2060 int dummy2;
2061 nopeel_outside_cost += vect_get_known_peeling_cost
2062 (loop_vinfo, 0, &dummy2,
2063 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2064 &prologue_cost_vec, &epilogue_cost_vec);
2065
2066 prologue_cost_vec.release ();
2067 epilogue_cost_vec.release ();
2068
2069 npeel = best_peel.peel_info.npeel;
2070 dr0 = best_peel.peel_info.dr;
2071
2072 /* If no peeling is not more expensive than the best peeling we
2073 have so far, don't perform any peeling. */
2074 if (nopeel_inside_cost <= best_peel.inside_cost)
2075 do_peeling = false;
2076 }
2077
2078 if (do_peeling)
2079 {
2080 stmt = DR_STMT (dr0);
2081 stmt_info = vinfo_for_stmt (stmt);
2082 vectype = STMT_VINFO_VECTYPE (stmt_info);
2083
2084 if (known_alignment_for_access_p (dr0))
2085 {
2086 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2087 size_zero_node) < 0;
2088 if (!npeel)
2089 {
2090 /* Since it's known at compile time, compute the number of
2091 iterations in the peeled loop (the peeling factor) for use in
2092 updating DR_MISALIGNMENT values. The peeling factor is the
2093 vectorization factor minus the misalignment as an element
2094 count. */
2095 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2096 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2097 npeel = ((mis & (target_align - 1))
2098 / vect_get_scalar_dr_size (dr0));
2099 }
2100
2101 /* For interleaved data access every iteration accesses all the
2102 members of the group, therefore we divide the number of iterations
2103 by the group size. */
2104 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2105 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106 npeel /= GROUP_SIZE (stmt_info);
2107
2108 if (dump_enabled_p ())
2109 dump_printf_loc (MSG_NOTE, vect_location,
2110 "Try peeling by %d\n", npeel);
2111 }
2112
2113 /* Ensure that all datarefs can be vectorized after the peel. */
2114 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2115 do_peeling = false;
2116
2117 /* Check if all datarefs are supportable and log. */
2118 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2119 {
2120 stat = vect_verify_datarefs_alignment (loop_vinfo);
2121 if (!stat)
2122 do_peeling = false;
2123 else
2124 return stat;
2125 }
2126
2127 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2128 if (do_peeling)
2129 {
2130 unsigned max_allowed_peel
2131 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2132 if (max_allowed_peel != (unsigned)-1)
2133 {
2134 unsigned max_peel = npeel;
2135 if (max_peel == 0)
2136 {
2137 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2138 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2139 }
2140 if (max_peel > max_allowed_peel)
2141 {
2142 do_peeling = false;
2143 if (dump_enabled_p ())
2144 dump_printf_loc (MSG_NOTE, vect_location,
2145 "Disable peeling, max peels reached: %d\n", max_peel);
2146 }
2147 }
2148 }
2149
2150 /* Cost model #2 - if peeling may result in a remaining loop not
2151 iterating enough to be vectorized then do not peel. Since this
2152 is a cost heuristic rather than a correctness decision, use the
2153 most likely runtime value for variable vectorization factors. */
2154 if (do_peeling
2155 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2156 {
2157 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2158 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2159 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2160 < assumed_vf + max_peel)
2161 do_peeling = false;
2162 }
2163
2164 if (do_peeling)
2165 {
2166 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2167 If the misalignment of DR_i is identical to that of dr0 then set
2168 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2169 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2170 by the peeling factor times the element size of DR_i (MOD the
2171 vectorization factor times the size). Otherwise, the
2172 misalignment of DR_i must be set to unknown. */
2173 FOR_EACH_VEC_ELT (datarefs, i, dr)
2174 if (dr != dr0)
2175 {
2176 /* Strided accesses perform only component accesses, alignment
2177 is irrelevant for them. */
2178 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2179 if (STMT_VINFO_STRIDED_P (stmt_info)
2180 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2181 continue;
2182
2183 vect_update_misalignment_for_peel (dr, dr0, npeel);
2184 }
2185
2186 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2187 if (npeel)
2188 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2189 else
2190 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2191 = DR_MISALIGNMENT (dr0);
2192 SET_DR_MISALIGNMENT (dr0, 0);
2193 if (dump_enabled_p ())
2194 {
2195 dump_printf_loc (MSG_NOTE, vect_location,
2196 "Alignment of access forced using peeling.\n");
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "Peeling for alignment will be applied.\n");
2199 }
2200
2201 /* The inside-loop cost will be accounted for in vectorizable_load
2202 and vectorizable_store correctly with adjusted alignments.
2203 Drop the body_cst_vec on the floor here. */
2204 stat = vect_verify_datarefs_alignment (loop_vinfo);
2205 gcc_assert (stat);
2206 return stat;
2207 }
2208 }
2209
2210 /* (2) Versioning to force alignment. */
2211
2212 /* Try versioning if:
2213 1) optimize loop for speed
2214 2) there is at least one unsupported misaligned data ref with an unknown
2215 misalignment, and
2216 3) all misaligned data refs with a known misalignment are supported, and
2217 4) the number of runtime alignment checks is within reason. */
2218
2219 do_versioning =
2220 optimize_loop_nest_for_speed_p (loop)
2221 && (!loop->inner); /* FORNOW */
2222
2223 if (do_versioning)
2224 {
2225 FOR_EACH_VEC_ELT (datarefs, i, dr)
2226 {
2227 stmt = DR_STMT (dr);
2228 stmt_info = vinfo_for_stmt (stmt);
2229
2230 /* For interleaving, only the alignment of the first access
2231 matters. */
2232 if (aligned_access_p (dr)
2233 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2234 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2235 continue;
2236
2237 if (STMT_VINFO_STRIDED_P (stmt_info))
2238 {
2239 /* Strided loads perform only component accesses, alignment is
2240 irrelevant for them. */
2241 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2242 continue;
2243 do_versioning = false;
2244 break;
2245 }
2246
2247 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2248
2249 if (!supportable_dr_alignment)
2250 {
2251 gimple *stmt;
2252 int mask;
2253 tree vectype;
2254
2255 if (known_alignment_for_access_p (dr)
2256 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2257 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2258 {
2259 do_versioning = false;
2260 break;
2261 }
2262
2263 stmt = DR_STMT (dr);
2264 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2265 gcc_assert (vectype);
2266
2267 /* At present we don't support versioning for alignment
2268 with variable VF, since there's no guarantee that the
2269 VF is a power of two. We could relax this if we added
2270 a way of enforcing a power-of-two size. */
2271 unsigned HOST_WIDE_INT size;
2272 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2273 {
2274 do_versioning = false;
2275 break;
2276 }
2277
2278 /* The rightmost bits of an aligned address must be zeros.
2279 Construct the mask needed for this test. For example,
2280 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2281 mask must be 15 = 0xf. */
2282 mask = size - 1;
2283
2284 /* FORNOW: use the same mask to test all potentially unaligned
2285 references in the loop. The vectorizer currently supports
2286 a single vector size, see the reference to
2287 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2288 vectorization factor is computed. */
2289 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2290 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2291 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2292 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2293 DR_STMT (dr));
2294 }
2295 }
2296
2297 /* Versioning requires at least one misaligned data reference. */
2298 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2299 do_versioning = false;
2300 else if (!do_versioning)
2301 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2302 }
2303
2304 if (do_versioning)
2305 {
2306 vec<gimple *> may_misalign_stmts
2307 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2308 gimple *stmt;
2309
2310 /* It can now be assumed that the data references in the statements
2311 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2312 of the loop being vectorized. */
2313 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2314 {
2315 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2316 dr = STMT_VINFO_DATA_REF (stmt_info);
2317 SET_DR_MISALIGNMENT (dr, 0);
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "Alignment of access forced using versioning.\n");
2321 }
2322
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "Versioning for alignment will be applied.\n");
2326
2327 /* Peeling and versioning can't be done together at this time. */
2328 gcc_assert (! (do_peeling && do_versioning));
2329
2330 stat = vect_verify_datarefs_alignment (loop_vinfo);
2331 gcc_assert (stat);
2332 return stat;
2333 }
2334
2335 /* This point is reached if neither peeling nor versioning is being done. */
2336 gcc_assert (! (do_peeling || do_versioning));
2337
2338 stat = vect_verify_datarefs_alignment (loop_vinfo);
2339 return stat;
2340 }
2341
2342
2343 /* Function vect_find_same_alignment_drs.
2344
2345 Update group and alignment relations according to the chosen
2346 vectorization factor. */
2347
2348 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2349 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2350 {
2351 struct data_reference *dra = DDR_A (ddr);
2352 struct data_reference *drb = DDR_B (ddr);
2353 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2354 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2355
2356 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2357 return;
2358
2359 if (dra == drb)
2360 return;
2361
2362 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2363 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2364 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2365 return;
2366
2367 /* Two references with distance zero have the same alignment. */
2368 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2369 - wi::to_poly_offset (DR_INIT (drb)));
2370 if (maybe_ne (diff, 0))
2371 {
2372 /* Get the wider of the two alignments. */
2373 unsigned int align_a = (vect_calculate_target_alignment (dra)
2374 / BITS_PER_UNIT);
2375 unsigned int align_b = (vect_calculate_target_alignment (drb)
2376 / BITS_PER_UNIT);
2377 unsigned int max_align = MAX (align_a, align_b);
2378
2379 /* Require the gap to be a multiple of the larger vector alignment. */
2380 if (!multiple_p (diff, max_align))
2381 return;
2382 }
2383
2384 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2385 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2386 if (dump_enabled_p ())
2387 {
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "accesses have the same alignment: ");
2390 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2391 dump_printf (MSG_NOTE, " and ");
2392 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2393 dump_printf (MSG_NOTE, "\n");
2394 }
2395 }
2396
2397
2398 /* Function vect_analyze_data_refs_alignment
2399
2400 Analyze the alignment of the data-references in the loop.
2401 Return FALSE if a data reference is found that cannot be vectorized. */
2402
2403 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2404 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2405 {
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "=== vect_analyze_data_refs_alignment ===\n");
2409
2410 /* Mark groups of data references with same alignment using
2411 data dependence information. */
2412 vec<ddr_p> ddrs = vinfo->ddrs;
2413 struct data_dependence_relation *ddr;
2414 unsigned int i;
2415
2416 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2417 vect_find_same_alignment_drs (ddr);
2418
2419 vec<data_reference_p> datarefs = vinfo->datarefs;
2420 struct data_reference *dr;
2421
2422 vect_record_base_alignments (vinfo);
2423 FOR_EACH_VEC_ELT (datarefs, i, dr)
2424 {
2425 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2426 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2427 && !vect_compute_data_ref_alignment (dr))
2428 {
2429 /* Strided accesses perform only component accesses, misalignment
2430 information is irrelevant for them. */
2431 if (STMT_VINFO_STRIDED_P (stmt_info)
2432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2433 continue;
2434
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2437 "not vectorized: can't calculate alignment "
2438 "for data ref.\n");
2439
2440 return false;
2441 }
2442 }
2443
2444 return true;
2445 }
2446
2447
2448 /* Analyze alignment of DRs of stmts in NODE. */
2449
2450 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2451 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2452 {
2453 /* We vectorize from the first scalar stmt in the node unless
2454 the node is permuted in which case we start from the first
2455 element in the group. */
2456 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2457 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2458 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2459 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2460
2461 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2462 if (! vect_compute_data_ref_alignment (dr)
2463 /* For creating the data-ref pointer we need alignment of the
2464 first element anyway. */
2465 || (dr != first_dr
2466 && ! vect_compute_data_ref_alignment (first_dr))
2467 || ! verify_data_ref_alignment (dr))
2468 {
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "not vectorized: bad data alignment in basic "
2472 "block.\n");
2473 return false;
2474 }
2475
2476 return true;
2477 }
2478
2479 /* Function vect_slp_analyze_instance_alignment
2480
2481 Analyze the alignment of the data-references in the SLP instance.
2482 Return FALSE if a data reference is found that cannot be vectorized. */
2483
2484 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2485 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2486 {
2487 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_NOTE, vect_location,
2489 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2490
2491 slp_tree node;
2492 unsigned i;
2493 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2494 if (! vect_slp_analyze_and_verify_node_alignment (node))
2495 return false;
2496
2497 node = SLP_INSTANCE_TREE (instance);
2498 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2499 && ! vect_slp_analyze_and_verify_node_alignment
2500 (SLP_INSTANCE_TREE (instance)))
2501 return false;
2502
2503 return true;
2504 }
2505
2506
2507 /* Analyze groups of accesses: check that DR belongs to a group of
2508 accesses of legal size, step, etc. Detect gaps, single element
2509 interleaving, and other special cases. Set grouped access info.
2510 Collect groups of strided stores for further use in SLP analysis.
2511 Worker for vect_analyze_group_access. */
2512
2513 static bool
vect_analyze_group_access_1(struct data_reference * dr)2514 vect_analyze_group_access_1 (struct data_reference *dr)
2515 {
2516 tree step = DR_STEP (dr);
2517 tree scalar_type = TREE_TYPE (DR_REF (dr));
2518 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2519 gimple *stmt = DR_STMT (dr);
2520 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2521 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2522 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2523 HOST_WIDE_INT dr_step = -1;
2524 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2525 bool slp_impossible = false;
2526
2527 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2528 size of the interleaving group (including gaps). */
2529 if (tree_fits_shwi_p (step))
2530 {
2531 dr_step = tree_to_shwi (step);
2532 /* Check that STEP is a multiple of type size. Otherwise there is
2533 a non-element-sized gap at the end of the group which we
2534 cannot represent in GROUP_GAP or GROUP_SIZE.
2535 ??? As we can handle non-constant step fine here we should
2536 simply remove uses of GROUP_GAP between the last and first
2537 element and instead rely on DR_STEP. GROUP_SIZE then would
2538 simply not include that gap. */
2539 if ((dr_step % type_size) != 0)
2540 {
2541 if (dump_enabled_p ())
2542 {
2543 dump_printf_loc (MSG_NOTE, vect_location,
2544 "Step ");
2545 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2546 dump_printf (MSG_NOTE,
2547 " is not a multiple of the element size for ");
2548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2549 dump_printf (MSG_NOTE, "\n");
2550 }
2551 return false;
2552 }
2553 groupsize = absu_hwi (dr_step) / type_size;
2554 }
2555 else
2556 groupsize = 0;
2557
2558 /* Not consecutive access is possible only if it is a part of interleaving. */
2559 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2560 {
2561 /* Check if it this DR is a part of interleaving, and is a single
2562 element of the group that is accessed in the loop. */
2563
2564 /* Gaps are supported only for loads. STEP must be a multiple of the type
2565 size. */
2566 if (DR_IS_READ (dr)
2567 && (dr_step % type_size) == 0
2568 && groupsize > 0)
2569 {
2570 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2571 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2572 GROUP_GAP (stmt_info) = groupsize - 1;
2573 if (dump_enabled_p ())
2574 {
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "Detected single element interleaving ");
2577 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2578 dump_printf (MSG_NOTE, " step ");
2579 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2580 dump_printf (MSG_NOTE, "\n");
2581 }
2582
2583 return true;
2584 }
2585
2586 if (dump_enabled_p ())
2587 {
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589 "not consecutive access ");
2590 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2591 }
2592
2593 if (bb_vinfo)
2594 {
2595 /* Mark the statement as unvectorizable. */
2596 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2597 return true;
2598 }
2599
2600 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2601 STMT_VINFO_STRIDED_P (stmt_info) = true;
2602 return true;
2603 }
2604
2605 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2606 {
2607 /* First stmt in the interleaving chain. Check the chain. */
2608 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2609 struct data_reference *data_ref = dr;
2610 unsigned int count = 1;
2611 tree prev_init = DR_INIT (data_ref);
2612 gimple *prev = stmt;
2613 HOST_WIDE_INT diff, gaps = 0;
2614
2615 /* By construction, all group members have INTEGER_CST DR_INITs. */
2616 while (next)
2617 {
2618 /* Skip same data-refs. In case that two or more stmts share
2619 data-ref (supported only for loads), we vectorize only the first
2620 stmt, and the rest get their vectorized loads from the first
2621 one. */
2622 if (!tree_int_cst_compare (DR_INIT (data_ref),
2623 DR_INIT (STMT_VINFO_DATA_REF (
2624 vinfo_for_stmt (next)))))
2625 {
2626 if (DR_IS_WRITE (data_ref))
2627 {
2628 if (dump_enabled_p ())
2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630 "Two store stmts share the same dr.\n");
2631 return false;
2632 }
2633
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636 "Two or more load stmts share the same dr.\n");
2637
2638 /* For load use the same data-ref load. */
2639 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2640
2641 prev = next;
2642 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2643 continue;
2644 }
2645
2646 prev = next;
2647 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2648
2649 /* All group members have the same STEP by construction. */
2650 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2651
2652 /* Check that the distance between two accesses is equal to the type
2653 size. Otherwise, we have gaps. */
2654 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2655 - TREE_INT_CST_LOW (prev_init)) / type_size;
2656 if (diff != 1)
2657 {
2658 /* FORNOW: SLP of accesses with gaps is not supported. */
2659 slp_impossible = true;
2660 if (DR_IS_WRITE (data_ref))
2661 {
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2664 "interleaved store with gaps\n");
2665 return false;
2666 }
2667
2668 gaps += diff - 1;
2669 }
2670
2671 last_accessed_element += diff;
2672
2673 /* Store the gap from the previous member of the group. If there is no
2674 gap in the access, GROUP_GAP is always 1. */
2675 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2676
2677 prev_init = DR_INIT (data_ref);
2678 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2679 /* Count the number of data-refs in the chain. */
2680 count++;
2681 }
2682
2683 if (groupsize == 0)
2684 groupsize = count + gaps;
2685
2686 /* This could be UINT_MAX but as we are generating code in a very
2687 inefficient way we have to cap earlier. See PR78699 for example. */
2688 if (groupsize > 4096)
2689 {
2690 if (dump_enabled_p ())
2691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2692 "group is too large\n");
2693 return false;
2694 }
2695
2696 /* Check that the size of the interleaving is equal to count for stores,
2697 i.e., that there are no gaps. */
2698 if (groupsize != count
2699 && !DR_IS_READ (dr))
2700 {
2701 if (dump_enabled_p ())
2702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2703 "interleaved store with gaps\n");
2704 return false;
2705 }
2706
2707 /* If there is a gap after the last load in the group it is the
2708 difference between the groupsize and the last accessed
2709 element.
2710 When there is no gap, this difference should be 0. */
2711 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2712
2713 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2714 if (dump_enabled_p ())
2715 {
2716 dump_printf_loc (MSG_NOTE, vect_location,
2717 "Detected interleaving ");
2718 if (DR_IS_READ (dr))
2719 dump_printf (MSG_NOTE, "load ");
2720 else
2721 dump_printf (MSG_NOTE, "store ");
2722 dump_printf (MSG_NOTE, "of size %u starting with ",
2723 (unsigned)groupsize);
2724 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2725 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2726 dump_printf_loc (MSG_NOTE, vect_location,
2727 "There is a gap of %u elements after the group\n",
2728 GROUP_GAP (vinfo_for_stmt (stmt)));
2729 }
2730
2731 /* SLP: create an SLP data structure for every interleaving group of
2732 stores for further analysis in vect_analyse_slp. */
2733 if (DR_IS_WRITE (dr) && !slp_impossible)
2734 {
2735 if (loop_vinfo)
2736 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2737 if (bb_vinfo)
2738 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2739 }
2740 }
2741
2742 return true;
2743 }
2744
2745 /* Analyze groups of accesses: check that DR belongs to a group of
2746 accesses of legal size, step, etc. Detect gaps, single element
2747 interleaving, and other special cases. Set grouped access info.
2748 Collect groups of strided stores for further use in SLP analysis. */
2749
2750 static bool
vect_analyze_group_access(struct data_reference * dr)2751 vect_analyze_group_access (struct data_reference *dr)
2752 {
2753 if (!vect_analyze_group_access_1 (dr))
2754 {
2755 /* Dissolve the group if present. */
2756 gimple *next;
2757 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2758 while (stmt)
2759 {
2760 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2761 next = GROUP_NEXT_ELEMENT (vinfo);
2762 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2763 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2764 stmt = next;
2765 }
2766 return false;
2767 }
2768 return true;
2769 }
2770
2771 /* Analyze the access pattern of the data-reference DR.
2772 In case of non-consecutive accesses call vect_analyze_group_access() to
2773 analyze groups of accesses. */
2774
2775 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2776 vect_analyze_data_ref_access (struct data_reference *dr)
2777 {
2778 tree step = DR_STEP (dr);
2779 tree scalar_type = TREE_TYPE (DR_REF (dr));
2780 gimple *stmt = DR_STMT (dr);
2781 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2782 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2783 struct loop *loop = NULL;
2784
2785 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2786 return true;
2787
2788 if (loop_vinfo)
2789 loop = LOOP_VINFO_LOOP (loop_vinfo);
2790
2791 if (loop_vinfo && !step)
2792 {
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2795 "bad data-ref access in loop\n");
2796 return false;
2797 }
2798
2799 /* Allow loads with zero step in inner-loop vectorization. */
2800 if (loop_vinfo && integer_zerop (step))
2801 {
2802 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2803 if (!nested_in_vect_loop_p (loop, stmt))
2804 return DR_IS_READ (dr);
2805 /* Allow references with zero step for outer loops marked
2806 with pragma omp simd only - it guarantees absence of
2807 loop-carried dependencies between inner loop iterations. */
2808 if (loop->safelen < 2)
2809 {
2810 if (dump_enabled_p ())
2811 dump_printf_loc (MSG_NOTE, vect_location,
2812 "zero step in inner loop of nest\n");
2813 return false;
2814 }
2815 }
2816
2817 if (loop && nested_in_vect_loop_p (loop, stmt))
2818 {
2819 /* Interleaved accesses are not yet supported within outer-loop
2820 vectorization for references in the inner-loop. */
2821 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2822
2823 /* For the rest of the analysis we use the outer-loop step. */
2824 step = STMT_VINFO_DR_STEP (stmt_info);
2825 if (integer_zerop (step))
2826 {
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "zero step in outer loop.\n");
2830 return DR_IS_READ (dr);
2831 }
2832 }
2833
2834 /* Consecutive? */
2835 if (TREE_CODE (step) == INTEGER_CST)
2836 {
2837 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2838 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2839 || (dr_step < 0
2840 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2841 {
2842 /* Mark that it is not interleaving. */
2843 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2844 return true;
2845 }
2846 }
2847
2848 if (loop && nested_in_vect_loop_p (loop, stmt))
2849 {
2850 if (dump_enabled_p ())
2851 dump_printf_loc (MSG_NOTE, vect_location,
2852 "grouped access in outer loop.\n");
2853 return false;
2854 }
2855
2856
2857 /* Assume this is a DR handled by non-constant strided load case. */
2858 if (TREE_CODE (step) != INTEGER_CST)
2859 return (STMT_VINFO_STRIDED_P (stmt_info)
2860 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2861 || vect_analyze_group_access (dr)));
2862
2863 /* Not consecutive access - check if it's a part of interleaving group. */
2864 return vect_analyze_group_access (dr);
2865 }
2866
2867 /* Compare two data-references DRA and DRB to group them into chunks
2868 suitable for grouping. */
2869
2870 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2871 dr_group_sort_cmp (const void *dra_, const void *drb_)
2872 {
2873 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2874 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2875 int cmp;
2876
2877 /* Stabilize sort. */
2878 if (dra == drb)
2879 return 0;
2880
2881 /* DRs in different loops never belong to the same group. */
2882 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2883 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2884 if (loopa != loopb)
2885 return loopa->num < loopb->num ? -1 : 1;
2886
2887 /* Ordering of DRs according to base. */
2888 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2889 DR_BASE_ADDRESS (drb));
2890 if (cmp != 0)
2891 return cmp;
2892
2893 /* And according to DR_OFFSET. */
2894 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2895 if (cmp != 0)
2896 return cmp;
2897
2898 /* Put reads before writes. */
2899 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2900 return DR_IS_READ (dra) ? -1 : 1;
2901
2902 /* Then sort after access size. */
2903 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2904 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2905 if (cmp != 0)
2906 return cmp;
2907
2908 /* And after step. */
2909 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2910 if (cmp != 0)
2911 return cmp;
2912
2913 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2914 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2915 if (cmp == 0)
2916 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2917 return cmp;
2918 }
2919
2920 /* If OP is the result of a conversion, return the unconverted value,
2921 otherwise return null. */
2922
2923 static tree
strip_conversion(tree op)2924 strip_conversion (tree op)
2925 {
2926 if (TREE_CODE (op) != SSA_NAME)
2927 return NULL_TREE;
2928 gimple *stmt = SSA_NAME_DEF_STMT (op);
2929 if (!is_gimple_assign (stmt)
2930 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2931 return NULL_TREE;
2932 return gimple_assign_rhs1 (stmt);
2933 }
2934
2935 /* Return true if vectorizable_* routines can handle statements STMT1
2936 and STMT2 being in a single group. */
2937
2938 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2939 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2940 {
2941 if (gimple_assign_single_p (stmt1))
2942 return gimple_assign_single_p (stmt2);
2943
2944 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2945 {
2946 /* Check for two masked loads or two masked stores. */
2947 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2948 return false;
2949 internal_fn ifn = gimple_call_internal_fn (stmt1);
2950 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2951 return false;
2952 if (ifn != gimple_call_internal_fn (stmt2))
2953 return false;
2954
2955 /* Check that the masks are the same. Cope with casts of masks,
2956 like those created by build_mask_conversion. */
2957 tree mask1 = gimple_call_arg (stmt1, 2);
2958 tree mask2 = gimple_call_arg (stmt2, 2);
2959 if (!operand_equal_p (mask1, mask2, 0))
2960 {
2961 mask1 = strip_conversion (mask1);
2962 if (!mask1)
2963 return false;
2964 mask2 = strip_conversion (mask2);
2965 if (!mask2)
2966 return false;
2967 if (!operand_equal_p (mask1, mask2, 0))
2968 return false;
2969 }
2970 return true;
2971 }
2972
2973 return false;
2974 }
2975
2976 /* Function vect_analyze_data_ref_accesses.
2977
2978 Analyze the access pattern of all the data references in the loop.
2979
2980 FORNOW: the only access pattern that is considered vectorizable is a
2981 simple step 1 (consecutive) access.
2982
2983 FORNOW: handle only arrays and pointer accesses. */
2984
2985 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2986 vect_analyze_data_ref_accesses (vec_info *vinfo)
2987 {
2988 unsigned int i;
2989 vec<data_reference_p> datarefs = vinfo->datarefs;
2990 struct data_reference *dr;
2991
2992 if (dump_enabled_p ())
2993 dump_printf_loc (MSG_NOTE, vect_location,
2994 "=== vect_analyze_data_ref_accesses ===\n");
2995
2996 if (datarefs.is_empty ())
2997 return true;
2998
2999 /* Sort the array of datarefs to make building the interleaving chains
3000 linear. Don't modify the original vector's order, it is needed for
3001 determining what dependencies are reversed. */
3002 vec<data_reference_p> datarefs_copy = datarefs.copy ();
3003 datarefs_copy.qsort (dr_group_sort_cmp);
3004
3005 /* Build the interleaving chains. */
3006 for (i = 0; i < datarefs_copy.length () - 1;)
3007 {
3008 data_reference_p dra = datarefs_copy[i];
3009 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
3010 stmt_vec_info lastinfo = NULL;
3011 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3012 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3013 {
3014 ++i;
3015 continue;
3016 }
3017 for (i = i + 1; i < datarefs_copy.length (); ++i)
3018 {
3019 data_reference_p drb = datarefs_copy[i];
3020 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
3021 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3022 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3023 break;
3024
3025 /* ??? Imperfect sorting (non-compatible types, non-modulo
3026 accesses, same accesses) can lead to a group to be artificially
3027 split here as we don't just skip over those. If it really
3028 matters we can push those to a worklist and re-iterate
3029 over them. The we can just skip ahead to the next DR here. */
3030
3031 /* DRs in a different loop should not be put into the same
3032 interleaving group. */
3033 if (gimple_bb (DR_STMT (dra))->loop_father
3034 != gimple_bb (DR_STMT (drb))->loop_father)
3035 break;
3036
3037 /* Check that the data-refs have same first location (except init)
3038 and they are both either store or load (not load and store,
3039 not masked loads or stores). */
3040 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3041 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3042 DR_BASE_ADDRESS (drb)) != 0
3043 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3044 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
3045 break;
3046
3047 /* Check that the data-refs have the same constant size. */
3048 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3049 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3050 if (!tree_fits_uhwi_p (sza)
3051 || !tree_fits_uhwi_p (szb)
3052 || !tree_int_cst_equal (sza, szb))
3053 break;
3054
3055 /* Check that the data-refs have the same step. */
3056 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3057 break;
3058
3059 /* Check the types are compatible.
3060 ??? We don't distinguish this during sorting. */
3061 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3062 TREE_TYPE (DR_REF (drb))))
3063 break;
3064
3065 /* Check that the DR_INITs are compile-time constants. */
3066 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3067 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3068 break;
3069
3070 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3071 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3072 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3073 HOST_WIDE_INT init_prev
3074 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3075 gcc_assert (init_a <= init_b
3076 && init_a <= init_prev
3077 && init_prev <= init_b);
3078
3079 /* Do not place the same access in the interleaving chain twice. */
3080 if (init_b == init_prev)
3081 {
3082 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3083 < gimple_uid (DR_STMT (drb)));
3084 /* ??? For now we simply "drop" the later reference which is
3085 otherwise the same rather than finishing off this group.
3086 In the end we'd want to re-process duplicates forming
3087 multiple groups from the refs, likely by just collecting
3088 all candidates (including duplicates and split points
3089 below) in a vector and then process them together. */
3090 continue;
3091 }
3092
3093 /* If init_b == init_a + the size of the type * k, we have an
3094 interleaving, and DRA is accessed before DRB. */
3095 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3096 if (type_size_a == 0
3097 || (init_b - init_a) % type_size_a != 0)
3098 break;
3099
3100 /* If we have a store, the accesses are adjacent. This splits
3101 groups into chunks we support (we don't support vectorization
3102 of stores with gaps). */
3103 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3104 break;
3105
3106 /* If the step (if not zero or non-constant) is greater than the
3107 difference between data-refs' inits this splits groups into
3108 suitable sizes. */
3109 if (tree_fits_shwi_p (DR_STEP (dra)))
3110 {
3111 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3112 if (step != 0 && step <= (init_b - init_a))
3113 break;
3114 }
3115
3116 if (dump_enabled_p ())
3117 {
3118 dump_printf_loc (MSG_NOTE, vect_location,
3119 "Detected interleaving ");
3120 if (DR_IS_READ (dra))
3121 dump_printf (MSG_NOTE, "load ");
3122 else
3123 dump_printf (MSG_NOTE, "store ");
3124 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3125 dump_printf (MSG_NOTE, " and ");
3126 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3127 dump_printf (MSG_NOTE, "\n");
3128 }
3129
3130 /* Link the found element into the group list. */
3131 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3132 {
3133 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3134 lastinfo = stmtinfo_a;
3135 }
3136 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3137 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3138 lastinfo = stmtinfo_b;
3139 }
3140 }
3141
3142 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3143 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3144 && !vect_analyze_data_ref_access (dr))
3145 {
3146 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3148 "not vectorized: complicated access pattern.\n");
3149
3150 if (is_a <bb_vec_info> (vinfo))
3151 {
3152 /* Mark the statement as not vectorizable. */
3153 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3154 continue;
3155 }
3156 else
3157 {
3158 datarefs_copy.release ();
3159 return false;
3160 }
3161 }
3162
3163 datarefs_copy.release ();
3164 return true;
3165 }
3166
3167 /* Function vect_vfa_segment_size.
3168
3169 Input:
3170 DR: The data reference.
3171 LENGTH_FACTOR: segment length to consider.
3172
3173 Return a value suitable for the dr_with_seg_len::seg_len field.
3174 This is the "distance travelled" by the pointer from the first
3175 iteration in the segment to the last. Note that it does not include
3176 the size of the access; in effect it only describes the first byte. */
3177
3178 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3179 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3180 {
3181 length_factor = size_binop (MINUS_EXPR,
3182 fold_convert (sizetype, length_factor),
3183 size_one_node);
3184 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3185 length_factor);
3186 }
3187
3188 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3189 gives the worst-case number of bytes covered by the segment. */
3190
3191 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3192 vect_vfa_access_size (data_reference *dr)
3193 {
3194 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3195 tree ref_type = TREE_TYPE (DR_REF (dr));
3196 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3197 unsigned HOST_WIDE_INT access_size = ref_size;
3198 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3199 {
3200 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3201 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3202 }
3203 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3204 && (vect_supportable_dr_alignment (dr, false)
3205 == dr_explicit_realign_optimized))
3206 {
3207 /* We might access a full vector's worth. */
3208 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3209 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3210 }
3211 return access_size;
3212 }
3213
3214 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3215
3216 static unsigned int
vect_vfa_align(const data_reference * dr)3217 vect_vfa_align (const data_reference *dr)
3218 {
3219 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3220 }
3221
3222 /* Function vect_no_alias_p.
3223
3224 Given data references A and B with equal base and offset, see whether
3225 the alias relation can be decided at compilation time. Return 1 if
3226 it can and the references alias, 0 if it can and the references do
3227 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3228 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3229 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3230
3231 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3232 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3233 tree segment_length_a, tree segment_length_b,
3234 unsigned HOST_WIDE_INT access_size_a,
3235 unsigned HOST_WIDE_INT access_size_b)
3236 {
3237 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3238 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3239 poly_uint64 const_length_a;
3240 poly_uint64 const_length_b;
3241
3242 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3243 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3244 [a, a+12) */
3245 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3246 {
3247 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3248 offset_a -= const_length_a;
3249 }
3250 else
3251 const_length_a = tree_to_poly_uint64 (segment_length_a);
3252 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3253 {
3254 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3255 offset_b -= const_length_b;
3256 }
3257 else
3258 const_length_b = tree_to_poly_uint64 (segment_length_b);
3259
3260 const_length_a += access_size_a;
3261 const_length_b += access_size_b;
3262
3263 if (ranges_known_overlap_p (offset_a, const_length_a,
3264 offset_b, const_length_b))
3265 return 1;
3266
3267 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3268 offset_b, const_length_b))
3269 return 0;
3270
3271 return -1;
3272 }
3273
3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3275 in DDR is >= VF. */
3276
3277 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3278 dependence_distance_ge_vf (data_dependence_relation *ddr,
3279 unsigned int loop_depth, poly_uint64 vf)
3280 {
3281 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3282 || DDR_NUM_DIST_VECTS (ddr) == 0)
3283 return false;
3284
3285 /* If the dependence is exact, we should have limited the VF instead. */
3286 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3287
3288 unsigned int i;
3289 lambda_vector dist_v;
3290 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3291 {
3292 HOST_WIDE_INT dist = dist_v[loop_depth];
3293 if (dist != 0
3294 && !(dist > 0 && DDR_REVERSED_P (ddr))
3295 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3296 return false;
3297 }
3298
3299 if (dump_enabled_p ())
3300 {
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "dependence distance between ");
3303 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3304 dump_printf (MSG_NOTE, " and ");
3305 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3306 dump_printf (MSG_NOTE, " is >= VF\n");
3307 }
3308
3309 return true;
3310 }
3311
3312 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3313
3314 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3315 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3316 {
3317 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3318 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3319 dump_printf (dump_kind, ") >= ");
3320 dump_dec (dump_kind, lower_bound.min_value);
3321 }
3322
3323 /* Record that the vectorized loop requires the vec_lower_bound described
3324 by EXPR, UNSIGNED_P and MIN_VALUE. */
3325
3326 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3327 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3328 poly_uint64 min_value)
3329 {
3330 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3331 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3332 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3333 {
3334 unsigned_p &= lower_bounds[i].unsigned_p;
3335 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3336 if (lower_bounds[i].unsigned_p != unsigned_p
3337 || maybe_lt (lower_bounds[i].min_value, min_value))
3338 {
3339 lower_bounds[i].unsigned_p = unsigned_p;
3340 lower_bounds[i].min_value = min_value;
3341 if (dump_enabled_p ())
3342 {
3343 dump_printf_loc (MSG_NOTE, vect_location,
3344 "updating run-time check to ");
3345 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3346 dump_printf (MSG_NOTE, "\n");
3347 }
3348 }
3349 return;
3350 }
3351
3352 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3353 if (dump_enabled_p ())
3354 {
3355 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3356 dump_lower_bound (MSG_NOTE, lower_bound);
3357 dump_printf (MSG_NOTE, "\n");
3358 }
3359 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3360 }
3361
3362 /* Return true if it's unlikely that the step of the vectorized form of DR
3363 will span fewer than GAP bytes. */
3364
3365 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3366 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3367 {
3368 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3369 HOST_WIDE_INT count
3370 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3371 if (GROUP_FIRST_ELEMENT (stmt_info))
3372 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3373 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3374 }
3375
3376 /* Return true if we know that there is no alias between DR_A and DR_B
3377 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3378 *LOWER_BOUND_OUT to this N. */
3379
3380 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3381 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3382 poly_uint64 *lower_bound_out)
3383 {
3384 /* Check that there is a constant gap of known sign between DR_A
3385 and DR_B. */
3386 poly_int64 init_a, init_b;
3387 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3388 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3389 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3390 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3391 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3392 || !ordered_p (init_a, init_b))
3393 return false;
3394
3395 /* Sort DR_A and DR_B by the address they access. */
3396 if (maybe_lt (init_b, init_a))
3397 {
3398 std::swap (init_a, init_b);
3399 std::swap (dr_a, dr_b);
3400 }
3401
3402 /* If the two accesses could be dependent within a scalar iteration,
3403 make sure that we'd retain their order. */
3404 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3405 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3406 return false;
3407
3408 /* There is no alias if abs (DR_STEP) is greater than or equal to
3409 the bytes spanned by the combination of the two accesses. */
3410 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3411 return true;
3412 }
3413
3414 /* Function vect_prune_runtime_alias_test_list.
3415
3416 Prune a list of ddrs to be tested at run-time by versioning for alias.
3417 Merge several alias checks into one if possible.
3418 Return FALSE if resulting list of ddrs is longer then allowed by
3419 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3420
3421 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3423 {
3424 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3425 hash_set <tree_pair_hash> compared_objects;
3426
3427 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3428 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3429 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3430 vec<vec_object_pair> &check_unequal_addrs
3431 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3432 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3434
3435 ddr_p ddr;
3436 unsigned int i;
3437 tree length_factor;
3438
3439 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_NOTE, vect_location,
3441 "=== vect_prune_runtime_alias_test_list ===\n");
3442
3443 /* Step values are irrelevant for aliasing if the number of vector
3444 iterations is equal to the number of scalar iterations (which can
3445 happen for fully-SLP loops). */
3446 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3447
3448 if (!ignore_step_p)
3449 {
3450 /* Convert the checks for nonzero steps into bound tests. */
3451 tree value;
3452 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3453 vect_check_lower_bound (loop_vinfo, value, true, 1);
3454 }
3455
3456 if (may_alias_ddrs.is_empty ())
3457 return true;
3458
3459 comp_alias_ddrs.create (may_alias_ddrs.length ());
3460
3461 unsigned int loop_depth
3462 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3463 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3464
3465 /* First, we collect all data ref pairs for aliasing checks. */
3466 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3467 {
3468 int comp_res;
3469 poly_uint64 lower_bound;
3470 struct data_reference *dr_a, *dr_b;
3471 gimple *dr_group_first_a, *dr_group_first_b;
3472 tree segment_length_a, segment_length_b;
3473 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3474 unsigned int align_a, align_b;
3475 gimple *stmt_a, *stmt_b;
3476
3477 /* Ignore the alias if the VF we chose ended up being no greater
3478 than the dependence distance. */
3479 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3480 continue;
3481
3482 if (DDR_OBJECT_A (ddr))
3483 {
3484 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3485 if (!compared_objects.add (new_pair))
3486 {
3487 if (dump_enabled_p ())
3488 {
3489 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3490 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3491 dump_printf (MSG_NOTE, " and ");
3492 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3493 dump_printf (MSG_NOTE, " have different addresses\n");
3494 }
3495 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3496 }
3497 continue;
3498 }
3499
3500 dr_a = DDR_A (ddr);
3501 stmt_a = DR_STMT (DDR_A (ddr));
3502
3503 dr_b = DDR_B (ddr);
3504 stmt_b = DR_STMT (DDR_B (ddr));
3505
3506 /* Skip the pair if inter-iteration dependencies are irrelevant
3507 and intra-iteration dependencies are guaranteed to be honored. */
3508 if (ignore_step_p
3509 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3510 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3511 {
3512 if (dump_enabled_p ())
3513 {
3514 dump_printf_loc (MSG_NOTE, vect_location,
3515 "no need for alias check between ");
3516 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3517 dump_printf (MSG_NOTE, " and ");
3518 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3519 dump_printf (MSG_NOTE, " when VF is 1\n");
3520 }
3521 continue;
3522 }
3523
3524 /* See whether we can handle the alias using a bounds check on
3525 the step, and whether that's likely to be the best approach.
3526 (It might not be, for example, if the minimum step is much larger
3527 than the number of bytes handled by one vector iteration.) */
3528 if (!ignore_step_p
3529 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3530 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3531 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3532 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3533 {
3534 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3535 if (dump_enabled_p ())
3536 {
3537 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3539 dump_printf (MSG_NOTE, " and ");
3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3541 dump_printf (MSG_NOTE, " when the step ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3543 dump_printf (MSG_NOTE, " is outside ");
3544 if (unsigned_p)
3545 dump_printf (MSG_NOTE, "[0");
3546 else
3547 {
3548 dump_printf (MSG_NOTE, "(");
3549 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3550 }
3551 dump_printf (MSG_NOTE, ", ");
3552 dump_dec (MSG_NOTE, lower_bound);
3553 dump_printf (MSG_NOTE, ")\n");
3554 }
3555 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3556 lower_bound);
3557 continue;
3558 }
3559
3560 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3561 if (dr_group_first_a)
3562 {
3563 stmt_a = dr_group_first_a;
3564 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3565 }
3566
3567 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3568 if (dr_group_first_b)
3569 {
3570 stmt_b = dr_group_first_b;
3571 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3572 }
3573
3574 if (ignore_step_p)
3575 {
3576 segment_length_a = size_zero_node;
3577 segment_length_b = size_zero_node;
3578 }
3579 else
3580 {
3581 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3582 length_factor = scalar_loop_iters;
3583 else
3584 length_factor = size_int (vect_factor);
3585 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3586 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3587 }
3588 access_size_a = vect_vfa_access_size (dr_a);
3589 access_size_b = vect_vfa_access_size (dr_b);
3590 align_a = vect_vfa_align (dr_a);
3591 align_b = vect_vfa_align (dr_b);
3592
3593 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3594 DR_BASE_ADDRESS (dr_b));
3595 if (comp_res == 0)
3596 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3597 DR_OFFSET (dr_b));
3598
3599 /* See whether the alias is known at compilation time. */
3600 if (comp_res == 0
3601 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3602 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3603 && poly_int_tree_p (segment_length_a)
3604 && poly_int_tree_p (segment_length_b))
3605 {
3606 int res = vect_compile_time_alias (dr_a, dr_b,
3607 segment_length_a,
3608 segment_length_b,
3609 access_size_a,
3610 access_size_b);
3611 if (res >= 0 && dump_enabled_p ())
3612 {
3613 dump_printf_loc (MSG_NOTE, vect_location,
3614 "can tell at compile time that ");
3615 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3616 dump_printf (MSG_NOTE, " and ");
3617 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3618 if (res == 0)
3619 dump_printf (MSG_NOTE, " do not alias\n");
3620 else
3621 dump_printf (MSG_NOTE, " alias\n");
3622 }
3623
3624 if (res == 0)
3625 continue;
3626
3627 if (res == 1)
3628 {
3629 if (dump_enabled_p ())
3630 dump_printf_loc (MSG_NOTE, vect_location,
3631 "not vectorized: compilation time alias.\n");
3632 return false;
3633 }
3634 }
3635
3636 dr_with_seg_len_pair_t dr_with_seg_len_pair
3637 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3638 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3639
3640 /* Canonicalize pairs by sorting the two DR members. */
3641 if (comp_res > 0)
3642 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3643
3644 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3645 }
3646
3647 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3648
3649 unsigned int count = (comp_alias_ddrs.length ()
3650 + check_unequal_addrs.length ());
3651
3652 dump_printf_loc (MSG_NOTE, vect_location,
3653 "improved number of alias checks from %d to %d\n",
3654 may_alias_ddrs.length (), count);
3655 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3656 {
3657 if (dump_enabled_p ())
3658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3659 "number of versioning for alias "
3660 "run-time tests exceeds %d "
3661 "(--param vect-max-version-for-alias-checks)\n",
3662 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3663 return false;
3664 }
3665
3666 return true;
3667 }
3668
3669 /* Check whether we can use an internal function for a gather load
3670 or scatter store. READ_P is true for loads and false for stores.
3671 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3672 the type of the memory elements being loaded or stored. OFFSET_BITS
3673 is the number of bits in each scalar offset and OFFSET_SIGN is the
3674 sign of the offset. SCALE is the amount by which the offset should
3675 be multiplied *after* it has been converted to address width.
3676
3677 Return true if the function is supported, storing the function
3678 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3679
3680 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3681 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3682 tree memory_type, unsigned int offset_bits,
3683 signop offset_sign, int scale,
3684 internal_fn *ifn_out, tree *element_type_out)
3685 {
3686 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3687 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3688 if (offset_bits > element_bits)
3689 /* Internal functions require the offset to be the same width as
3690 the vector elements. We can extend narrower offsets, but it isn't
3691 safe to truncate wider offsets. */
3692 return false;
3693
3694 if (element_bits != memory_bits)
3695 /* For now the vector elements must be the same width as the
3696 memory elements. */
3697 return false;
3698
3699 /* Work out which function we need. */
3700 internal_fn ifn;
3701 if (read_p)
3702 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3703 else
3704 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3705
3706 /* Test whether the target supports this combination. */
3707 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3708 offset_sign, scale))
3709 return false;
3710
3711 *ifn_out = ifn;
3712 *element_type_out = TREE_TYPE (vectype);
3713 return true;
3714 }
3715
3716 /* CALL is a call to an internal gather load or scatter store function.
3717 Describe the operation in INFO. */
3718
3719 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3720 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3721 {
3722 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3723 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3724 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3725
3726 info->ifn = gimple_call_internal_fn (call);
3727 info->decl = NULL_TREE;
3728 info->base = gimple_call_arg (call, 0);
3729 info->offset = gimple_call_arg (call, 1);
3730 info->offset_dt = vect_unknown_def_type;
3731 info->offset_vectype = NULL_TREE;
3732 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3733 info->element_type = TREE_TYPE (vectype);
3734 info->memory_type = TREE_TYPE (DR_REF (dr));
3735 }
3736
3737 /* Return true if a non-affine read or write in STMT is suitable for a
3738 gather load or scatter store. Describe the operation in *INFO if so. */
3739
3740 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3741 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3742 gather_scatter_info *info)
3743 {
3744 HOST_WIDE_INT scale = 1;
3745 poly_int64 pbitpos, pbitsize;
3746 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3747 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3748 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3749 tree offtype = NULL_TREE;
3750 tree decl = NULL_TREE, base, off;
3751 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3752 tree memory_type = TREE_TYPE (DR_REF (dr));
3753 machine_mode pmode;
3754 int punsignedp, reversep, pvolatilep = 0;
3755 internal_fn ifn;
3756 tree element_type;
3757 bool masked_p = false;
3758
3759 /* See whether this is already a call to a gather/scatter internal function.
3760 If not, see whether it's a masked load or store. */
3761 gcall *call = dyn_cast <gcall *> (stmt);
3762 if (call && gimple_call_internal_p (call))
3763 {
3764 ifn = gimple_call_internal_fn (stmt);
3765 if (internal_gather_scatter_fn_p (ifn))
3766 {
3767 vect_describe_gather_scatter_call (call, info);
3768 return true;
3769 }
3770 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3771 }
3772
3773 /* True if we should aim to use internal functions rather than
3774 built-in functions. */
3775 bool use_ifn_p = (DR_IS_READ (dr)
3776 ? supports_vec_gather_load_p ()
3777 : supports_vec_scatter_store_p ());
3778
3779 base = DR_REF (dr);
3780 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3781 see if we can use the def stmt of the address. */
3782 if (masked_p
3783 && TREE_CODE (base) == MEM_REF
3784 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3785 && integer_zerop (TREE_OPERAND (base, 1))
3786 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3787 {
3788 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3789 if (is_gimple_assign (def_stmt)
3790 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3791 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3792 }
3793
3794 /* The gather and scatter builtins need address of the form
3795 loop_invariant + vector * {1, 2, 4, 8}
3796 or
3797 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3798 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3799 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3800 multiplications and additions in it. To get a vector, we need
3801 a single SSA_NAME that will be defined in the loop and will
3802 contain everything that is not loop invariant and that can be
3803 vectorized. The following code attempts to find such a preexistng
3804 SSA_NAME OFF and put the loop invariants into a tree BASE
3805 that can be gimplified before the loop. */
3806 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3807 &punsignedp, &reversep, &pvolatilep);
3808 gcc_assert (base && !reversep);
3809 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3810
3811 if (TREE_CODE (base) == MEM_REF)
3812 {
3813 if (!integer_zerop (TREE_OPERAND (base, 1)))
3814 {
3815 if (off == NULL_TREE)
3816 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3817 else
3818 off = size_binop (PLUS_EXPR, off,
3819 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3820 }
3821 base = TREE_OPERAND (base, 0);
3822 }
3823 else
3824 base = build_fold_addr_expr (base);
3825
3826 if (off == NULL_TREE)
3827 off = size_zero_node;
3828
3829 /* If base is not loop invariant, either off is 0, then we start with just
3830 the constant offset in the loop invariant BASE and continue with base
3831 as OFF, otherwise give up.
3832 We could handle that case by gimplifying the addition of base + off
3833 into some SSA_NAME and use that as off, but for now punt. */
3834 if (!expr_invariant_in_loop_p (loop, base))
3835 {
3836 if (!integer_zerop (off))
3837 return false;
3838 off = base;
3839 base = size_int (pbytepos);
3840 }
3841 /* Otherwise put base + constant offset into the loop invariant BASE
3842 and continue with OFF. */
3843 else
3844 {
3845 base = fold_convert (sizetype, base);
3846 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3847 }
3848
3849 /* OFF at this point may be either a SSA_NAME or some tree expression
3850 from get_inner_reference. Try to peel off loop invariants from it
3851 into BASE as long as possible. */
3852 STRIP_NOPS (off);
3853 while (offtype == NULL_TREE)
3854 {
3855 enum tree_code code;
3856 tree op0, op1, add = NULL_TREE;
3857
3858 if (TREE_CODE (off) == SSA_NAME)
3859 {
3860 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3861
3862 if (expr_invariant_in_loop_p (loop, off))
3863 return false;
3864
3865 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3866 break;
3867
3868 op0 = gimple_assign_rhs1 (def_stmt);
3869 code = gimple_assign_rhs_code (def_stmt);
3870 op1 = gimple_assign_rhs2 (def_stmt);
3871 }
3872 else
3873 {
3874 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3875 return false;
3876 code = TREE_CODE (off);
3877 extract_ops_from_tree (off, &code, &op0, &op1);
3878 }
3879 switch (code)
3880 {
3881 case POINTER_PLUS_EXPR:
3882 case PLUS_EXPR:
3883 if (expr_invariant_in_loop_p (loop, op0))
3884 {
3885 add = op0;
3886 off = op1;
3887 do_add:
3888 add = fold_convert (sizetype, add);
3889 if (scale != 1)
3890 add = size_binop (MULT_EXPR, add, size_int (scale));
3891 base = size_binop (PLUS_EXPR, base, add);
3892 continue;
3893 }
3894 if (expr_invariant_in_loop_p (loop, op1))
3895 {
3896 add = op1;
3897 off = op0;
3898 goto do_add;
3899 }
3900 break;
3901 case MINUS_EXPR:
3902 if (expr_invariant_in_loop_p (loop, op1))
3903 {
3904 add = fold_convert (sizetype, op1);
3905 add = size_binop (MINUS_EXPR, size_zero_node, add);
3906 off = op0;
3907 goto do_add;
3908 }
3909 break;
3910 case MULT_EXPR:
3911 if (scale == 1 && tree_fits_shwi_p (op1))
3912 {
3913 int new_scale = tree_to_shwi (op1);
3914 /* Only treat this as a scaling operation if the target
3915 supports it. */
3916 if (use_ifn_p
3917 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3918 vectype, memory_type, 1,
3919 TYPE_SIGN (TREE_TYPE (op0)),
3920 new_scale, &ifn,
3921 &element_type))
3922 break;
3923 scale = new_scale;
3924 off = op0;
3925 continue;
3926 }
3927 break;
3928 case SSA_NAME:
3929 off = op0;
3930 continue;
3931 CASE_CONVERT:
3932 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3933 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3934 break;
3935 if (TYPE_PRECISION (TREE_TYPE (op0))
3936 == TYPE_PRECISION (TREE_TYPE (off)))
3937 {
3938 off = op0;
3939 continue;
3940 }
3941
3942 /* The internal functions need the offset to be the same width
3943 as the elements of VECTYPE. Don't include operations that
3944 cast the offset from that width to a different width. */
3945 if (use_ifn_p
3946 && (int_size_in_bytes (TREE_TYPE (vectype))
3947 == int_size_in_bytes (TREE_TYPE (off))))
3948 break;
3949
3950 if (TYPE_PRECISION (TREE_TYPE (op0))
3951 < TYPE_PRECISION (TREE_TYPE (off)))
3952 {
3953 off = op0;
3954 offtype = TREE_TYPE (off);
3955 STRIP_NOPS (off);
3956 continue;
3957 }
3958 break;
3959 default:
3960 break;
3961 }
3962 break;
3963 }
3964
3965 /* If at the end OFF still isn't a SSA_NAME or isn't
3966 defined in the loop, punt. */
3967 if (TREE_CODE (off) != SSA_NAME
3968 || expr_invariant_in_loop_p (loop, off))
3969 return false;
3970
3971 if (offtype == NULL_TREE)
3972 offtype = TREE_TYPE (off);
3973
3974 if (use_ifn_p)
3975 {
3976 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3977 memory_type, TYPE_PRECISION (offtype),
3978 TYPE_SIGN (offtype), scale, &ifn,
3979 &element_type))
3980 return false;
3981 }
3982 else
3983 {
3984 if (DR_IS_READ (dr))
3985 {
3986 if (targetm.vectorize.builtin_gather)
3987 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3988 }
3989 else
3990 {
3991 if (targetm.vectorize.builtin_scatter)
3992 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3993 }
3994
3995 if (!decl)
3996 return false;
3997
3998 ifn = IFN_LAST;
3999 element_type = TREE_TYPE (vectype);
4000 }
4001
4002 info->ifn = ifn;
4003 info->decl = decl;
4004 info->base = base;
4005 info->offset = off;
4006 info->offset_dt = vect_unknown_def_type;
4007 info->offset_vectype = NULL_TREE;
4008 info->scale = scale;
4009 info->element_type = element_type;
4010 info->memory_type = memory_type;
4011 return true;
4012 }
4013
4014 /* Function vect_analyze_data_refs.
4015
4016 Find all the data references in the loop or basic block.
4017
4018 The general structure of the analysis of data refs in the vectorizer is as
4019 follows:
4020 1- vect_analyze_data_refs(loop/bb): call
4021 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4022 in the loop/bb and their dependences.
4023 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4024 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4025 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4026
4027 */
4028
4029 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)4030 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4031 {
4032 struct loop *loop = NULL;
4033 unsigned int i;
4034 struct data_reference *dr;
4035 tree scalar_type;
4036
4037 if (dump_enabled_p ())
4038 dump_printf_loc (MSG_NOTE, vect_location,
4039 "=== vect_analyze_data_refs ===\n");
4040
4041 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4042 loop = LOOP_VINFO_LOOP (loop_vinfo);
4043
4044 /* Go through the data-refs, check that the analysis succeeded. Update
4045 pointer from stmt_vec_info struct to DR and vectype. */
4046
4047 vec<data_reference_p> datarefs = vinfo->datarefs;
4048 FOR_EACH_VEC_ELT (datarefs, i, dr)
4049 {
4050 gimple *stmt;
4051 stmt_vec_info stmt_info;
4052 tree base, offset, init;
4053 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4054 bool simd_lane_access = false;
4055 poly_uint64 vf;
4056
4057 again:
4058 if (!dr || !DR_REF (dr))
4059 {
4060 if (dump_enabled_p ())
4061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4062 "not vectorized: unhandled data-ref\n");
4063 return false;
4064 }
4065
4066 stmt = DR_STMT (dr);
4067 stmt_info = vinfo_for_stmt (stmt);
4068
4069 /* Discard clobbers from the dataref vector. We will remove
4070 clobber stmts during vectorization. */
4071 if (gimple_clobber_p (stmt))
4072 {
4073 free_data_ref (dr);
4074 if (i == datarefs.length () - 1)
4075 {
4076 datarefs.pop ();
4077 break;
4078 }
4079 datarefs.ordered_remove (i);
4080 dr = datarefs[i];
4081 goto again;
4082 }
4083
4084 /* Check that analysis of the data-ref succeeded. */
4085 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4086 || !DR_STEP (dr))
4087 {
4088 bool maybe_gather
4089 = DR_IS_READ (dr)
4090 && !TREE_THIS_VOLATILE (DR_REF (dr))
4091 && (targetm.vectorize.builtin_gather != NULL
4092 || supports_vec_gather_load_p ());
4093 bool maybe_scatter
4094 = DR_IS_WRITE (dr)
4095 && !TREE_THIS_VOLATILE (DR_REF (dr))
4096 && (targetm.vectorize.builtin_scatter != NULL
4097 || supports_vec_scatter_store_p ());
4098 bool maybe_simd_lane_access
4099 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4100
4101 /* If target supports vector gather loads or scatter stores, or if
4102 this might be a SIMD lane access, see if they can't be used. */
4103 if (is_a <loop_vec_info> (vinfo)
4104 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4105 && !nested_in_vect_loop_p (loop, stmt))
4106 {
4107 struct data_reference *newdr
4108 = create_data_ref (NULL, loop_containing_stmt (stmt),
4109 DR_REF (dr), stmt, !maybe_scatter,
4110 DR_IS_CONDITIONAL_IN_STMT (dr));
4111 gcc_assert (newdr != NULL && DR_REF (newdr));
4112 if (DR_BASE_ADDRESS (newdr)
4113 && DR_OFFSET (newdr)
4114 && DR_INIT (newdr)
4115 && DR_STEP (newdr)
4116 && integer_zerop (DR_STEP (newdr)))
4117 {
4118 if (maybe_simd_lane_access)
4119 {
4120 tree off = DR_OFFSET (newdr);
4121 STRIP_NOPS (off);
4122 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4123 && TREE_CODE (off) == MULT_EXPR
4124 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4125 {
4126 tree step = TREE_OPERAND (off, 1);
4127 off = TREE_OPERAND (off, 0);
4128 STRIP_NOPS (off);
4129 if (CONVERT_EXPR_P (off)
4130 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4131 0)))
4132 < TYPE_PRECISION (TREE_TYPE (off)))
4133 off = TREE_OPERAND (off, 0);
4134 if (TREE_CODE (off) == SSA_NAME)
4135 {
4136 gimple *def = SSA_NAME_DEF_STMT (off);
4137 tree reft = TREE_TYPE (DR_REF (newdr));
4138 if (is_gimple_call (def)
4139 && gimple_call_internal_p (def)
4140 && (gimple_call_internal_fn (def)
4141 == IFN_GOMP_SIMD_LANE))
4142 {
4143 tree arg = gimple_call_arg (def, 0);
4144 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4145 arg = SSA_NAME_VAR (arg);
4146 if (arg == loop->simduid
4147 /* For now. */
4148 && tree_int_cst_equal
4149 (TYPE_SIZE_UNIT (reft),
4150 step))
4151 {
4152 DR_OFFSET (newdr) = ssize_int (0);
4153 DR_STEP (newdr) = step;
4154 DR_OFFSET_ALIGNMENT (newdr)
4155 = BIGGEST_ALIGNMENT;
4156 DR_STEP_ALIGNMENT (newdr)
4157 = highest_pow2_factor (step);
4158 dr = newdr;
4159 simd_lane_access = true;
4160 }
4161 }
4162 }
4163 }
4164 }
4165 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4166 {
4167 dr = newdr;
4168 if (maybe_gather)
4169 gatherscatter = GATHER;
4170 else
4171 gatherscatter = SCATTER;
4172 }
4173 }
4174 if (gatherscatter == SG_NONE && !simd_lane_access)
4175 free_data_ref (newdr);
4176 }
4177
4178 if (gatherscatter == SG_NONE && !simd_lane_access)
4179 {
4180 if (dump_enabled_p ())
4181 {
4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183 "not vectorized: data ref analysis "
4184 "failed ");
4185 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4186 }
4187
4188 if (is_a <bb_vec_info> (vinfo))
4189 break;
4190
4191 return false;
4192 }
4193 }
4194
4195 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4196 {
4197 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "not vectorized: base addr of dr is a "
4200 "constant\n");
4201
4202 if (is_a <bb_vec_info> (vinfo))
4203 break;
4204
4205 if (gatherscatter != SG_NONE || simd_lane_access)
4206 free_data_ref (dr);
4207 return false;
4208 }
4209
4210 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4211 {
4212 if (dump_enabled_p ())
4213 {
4214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4215 "not vectorized: volatile type ");
4216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4217 }
4218
4219 if (is_a <bb_vec_info> (vinfo))
4220 break;
4221
4222 return false;
4223 }
4224
4225 if (stmt_can_throw_internal (stmt))
4226 {
4227 if (dump_enabled_p ())
4228 {
4229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4230 "not vectorized: statement can throw an "
4231 "exception ");
4232 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4233 }
4234
4235 if (is_a <bb_vec_info> (vinfo))
4236 break;
4237
4238 if (gatherscatter != SG_NONE || simd_lane_access)
4239 free_data_ref (dr);
4240 return false;
4241 }
4242
4243 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4244 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4245 {
4246 if (dump_enabled_p ())
4247 {
4248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4249 "not vectorized: statement is bitfield "
4250 "access ");
4251 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4252 }
4253
4254 if (is_a <bb_vec_info> (vinfo))
4255 break;
4256
4257 if (gatherscatter != SG_NONE || simd_lane_access)
4258 free_data_ref (dr);
4259 return false;
4260 }
4261
4262 base = unshare_expr (DR_BASE_ADDRESS (dr));
4263 offset = unshare_expr (DR_OFFSET (dr));
4264 init = unshare_expr (DR_INIT (dr));
4265
4266 if (is_gimple_call (stmt)
4267 && (!gimple_call_internal_p (stmt)
4268 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4269 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4270 {
4271 if (dump_enabled_p ())
4272 {
4273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4274 "not vectorized: dr in a call ");
4275 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4276 }
4277
4278 if (is_a <bb_vec_info> (vinfo))
4279 break;
4280
4281 if (gatherscatter != SG_NONE || simd_lane_access)
4282 free_data_ref (dr);
4283 return false;
4284 }
4285
4286 /* Update DR field in stmt_vec_info struct. */
4287
4288 /* If the dataref is in an inner-loop of the loop that is considered for
4289 for vectorization, we also want to analyze the access relative to
4290 the outer-loop (DR contains information only relative to the
4291 inner-most enclosing loop). We do that by building a reference to the
4292 first location accessed by the inner-loop, and analyze it relative to
4293 the outer-loop. */
4294 if (loop && nested_in_vect_loop_p (loop, stmt))
4295 {
4296 /* Build a reference to the first location accessed by the
4297 inner loop: *(BASE + INIT + OFFSET). By construction,
4298 this address must be invariant in the inner loop, so we
4299 can consider it as being used in the outer loop. */
4300 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4301 init, offset);
4302 tree init_addr = fold_build_pointer_plus (base, init_offset);
4303 tree init_ref = build_fold_indirect_ref (init_addr);
4304
4305 if (dump_enabled_p ())
4306 {
4307 dump_printf_loc (MSG_NOTE, vect_location,
4308 "analyze in outer loop: ");
4309 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4310 dump_printf (MSG_NOTE, "\n");
4311 }
4312
4313 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4314 init_ref, loop))
4315 /* dr_analyze_innermost already explained the failure. */
4316 return false;
4317
4318 if (dump_enabled_p ())
4319 {
4320 dump_printf_loc (MSG_NOTE, vect_location,
4321 "\touter base_address: ");
4322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4323 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4324 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4325 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4326 STMT_VINFO_DR_OFFSET (stmt_info));
4327 dump_printf (MSG_NOTE,
4328 "\n\touter constant offset from base address: ");
4329 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4330 STMT_VINFO_DR_INIT (stmt_info));
4331 dump_printf (MSG_NOTE, "\n\touter step: ");
4332 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4333 STMT_VINFO_DR_STEP (stmt_info));
4334 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4335 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4336 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4337 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4338 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4339 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4340 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4341 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4342 }
4343 }
4344
4345 if (STMT_VINFO_DATA_REF (stmt_info))
4346 {
4347 if (dump_enabled_p ())
4348 {
4349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4350 "not vectorized: more than one data ref "
4351 "in stmt: ");
4352 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4353 }
4354
4355 if (is_a <bb_vec_info> (vinfo))
4356 break;
4357
4358 if (gatherscatter != SG_NONE || simd_lane_access)
4359 free_data_ref (dr);
4360 return false;
4361 }
4362
4363 STMT_VINFO_DATA_REF (stmt_info) = dr;
4364 if (simd_lane_access)
4365 {
4366 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4367 free_data_ref (datarefs[i]);
4368 datarefs[i] = dr;
4369 }
4370
4371 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4372 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4373 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4374 {
4375 if (dump_enabled_p ())
4376 {
4377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4378 "not vectorized: base object not addressable "
4379 "for stmt: ");
4380 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4381 }
4382 if (is_a <bb_vec_info> (vinfo))
4383 {
4384 /* In BB vectorization the ref can still participate
4385 in dependence analysis, we just can't vectorize it. */
4386 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4387 continue;
4388 }
4389 return false;
4390 }
4391
4392 /* Set vectype for STMT. */
4393 scalar_type = TREE_TYPE (DR_REF (dr));
4394 STMT_VINFO_VECTYPE (stmt_info)
4395 = get_vectype_for_scalar_type (scalar_type);
4396 if (!STMT_VINFO_VECTYPE (stmt_info))
4397 {
4398 if (dump_enabled_p ())
4399 {
4400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4401 "not vectorized: no vectype for stmt: ");
4402 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4403 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4405 scalar_type);
4406 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4407 }
4408
4409 if (is_a <bb_vec_info> (vinfo))
4410 {
4411 /* No vector type is fine, the ref can still participate
4412 in dependence analysis, we just can't vectorize it. */
4413 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4414 continue;
4415 }
4416
4417 if (gatherscatter != SG_NONE || simd_lane_access)
4418 {
4419 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4420 if (gatherscatter != SG_NONE)
4421 free_data_ref (dr);
4422 }
4423 return false;
4424 }
4425 else
4426 {
4427 if (dump_enabled_p ())
4428 {
4429 dump_printf_loc (MSG_NOTE, vect_location,
4430 "got vectype for stmt: ");
4431 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4432 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4433 STMT_VINFO_VECTYPE (stmt_info));
4434 dump_printf (MSG_NOTE, "\n");
4435 }
4436 }
4437
4438 /* Adjust the minimal vectorization factor according to the
4439 vector type. */
4440 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4441 *min_vf = upper_bound (*min_vf, vf);
4442
4443 if (gatherscatter != SG_NONE)
4444 {
4445 gather_scatter_info gs_info;
4446 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4447 &gs_info)
4448 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4449 {
4450 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4451 free_data_ref (dr);
4452 if (dump_enabled_p ())
4453 {
4454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4455 (gatherscatter == GATHER) ?
4456 "not vectorized: not suitable for gather "
4457 "load " :
4458 "not vectorized: not suitable for scatter "
4459 "store ");
4460 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4461 }
4462 return false;
4463 }
4464
4465 free_data_ref (datarefs[i]);
4466 datarefs[i] = dr;
4467 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4468 }
4469
4470 else if (is_a <loop_vec_info> (vinfo)
4471 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4472 {
4473 if (nested_in_vect_loop_p (loop, stmt))
4474 {
4475 if (dump_enabled_p ())
4476 {
4477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4478 "not vectorized: not suitable for strided "
4479 "load ");
4480 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4481 }
4482 return false;
4483 }
4484 STMT_VINFO_STRIDED_P (stmt_info) = true;
4485 }
4486 }
4487
4488 /* If we stopped analysis at the first dataref we could not analyze
4489 when trying to vectorize a basic-block mark the rest of the datarefs
4490 as not vectorizable and truncate the vector of datarefs. That
4491 avoids spending useless time in analyzing their dependence. */
4492 if (i != datarefs.length ())
4493 {
4494 gcc_assert (is_a <bb_vec_info> (vinfo));
4495 for (unsigned j = i; j < datarefs.length (); ++j)
4496 {
4497 data_reference_p dr = datarefs[j];
4498 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4499 free_data_ref (dr);
4500 }
4501 datarefs.truncate (i);
4502 }
4503
4504 return true;
4505 }
4506
4507
4508 /* Function vect_get_new_vect_var.
4509
4510 Returns a name for a new variable. The current naming scheme appends the
4511 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4512 the name of vectorizer generated variables, and appends that to NAME if
4513 provided. */
4514
4515 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4516 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4517 {
4518 const char *prefix;
4519 tree new_vect_var;
4520
4521 switch (var_kind)
4522 {
4523 case vect_simple_var:
4524 prefix = "vect";
4525 break;
4526 case vect_scalar_var:
4527 prefix = "stmp";
4528 break;
4529 case vect_mask_var:
4530 prefix = "mask";
4531 break;
4532 case vect_pointer_var:
4533 prefix = "vectp";
4534 break;
4535 default:
4536 gcc_unreachable ();
4537 }
4538
4539 if (name)
4540 {
4541 char* tmp = concat (prefix, "_", name, NULL);
4542 new_vect_var = create_tmp_reg (type, tmp);
4543 free (tmp);
4544 }
4545 else
4546 new_vect_var = create_tmp_reg (type, prefix);
4547
4548 return new_vect_var;
4549 }
4550
4551 /* Like vect_get_new_vect_var but return an SSA name. */
4552
4553 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4554 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4555 {
4556 const char *prefix;
4557 tree new_vect_var;
4558
4559 switch (var_kind)
4560 {
4561 case vect_simple_var:
4562 prefix = "vect";
4563 break;
4564 case vect_scalar_var:
4565 prefix = "stmp";
4566 break;
4567 case vect_pointer_var:
4568 prefix = "vectp";
4569 break;
4570 default:
4571 gcc_unreachable ();
4572 }
4573
4574 if (name)
4575 {
4576 char* tmp = concat (prefix, "_", name, NULL);
4577 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4578 free (tmp);
4579 }
4580 else
4581 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4582
4583 return new_vect_var;
4584 }
4585
4586 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4587
4588 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4589 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4590 {
4591 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4592 int misalign = DR_MISALIGNMENT (dr);
4593 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4594 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4595 else
4596 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4597 DR_TARGET_ALIGNMENT (dr), misalign);
4598 }
4599
4600 /* Function vect_create_addr_base_for_vector_ref.
4601
4602 Create an expression that computes the address of the first memory location
4603 that will be accessed for a data reference.
4604
4605 Input:
4606 STMT: The statement containing the data reference.
4607 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4608 OFFSET: Optional. If supplied, it is be added to the initial address.
4609 LOOP: Specify relative to which loop-nest should the address be computed.
4610 For example, when the dataref is in an inner-loop nested in an
4611 outer-loop that is now being vectorized, LOOP can be either the
4612 outer-loop, or the inner-loop. The first memory location accessed
4613 by the following dataref ('in' points to short):
4614
4615 for (i=0; i<N; i++)
4616 for (j=0; j<M; j++)
4617 s += in[i+j]
4618
4619 is as follows:
4620 if LOOP=i_loop: &in (relative to i_loop)
4621 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4622 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4623 initial address. Unlike OFFSET, which is number of elements to
4624 be added, BYTE_OFFSET is measured in bytes.
4625
4626 Output:
4627 1. Return an SSA_NAME whose value is the address of the memory location of
4628 the first vector of the data reference.
4629 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4630 these statement(s) which define the returned SSA_NAME.
4631
4632 FORNOW: We are only handling array accesses with step 1. */
4633
4634 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4635 vect_create_addr_base_for_vector_ref (gimple *stmt,
4636 gimple_seq *new_stmt_list,
4637 tree offset,
4638 tree byte_offset)
4639 {
4640 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4641 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4642 const char *base_name;
4643 tree addr_base;
4644 tree dest;
4645 gimple_seq seq = NULL;
4646 tree vect_ptr_type;
4647 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4648 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4649 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4650
4651 tree data_ref_base = unshare_expr (drb->base_address);
4652 tree base_offset = unshare_expr (drb->offset);
4653 tree init = unshare_expr (drb->init);
4654
4655 if (loop_vinfo)
4656 base_name = get_name (data_ref_base);
4657 else
4658 {
4659 base_offset = ssize_int (0);
4660 init = ssize_int (0);
4661 base_name = get_name (DR_REF (dr));
4662 }
4663
4664 /* Create base_offset */
4665 base_offset = size_binop (PLUS_EXPR,
4666 fold_convert (sizetype, base_offset),
4667 fold_convert (sizetype, init));
4668
4669 if (offset)
4670 {
4671 offset = fold_build2 (MULT_EXPR, sizetype,
4672 fold_convert (sizetype, offset), step);
4673 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4674 base_offset, offset);
4675 }
4676 if (byte_offset)
4677 {
4678 byte_offset = fold_convert (sizetype, byte_offset);
4679 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4680 base_offset, byte_offset);
4681 }
4682
4683 /* base + base_offset */
4684 if (loop_vinfo)
4685 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4686 else
4687 {
4688 addr_base = build1 (ADDR_EXPR,
4689 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4690 unshare_expr (DR_REF (dr)));
4691 }
4692
4693 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4694 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4695 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4696 gimple_seq_add_seq (new_stmt_list, seq);
4697
4698 if (DR_PTR_INFO (dr)
4699 && TREE_CODE (addr_base) == SSA_NAME
4700 && !SSA_NAME_PTR_INFO (addr_base))
4701 {
4702 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4703 if (offset || byte_offset)
4704 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4705 }
4706
4707 if (dump_enabled_p ())
4708 {
4709 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4710 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4711 dump_printf (MSG_NOTE, "\n");
4712 }
4713
4714 return addr_base;
4715 }
4716
4717
4718 /* Function vect_create_data_ref_ptr.
4719
4720 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4721 location accessed in the loop by STMT, along with the def-use update
4722 chain to appropriately advance the pointer through the loop iterations.
4723 Also set aliasing information for the pointer. This pointer is used by
4724 the callers to this function to create a memory reference expression for
4725 vector load/store access.
4726
4727 Input:
4728 1. STMT: a stmt that references memory. Expected to be of the form
4729 GIMPLE_ASSIGN <name, data-ref> or
4730 GIMPLE_ASSIGN <data-ref, name>.
4731 2. AGGR_TYPE: the type of the reference, which should be either a vector
4732 or an array.
4733 3. AT_LOOP: the loop where the vector memref is to be created.
4734 4. OFFSET (optional): an offset to be added to the initial address accessed
4735 by the data-ref in STMT.
4736 5. BSI: location where the new stmts are to be placed if there is no loop
4737 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4738 pointing to the initial address.
4739 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4740 to the initial address accessed by the data-ref in STMT. This is
4741 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4742 in bytes.
4743 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4744 to the IV during each iteration of the loop. NULL says to move
4745 by one copy of AGGR_TYPE up or down, depending on the step of the
4746 data reference.
4747
4748 Output:
4749 1. Declare a new ptr to vector_type, and have it point to the base of the
4750 data reference (initial addressed accessed by the data reference).
4751 For example, for vector of type V8HI, the following code is generated:
4752
4753 v8hi *ap;
4754 ap = (v8hi *)initial_address;
4755
4756 if OFFSET is not supplied:
4757 initial_address = &a[init];
4758 if OFFSET is supplied:
4759 initial_address = &a[init + OFFSET];
4760 if BYTE_OFFSET is supplied:
4761 initial_address = &a[init] + BYTE_OFFSET;
4762
4763 Return the initial_address in INITIAL_ADDRESS.
4764
4765 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4766 update the pointer in each iteration of the loop.
4767
4768 Return the increment stmt that updates the pointer in PTR_INCR.
4769
4770 3. Set INV_P to true if the access pattern of the data reference in the
4771 vectorized loop is invariant. Set it to false otherwise.
4772
4773 4. Return the pointer. */
4774
4775 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4776 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4777 tree offset, tree *initial_address,
4778 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4779 bool only_init, bool *inv_p, tree byte_offset,
4780 tree iv_step)
4781 {
4782 const char *base_name;
4783 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4785 struct loop *loop = NULL;
4786 bool nested_in_vect_loop = false;
4787 struct loop *containing_loop = NULL;
4788 tree aggr_ptr_type;
4789 tree aggr_ptr;
4790 tree new_temp;
4791 gimple_seq new_stmt_list = NULL;
4792 edge pe = NULL;
4793 basic_block new_bb;
4794 tree aggr_ptr_init;
4795 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4796 tree aptr;
4797 gimple_stmt_iterator incr_gsi;
4798 bool insert_after;
4799 tree indx_before_incr, indx_after_incr;
4800 gimple *incr;
4801 tree step;
4802 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4803
4804 gcc_assert (iv_step != NULL_TREE
4805 || TREE_CODE (aggr_type) == ARRAY_TYPE
4806 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4807
4808 if (loop_vinfo)
4809 {
4810 loop = LOOP_VINFO_LOOP (loop_vinfo);
4811 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4812 containing_loop = (gimple_bb (stmt))->loop_father;
4813 pe = loop_preheader_edge (loop);
4814 }
4815 else
4816 {
4817 gcc_assert (bb_vinfo);
4818 only_init = true;
4819 *ptr_incr = NULL;
4820 }
4821
4822 /* Check the step (evolution) of the load in LOOP, and record
4823 whether it's invariant. */
4824 step = vect_dr_behavior (dr)->step;
4825 if (integer_zerop (step))
4826 *inv_p = true;
4827 else
4828 *inv_p = false;
4829
4830 /* Create an expression for the first address accessed by this load
4831 in LOOP. */
4832 base_name = get_name (DR_BASE_ADDRESS (dr));
4833
4834 if (dump_enabled_p ())
4835 {
4836 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4837 dump_printf_loc (MSG_NOTE, vect_location,
4838 "create %s-pointer variable to type: ",
4839 get_tree_code_name (TREE_CODE (aggr_type)));
4840 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4841 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4842 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4843 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4844 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4845 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4846 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4847 else
4848 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4849 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4850 dump_printf (MSG_NOTE, "\n");
4851 }
4852
4853 /* (1) Create the new aggregate-pointer variable.
4854 Vector and array types inherit the alias set of their component
4855 type by default so we need to use a ref-all pointer if the data
4856 reference does not conflict with the created aggregated data
4857 reference because it is not addressable. */
4858 bool need_ref_all = false;
4859 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4860 get_alias_set (DR_REF (dr))))
4861 need_ref_all = true;
4862 /* Likewise for any of the data references in the stmt group. */
4863 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4864 {
4865 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4866 do
4867 {
4868 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4869 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4870 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4871 get_alias_set (DR_REF (sdr))))
4872 {
4873 need_ref_all = true;
4874 break;
4875 }
4876 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4877 }
4878 while (orig_stmt);
4879 }
4880 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4881 need_ref_all);
4882 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4883
4884
4885 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4886 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4887 def-use update cycles for the pointer: one relative to the outer-loop
4888 (LOOP), which is what steps (3) and (4) below do. The other is relative
4889 to the inner-loop (which is the inner-most loop containing the dataref),
4890 and this is done be step (5) below.
4891
4892 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4893 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4894 redundant. Steps (3),(4) create the following:
4895
4896 vp0 = &base_addr;
4897 LOOP: vp1 = phi(vp0,vp2)
4898 ...
4899 ...
4900 vp2 = vp1 + step
4901 goto LOOP
4902
4903 If there is an inner-loop nested in loop, then step (5) will also be
4904 applied, and an additional update in the inner-loop will be created:
4905
4906 vp0 = &base_addr;
4907 LOOP: vp1 = phi(vp0,vp2)
4908 ...
4909 inner: vp3 = phi(vp1,vp4)
4910 vp4 = vp3 + inner_step
4911 if () goto inner
4912 ...
4913 vp2 = vp1 + step
4914 if () goto LOOP */
4915
4916 /* (2) Calculate the initial address of the aggregate-pointer, and set
4917 the aggregate-pointer to point to it before the loop. */
4918
4919 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4920
4921 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4922 offset, byte_offset);
4923 if (new_stmt_list)
4924 {
4925 if (pe)
4926 {
4927 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4928 gcc_assert (!new_bb);
4929 }
4930 else
4931 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4932 }
4933
4934 *initial_address = new_temp;
4935 aggr_ptr_init = new_temp;
4936
4937 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4938 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4939 inner-loop nested in LOOP (during outer-loop vectorization). */
4940
4941 /* No update in loop is required. */
4942 if (only_init && (!loop_vinfo || at_loop == loop))
4943 aptr = aggr_ptr_init;
4944 else
4945 {
4946 if (iv_step == NULL_TREE)
4947 {
4948 /* The step of the aggregate pointer is the type size. */
4949 iv_step = TYPE_SIZE_UNIT (aggr_type);
4950 /* One exception to the above is when the scalar step of the load in
4951 LOOP is zero. In this case the step here is also zero. */
4952 if (*inv_p)
4953 iv_step = size_zero_node;
4954 else if (tree_int_cst_sgn (step) == -1)
4955 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4956 }
4957
4958 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4959
4960 create_iv (aggr_ptr_init,
4961 fold_convert (aggr_ptr_type, iv_step),
4962 aggr_ptr, loop, &incr_gsi, insert_after,
4963 &indx_before_incr, &indx_after_incr);
4964 incr = gsi_stmt (incr_gsi);
4965 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4966
4967 /* Copy the points-to information if it exists. */
4968 if (DR_PTR_INFO (dr))
4969 {
4970 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4971 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4972 }
4973 if (ptr_incr)
4974 *ptr_incr = incr;
4975
4976 aptr = indx_before_incr;
4977 }
4978
4979 if (!nested_in_vect_loop || only_init)
4980 return aptr;
4981
4982
4983 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4984 nested in LOOP, if exists. */
4985
4986 gcc_assert (nested_in_vect_loop);
4987 if (!only_init)
4988 {
4989 standard_iv_increment_position (containing_loop, &incr_gsi,
4990 &insert_after);
4991 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4992 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4993 &indx_after_incr);
4994 incr = gsi_stmt (incr_gsi);
4995 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4996
4997 /* Copy the points-to information if it exists. */
4998 if (DR_PTR_INFO (dr))
4999 {
5000 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
5001 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
5002 }
5003 if (ptr_incr)
5004 *ptr_incr = incr;
5005
5006 return indx_before_incr;
5007 }
5008 else
5009 gcc_unreachable ();
5010 }
5011
5012
5013 /* Function bump_vector_ptr
5014
5015 Increment a pointer (to a vector type) by vector-size. If requested,
5016 i.e. if PTR-INCR is given, then also connect the new increment stmt
5017 to the existing def-use update-chain of the pointer, by modifying
5018 the PTR_INCR as illustrated below:
5019
5020 The pointer def-use update-chain before this function:
5021 DATAREF_PTR = phi (p_0, p_2)
5022 ....
5023 PTR_INCR: p_2 = DATAREF_PTR + step
5024
5025 The pointer def-use update-chain after this function:
5026 DATAREF_PTR = phi (p_0, p_2)
5027 ....
5028 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5029 ....
5030 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5031
5032 Input:
5033 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5034 in the loop.
5035 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5036 the loop. The increment amount across iterations is expected
5037 to be vector_size.
5038 BSI - location where the new update stmt is to be placed.
5039 STMT - the original scalar memory-access stmt that is being vectorized.
5040 BUMP - optional. The offset by which to bump the pointer. If not given,
5041 the offset is assumed to be vector_size.
5042
5043 Output: Return NEW_DATAREF_PTR as illustrated above.
5044
5045 */
5046
5047 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)5048 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5049 gimple *stmt, tree bump)
5050 {
5051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5052 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5053 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5054 tree update = TYPE_SIZE_UNIT (vectype);
5055 gassign *incr_stmt;
5056 ssa_op_iter iter;
5057 use_operand_p use_p;
5058 tree new_dataref_ptr;
5059
5060 if (bump)
5061 update = bump;
5062
5063 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5064 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5065 else
5066 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5067 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5068 dataref_ptr, update);
5069 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5070
5071 /* Copy the points-to information if it exists. */
5072 if (DR_PTR_INFO (dr))
5073 {
5074 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5075 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5076 }
5077
5078 if (!ptr_incr)
5079 return new_dataref_ptr;
5080
5081 /* Update the vector-pointer's cross-iteration increment. */
5082 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5083 {
5084 tree use = USE_FROM_PTR (use_p);
5085
5086 if (use == dataref_ptr)
5087 SET_USE (use_p, new_dataref_ptr);
5088 else
5089 gcc_assert (operand_equal_p (use, update, 0));
5090 }
5091
5092 return new_dataref_ptr;
5093 }
5094
5095
5096 /* Copy memory reference info such as base/clique from the SRC reference
5097 to the DEST MEM_REF. */
5098
5099 void
vect_copy_ref_info(tree dest,tree src)5100 vect_copy_ref_info (tree dest, tree src)
5101 {
5102 if (TREE_CODE (dest) != MEM_REF)
5103 return;
5104
5105 tree src_base = src;
5106 while (handled_component_p (src_base))
5107 src_base = TREE_OPERAND (src_base, 0);
5108 if (TREE_CODE (src_base) != MEM_REF
5109 && TREE_CODE (src_base) != TARGET_MEM_REF)
5110 return;
5111
5112 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5113 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5114 }
5115
5116
5117 /* Function vect_create_destination_var.
5118
5119 Create a new temporary of type VECTYPE. */
5120
5121 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5122 vect_create_destination_var (tree scalar_dest, tree vectype)
5123 {
5124 tree vec_dest;
5125 const char *name;
5126 char *new_name;
5127 tree type;
5128 enum vect_var_kind kind;
5129
5130 kind = vectype
5131 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5132 ? vect_mask_var
5133 : vect_simple_var
5134 : vect_scalar_var;
5135 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5136
5137 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5138
5139 name = get_name (scalar_dest);
5140 if (name)
5141 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5142 else
5143 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5144 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5145 free (new_name);
5146
5147 return vec_dest;
5148 }
5149
5150 /* Function vect_grouped_store_supported.
5151
5152 Returns TRUE if interleave high and interleave low permutations
5153 are supported, and FALSE otherwise. */
5154
5155 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5156 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5157 {
5158 machine_mode mode = TYPE_MODE (vectype);
5159
5160 /* vect_permute_store_chain requires the group size to be equal to 3 or
5161 be a power of two. */
5162 if (count != 3 && exact_log2 (count) == -1)
5163 {
5164 if (dump_enabled_p ())
5165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5166 "the size of the group of accesses"
5167 " is not a power of 2 or not eqaul to 3\n");
5168 return false;
5169 }
5170
5171 /* Check that the permutation is supported. */
5172 if (VECTOR_MODE_P (mode))
5173 {
5174 unsigned int i;
5175 if (count == 3)
5176 {
5177 unsigned int j0 = 0, j1 = 0, j2 = 0;
5178 unsigned int i, j;
5179
5180 unsigned int nelt;
5181 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5182 {
5183 if (dump_enabled_p ())
5184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185 "cannot handle groups of 3 stores for"
5186 " variable-length vectors\n");
5187 return false;
5188 }
5189
5190 vec_perm_builder sel (nelt, nelt, 1);
5191 sel.quick_grow (nelt);
5192 vec_perm_indices indices;
5193 for (j = 0; j < 3; j++)
5194 {
5195 int nelt0 = ((3 - j) * nelt) % 3;
5196 int nelt1 = ((3 - j) * nelt + 1) % 3;
5197 int nelt2 = ((3 - j) * nelt + 2) % 3;
5198 for (i = 0; i < nelt; i++)
5199 {
5200 if (3 * i + nelt0 < nelt)
5201 sel[3 * i + nelt0] = j0++;
5202 if (3 * i + nelt1 < nelt)
5203 sel[3 * i + nelt1] = nelt + j1++;
5204 if (3 * i + nelt2 < nelt)
5205 sel[3 * i + nelt2] = 0;
5206 }
5207 indices.new_vector (sel, 2, nelt);
5208 if (!can_vec_perm_const_p (mode, indices))
5209 {
5210 if (dump_enabled_p ())
5211 dump_printf (MSG_MISSED_OPTIMIZATION,
5212 "permutation op not supported by target.\n");
5213 return false;
5214 }
5215
5216 for (i = 0; i < nelt; i++)
5217 {
5218 if (3 * i + nelt0 < nelt)
5219 sel[3 * i + nelt0] = 3 * i + nelt0;
5220 if (3 * i + nelt1 < nelt)
5221 sel[3 * i + nelt1] = 3 * i + nelt1;
5222 if (3 * i + nelt2 < nelt)
5223 sel[3 * i + nelt2] = nelt + j2++;
5224 }
5225 indices.new_vector (sel, 2, nelt);
5226 if (!can_vec_perm_const_p (mode, indices))
5227 {
5228 if (dump_enabled_p ())
5229 dump_printf (MSG_MISSED_OPTIMIZATION,
5230 "permutation op not supported by target.\n");
5231 return false;
5232 }
5233 }
5234 return true;
5235 }
5236 else
5237 {
5238 /* If length is not equal to 3 then only power of 2 is supported. */
5239 gcc_assert (pow2p_hwi (count));
5240 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5241
5242 /* The encoding has 2 interleaved stepped patterns. */
5243 vec_perm_builder sel (nelt, 2, 3);
5244 sel.quick_grow (6);
5245 for (i = 0; i < 3; i++)
5246 {
5247 sel[i * 2] = i;
5248 sel[i * 2 + 1] = i + nelt;
5249 }
5250 vec_perm_indices indices (sel, 2, nelt);
5251 if (can_vec_perm_const_p (mode, indices))
5252 {
5253 for (i = 0; i < 6; i++)
5254 sel[i] += exact_div (nelt, 2);
5255 indices.new_vector (sel, 2, nelt);
5256 if (can_vec_perm_const_p (mode, indices))
5257 return true;
5258 }
5259 }
5260 }
5261
5262 if (dump_enabled_p ())
5263 dump_printf (MSG_MISSED_OPTIMIZATION,
5264 "permutaion op not supported by target.\n");
5265 return false;
5266 }
5267
5268
5269 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5270 type VECTYPE. MASKED_P says whether the masked form is needed. */
5271
5272 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5273 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5274 bool masked_p)
5275 {
5276 if (masked_p)
5277 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5278 vec_mask_store_lanes_optab,
5279 vectype, count);
5280 else
5281 return vect_lanes_optab_supported_p ("vec_store_lanes",
5282 vec_store_lanes_optab,
5283 vectype, count);
5284 }
5285
5286
5287 /* Function vect_permute_store_chain.
5288
5289 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5290 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5291 the data correctly for the stores. Return the final references for stores
5292 in RESULT_CHAIN.
5293
5294 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5295 The input is 4 vectors each containing 8 elements. We assign a number to
5296 each element, the input sequence is:
5297
5298 1st vec: 0 1 2 3 4 5 6 7
5299 2nd vec: 8 9 10 11 12 13 14 15
5300 3rd vec: 16 17 18 19 20 21 22 23
5301 4th vec: 24 25 26 27 28 29 30 31
5302
5303 The output sequence should be:
5304
5305 1st vec: 0 8 16 24 1 9 17 25
5306 2nd vec: 2 10 18 26 3 11 19 27
5307 3rd vec: 4 12 20 28 5 13 21 30
5308 4th vec: 6 14 22 30 7 15 23 31
5309
5310 i.e., we interleave the contents of the four vectors in their order.
5311
5312 We use interleave_high/low instructions to create such output. The input of
5313 each interleave_high/low operation is two vectors:
5314 1st vec 2nd vec
5315 0 1 2 3 4 5 6 7
5316 the even elements of the result vector are obtained left-to-right from the
5317 high/low elements of the first vector. The odd elements of the result are
5318 obtained left-to-right from the high/low elements of the second vector.
5319 The output of interleave_high will be: 0 4 1 5
5320 and of interleave_low: 2 6 3 7
5321
5322
5323 The permutation is done in log LENGTH stages. In each stage interleave_high
5324 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5325 where the first argument is taken from the first half of DR_CHAIN and the
5326 second argument from it's second half.
5327 In our example,
5328
5329 I1: interleave_high (1st vec, 3rd vec)
5330 I2: interleave_low (1st vec, 3rd vec)
5331 I3: interleave_high (2nd vec, 4th vec)
5332 I4: interleave_low (2nd vec, 4th vec)
5333
5334 The output for the first stage is:
5335
5336 I1: 0 16 1 17 2 18 3 19
5337 I2: 4 20 5 21 6 22 7 23
5338 I3: 8 24 9 25 10 26 11 27
5339 I4: 12 28 13 29 14 30 15 31
5340
5341 The output of the second stage, i.e. the final result is:
5342
5343 I1: 0 8 16 24 1 9 17 25
5344 I2: 2 10 18 26 3 11 19 27
5345 I3: 4 12 20 28 5 13 21 30
5346 I4: 6 14 22 30 7 15 23 31. */
5347
5348 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5349 vect_permute_store_chain (vec<tree> dr_chain,
5350 unsigned int length,
5351 gimple *stmt,
5352 gimple_stmt_iterator *gsi,
5353 vec<tree> *result_chain)
5354 {
5355 tree vect1, vect2, high, low;
5356 gimple *perm_stmt;
5357 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5358 tree perm_mask_low, perm_mask_high;
5359 tree data_ref;
5360 tree perm3_mask_low, perm3_mask_high;
5361 unsigned int i, j, n, log_length = exact_log2 (length);
5362
5363 result_chain->quick_grow (length);
5364 memcpy (result_chain->address (), dr_chain.address (),
5365 length * sizeof (tree));
5366
5367 if (length == 3)
5368 {
5369 /* vect_grouped_store_supported ensures that this is constant. */
5370 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5371 unsigned int j0 = 0, j1 = 0, j2 = 0;
5372
5373 vec_perm_builder sel (nelt, nelt, 1);
5374 sel.quick_grow (nelt);
5375 vec_perm_indices indices;
5376 for (j = 0; j < 3; j++)
5377 {
5378 int nelt0 = ((3 - j) * nelt) % 3;
5379 int nelt1 = ((3 - j) * nelt + 1) % 3;
5380 int nelt2 = ((3 - j) * nelt + 2) % 3;
5381
5382 for (i = 0; i < nelt; i++)
5383 {
5384 if (3 * i + nelt0 < nelt)
5385 sel[3 * i + nelt0] = j0++;
5386 if (3 * i + nelt1 < nelt)
5387 sel[3 * i + nelt1] = nelt + j1++;
5388 if (3 * i + nelt2 < nelt)
5389 sel[3 * i + nelt2] = 0;
5390 }
5391 indices.new_vector (sel, 2, nelt);
5392 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5393
5394 for (i = 0; i < nelt; i++)
5395 {
5396 if (3 * i + nelt0 < nelt)
5397 sel[3 * i + nelt0] = 3 * i + nelt0;
5398 if (3 * i + nelt1 < nelt)
5399 sel[3 * i + nelt1] = 3 * i + nelt1;
5400 if (3 * i + nelt2 < nelt)
5401 sel[3 * i + nelt2] = nelt + j2++;
5402 }
5403 indices.new_vector (sel, 2, nelt);
5404 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5405
5406 vect1 = dr_chain[0];
5407 vect2 = dr_chain[1];
5408
5409 /* Create interleaving stmt:
5410 low = VEC_PERM_EXPR <vect1, vect2,
5411 {j, nelt, *, j + 1, nelt + j + 1, *,
5412 j + 2, nelt + j + 2, *, ...}> */
5413 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5414 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5415 vect2, perm3_mask_low);
5416 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5417
5418 vect1 = data_ref;
5419 vect2 = dr_chain[2];
5420 /* Create interleaving stmt:
5421 low = VEC_PERM_EXPR <vect1, vect2,
5422 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5423 6, 7, nelt + j + 2, ...}> */
5424 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5425 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5426 vect2, perm3_mask_high);
5427 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5428 (*result_chain)[j] = data_ref;
5429 }
5430 }
5431 else
5432 {
5433 /* If length is not equal to 3 then only power of 2 is supported. */
5434 gcc_assert (pow2p_hwi (length));
5435
5436 /* The encoding has 2 interleaved stepped patterns. */
5437 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5438 vec_perm_builder sel (nelt, 2, 3);
5439 sel.quick_grow (6);
5440 for (i = 0; i < 3; i++)
5441 {
5442 sel[i * 2] = i;
5443 sel[i * 2 + 1] = i + nelt;
5444 }
5445 vec_perm_indices indices (sel, 2, nelt);
5446 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5447
5448 for (i = 0; i < 6; i++)
5449 sel[i] += exact_div (nelt, 2);
5450 indices.new_vector (sel, 2, nelt);
5451 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5452
5453 for (i = 0, n = log_length; i < n; i++)
5454 {
5455 for (j = 0; j < length/2; j++)
5456 {
5457 vect1 = dr_chain[j];
5458 vect2 = dr_chain[j+length/2];
5459
5460 /* Create interleaving stmt:
5461 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5462 ...}> */
5463 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5464 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5465 vect2, perm_mask_high);
5466 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5467 (*result_chain)[2*j] = high;
5468
5469 /* Create interleaving stmt:
5470 low = VEC_PERM_EXPR <vect1, vect2,
5471 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5472 ...}> */
5473 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5474 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5475 vect2, perm_mask_low);
5476 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5477 (*result_chain)[2*j+1] = low;
5478 }
5479 memcpy (dr_chain.address (), result_chain->address (),
5480 length * sizeof (tree));
5481 }
5482 }
5483 }
5484
5485 /* Function vect_setup_realignment
5486
5487 This function is called when vectorizing an unaligned load using
5488 the dr_explicit_realign[_optimized] scheme.
5489 This function generates the following code at the loop prolog:
5490
5491 p = initial_addr;
5492 x msq_init = *(floor(p)); # prolog load
5493 realignment_token = call target_builtin;
5494 loop:
5495 x msq = phi (msq_init, ---)
5496
5497 The stmts marked with x are generated only for the case of
5498 dr_explicit_realign_optimized.
5499
5500 The code above sets up a new (vector) pointer, pointing to the first
5501 location accessed by STMT, and a "floor-aligned" load using that pointer.
5502 It also generates code to compute the "realignment-token" (if the relevant
5503 target hook was defined), and creates a phi-node at the loop-header bb
5504 whose arguments are the result of the prolog-load (created by this
5505 function) and the result of a load that takes place in the loop (to be
5506 created by the caller to this function).
5507
5508 For the case of dr_explicit_realign_optimized:
5509 The caller to this function uses the phi-result (msq) to create the
5510 realignment code inside the loop, and sets up the missing phi argument,
5511 as follows:
5512 loop:
5513 msq = phi (msq_init, lsq)
5514 lsq = *(floor(p')); # load in loop
5515 result = realign_load (msq, lsq, realignment_token);
5516
5517 For the case of dr_explicit_realign:
5518 loop:
5519 msq = *(floor(p)); # load in loop
5520 p' = p + (VS-1);
5521 lsq = *(floor(p')); # load in loop
5522 result = realign_load (msq, lsq, realignment_token);
5523
5524 Input:
5525 STMT - (scalar) load stmt to be vectorized. This load accesses
5526 a memory location that may be unaligned.
5527 BSI - place where new code is to be inserted.
5528 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5529 is used.
5530
5531 Output:
5532 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5533 target hook, if defined.
5534 Return value - the result of the loop-header phi node. */
5535
5536 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5537 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5538 tree *realignment_token,
5539 enum dr_alignment_support alignment_support_scheme,
5540 tree init_addr,
5541 struct loop **at_loop)
5542 {
5543 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5544 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5545 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5546 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5547 struct loop *loop = NULL;
5548 edge pe = NULL;
5549 tree scalar_dest = gimple_assign_lhs (stmt);
5550 tree vec_dest;
5551 gimple *inc;
5552 tree ptr;
5553 tree data_ref;
5554 basic_block new_bb;
5555 tree msq_init = NULL_TREE;
5556 tree new_temp;
5557 gphi *phi_stmt;
5558 tree msq = NULL_TREE;
5559 gimple_seq stmts = NULL;
5560 bool inv_p;
5561 bool compute_in_loop = false;
5562 bool nested_in_vect_loop = false;
5563 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5564 struct loop *loop_for_initial_load = NULL;
5565
5566 if (loop_vinfo)
5567 {
5568 loop = LOOP_VINFO_LOOP (loop_vinfo);
5569 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5570 }
5571
5572 gcc_assert (alignment_support_scheme == dr_explicit_realign
5573 || alignment_support_scheme == dr_explicit_realign_optimized);
5574
5575 /* We need to generate three things:
5576 1. the misalignment computation
5577 2. the extra vector load (for the optimized realignment scheme).
5578 3. the phi node for the two vectors from which the realignment is
5579 done (for the optimized realignment scheme). */
5580
5581 /* 1. Determine where to generate the misalignment computation.
5582
5583 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5584 calculation will be generated by this function, outside the loop (in the
5585 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5586 caller, inside the loop.
5587
5588 Background: If the misalignment remains fixed throughout the iterations of
5589 the loop, then both realignment schemes are applicable, and also the
5590 misalignment computation can be done outside LOOP. This is because we are
5591 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5592 are a multiple of VS (the Vector Size), and therefore the misalignment in
5593 different vectorized LOOP iterations is always the same.
5594 The problem arises only if the memory access is in an inner-loop nested
5595 inside LOOP, which is now being vectorized using outer-loop vectorization.
5596 This is the only case when the misalignment of the memory access may not
5597 remain fixed throughout the iterations of the inner-loop (as explained in
5598 detail in vect_supportable_dr_alignment). In this case, not only is the
5599 optimized realignment scheme not applicable, but also the misalignment
5600 computation (and generation of the realignment token that is passed to
5601 REALIGN_LOAD) have to be done inside the loop.
5602
5603 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5604 or not, which in turn determines if the misalignment is computed inside
5605 the inner-loop, or outside LOOP. */
5606
5607 if (init_addr != NULL_TREE || !loop_vinfo)
5608 {
5609 compute_in_loop = true;
5610 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5611 }
5612
5613
5614 /* 2. Determine where to generate the extra vector load.
5615
5616 For the optimized realignment scheme, instead of generating two vector
5617 loads in each iteration, we generate a single extra vector load in the
5618 preheader of the loop, and in each iteration reuse the result of the
5619 vector load from the previous iteration. In case the memory access is in
5620 an inner-loop nested inside LOOP, which is now being vectorized using
5621 outer-loop vectorization, we need to determine whether this initial vector
5622 load should be generated at the preheader of the inner-loop, or can be
5623 generated at the preheader of LOOP. If the memory access has no evolution
5624 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5625 to be generated inside LOOP (in the preheader of the inner-loop). */
5626
5627 if (nested_in_vect_loop)
5628 {
5629 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5630 bool invariant_in_outerloop =
5631 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5632 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5633 }
5634 else
5635 loop_for_initial_load = loop;
5636 if (at_loop)
5637 *at_loop = loop_for_initial_load;
5638
5639 if (loop_for_initial_load)
5640 pe = loop_preheader_edge (loop_for_initial_load);
5641
5642 /* 3. For the case of the optimized realignment, create the first vector
5643 load at the loop preheader. */
5644
5645 if (alignment_support_scheme == dr_explicit_realign_optimized)
5646 {
5647 /* Create msq_init = *(floor(p1)) in the loop preheader */
5648 gassign *new_stmt;
5649
5650 gcc_assert (!compute_in_loop);
5651 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5652 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5653 NULL_TREE, &init_addr, NULL, &inc,
5654 true, &inv_p);
5655 if (TREE_CODE (ptr) == SSA_NAME)
5656 new_temp = copy_ssa_name (ptr);
5657 else
5658 new_temp = make_ssa_name (TREE_TYPE (ptr));
5659 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5660 new_stmt = gimple_build_assign
5661 (new_temp, BIT_AND_EXPR, ptr,
5662 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5663 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5664 gcc_assert (!new_bb);
5665 data_ref
5666 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5667 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5668 vect_copy_ref_info (data_ref, DR_REF (dr));
5669 new_stmt = gimple_build_assign (vec_dest, data_ref);
5670 new_temp = make_ssa_name (vec_dest, new_stmt);
5671 gimple_assign_set_lhs (new_stmt, new_temp);
5672 if (pe)
5673 {
5674 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5675 gcc_assert (!new_bb);
5676 }
5677 else
5678 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5679
5680 msq_init = gimple_assign_lhs (new_stmt);
5681 }
5682
5683 /* 4. Create realignment token using a target builtin, if available.
5684 It is done either inside the containing loop, or before LOOP (as
5685 determined above). */
5686
5687 if (targetm.vectorize.builtin_mask_for_load)
5688 {
5689 gcall *new_stmt;
5690 tree builtin_decl;
5691
5692 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5693 if (!init_addr)
5694 {
5695 /* Generate the INIT_ADDR computation outside LOOP. */
5696 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5697 NULL_TREE);
5698 if (loop)
5699 {
5700 pe = loop_preheader_edge (loop);
5701 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5702 gcc_assert (!new_bb);
5703 }
5704 else
5705 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5706 }
5707
5708 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5709 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5710 vec_dest =
5711 vect_create_destination_var (scalar_dest,
5712 gimple_call_return_type (new_stmt));
5713 new_temp = make_ssa_name (vec_dest, new_stmt);
5714 gimple_call_set_lhs (new_stmt, new_temp);
5715
5716 if (compute_in_loop)
5717 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5718 else
5719 {
5720 /* Generate the misalignment computation outside LOOP. */
5721 pe = loop_preheader_edge (loop);
5722 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5723 gcc_assert (!new_bb);
5724 }
5725
5726 *realignment_token = gimple_call_lhs (new_stmt);
5727
5728 /* The result of the CALL_EXPR to this builtin is determined from
5729 the value of the parameter and no global variables are touched
5730 which makes the builtin a "const" function. Requiring the
5731 builtin to have the "const" attribute makes it unnecessary
5732 to call mark_call_clobbered. */
5733 gcc_assert (TREE_READONLY (builtin_decl));
5734 }
5735
5736 if (alignment_support_scheme == dr_explicit_realign)
5737 return msq;
5738
5739 gcc_assert (!compute_in_loop);
5740 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5741
5742
5743 /* 5. Create msq = phi <msq_init, lsq> in loop */
5744
5745 pe = loop_preheader_edge (containing_loop);
5746 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5747 msq = make_ssa_name (vec_dest);
5748 phi_stmt = create_phi_node (msq, containing_loop->header);
5749 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5750
5751 return msq;
5752 }
5753
5754
5755 /* Function vect_grouped_load_supported.
5756
5757 COUNT is the size of the load group (the number of statements plus the
5758 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5759 only one statement, with a gap of COUNT - 1.
5760
5761 Returns true if a suitable permute exists. */
5762
5763 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5764 vect_grouped_load_supported (tree vectype, bool single_element_p,
5765 unsigned HOST_WIDE_INT count)
5766 {
5767 machine_mode mode = TYPE_MODE (vectype);
5768
5769 /* If this is single-element interleaving with an element distance
5770 that leaves unused vector loads around punt - we at least create
5771 very sub-optimal code in that case (and blow up memory,
5772 see PR65518). */
5773 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5774 {
5775 if (dump_enabled_p ())
5776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5777 "single-element interleaving not supported "
5778 "for not adjacent vector loads\n");
5779 return false;
5780 }
5781
5782 /* vect_permute_load_chain requires the group size to be equal to 3 or
5783 be a power of two. */
5784 if (count != 3 && exact_log2 (count) == -1)
5785 {
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "the size of the group of accesses"
5789 " is not a power of 2 or not equal to 3\n");
5790 return false;
5791 }
5792
5793 /* Check that the permutation is supported. */
5794 if (VECTOR_MODE_P (mode))
5795 {
5796 unsigned int i, j;
5797 if (count == 3)
5798 {
5799 unsigned int nelt;
5800 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5801 {
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "cannot handle groups of 3 loads for"
5805 " variable-length vectors\n");
5806 return false;
5807 }
5808
5809 vec_perm_builder sel (nelt, nelt, 1);
5810 sel.quick_grow (nelt);
5811 vec_perm_indices indices;
5812 unsigned int k;
5813 for (k = 0; k < 3; k++)
5814 {
5815 for (i = 0; i < nelt; i++)
5816 if (3 * i + k < 2 * nelt)
5817 sel[i] = 3 * i + k;
5818 else
5819 sel[i] = 0;
5820 indices.new_vector (sel, 2, nelt);
5821 if (!can_vec_perm_const_p (mode, indices))
5822 {
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825 "shuffle of 3 loads is not supported by"
5826 " target\n");
5827 return false;
5828 }
5829 for (i = 0, j = 0; i < nelt; i++)
5830 if (3 * i + k < 2 * nelt)
5831 sel[i] = i;
5832 else
5833 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5834 indices.new_vector (sel, 2, nelt);
5835 if (!can_vec_perm_const_p (mode, indices))
5836 {
5837 if (dump_enabled_p ())
5838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5839 "shuffle of 3 loads is not supported by"
5840 " target\n");
5841 return false;
5842 }
5843 }
5844 return true;
5845 }
5846 else
5847 {
5848 /* If length is not equal to 3 then only power of 2 is supported. */
5849 gcc_assert (pow2p_hwi (count));
5850 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5851
5852 /* The encoding has a single stepped pattern. */
5853 vec_perm_builder sel (nelt, 1, 3);
5854 sel.quick_grow (3);
5855 for (i = 0; i < 3; i++)
5856 sel[i] = i * 2;
5857 vec_perm_indices indices (sel, 2, nelt);
5858 if (can_vec_perm_const_p (mode, indices))
5859 {
5860 for (i = 0; i < 3; i++)
5861 sel[i] = i * 2 + 1;
5862 indices.new_vector (sel, 2, nelt);
5863 if (can_vec_perm_const_p (mode, indices))
5864 return true;
5865 }
5866 }
5867 }
5868
5869 if (dump_enabled_p ())
5870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5871 "extract even/odd not supported by target\n");
5872 return false;
5873 }
5874
5875 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5876 type VECTYPE. MASKED_P says whether the masked form is needed. */
5877
5878 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5879 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5880 bool masked_p)
5881 {
5882 if (masked_p)
5883 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5884 vec_mask_load_lanes_optab,
5885 vectype, count);
5886 else
5887 return vect_lanes_optab_supported_p ("vec_load_lanes",
5888 vec_load_lanes_optab,
5889 vectype, count);
5890 }
5891
5892 /* Function vect_permute_load_chain.
5893
5894 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5895 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5896 the input data correctly. Return the final references for loads in
5897 RESULT_CHAIN.
5898
5899 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5900 The input is 4 vectors each containing 8 elements. We assign a number to each
5901 element, the input sequence is:
5902
5903 1st vec: 0 1 2 3 4 5 6 7
5904 2nd vec: 8 9 10 11 12 13 14 15
5905 3rd vec: 16 17 18 19 20 21 22 23
5906 4th vec: 24 25 26 27 28 29 30 31
5907
5908 The output sequence should be:
5909
5910 1st vec: 0 4 8 12 16 20 24 28
5911 2nd vec: 1 5 9 13 17 21 25 29
5912 3rd vec: 2 6 10 14 18 22 26 30
5913 4th vec: 3 7 11 15 19 23 27 31
5914
5915 i.e., the first output vector should contain the first elements of each
5916 interleaving group, etc.
5917
5918 We use extract_even/odd instructions to create such output. The input of
5919 each extract_even/odd operation is two vectors
5920 1st vec 2nd vec
5921 0 1 2 3 4 5 6 7
5922
5923 and the output is the vector of extracted even/odd elements. The output of
5924 extract_even will be: 0 2 4 6
5925 and of extract_odd: 1 3 5 7
5926
5927
5928 The permutation is done in log LENGTH stages. In each stage extract_even
5929 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5930 their order. In our example,
5931
5932 E1: extract_even (1st vec, 2nd vec)
5933 E2: extract_odd (1st vec, 2nd vec)
5934 E3: extract_even (3rd vec, 4th vec)
5935 E4: extract_odd (3rd vec, 4th vec)
5936
5937 The output for the first stage will be:
5938
5939 E1: 0 2 4 6 8 10 12 14
5940 E2: 1 3 5 7 9 11 13 15
5941 E3: 16 18 20 22 24 26 28 30
5942 E4: 17 19 21 23 25 27 29 31
5943
5944 In order to proceed and create the correct sequence for the next stage (or
5945 for the correct output, if the second stage is the last one, as in our
5946 example), we first put the output of extract_even operation and then the
5947 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5948 The input for the second stage is:
5949
5950 1st vec (E1): 0 2 4 6 8 10 12 14
5951 2nd vec (E3): 16 18 20 22 24 26 28 30
5952 3rd vec (E2): 1 3 5 7 9 11 13 15
5953 4th vec (E4): 17 19 21 23 25 27 29 31
5954
5955 The output of the second stage:
5956
5957 E1: 0 4 8 12 16 20 24 28
5958 E2: 2 6 10 14 18 22 26 30
5959 E3: 1 5 9 13 17 21 25 29
5960 E4: 3 7 11 15 19 23 27 31
5961
5962 And RESULT_CHAIN after reordering:
5963
5964 1st vec (E1): 0 4 8 12 16 20 24 28
5965 2nd vec (E3): 1 5 9 13 17 21 25 29
5966 3rd vec (E2): 2 6 10 14 18 22 26 30
5967 4th vec (E4): 3 7 11 15 19 23 27 31. */
5968
5969 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5970 vect_permute_load_chain (vec<tree> dr_chain,
5971 unsigned int length,
5972 gimple *stmt,
5973 gimple_stmt_iterator *gsi,
5974 vec<tree> *result_chain)
5975 {
5976 tree data_ref, first_vect, second_vect;
5977 tree perm_mask_even, perm_mask_odd;
5978 tree perm3_mask_low, perm3_mask_high;
5979 gimple *perm_stmt;
5980 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5981 unsigned int i, j, log_length = exact_log2 (length);
5982
5983 result_chain->quick_grow (length);
5984 memcpy (result_chain->address (), dr_chain.address (),
5985 length * sizeof (tree));
5986
5987 if (length == 3)
5988 {
5989 /* vect_grouped_load_supported ensures that this is constant. */
5990 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5991 unsigned int k;
5992
5993 vec_perm_builder sel (nelt, nelt, 1);
5994 sel.quick_grow (nelt);
5995 vec_perm_indices indices;
5996 for (k = 0; k < 3; k++)
5997 {
5998 for (i = 0; i < nelt; i++)
5999 if (3 * i + k < 2 * nelt)
6000 sel[i] = 3 * i + k;
6001 else
6002 sel[i] = 0;
6003 indices.new_vector (sel, 2, nelt);
6004 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6005
6006 for (i = 0, j = 0; i < nelt; i++)
6007 if (3 * i + k < 2 * nelt)
6008 sel[i] = i;
6009 else
6010 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6011 indices.new_vector (sel, 2, nelt);
6012 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6013
6014 first_vect = dr_chain[0];
6015 second_vect = dr_chain[1];
6016
6017 /* Create interleaving stmt (low part of):
6018 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6019 ...}> */
6020 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6021 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6022 second_vect, perm3_mask_low);
6023 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6024
6025 /* Create interleaving stmt (high part of):
6026 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6027 ...}> */
6028 first_vect = data_ref;
6029 second_vect = dr_chain[2];
6030 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6031 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6032 second_vect, perm3_mask_high);
6033 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6034 (*result_chain)[k] = data_ref;
6035 }
6036 }
6037 else
6038 {
6039 /* If length is not equal to 3 then only power of 2 is supported. */
6040 gcc_assert (pow2p_hwi (length));
6041
6042 /* The encoding has a single stepped pattern. */
6043 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6044 vec_perm_builder sel (nelt, 1, 3);
6045 sel.quick_grow (3);
6046 for (i = 0; i < 3; ++i)
6047 sel[i] = i * 2;
6048 vec_perm_indices indices (sel, 2, nelt);
6049 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6050
6051 for (i = 0; i < 3; ++i)
6052 sel[i] = i * 2 + 1;
6053 indices.new_vector (sel, 2, nelt);
6054 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6055
6056 for (i = 0; i < log_length; i++)
6057 {
6058 for (j = 0; j < length; j += 2)
6059 {
6060 first_vect = dr_chain[j];
6061 second_vect = dr_chain[j+1];
6062
6063 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6064 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6065 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6066 first_vect, second_vect,
6067 perm_mask_even);
6068 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6069 (*result_chain)[j/2] = data_ref;
6070
6071 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6072 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6073 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6074 first_vect, second_vect,
6075 perm_mask_odd);
6076 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6077 (*result_chain)[j/2+length/2] = data_ref;
6078 }
6079 memcpy (dr_chain.address (), result_chain->address (),
6080 length * sizeof (tree));
6081 }
6082 }
6083 }
6084
6085 /* Function vect_shift_permute_load_chain.
6086
6087 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6088 sequence of stmts to reorder the input data accordingly.
6089 Return the final references for loads in RESULT_CHAIN.
6090 Return true if successed, false otherwise.
6091
6092 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6093 The input is 3 vectors each containing 8 elements. We assign a
6094 number to each element, the input sequence is:
6095
6096 1st vec: 0 1 2 3 4 5 6 7
6097 2nd vec: 8 9 10 11 12 13 14 15
6098 3rd vec: 16 17 18 19 20 21 22 23
6099
6100 The output sequence should be:
6101
6102 1st vec: 0 3 6 9 12 15 18 21
6103 2nd vec: 1 4 7 10 13 16 19 22
6104 3rd vec: 2 5 8 11 14 17 20 23
6105
6106 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6107
6108 First we shuffle all 3 vectors to get correct elements order:
6109
6110 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6111 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6112 3rd vec: (16 19 22) (17 20 23) (18 21)
6113
6114 Next we unite and shift vector 3 times:
6115
6116 1st step:
6117 shift right by 6 the concatenation of:
6118 "1st vec" and "2nd vec"
6119 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6120 "2nd vec" and "3rd vec"
6121 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6122 "3rd vec" and "1st vec"
6123 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6124 | New vectors |
6125
6126 So that now new vectors are:
6127
6128 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6129 2nd vec: (10 13) (16 19 22) (17 20 23)
6130 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6131
6132 2nd step:
6133 shift right by 5 the concatenation of:
6134 "1st vec" and "3rd vec"
6135 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6136 "2nd vec" and "1st vec"
6137 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6138 "3rd vec" and "2nd vec"
6139 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6140 | New vectors |
6141
6142 So that now new vectors are:
6143
6144 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6145 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6146 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6147
6148 3rd step:
6149 shift right by 5 the concatenation of:
6150 "1st vec" and "1st vec"
6151 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6152 shift right by 3 the concatenation of:
6153 "2nd vec" and "2nd vec"
6154 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6155 | New vectors |
6156
6157 So that now all vectors are READY:
6158 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6159 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6160 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6161
6162 This algorithm is faster than one in vect_permute_load_chain if:
6163 1. "shift of a concatination" is faster than general permutation.
6164 This is usually so.
6165 2. The TARGET machine can't execute vector instructions in parallel.
6166 This is because each step of the algorithm depends on previous.
6167 The algorithm in vect_permute_load_chain is much more parallel.
6168
6169 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6170 */
6171
6172 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6173 vect_shift_permute_load_chain (vec<tree> dr_chain,
6174 unsigned int length,
6175 gimple *stmt,
6176 gimple_stmt_iterator *gsi,
6177 vec<tree> *result_chain)
6178 {
6179 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6180 tree perm2_mask1, perm2_mask2, perm3_mask;
6181 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6182 gimple *perm_stmt;
6183
6184 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6185 unsigned int i;
6186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6187 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6188
6189 unsigned HOST_WIDE_INT nelt, vf;
6190 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6191 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6192 /* Not supported for variable-length vectors. */
6193 return false;
6194
6195 vec_perm_builder sel (nelt, nelt, 1);
6196 sel.quick_grow (nelt);
6197
6198 result_chain->quick_grow (length);
6199 memcpy (result_chain->address (), dr_chain.address (),
6200 length * sizeof (tree));
6201
6202 if (pow2p_hwi (length) && vf > 4)
6203 {
6204 unsigned int j, log_length = exact_log2 (length);
6205 for (i = 0; i < nelt / 2; ++i)
6206 sel[i] = i * 2;
6207 for (i = 0; i < nelt / 2; ++i)
6208 sel[nelt / 2 + i] = i * 2 + 1;
6209 vec_perm_indices indices (sel, 2, nelt);
6210 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211 {
6212 if (dump_enabled_p ())
6213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214 "shuffle of 2 fields structure is not \
6215 supported by target\n");
6216 return false;
6217 }
6218 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6219
6220 for (i = 0; i < nelt / 2; ++i)
6221 sel[i] = i * 2 + 1;
6222 for (i = 0; i < nelt / 2; ++i)
6223 sel[nelt / 2 + i] = i * 2;
6224 indices.new_vector (sel, 2, nelt);
6225 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226 {
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 "shuffle of 2 fields structure is not \
6230 supported by target\n");
6231 return false;
6232 }
6233 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6234
6235 /* Generating permutation constant to shift all elements.
6236 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6237 for (i = 0; i < nelt; i++)
6238 sel[i] = nelt / 2 + i;
6239 indices.new_vector (sel, 2, nelt);
6240 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241 {
6242 if (dump_enabled_p ())
6243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244 "shift permutation is not supported by target\n");
6245 return false;
6246 }
6247 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6248
6249 /* Generating permutation constant to select vector from 2.
6250 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6251 for (i = 0; i < nelt / 2; i++)
6252 sel[i] = i;
6253 for (i = nelt / 2; i < nelt; i++)
6254 sel[i] = nelt + i;
6255 indices.new_vector (sel, 2, nelt);
6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257 {
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 "select is not supported by target\n");
6261 return false;
6262 }
6263 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6264
6265 for (i = 0; i < log_length; i++)
6266 {
6267 for (j = 0; j < length; j += 2)
6268 {
6269 first_vect = dr_chain[j];
6270 second_vect = dr_chain[j + 1];
6271
6272 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6273 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6274 first_vect, first_vect,
6275 perm2_mask1);
6276 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6277 vect[0] = data_ref;
6278
6279 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6280 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6281 second_vect, second_vect,
6282 perm2_mask2);
6283 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6284 vect[1] = data_ref;
6285
6286 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288 vect[0], vect[1], shift1_mask);
6289 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290 (*result_chain)[j/2 + length/2] = data_ref;
6291
6292 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6293 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6294 vect[0], vect[1], select_mask);
6295 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296 (*result_chain)[j/2] = data_ref;
6297 }
6298 memcpy (dr_chain.address (), result_chain->address (),
6299 length * sizeof (tree));
6300 }
6301 return true;
6302 }
6303 if (length == 3 && vf > 2)
6304 {
6305 unsigned int k = 0, l = 0;
6306
6307 /* Generating permutation constant to get all elements in rigth order.
6308 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6309 for (i = 0; i < nelt; i++)
6310 {
6311 if (3 * k + (l % 3) >= nelt)
6312 {
6313 k = 0;
6314 l += (3 - (nelt % 3));
6315 }
6316 sel[i] = 3 * k + (l % 3);
6317 k++;
6318 }
6319 vec_perm_indices indices (sel, 2, nelt);
6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6321 {
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324 "shuffle of 3 fields structure is not \
6325 supported by target\n");
6326 return false;
6327 }
6328 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6329
6330 /* Generating permutation constant to shift all elements.
6331 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6332 for (i = 0; i < nelt; i++)
6333 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6334 indices.new_vector (sel, 2, nelt);
6335 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6336 {
6337 if (dump_enabled_p ())
6338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6339 "shift permutation is not supported by target\n");
6340 return false;
6341 }
6342 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6343
6344 /* Generating permutation constant to shift all elements.
6345 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6346 for (i = 0; i < nelt; i++)
6347 sel[i] = 2 * (nelt / 3) + 1 + i;
6348 indices.new_vector (sel, 2, nelt);
6349 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6350 {
6351 if (dump_enabled_p ())
6352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6353 "shift permutation is not supported by target\n");
6354 return false;
6355 }
6356 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6357
6358 /* Generating permutation constant to shift all elements.
6359 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6360 for (i = 0; i < nelt; i++)
6361 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6362 indices.new_vector (sel, 2, nelt);
6363 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6364 {
6365 if (dump_enabled_p ())
6366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6367 "shift permutation is not supported by target\n");
6368 return false;
6369 }
6370 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6371
6372 /* Generating permutation constant to shift all elements.
6373 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6374 for (i = 0; i < nelt; i++)
6375 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6376 indices.new_vector (sel, 2, nelt);
6377 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6378 {
6379 if (dump_enabled_p ())
6380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6381 "shift permutation is not supported by target\n");
6382 return false;
6383 }
6384 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6385
6386 for (k = 0; k < 3; k++)
6387 {
6388 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6389 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6390 dr_chain[k], dr_chain[k],
6391 perm3_mask);
6392 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6393 vect[k] = data_ref;
6394 }
6395
6396 for (k = 0; k < 3; k++)
6397 {
6398 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6399 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6400 vect[k % 3], vect[(k + 1) % 3],
6401 shift1_mask);
6402 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6403 vect_shift[k] = data_ref;
6404 }
6405
6406 for (k = 0; k < 3; k++)
6407 {
6408 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6409 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6410 vect_shift[(4 - k) % 3],
6411 vect_shift[(3 - k) % 3],
6412 shift2_mask);
6413 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6414 vect[k] = data_ref;
6415 }
6416
6417 (*result_chain)[3 - (nelt % 3)] = vect[2];
6418
6419 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6420 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6421 vect[0], shift3_mask);
6422 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6423 (*result_chain)[nelt % 3] = data_ref;
6424
6425 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6426 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6427 vect[1], shift4_mask);
6428 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6429 (*result_chain)[0] = data_ref;
6430 return true;
6431 }
6432 return false;
6433 }
6434
6435 /* Function vect_transform_grouped_load.
6436
6437 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6438 to perform their permutation and ascribe the result vectorized statements to
6439 the scalar statements.
6440 */
6441
6442 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6443 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6444 gimple_stmt_iterator *gsi)
6445 {
6446 machine_mode mode;
6447 vec<tree> result_chain = vNULL;
6448
6449 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6450 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6451 vectors, that are ready for vector computation. */
6452 result_chain.create (size);
6453
6454 /* If reassociation width for vector type is 2 or greater target machine can
6455 execute 2 or more vector instructions in parallel. Otherwise try to
6456 get chain for loads group using vect_shift_permute_load_chain. */
6457 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6458 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6459 || pow2p_hwi (size)
6460 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6461 gsi, &result_chain))
6462 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6463 vect_record_grouped_load_vectors (stmt, result_chain);
6464 result_chain.release ();
6465 }
6466
6467 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6468 generated as part of the vectorization of STMT. Assign the statement
6469 for each vector to the associated scalar statement. */
6470
6471 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6472 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6473 {
6474 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6475 gimple *next_stmt, *new_stmt;
6476 unsigned int i, gap_count;
6477 tree tmp_data_ref;
6478
6479 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6480 Since we scan the chain starting from it's first node, their order
6481 corresponds the order of data-refs in RESULT_CHAIN. */
6482 next_stmt = first_stmt;
6483 gap_count = 1;
6484 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6485 {
6486 if (!next_stmt)
6487 break;
6488
6489 /* Skip the gaps. Loads created for the gaps will be removed by dead
6490 code elimination pass later. No need to check for the first stmt in
6491 the group, since it always exists.
6492 GROUP_GAP is the number of steps in elements from the previous
6493 access (if there is no gap GROUP_GAP is 1). We skip loads that
6494 correspond to the gaps. */
6495 if (next_stmt != first_stmt
6496 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6497 {
6498 gap_count++;
6499 continue;
6500 }
6501
6502 while (next_stmt)
6503 {
6504 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6505 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6506 copies, and we put the new vector statement in the first available
6507 RELATED_STMT. */
6508 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6509 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6510 else
6511 {
6512 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6513 {
6514 gimple *prev_stmt =
6515 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6516 gimple *rel_stmt =
6517 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6518 while (rel_stmt)
6519 {
6520 prev_stmt = rel_stmt;
6521 rel_stmt =
6522 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6523 }
6524
6525 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6526 new_stmt;
6527 }
6528 }
6529
6530 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6531 gap_count = 1;
6532 /* If NEXT_STMT accesses the same DR as the previous statement,
6533 put the same TMP_DATA_REF as its vectorized statement; otherwise
6534 get the next data-ref from RESULT_CHAIN. */
6535 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6536 break;
6537 }
6538 }
6539 }
6540
6541 /* Function vect_force_dr_alignment_p.
6542
6543 Returns whether the alignment of a DECL can be forced to be aligned
6544 on ALIGNMENT bit boundary. */
6545
6546 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6547 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6548 {
6549 if (!VAR_P (decl))
6550 return false;
6551
6552 if (decl_in_symtab_p (decl)
6553 && !symtab_node::get (decl)->can_increase_alignment_p ())
6554 return false;
6555
6556 if (TREE_STATIC (decl))
6557 return (alignment <= MAX_OFILE_ALIGNMENT);
6558 else
6559 return (alignment <= MAX_STACK_ALIGNMENT);
6560 }
6561
6562
6563 /* Return whether the data reference DR is supported with respect to its
6564 alignment.
6565 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6566 it is aligned, i.e., check if it is possible to vectorize it with different
6567 alignment. */
6568
6569 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6570 vect_supportable_dr_alignment (struct data_reference *dr,
6571 bool check_aligned_accesses)
6572 {
6573 gimple *stmt = DR_STMT (dr);
6574 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6575 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6576 machine_mode mode = TYPE_MODE (vectype);
6577 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6578 struct loop *vect_loop = NULL;
6579 bool nested_in_vect_loop = false;
6580
6581 if (aligned_access_p (dr) && !check_aligned_accesses)
6582 return dr_aligned;
6583
6584 /* For now assume all conditional loads/stores support unaligned
6585 access without any special code. */
6586 if (is_gimple_call (stmt)
6587 && gimple_call_internal_p (stmt)
6588 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6589 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6590 return dr_unaligned_supported;
6591
6592 if (loop_vinfo)
6593 {
6594 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6595 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6596 }
6597
6598 /* Possibly unaligned access. */
6599
6600 /* We can choose between using the implicit realignment scheme (generating
6601 a misaligned_move stmt) and the explicit realignment scheme (generating
6602 aligned loads with a REALIGN_LOAD). There are two variants to the
6603 explicit realignment scheme: optimized, and unoptimized.
6604 We can optimize the realignment only if the step between consecutive
6605 vector loads is equal to the vector size. Since the vector memory
6606 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6607 is guaranteed that the misalignment amount remains the same throughout the
6608 execution of the vectorized loop. Therefore, we can create the
6609 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6610 at the loop preheader.
6611
6612 However, in the case of outer-loop vectorization, when vectorizing a
6613 memory access in the inner-loop nested within the LOOP that is now being
6614 vectorized, while it is guaranteed that the misalignment of the
6615 vectorized memory access will remain the same in different outer-loop
6616 iterations, it is *not* guaranteed that is will remain the same throughout
6617 the execution of the inner-loop. This is because the inner-loop advances
6618 with the original scalar step (and not in steps of VS). If the inner-loop
6619 step happens to be a multiple of VS, then the misalignment remains fixed
6620 and we can use the optimized realignment scheme. For example:
6621
6622 for (i=0; i<N; i++)
6623 for (j=0; j<M; j++)
6624 s += a[i+j];
6625
6626 When vectorizing the i-loop in the above example, the step between
6627 consecutive vector loads is 1, and so the misalignment does not remain
6628 fixed across the execution of the inner-loop, and the realignment cannot
6629 be optimized (as illustrated in the following pseudo vectorized loop):
6630
6631 for (i=0; i<N; i+=4)
6632 for (j=0; j<M; j++){
6633 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6634 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6635 // (assuming that we start from an aligned address).
6636 }
6637
6638 We therefore have to use the unoptimized realignment scheme:
6639
6640 for (i=0; i<N; i+=4)
6641 for (j=k; j<M; j+=4)
6642 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6643 // that the misalignment of the initial address is
6644 // 0).
6645
6646 The loop can then be vectorized as follows:
6647
6648 for (k=0; k<4; k++){
6649 rt = get_realignment_token (&vp[k]);
6650 for (i=0; i<N; i+=4){
6651 v1 = vp[i+k];
6652 for (j=k; j<M; j+=4){
6653 v2 = vp[i+j+VS-1];
6654 va = REALIGN_LOAD <v1,v2,rt>;
6655 vs += va;
6656 v1 = v2;
6657 }
6658 }
6659 } */
6660
6661 if (DR_IS_READ (dr))
6662 {
6663 bool is_packed = false;
6664 tree type = (TREE_TYPE (DR_REF (dr)));
6665
6666 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6667 && (!targetm.vectorize.builtin_mask_for_load
6668 || targetm.vectorize.builtin_mask_for_load ()))
6669 {
6670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6671
6672 /* If we are doing SLP then the accesses need not have the
6673 same alignment, instead it depends on the SLP group size. */
6674 if (loop_vinfo
6675 && STMT_SLP_TYPE (stmt_info)
6676 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6677 * GROUP_SIZE (vinfo_for_stmt
6678 (GROUP_FIRST_ELEMENT (stmt_info))),
6679 TYPE_VECTOR_SUBPARTS (vectype)))
6680 ;
6681 else if (!loop_vinfo
6682 || (nested_in_vect_loop
6683 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6684 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6685 return dr_explicit_realign;
6686 else
6687 return dr_explicit_realign_optimized;
6688 }
6689 if (!known_alignment_for_access_p (dr))
6690 is_packed = not_size_aligned (DR_REF (dr));
6691
6692 if (targetm.vectorize.support_vector_misalignment
6693 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6694 /* Can't software pipeline the loads, but can at least do them. */
6695 return dr_unaligned_supported;
6696 }
6697 else
6698 {
6699 bool is_packed = false;
6700 tree type = (TREE_TYPE (DR_REF (dr)));
6701
6702 if (!known_alignment_for_access_p (dr))
6703 is_packed = not_size_aligned (DR_REF (dr));
6704
6705 if (targetm.vectorize.support_vector_misalignment
6706 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6707 return dr_unaligned_supported;
6708 }
6709
6710 /* Unsupported. */
6711 return dr_unaligned_unsupported;
6712 }
6713