138fd1498Szrj /* Data References Analysis and Manipulation Utilities for Vectorization.
238fd1498Szrj Copyright (C) 2003-2018 Free Software Foundation, Inc.
338fd1498Szrj Contributed by Dorit Naishlos <dorit@il.ibm.com>
438fd1498Szrj and Ira Rosen <irar@il.ibm.com>
538fd1498Szrj
638fd1498Szrj This file is part of GCC.
738fd1498Szrj
838fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
938fd1498Szrj the terms of the GNU General Public License as published by the Free
1038fd1498Szrj Software Foundation; either version 3, or (at your option) any later
1138fd1498Szrj version.
1238fd1498Szrj
1338fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1438fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
1538fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1638fd1498Szrj for more details.
1738fd1498Szrj
1838fd1498Szrj You should have received a copy of the GNU General Public License
1938fd1498Szrj along with GCC; see the file COPYING3. If not see
2038fd1498Szrj <http://www.gnu.org/licenses/>. */
2138fd1498Szrj
2238fd1498Szrj #include "config.h"
2338fd1498Szrj #include "system.h"
2438fd1498Szrj #include "coretypes.h"
2538fd1498Szrj #include "backend.h"
2638fd1498Szrj #include "target.h"
2738fd1498Szrj #include "rtl.h"
2838fd1498Szrj #include "tree.h"
2938fd1498Szrj #include "gimple.h"
3038fd1498Szrj #include "predict.h"
3138fd1498Szrj #include "memmodel.h"
3238fd1498Szrj #include "tm_p.h"
3338fd1498Szrj #include "ssa.h"
3438fd1498Szrj #include "optabs-tree.h"
3538fd1498Szrj #include "cgraph.h"
3638fd1498Szrj #include "dumpfile.h"
3738fd1498Szrj #include "alias.h"
3838fd1498Szrj #include "fold-const.h"
3938fd1498Szrj #include "stor-layout.h"
4038fd1498Szrj #include "tree-eh.h"
4138fd1498Szrj #include "gimplify.h"
4238fd1498Szrj #include "gimple-iterator.h"
4338fd1498Szrj #include "gimplify-me.h"
4438fd1498Szrj #include "tree-ssa-loop-ivopts.h"
4538fd1498Szrj #include "tree-ssa-loop-manip.h"
4638fd1498Szrj #include "tree-ssa-loop.h"
4738fd1498Szrj #include "cfgloop.h"
4838fd1498Szrj #include "tree-scalar-evolution.h"
4938fd1498Szrj #include "tree-vectorizer.h"
5038fd1498Szrj #include "expr.h"
5138fd1498Szrj #include "builtins.h"
5238fd1498Szrj #include "params.h"
5338fd1498Szrj #include "tree-cfg.h"
5438fd1498Szrj #include "tree-hash-traits.h"
5538fd1498Szrj #include "vec-perm-indices.h"
5638fd1498Szrj #include "internal-fn.h"
5738fd1498Szrj
5838fd1498Szrj /* Return true if load- or store-lanes optab OPTAB is implemented for
5938fd1498Szrj COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
6038fd1498Szrj
6138fd1498Szrj static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)6238fd1498Szrj vect_lanes_optab_supported_p (const char *name, convert_optab optab,
6338fd1498Szrj tree vectype, unsigned HOST_WIDE_INT count)
6438fd1498Szrj {
6538fd1498Szrj machine_mode mode, array_mode;
6638fd1498Szrj bool limit_p;
6738fd1498Szrj
6838fd1498Szrj mode = TYPE_MODE (vectype);
6938fd1498Szrj if (!targetm.array_mode (mode, count).exists (&array_mode))
7038fd1498Szrj {
7138fd1498Szrj poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
7238fd1498Szrj limit_p = !targetm.array_mode_supported_p (mode, count);
7338fd1498Szrj if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
7438fd1498Szrj {
7538fd1498Szrj if (dump_enabled_p ())
7638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7738fd1498Szrj "no array mode for %s["
7838fd1498Szrj HOST_WIDE_INT_PRINT_DEC "]\n",
7938fd1498Szrj GET_MODE_NAME (mode), count);
8038fd1498Szrj return false;
8138fd1498Szrj }
8238fd1498Szrj }
8338fd1498Szrj
8438fd1498Szrj if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
8538fd1498Szrj {
8638fd1498Szrj if (dump_enabled_p ())
8738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8838fd1498Szrj "cannot use %s<%s><%s>\n", name,
8938fd1498Szrj GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
9038fd1498Szrj return false;
9138fd1498Szrj }
9238fd1498Szrj
9338fd1498Szrj if (dump_enabled_p ())
9438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
9538fd1498Szrj "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
9638fd1498Szrj GET_MODE_NAME (mode));
9738fd1498Szrj
9838fd1498Szrj return true;
9938fd1498Szrj }
10038fd1498Szrj
10138fd1498Szrj
10238fd1498Szrj /* Return the smallest scalar part of STMT.
10338fd1498Szrj This is used to determine the vectype of the stmt. We generally set the
10438fd1498Szrj vectype according to the type of the result (lhs). For stmts whose
10538fd1498Szrj result-type is different than the type of the arguments (e.g., demotion,
10638fd1498Szrj promotion), vectype will be reset appropriately (later). Note that we have
10738fd1498Szrj to visit the smallest datatype in this function, because that determines the
10838fd1498Szrj VF. If the smallest datatype in the loop is present only as the rhs of a
10938fd1498Szrj promotion operation - we'd miss it.
11038fd1498Szrj Such a case, where a variable of this datatype does not appear in the lhs
11138fd1498Szrj anywhere in the loop, can only occur if it's an invariant: e.g.:
11238fd1498Szrj 'int_x = (int) short_inv', which we'd expect to have been optimized away by
11338fd1498Szrj invariant motion. However, we cannot rely on invariant motion to always
11438fd1498Szrj take invariants out of the loop, and so in the case of promotion we also
11538fd1498Szrj have to check the rhs.
11638fd1498Szrj LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
11738fd1498Szrj types. */
11838fd1498Szrj
11938fd1498Szrj tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)12038fd1498Szrj vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
12138fd1498Szrj HOST_WIDE_INT *rhs_size_unit)
12238fd1498Szrj {
12338fd1498Szrj tree scalar_type = gimple_expr_type (stmt);
12438fd1498Szrj HOST_WIDE_INT lhs, rhs;
12538fd1498Szrj
12638fd1498Szrj /* During the analysis phase, this function is called on arbitrary
12738fd1498Szrj statements that might not have scalar results. */
12838fd1498Szrj if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
12938fd1498Szrj return scalar_type;
13038fd1498Szrj
13138fd1498Szrj lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
13238fd1498Szrj
13338fd1498Szrj if (is_gimple_assign (stmt)
13438fd1498Szrj && (gimple_assign_cast_p (stmt)
13538fd1498Szrj || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
13638fd1498Szrj || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
13738fd1498Szrj || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
13838fd1498Szrj || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
13938fd1498Szrj || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
14038fd1498Szrj {
14138fd1498Szrj tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
14238fd1498Szrj
14338fd1498Szrj rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
14438fd1498Szrj if (rhs < lhs)
14538fd1498Szrj scalar_type = rhs_type;
14638fd1498Szrj }
14738fd1498Szrj
14838fd1498Szrj *lhs_size_unit = lhs;
14938fd1498Szrj *rhs_size_unit = rhs;
15038fd1498Szrj return scalar_type;
15138fd1498Szrj }
15238fd1498Szrj
15338fd1498Szrj
15438fd1498Szrj /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
15538fd1498Szrj tested at run-time. Return TRUE if DDR was successfully inserted.
15638fd1498Szrj Return false if versioning is not supported. */
15738fd1498Szrj
15838fd1498Szrj static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)15938fd1498Szrj vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
16038fd1498Szrj {
16138fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16238fd1498Szrj
16338fd1498Szrj if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
16438fd1498Szrj return false;
16538fd1498Szrj
16638fd1498Szrj if (!runtime_alias_check_p (ddr, loop,
16738fd1498Szrj optimize_loop_nest_for_speed_p (loop)))
16838fd1498Szrj return false;
16938fd1498Szrj
17038fd1498Szrj LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
17138fd1498Szrj return true;
17238fd1498Szrj }
17338fd1498Szrj
17438fd1498Szrj /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
17538fd1498Szrj
17638fd1498Szrj static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)17738fd1498Szrj vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
17838fd1498Szrj {
17938fd1498Szrj vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
18038fd1498Szrj for (unsigned int i = 0; i < checks.length(); ++i)
18138fd1498Szrj if (checks[i] == value)
18238fd1498Szrj return;
18338fd1498Szrj
18438fd1498Szrj if (dump_enabled_p ())
18538fd1498Szrj {
18638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
18738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
18838fd1498Szrj dump_printf (MSG_NOTE, " is nonzero\n");
18938fd1498Szrj }
19038fd1498Szrj LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
19138fd1498Szrj }
19238fd1498Szrj
19338fd1498Szrj /* Return true if we know that the order of vectorized STMT_A and
19438fd1498Szrj vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
19538fd1498Szrj At least one of the statements is a write. */
19638fd1498Szrj
19738fd1498Szrj static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)19838fd1498Szrj vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
19938fd1498Szrj {
20038fd1498Szrj stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
20138fd1498Szrj stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
20238fd1498Szrj
20338fd1498Szrj /* Single statements are always kept in their original order. */
20438fd1498Szrj if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
20538fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
20638fd1498Szrj return true;
20738fd1498Szrj
20838fd1498Szrj /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209*e215fc28Szrj SLP group are emitted at the position of the last scalar load and
210*e215fc28Szrj all loads in an interleaving group are emitted at the position
211*e215fc28Szrj of the first scalar load.
212*e215fc28Szrj Stores in a group are emitted at the position of the last scalar store.
21358e805e6Szrj Compute that position and check whether the resulting order matches
214*e215fc28Szrj the current one.
215*e215fc28Szrj We have not yet decided between SLP and interleaving so we have
216*e215fc28Szrj to conservatively assume both. */
217*e215fc28Szrj gimple *il_a;
218*e215fc28Szrj gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
21958e805e6Szrj if (last_a)
220*e215fc28Szrj {
22158e805e6Szrj for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
22258e805e6Szrj s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
22358e805e6Szrj last_a = get_later_stmt (last_a, s);
224*e215fc28Szrj if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
225*e215fc28Szrj {
226*e215fc28Szrj for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s;
227*e215fc28Szrj s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
228*e215fc28Szrj if (get_later_stmt (il_a, s) == il_a)
229*e215fc28Szrj il_a = s;
230*e215fc28Szrj }
23158e805e6Szrj else
232*e215fc28Szrj il_a = last_a;
233*e215fc28Szrj }
234*e215fc28Szrj else
235*e215fc28Szrj last_a = il_a = stmt_a;
236*e215fc28Szrj gimple *il_b;
237*e215fc28Szrj gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
23858e805e6Szrj if (last_b)
239*e215fc28Szrj {
24058e805e6Szrj for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
24158e805e6Szrj s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
24258e805e6Szrj last_b = get_later_stmt (last_b, s);
243*e215fc28Szrj if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
244*e215fc28Szrj {
245*e215fc28Szrj for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s;
246*e215fc28Szrj s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247*e215fc28Szrj if (get_later_stmt (il_b, s) == il_b)
248*e215fc28Szrj il_b = s;
249*e215fc28Szrj }
25058e805e6Szrj else
251*e215fc28Szrj il_b = last_b;
252*e215fc28Szrj }
253*e215fc28Szrj else
254*e215fc28Szrj last_b = il_b = stmt_b;
255*e215fc28Szrj bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a);
256*e215fc28Szrj return (/* SLP */
257*e215fc28Szrj (get_later_stmt (last_a, last_b) == last_a) == a_after_b
258*e215fc28Szrj /* Interleaving */
259*e215fc28Szrj && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
260*e215fc28Szrj /* Mixed */
261*e215fc28Szrj && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
262*e215fc28Szrj && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
26338fd1498Szrj }
26438fd1498Szrj
26538fd1498Szrj /* A subroutine of vect_analyze_data_ref_dependence. Handle
26638fd1498Szrj DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
26738fd1498Szrj distances. These distances are conservatively correct but they don't
26838fd1498Szrj reflect a guaranteed dependence.
26938fd1498Szrj
27038fd1498Szrj Return true if this function does all the work necessary to avoid
27138fd1498Szrj an alias or false if the caller should use the dependence distances
27238fd1498Szrj to limit the vectorization factor in the usual way. LOOP_DEPTH is
27338fd1498Szrj the depth of the loop described by LOOP_VINFO and the other arguments
27438fd1498Szrj are as for vect_analyze_data_ref_dependence. */
27538fd1498Szrj
27638fd1498Szrj static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)27738fd1498Szrj vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
27838fd1498Szrj loop_vec_info loop_vinfo,
27938fd1498Szrj int loop_depth, unsigned int *max_vf)
28038fd1498Szrj {
28138fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
28238fd1498Szrj lambda_vector dist_v;
28338fd1498Szrj unsigned int i;
28438fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
28538fd1498Szrj {
28638fd1498Szrj int dist = dist_v[loop_depth];
28738fd1498Szrj if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
28838fd1498Szrj {
28938fd1498Szrj /* If the user asserted safelen >= DIST consecutive iterations
29038fd1498Szrj can be executed concurrently, assume independence.
29138fd1498Szrj
29238fd1498Szrj ??? An alternative would be to add the alias check even
29338fd1498Szrj in this case, and vectorize the fallback loop with the
29438fd1498Szrj maximum VF set to safelen. However, if the user has
29538fd1498Szrj explicitly given a length, it's less likely that that
29638fd1498Szrj would be a win. */
29738fd1498Szrj if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
29838fd1498Szrj {
29938fd1498Szrj if ((unsigned int) loop->safelen < *max_vf)
30038fd1498Szrj *max_vf = loop->safelen;
30138fd1498Szrj LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
30238fd1498Szrj continue;
30338fd1498Szrj }
30438fd1498Szrj
30538fd1498Szrj /* For dependence distances of 2 or more, we have the option
30638fd1498Szrj of limiting VF or checking for an alias at runtime.
30738fd1498Szrj Prefer to check at runtime if we can, to avoid limiting
30838fd1498Szrj the VF unnecessarily when the bases are in fact independent.
30938fd1498Szrj
31038fd1498Szrj Note that the alias checks will be removed if the VF ends up
31138fd1498Szrj being small enough. */
31238fd1498Szrj return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
31338fd1498Szrj }
31438fd1498Szrj }
31538fd1498Szrj return true;
31638fd1498Szrj }
31738fd1498Szrj
31838fd1498Szrj
31938fd1498Szrj /* Function vect_analyze_data_ref_dependence.
32038fd1498Szrj
32138fd1498Szrj Return TRUE if there (might) exist a dependence between a memory-reference
32238fd1498Szrj DRA and a memory-reference DRB. When versioning for alias may check a
32338fd1498Szrj dependence at run-time, return FALSE. Adjust *MAX_VF according to
32438fd1498Szrj the data dependence. */
32538fd1498Szrj
32638fd1498Szrj static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)32738fd1498Szrj vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
32838fd1498Szrj loop_vec_info loop_vinfo,
32938fd1498Szrj unsigned int *max_vf)
33038fd1498Szrj {
33138fd1498Szrj unsigned int i;
33238fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
33338fd1498Szrj struct data_reference *dra = DDR_A (ddr);
33438fd1498Szrj struct data_reference *drb = DDR_B (ddr);
33538fd1498Szrj stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
33638fd1498Szrj stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
33738fd1498Szrj lambda_vector dist_v;
33838fd1498Szrj unsigned int loop_depth;
33938fd1498Szrj
34038fd1498Szrj /* In loop analysis all data references should be vectorizable. */
34138fd1498Szrj if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
34238fd1498Szrj || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
34338fd1498Szrj gcc_unreachable ();
34438fd1498Szrj
34538fd1498Szrj /* Independent data accesses. */
34638fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
34738fd1498Szrj return false;
34838fd1498Szrj
34938fd1498Szrj if (dra == drb
35038fd1498Szrj || (DR_IS_READ (dra) && DR_IS_READ (drb)))
35138fd1498Szrj return false;
35238fd1498Szrj
35338fd1498Szrj /* We do not have to consider dependences between accesses that belong
35438fd1498Szrj to the same group, unless the stride could be smaller than the
35538fd1498Szrj group size. */
35638fd1498Szrj if (GROUP_FIRST_ELEMENT (stmtinfo_a)
35738fd1498Szrj && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
35838fd1498Szrj && !STMT_VINFO_STRIDED_P (stmtinfo_a))
35938fd1498Szrj return false;
36038fd1498Szrj
36138fd1498Szrj /* Even if we have an anti-dependence then, as the vectorized loop covers at
36238fd1498Szrj least two scalar iterations, there is always also a true dependence.
36338fd1498Szrj As the vectorizer does not re-order loads and stores we can ignore
36438fd1498Szrj the anti-dependence if TBAA can disambiguate both DRs similar to the
36538fd1498Szrj case with known negative distance anti-dependences (positive
36638fd1498Szrj distance anti-dependences would violate TBAA constraints). */
36738fd1498Szrj if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
36838fd1498Szrj || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
36938fd1498Szrj && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
37038fd1498Szrj get_alias_set (DR_REF (drb))))
37138fd1498Szrj return false;
37238fd1498Szrj
37338fd1498Szrj /* Unknown data dependence. */
37438fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
37538fd1498Szrj {
37638fd1498Szrj /* If user asserted safelen consecutive iterations can be
37738fd1498Szrj executed concurrently, assume independence. */
37838fd1498Szrj if (loop->safelen >= 2)
37938fd1498Szrj {
38038fd1498Szrj if ((unsigned int) loop->safelen < *max_vf)
38138fd1498Szrj *max_vf = loop->safelen;
38238fd1498Szrj LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
38338fd1498Szrj return false;
38438fd1498Szrj }
38538fd1498Szrj
38638fd1498Szrj if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
38738fd1498Szrj || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
38838fd1498Szrj {
38938fd1498Szrj if (dump_enabled_p ())
39038fd1498Szrj {
39138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
39238fd1498Szrj "versioning for alias not supported for: "
39338fd1498Szrj "can't determine dependence between ");
39438fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
39538fd1498Szrj DR_REF (dra));
39638fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
39738fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
39838fd1498Szrj DR_REF (drb));
39938fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
40038fd1498Szrj }
40138fd1498Szrj return true;
40238fd1498Szrj }
40338fd1498Szrj
40438fd1498Szrj if (dump_enabled_p ())
40538fd1498Szrj {
40638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
40738fd1498Szrj "versioning for alias required: "
40838fd1498Szrj "can't determine dependence between ");
40938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
41038fd1498Szrj DR_REF (dra));
41138fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
41238fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
41338fd1498Szrj DR_REF (drb));
41438fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
41538fd1498Szrj }
41638fd1498Szrj
41738fd1498Szrj /* Add to list of ddrs that need to be tested at run-time. */
41838fd1498Szrj return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
41938fd1498Szrj }
42038fd1498Szrj
42138fd1498Szrj /* Known data dependence. */
42238fd1498Szrj if (DDR_NUM_DIST_VECTS (ddr) == 0)
42338fd1498Szrj {
42438fd1498Szrj /* If user asserted safelen consecutive iterations can be
42538fd1498Szrj executed concurrently, assume independence. */
42638fd1498Szrj if (loop->safelen >= 2)
42738fd1498Szrj {
42838fd1498Szrj if ((unsigned int) loop->safelen < *max_vf)
42938fd1498Szrj *max_vf = loop->safelen;
43038fd1498Szrj LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
43138fd1498Szrj return false;
43238fd1498Szrj }
43338fd1498Szrj
43438fd1498Szrj if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
43538fd1498Szrj || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
43638fd1498Szrj {
43738fd1498Szrj if (dump_enabled_p ())
43838fd1498Szrj {
43938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
44038fd1498Szrj "versioning for alias not supported for: "
44138fd1498Szrj "bad dist vector for ");
44238fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
44338fd1498Szrj DR_REF (dra));
44438fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
44538fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
44638fd1498Szrj DR_REF (drb));
44738fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
44838fd1498Szrj }
44938fd1498Szrj return true;
45038fd1498Szrj }
45138fd1498Szrj
45238fd1498Szrj if (dump_enabled_p ())
45338fd1498Szrj {
45438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
45538fd1498Szrj "versioning for alias required: "
45638fd1498Szrj "bad dist vector for ");
45738fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
45838fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
45938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
46038fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
46138fd1498Szrj }
46238fd1498Szrj /* Add to list of ddrs that need to be tested at run-time. */
46338fd1498Szrj return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
46438fd1498Szrj }
46538fd1498Szrj
46638fd1498Szrj loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
46738fd1498Szrj
46838fd1498Szrj if (DDR_COULD_BE_INDEPENDENT_P (ddr)
46938fd1498Szrj && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
47038fd1498Szrj loop_depth, max_vf))
47138fd1498Szrj return false;
47238fd1498Szrj
47338fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
47438fd1498Szrj {
47538fd1498Szrj int dist = dist_v[loop_depth];
47638fd1498Szrj
47738fd1498Szrj if (dump_enabled_p ())
47838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
47938fd1498Szrj "dependence distance = %d.\n", dist);
48038fd1498Szrj
48138fd1498Szrj if (dist == 0)
48238fd1498Szrj {
48338fd1498Szrj if (dump_enabled_p ())
48438fd1498Szrj {
48538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
48638fd1498Szrj "dependence distance == 0 between ");
48738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
48838fd1498Szrj dump_printf (MSG_NOTE, " and ");
48938fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
49038fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
49138fd1498Szrj }
49238fd1498Szrj
49338fd1498Szrj /* When we perform grouped accesses and perform implicit CSE
49438fd1498Szrj by detecting equal accesses and doing disambiguation with
49538fd1498Szrj runtime alias tests like for
49638fd1498Szrj .. = a[i];
49738fd1498Szrj .. = a[i+1];
49838fd1498Szrj a[i] = ..;
49938fd1498Szrj a[i+1] = ..;
50038fd1498Szrj *p = ..;
50138fd1498Szrj .. = a[i];
50238fd1498Szrj .. = a[i+1];
50338fd1498Szrj where we will end up loading { a[i], a[i+1] } once, make
50438fd1498Szrj sure that inserting group loads before the first load and
50538fd1498Szrj stores after the last store will do the right thing.
50638fd1498Szrj Similar for groups like
50738fd1498Szrj a[i] = ...;
50838fd1498Szrj ... = a[i];
50938fd1498Szrj a[i+1] = ...;
51038fd1498Szrj where loads from the group interleave with the store. */
51138fd1498Szrj if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
51238fd1498Szrj {
51338fd1498Szrj if (dump_enabled_p ())
51438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
51538fd1498Szrj "READ_WRITE dependence in interleaving.\n");
51638fd1498Szrj return true;
51738fd1498Szrj }
51838fd1498Szrj
51938fd1498Szrj if (loop->safelen < 2)
52038fd1498Szrj {
52138fd1498Szrj tree indicator = dr_zero_step_indicator (dra);
52238fd1498Szrj if (TREE_CODE (indicator) != INTEGER_CST)
52338fd1498Szrj vect_check_nonzero_value (loop_vinfo, indicator);
52438fd1498Szrj else if (integer_zerop (indicator))
52538fd1498Szrj {
52638fd1498Szrj if (dump_enabled_p ())
52738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
52838fd1498Szrj "access also has a zero step\n");
52938fd1498Szrj return true;
53038fd1498Szrj }
53138fd1498Szrj }
53238fd1498Szrj continue;
53338fd1498Szrj }
53438fd1498Szrj
53538fd1498Szrj if (dist > 0 && DDR_REVERSED_P (ddr))
53638fd1498Szrj {
53738fd1498Szrj /* If DDR_REVERSED_P the order of the data-refs in DDR was
53838fd1498Szrj reversed (to make distance vector positive), and the actual
53938fd1498Szrj distance is negative. */
54038fd1498Szrj if (dump_enabled_p ())
54138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
54238fd1498Szrj "dependence distance negative.\n");
54338fd1498Szrj /* Record a negative dependence distance to later limit the
54438fd1498Szrj amount of stmt copying / unrolling we can perform.
54538fd1498Szrj Only need to handle read-after-write dependence. */
54638fd1498Szrj if (DR_IS_READ (drb)
54738fd1498Szrj && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
54838fd1498Szrj || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
54938fd1498Szrj STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
55038fd1498Szrj continue;
55138fd1498Szrj }
55238fd1498Szrj
55338fd1498Szrj unsigned int abs_dist = abs (dist);
55438fd1498Szrj if (abs_dist >= 2 && abs_dist < *max_vf)
55538fd1498Szrj {
55638fd1498Szrj /* The dependence distance requires reduction of the maximal
55738fd1498Szrj vectorization factor. */
55838fd1498Szrj *max_vf = abs (dist);
55938fd1498Szrj if (dump_enabled_p ())
56038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
56138fd1498Szrj "adjusting maximal vectorization factor to %i\n",
56238fd1498Szrj *max_vf);
56338fd1498Szrj }
56438fd1498Szrj
56538fd1498Szrj if (abs_dist >= *max_vf)
56638fd1498Szrj {
56738fd1498Szrj /* Dependence distance does not create dependence, as far as
56838fd1498Szrj vectorization is concerned, in this case. */
56938fd1498Szrj if (dump_enabled_p ())
57038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
57138fd1498Szrj "dependence distance >= VF.\n");
57238fd1498Szrj continue;
57338fd1498Szrj }
57438fd1498Szrj
57538fd1498Szrj if (dump_enabled_p ())
57638fd1498Szrj {
57738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
57838fd1498Szrj "not vectorized, possible dependence "
57938fd1498Szrj "between data-refs ");
58038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
58138fd1498Szrj dump_printf (MSG_NOTE, " and ");
58238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
58338fd1498Szrj dump_printf (MSG_NOTE, "\n");
58438fd1498Szrj }
58538fd1498Szrj
58638fd1498Szrj return true;
58738fd1498Szrj }
58838fd1498Szrj
58938fd1498Szrj return false;
59038fd1498Szrj }
59138fd1498Szrj
59238fd1498Szrj /* Function vect_analyze_data_ref_dependences.
59338fd1498Szrj
59438fd1498Szrj Examine all the data references in the loop, and make sure there do not
59538fd1498Szrj exist any data dependences between them. Set *MAX_VF according to
59638fd1498Szrj the maximum vectorization factor the data dependences allow. */
59738fd1498Szrj
59838fd1498Szrj bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)59938fd1498Szrj vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
60038fd1498Szrj unsigned int *max_vf)
60138fd1498Szrj {
60238fd1498Szrj unsigned int i;
60338fd1498Szrj struct data_dependence_relation *ddr;
60438fd1498Szrj
60538fd1498Szrj if (dump_enabled_p ())
60638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
60738fd1498Szrj "=== vect_analyze_data_ref_dependences ===\n");
60838fd1498Szrj
60938fd1498Szrj LOOP_VINFO_DDRS (loop_vinfo)
61038fd1498Szrj .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
61138fd1498Szrj * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
61238fd1498Szrj LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
61338fd1498Szrj /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
61438fd1498Szrj if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
61538fd1498Szrj &LOOP_VINFO_DDRS (loop_vinfo),
61638fd1498Szrj LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
61738fd1498Szrj return false;
61838fd1498Szrj
61938fd1498Szrj /* For epilogues we either have no aliases or alias versioning
62038fd1498Szrj was applied to original loop. Therefore we may just get max_vf
62138fd1498Szrj using VF of original loop. */
62238fd1498Szrj if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
62338fd1498Szrj *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
62438fd1498Szrj else
62538fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
62638fd1498Szrj if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
62738fd1498Szrj return false;
62838fd1498Szrj
62938fd1498Szrj return true;
63038fd1498Szrj }
63138fd1498Szrj
63238fd1498Szrj
63338fd1498Szrj /* Function vect_slp_analyze_data_ref_dependence.
63438fd1498Szrj
63538fd1498Szrj Return TRUE if there (might) exist a dependence between a memory-reference
63638fd1498Szrj DRA and a memory-reference DRB. When versioning for alias may check a
63738fd1498Szrj dependence at run-time, return FALSE. Adjust *MAX_VF according to
63838fd1498Szrj the data dependence. */
63938fd1498Szrj
64038fd1498Szrj static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)64138fd1498Szrj vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
64238fd1498Szrj {
64338fd1498Szrj struct data_reference *dra = DDR_A (ddr);
64438fd1498Szrj struct data_reference *drb = DDR_B (ddr);
64538fd1498Szrj
64638fd1498Szrj /* We need to check dependences of statements marked as unvectorizable
64738fd1498Szrj as well, they still can prohibit vectorization. */
64838fd1498Szrj
64938fd1498Szrj /* Independent data accesses. */
65038fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
65138fd1498Szrj return false;
65238fd1498Szrj
65338fd1498Szrj if (dra == drb)
65438fd1498Szrj return false;
65538fd1498Szrj
65638fd1498Szrj /* Read-read is OK. */
65738fd1498Szrj if (DR_IS_READ (dra) && DR_IS_READ (drb))
65838fd1498Szrj return false;
65938fd1498Szrj
66038fd1498Szrj /* If dra and drb are part of the same interleaving chain consider
66138fd1498Szrj them independent. */
66238fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
66338fd1498Szrj && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
66438fd1498Szrj == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
66538fd1498Szrj return false;
66638fd1498Szrj
66738fd1498Szrj /* Unknown data dependence. */
66838fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
66938fd1498Szrj {
67038fd1498Szrj if (dump_enabled_p ())
67138fd1498Szrj {
67238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
67338fd1498Szrj "can't determine dependence between ");
67438fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
67538fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
67638fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
67738fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
67838fd1498Szrj }
67938fd1498Szrj }
68038fd1498Szrj else if (dump_enabled_p ())
68138fd1498Szrj {
68238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
68338fd1498Szrj "determined dependence between ");
68438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
68538fd1498Szrj dump_printf (MSG_NOTE, " and ");
68638fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
68738fd1498Szrj dump_printf (MSG_NOTE, "\n");
68838fd1498Szrj }
68938fd1498Szrj
69038fd1498Szrj return true;
69138fd1498Szrj }
69238fd1498Szrj
69338fd1498Szrj
69438fd1498Szrj /* Analyze dependences involved in the transform of SLP NODE. STORES
69538fd1498Szrj contain the vector of scalar stores of this instance if we are
69638fd1498Szrj disambiguating the loads. */
69738fd1498Szrj
69838fd1498Szrj static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)69938fd1498Szrj vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
70038fd1498Szrj vec<gimple *> stores, gimple *last_store)
70138fd1498Szrj {
70238fd1498Szrj /* This walks over all stmts involved in the SLP load/store done
70338fd1498Szrj in NODE verifying we can sink them up to the last stmt in the
70438fd1498Szrj group. */
70538fd1498Szrj gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
70638fd1498Szrj for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
70738fd1498Szrj {
70838fd1498Szrj gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
70938fd1498Szrj if (access == last_access)
71038fd1498Szrj continue;
71138fd1498Szrj data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
71238fd1498Szrj for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
71338fd1498Szrj gsi_stmt (gsi) != last_access; gsi_next (&gsi))
71438fd1498Szrj {
71538fd1498Szrj gimple *stmt = gsi_stmt (gsi);
71638fd1498Szrj if (! gimple_vuse (stmt)
71738fd1498Szrj || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
71838fd1498Szrj continue;
71938fd1498Szrj
72038fd1498Szrj /* If we couldn't record a (single) data reference for this
72138fd1498Szrj stmt we have to give up. */
72238fd1498Szrj /* ??? Here and below if dependence analysis fails we can resort
72338fd1498Szrj to the alias oracle which can handle more kinds of stmts. */
72438fd1498Szrj data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
72538fd1498Szrj if (!dr_b)
72638fd1498Szrj return false;
72738fd1498Szrj
72838fd1498Szrj bool dependent = false;
72938fd1498Szrj /* If we run into a store of this same instance (we've just
73038fd1498Szrj marked those) then delay dependence checking until we run
73138fd1498Szrj into the last store because this is where it will have
73238fd1498Szrj been sunk to (and we verify if we can do that as well). */
73338fd1498Szrj if (gimple_visited_p (stmt))
73438fd1498Szrj {
73538fd1498Szrj if (stmt != last_store)
73638fd1498Szrj continue;
73738fd1498Szrj unsigned i;
73838fd1498Szrj gimple *store;
73938fd1498Szrj FOR_EACH_VEC_ELT (stores, i, store)
74038fd1498Szrj {
74138fd1498Szrj data_reference *store_dr
74238fd1498Szrj = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
74338fd1498Szrj ddr_p ddr = initialize_data_dependence_relation
74438fd1498Szrj (dr_a, store_dr, vNULL);
74538fd1498Szrj dependent = vect_slp_analyze_data_ref_dependence (ddr);
74638fd1498Szrj free_dependence_relation (ddr);
74738fd1498Szrj if (dependent)
74838fd1498Szrj break;
74938fd1498Szrj }
75038fd1498Szrj }
75138fd1498Szrj else
75238fd1498Szrj {
75338fd1498Szrj ddr_p ddr = initialize_data_dependence_relation (dr_a,
75438fd1498Szrj dr_b, vNULL);
75538fd1498Szrj dependent = vect_slp_analyze_data_ref_dependence (ddr);
75638fd1498Szrj free_dependence_relation (ddr);
75738fd1498Szrj }
75838fd1498Szrj if (dependent)
75938fd1498Szrj return false;
76038fd1498Szrj }
76138fd1498Szrj }
76238fd1498Szrj return true;
76338fd1498Szrj }
76438fd1498Szrj
76538fd1498Szrj
76638fd1498Szrj /* Function vect_analyze_data_ref_dependences.
76738fd1498Szrj
76838fd1498Szrj Examine all the data references in the basic-block, and make sure there
76938fd1498Szrj do not exist any data dependences between them. Set *MAX_VF according to
77038fd1498Szrj the maximum vectorization factor the data dependences allow. */
77138fd1498Szrj
77238fd1498Szrj bool
vect_slp_analyze_instance_dependence(slp_instance instance)77338fd1498Szrj vect_slp_analyze_instance_dependence (slp_instance instance)
77438fd1498Szrj {
77538fd1498Szrj if (dump_enabled_p ())
77638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
77738fd1498Szrj "=== vect_slp_analyze_instance_dependence ===\n");
77838fd1498Szrj
77938fd1498Szrj /* The stores of this instance are at the root of the SLP tree. */
78038fd1498Szrj slp_tree store = SLP_INSTANCE_TREE (instance);
78138fd1498Szrj if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
78238fd1498Szrj store = NULL;
78338fd1498Szrj
78438fd1498Szrj /* Verify we can sink stores to the vectorized stmt insert location. */
78538fd1498Szrj gimple *last_store = NULL;
78638fd1498Szrj if (store)
78738fd1498Szrj {
78838fd1498Szrj if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
78938fd1498Szrj return false;
79038fd1498Szrj
79138fd1498Szrj /* Mark stores in this instance and remember the last one. */
79238fd1498Szrj last_store = vect_find_last_scalar_stmt_in_slp (store);
79338fd1498Szrj for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
79438fd1498Szrj gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
79538fd1498Szrj }
79638fd1498Szrj
79738fd1498Szrj bool res = true;
79838fd1498Szrj
79938fd1498Szrj /* Verify we can sink loads to the vectorized stmt insert location,
80038fd1498Szrj special-casing stores of this instance. */
80138fd1498Szrj slp_tree load;
80238fd1498Szrj unsigned int i;
80338fd1498Szrj FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
80438fd1498Szrj if (! vect_slp_analyze_node_dependences (instance, load,
80538fd1498Szrj store
80638fd1498Szrj ? SLP_TREE_SCALAR_STMTS (store)
80738fd1498Szrj : vNULL, last_store))
80838fd1498Szrj {
80938fd1498Szrj res = false;
81038fd1498Szrj break;
81138fd1498Szrj }
81238fd1498Szrj
81338fd1498Szrj /* Unset the visited flag. */
81438fd1498Szrj if (store)
81538fd1498Szrj for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
81638fd1498Szrj gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
81738fd1498Szrj
81838fd1498Szrj return res;
81938fd1498Szrj }
82038fd1498Szrj
82138fd1498Szrj /* Record in VINFO the base alignment guarantee given by DRB. STMT is
82238fd1498Szrj the statement that contains DRB, which is useful for recording in the
82338fd1498Szrj dump file. */
82438fd1498Szrj
82538fd1498Szrj static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)82638fd1498Szrj vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
82738fd1498Szrj innermost_loop_behavior *drb)
82838fd1498Szrj {
82938fd1498Szrj bool existed;
83038fd1498Szrj innermost_loop_behavior *&entry
83138fd1498Szrj = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
83238fd1498Szrj if (!existed || entry->base_alignment < drb->base_alignment)
83338fd1498Szrj {
83438fd1498Szrj entry = drb;
83538fd1498Szrj if (dump_enabled_p ())
83638fd1498Szrj {
83738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
83838fd1498Szrj "recording new base alignment for ");
83938fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
84038fd1498Szrj dump_printf (MSG_NOTE, "\n");
84138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
84238fd1498Szrj " alignment: %d\n", drb->base_alignment);
84338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
84438fd1498Szrj " misalignment: %d\n", drb->base_misalignment);
84538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
84638fd1498Szrj " based on: ");
84738fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
84838fd1498Szrj }
84938fd1498Szrj }
85038fd1498Szrj }
85138fd1498Szrj
85238fd1498Szrj /* If the region we're going to vectorize is reached, all unconditional
85338fd1498Szrj data references occur at least once. We can therefore pool the base
85438fd1498Szrj alignment guarantees from each unconditional reference. Do this by
85538fd1498Szrj going through all the data references in VINFO and checking whether
85638fd1498Szrj the containing statement makes the reference unconditionally. If so,
85738fd1498Szrj record the alignment of the base address in VINFO so that it can be
85838fd1498Szrj used for all other references with the same base. */
85938fd1498Szrj
86038fd1498Szrj void
vect_record_base_alignments(vec_info * vinfo)86138fd1498Szrj vect_record_base_alignments (vec_info *vinfo)
86238fd1498Szrj {
86338fd1498Szrj loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
86438fd1498Szrj struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
86538fd1498Szrj data_reference *dr;
86638fd1498Szrj unsigned int i;
86738fd1498Szrj FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
86838fd1498Szrj if (!DR_IS_CONDITIONAL_IN_STMT (dr))
86938fd1498Szrj {
87038fd1498Szrj gimple *stmt = DR_STMT (dr);
87138fd1498Szrj vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
87238fd1498Szrj
87338fd1498Szrj /* If DR is nested in the loop that is being vectorized, we can also
87438fd1498Szrj record the alignment of the base wrt the outer loop. */
87538fd1498Szrj if (loop && nested_in_vect_loop_p (loop, stmt))
87638fd1498Szrj {
87738fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
87838fd1498Szrj vect_record_base_alignment
87938fd1498Szrj (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
88038fd1498Szrj }
88138fd1498Szrj }
88238fd1498Szrj }
88338fd1498Szrj
88438fd1498Szrj /* Return the target alignment for the vectorized form of DR. */
88538fd1498Szrj
88638fd1498Szrj static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)88738fd1498Szrj vect_calculate_target_alignment (struct data_reference *dr)
88838fd1498Szrj {
88938fd1498Szrj gimple *stmt = DR_STMT (dr);
89038fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
89138fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
89238fd1498Szrj return targetm.vectorize.preferred_vector_alignment (vectype);
89338fd1498Szrj }
89438fd1498Szrj
89538fd1498Szrj /* Function vect_compute_data_ref_alignment
89638fd1498Szrj
89738fd1498Szrj Compute the misalignment of the data reference DR.
89838fd1498Szrj
89938fd1498Szrj Output:
90038fd1498Szrj 1. If during the misalignment computation it is found that the data reference
90138fd1498Szrj cannot be vectorized then false is returned.
90238fd1498Szrj 2. DR_MISALIGNMENT (DR) is defined.
90338fd1498Szrj
90438fd1498Szrj FOR NOW: No analysis is actually performed. Misalignment is calculated
90538fd1498Szrj only for trivial cases. TODO. */
90638fd1498Szrj
90738fd1498Szrj bool
vect_compute_data_ref_alignment(struct data_reference * dr)90838fd1498Szrj vect_compute_data_ref_alignment (struct data_reference *dr)
90938fd1498Szrj {
91038fd1498Szrj gimple *stmt = DR_STMT (dr);
91138fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
91238fd1498Szrj vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
91338fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
91438fd1498Szrj struct loop *loop = NULL;
91538fd1498Szrj tree ref = DR_REF (dr);
91638fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
91738fd1498Szrj
91838fd1498Szrj if (dump_enabled_p ())
91938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
92038fd1498Szrj "vect_compute_data_ref_alignment:\n");
92138fd1498Szrj
92238fd1498Szrj if (loop_vinfo)
92338fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
92438fd1498Szrj
92538fd1498Szrj /* Initialize misalignment to unknown. */
92638fd1498Szrj SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
92738fd1498Szrj
92838fd1498Szrj innermost_loop_behavior *drb = vect_dr_behavior (dr);
92938fd1498Szrj bool step_preserves_misalignment_p;
93038fd1498Szrj
93138fd1498Szrj unsigned HOST_WIDE_INT vector_alignment
93238fd1498Szrj = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
93338fd1498Szrj DR_TARGET_ALIGNMENT (dr) = vector_alignment;
93438fd1498Szrj
93538fd1498Szrj /* No step for BB vectorization. */
93638fd1498Szrj if (!loop)
93738fd1498Szrj {
93838fd1498Szrj gcc_assert (integer_zerop (drb->step));
93938fd1498Szrj step_preserves_misalignment_p = true;
94038fd1498Szrj }
94138fd1498Szrj
94238fd1498Szrj /* In case the dataref is in an inner-loop of the loop that is being
94338fd1498Szrj vectorized (LOOP), we use the base and misalignment information
94438fd1498Szrj relative to the outer-loop (LOOP). This is ok only if the misalignment
94538fd1498Szrj stays the same throughout the execution of the inner-loop, which is why
94638fd1498Szrj we have to check that the stride of the dataref in the inner-loop evenly
94738fd1498Szrj divides by the vector alignment. */
94838fd1498Szrj else if (nested_in_vect_loop_p (loop, stmt))
94938fd1498Szrj {
95038fd1498Szrj step_preserves_misalignment_p
95138fd1498Szrj = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
95238fd1498Szrj
95338fd1498Szrj if (dump_enabled_p ())
95438fd1498Szrj {
95538fd1498Szrj if (step_preserves_misalignment_p)
95638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
95738fd1498Szrj "inner step divides the vector alignment.\n");
95838fd1498Szrj else
95938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
96038fd1498Szrj "inner step doesn't divide the vector"
96138fd1498Szrj " alignment.\n");
96238fd1498Szrj }
96338fd1498Szrj }
96438fd1498Szrj
96538fd1498Szrj /* Similarly we can only use base and misalignment information relative to
96638fd1498Szrj an innermost loop if the misalignment stays the same throughout the
96738fd1498Szrj execution of the loop. As above, this is the case if the stride of
96838fd1498Szrj the dataref evenly divides by the alignment. */
96938fd1498Szrj else
97038fd1498Szrj {
97138fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
97238fd1498Szrj step_preserves_misalignment_p
97338fd1498Szrj = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
97438fd1498Szrj
97538fd1498Szrj if (!step_preserves_misalignment_p && dump_enabled_p ())
97638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
97738fd1498Szrj "step doesn't divide the vector alignment.\n");
97838fd1498Szrj }
97938fd1498Szrj
98038fd1498Szrj unsigned int base_alignment = drb->base_alignment;
98138fd1498Szrj unsigned int base_misalignment = drb->base_misalignment;
98238fd1498Szrj
98338fd1498Szrj /* Calculate the maximum of the pooled base address alignment and the
98438fd1498Szrj alignment that we can compute for DR itself. */
98538fd1498Szrj innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
98638fd1498Szrj if (entry && base_alignment < (*entry)->base_alignment)
98738fd1498Szrj {
98838fd1498Szrj base_alignment = (*entry)->base_alignment;
98938fd1498Szrj base_misalignment = (*entry)->base_misalignment;
99038fd1498Szrj }
99138fd1498Szrj
99238fd1498Szrj if (drb->offset_alignment < vector_alignment
99338fd1498Szrj || !step_preserves_misalignment_p
99438fd1498Szrj /* We need to know whether the step wrt the vectorized loop is
99538fd1498Szrj negative when computing the starting misalignment below. */
99638fd1498Szrj || TREE_CODE (drb->step) != INTEGER_CST)
99738fd1498Szrj {
99838fd1498Szrj if (dump_enabled_p ())
99938fd1498Szrj {
100038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
100138fd1498Szrj "Unknown alignment for access: ");
100238fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
100338fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
100438fd1498Szrj }
100538fd1498Szrj return true;
100638fd1498Szrj }
100738fd1498Szrj
100838fd1498Szrj if (base_alignment < vector_alignment)
100938fd1498Szrj {
101038fd1498Szrj unsigned int max_alignment;
101138fd1498Szrj tree base = get_base_for_alignment (drb->base_address, &max_alignment);
101238fd1498Szrj if (max_alignment < vector_alignment
101338fd1498Szrj || !vect_can_force_dr_alignment_p (base,
101438fd1498Szrj vector_alignment * BITS_PER_UNIT))
101538fd1498Szrj {
101638fd1498Szrj if (dump_enabled_p ())
101738fd1498Szrj {
101838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
101938fd1498Szrj "can't force alignment of ref: ");
102038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
102138fd1498Szrj dump_printf (MSG_NOTE, "\n");
102238fd1498Szrj }
102338fd1498Szrj return true;
102438fd1498Szrj }
102538fd1498Szrj
102638fd1498Szrj /* Force the alignment of the decl.
102738fd1498Szrj NOTE: This is the only change to the code we make during
102838fd1498Szrj the analysis phase, before deciding to vectorize the loop. */
102938fd1498Szrj if (dump_enabled_p ())
103038fd1498Szrj {
103138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
103238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
103338fd1498Szrj dump_printf (MSG_NOTE, "\n");
103438fd1498Szrj }
103538fd1498Szrj
103638fd1498Szrj DR_VECT_AUX (dr)->base_decl = base;
103738fd1498Szrj DR_VECT_AUX (dr)->base_misaligned = true;
103838fd1498Szrj base_misalignment = 0;
103938fd1498Szrj }
104038fd1498Szrj poly_int64 misalignment
104138fd1498Szrj = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
104238fd1498Szrj
104338fd1498Szrj /* If this is a backward running DR then first access in the larger
104438fd1498Szrj vectype actually is N-1 elements before the address in the DR.
104538fd1498Szrj Adjust misalign accordingly. */
104638fd1498Szrj if (tree_int_cst_sgn (drb->step) < 0)
104738fd1498Szrj /* PLUS because STEP is negative. */
104838fd1498Szrj misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
104938fd1498Szrj * TREE_INT_CST_LOW (drb->step));
105038fd1498Szrj
105138fd1498Szrj unsigned int const_misalignment;
105238fd1498Szrj if (!known_misalignment (misalignment, vector_alignment,
105338fd1498Szrj &const_misalignment))
105438fd1498Szrj {
105538fd1498Szrj if (dump_enabled_p ())
105638fd1498Szrj {
105738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
105838fd1498Szrj "Non-constant misalignment for access: ");
105938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
106038fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
106138fd1498Szrj }
106238fd1498Szrj return true;
106338fd1498Szrj }
106438fd1498Szrj
106538fd1498Szrj SET_DR_MISALIGNMENT (dr, const_misalignment);
106638fd1498Szrj
106738fd1498Szrj if (dump_enabled_p ())
106838fd1498Szrj {
106938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
107038fd1498Szrj "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
107138fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
107238fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
107338fd1498Szrj }
107438fd1498Szrj
107538fd1498Szrj return true;
107638fd1498Szrj }
107738fd1498Szrj
107838fd1498Szrj /* Function vect_update_misalignment_for_peel.
107938fd1498Szrj Sets DR's misalignment
108038fd1498Szrj - to 0 if it has the same alignment as DR_PEEL,
108138fd1498Szrj - to the misalignment computed using NPEEL if DR's salignment is known,
108238fd1498Szrj - to -1 (unknown) otherwise.
108338fd1498Szrj
108438fd1498Szrj DR - the data reference whose misalignment is to be adjusted.
108538fd1498Szrj DR_PEEL - the data reference whose misalignment is being made
108638fd1498Szrj zero in the vector loop by the peel.
108738fd1498Szrj NPEEL - the number of iterations in the peel loop if the misalignment
108838fd1498Szrj of DR_PEEL is known at compile time. */
108938fd1498Szrj
109038fd1498Szrj static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)109138fd1498Szrj vect_update_misalignment_for_peel (struct data_reference *dr,
109238fd1498Szrj struct data_reference *dr_peel, int npeel)
109338fd1498Szrj {
109438fd1498Szrj unsigned int i;
109538fd1498Szrj vec<dr_p> same_aligned_drs;
109638fd1498Szrj struct data_reference *current_dr;
109738fd1498Szrj int dr_size = vect_get_scalar_dr_size (dr);
109838fd1498Szrj int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
109938fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
110038fd1498Szrj stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
110138fd1498Szrj
110238fd1498Szrj /* For interleaved data accesses the step in the loop must be multiplied by
110338fd1498Szrj the size of the interleaving group. */
110438fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
110538fd1498Szrj dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
110638fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
110738fd1498Szrj dr_peel_size *= GROUP_SIZE (peel_stmt_info);
110838fd1498Szrj
110938fd1498Szrj /* It can be assumed that the data refs with the same alignment as dr_peel
111038fd1498Szrj are aligned in the vector loop. */
111138fd1498Szrj same_aligned_drs
111238fd1498Szrj = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
111338fd1498Szrj FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
111438fd1498Szrj {
111538fd1498Szrj if (current_dr != dr)
111638fd1498Szrj continue;
111738fd1498Szrj gcc_assert (!known_alignment_for_access_p (dr)
111838fd1498Szrj || !known_alignment_for_access_p (dr_peel)
111938fd1498Szrj || (DR_MISALIGNMENT (dr) / dr_size
112038fd1498Szrj == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
112138fd1498Szrj SET_DR_MISALIGNMENT (dr, 0);
112238fd1498Szrj return;
112338fd1498Szrj }
112438fd1498Szrj
112538fd1498Szrj if (known_alignment_for_access_p (dr)
112638fd1498Szrj && known_alignment_for_access_p (dr_peel))
112738fd1498Szrj {
112838fd1498Szrj bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
112938fd1498Szrj int misal = DR_MISALIGNMENT (dr);
113038fd1498Szrj misal += negative ? -npeel * dr_size : npeel * dr_size;
113138fd1498Szrj misal &= DR_TARGET_ALIGNMENT (dr) - 1;
113238fd1498Szrj SET_DR_MISALIGNMENT (dr, misal);
113338fd1498Szrj return;
113438fd1498Szrj }
113538fd1498Szrj
113638fd1498Szrj if (dump_enabled_p ())
113738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
113838fd1498Szrj "to unknown (-1).\n");
113938fd1498Szrj SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
114038fd1498Szrj }
114138fd1498Szrj
114238fd1498Szrj
114338fd1498Szrj /* Function verify_data_ref_alignment
114438fd1498Szrj
114538fd1498Szrj Return TRUE if DR can be handled with respect to alignment. */
114638fd1498Szrj
114738fd1498Szrj static bool
verify_data_ref_alignment(data_reference_p dr)114838fd1498Szrj verify_data_ref_alignment (data_reference_p dr)
114938fd1498Szrj {
115038fd1498Szrj enum dr_alignment_support supportable_dr_alignment
115138fd1498Szrj = vect_supportable_dr_alignment (dr, false);
115238fd1498Szrj if (!supportable_dr_alignment)
115338fd1498Szrj {
115438fd1498Szrj if (dump_enabled_p ())
115538fd1498Szrj {
115638fd1498Szrj if (DR_IS_READ (dr))
115738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
115838fd1498Szrj "not vectorized: unsupported unaligned load.");
115938fd1498Szrj else
116038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
116138fd1498Szrj "not vectorized: unsupported unaligned "
116238fd1498Szrj "store.");
116338fd1498Szrj
116438fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
116538fd1498Szrj DR_REF (dr));
116638fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
116738fd1498Szrj }
116838fd1498Szrj return false;
116938fd1498Szrj }
117038fd1498Szrj
117138fd1498Szrj if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
117238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
117338fd1498Szrj "Vectorizing an unaligned access.\n");
117438fd1498Szrj
117538fd1498Szrj return true;
117638fd1498Szrj }
117738fd1498Szrj
117838fd1498Szrj /* Function vect_verify_datarefs_alignment
117938fd1498Szrj
118038fd1498Szrj Return TRUE if all data references in the loop can be
118138fd1498Szrj handled with respect to alignment. */
118238fd1498Szrj
118338fd1498Szrj bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)118438fd1498Szrj vect_verify_datarefs_alignment (loop_vec_info vinfo)
118538fd1498Szrj {
118638fd1498Szrj vec<data_reference_p> datarefs = vinfo->datarefs;
118738fd1498Szrj struct data_reference *dr;
118838fd1498Szrj unsigned int i;
118938fd1498Szrj
119038fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
119138fd1498Szrj {
119238fd1498Szrj gimple *stmt = DR_STMT (dr);
119338fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
119438fd1498Szrj
119538fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info))
119638fd1498Szrj continue;
119738fd1498Szrj
119838fd1498Szrj /* For interleaving, only the alignment of the first access matters. */
119938fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
120038fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
120138fd1498Szrj continue;
120238fd1498Szrj
120338fd1498Szrj /* Strided accesses perform only component accesses, alignment is
120438fd1498Szrj irrelevant for them. */
120538fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
120638fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
120738fd1498Szrj continue;
120838fd1498Szrj
120938fd1498Szrj if (! verify_data_ref_alignment (dr))
121038fd1498Szrj return false;
121138fd1498Szrj }
121238fd1498Szrj
121338fd1498Szrj return true;
121438fd1498Szrj }
121538fd1498Szrj
121638fd1498Szrj /* Given an memory reference EXP return whether its alignment is less
121738fd1498Szrj than its size. */
121838fd1498Szrj
121938fd1498Szrj static bool
not_size_aligned(tree exp)122038fd1498Szrj not_size_aligned (tree exp)
122138fd1498Szrj {
122238fd1498Szrj if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
122338fd1498Szrj return true;
122438fd1498Szrj
122538fd1498Szrj return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
122638fd1498Szrj > get_object_alignment (exp));
122738fd1498Szrj }
122838fd1498Szrj
122938fd1498Szrj /* Function vector_alignment_reachable_p
123038fd1498Szrj
123138fd1498Szrj Return true if vector alignment for DR is reachable by peeling
123238fd1498Szrj a few loop iterations. Return false otherwise. */
123338fd1498Szrj
123438fd1498Szrj static bool
vector_alignment_reachable_p(struct data_reference * dr)123538fd1498Szrj vector_alignment_reachable_p (struct data_reference *dr)
123638fd1498Szrj {
123738fd1498Szrj gimple *stmt = DR_STMT (dr);
123838fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
123938fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
124038fd1498Szrj
124138fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
124238fd1498Szrj {
124338fd1498Szrj /* For interleaved access we peel only if number of iterations in
124438fd1498Szrj the prolog loop ({VF - misalignment}), is a multiple of the
124538fd1498Szrj number of the interleaved accesses. */
124638fd1498Szrj int elem_size, mis_in_elements;
124738fd1498Szrj
124838fd1498Szrj /* FORNOW: handle only known alignment. */
124938fd1498Szrj if (!known_alignment_for_access_p (dr))
125038fd1498Szrj return false;
125138fd1498Szrj
125238fd1498Szrj poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
125338fd1498Szrj poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
125438fd1498Szrj elem_size = vector_element_size (vector_size, nelements);
125538fd1498Szrj mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
125638fd1498Szrj
125738fd1498Szrj if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
125838fd1498Szrj return false;
125938fd1498Szrj }
126038fd1498Szrj
126138fd1498Szrj /* If misalignment is known at the compile time then allow peeling
126238fd1498Szrj only if natural alignment is reachable through peeling. */
126338fd1498Szrj if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
126438fd1498Szrj {
126538fd1498Szrj HOST_WIDE_INT elmsize =
126638fd1498Szrj int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
126738fd1498Szrj if (dump_enabled_p ())
126838fd1498Szrj {
126938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
127038fd1498Szrj "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
127138fd1498Szrj dump_printf (MSG_NOTE,
127238fd1498Szrj ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
127338fd1498Szrj }
127438fd1498Szrj if (DR_MISALIGNMENT (dr) % elmsize)
127538fd1498Szrj {
127638fd1498Szrj if (dump_enabled_p ())
127738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
127838fd1498Szrj "data size does not divide the misalignment.\n");
127938fd1498Szrj return false;
128038fd1498Szrj }
128138fd1498Szrj }
128238fd1498Szrj
128338fd1498Szrj if (!known_alignment_for_access_p (dr))
128438fd1498Szrj {
128538fd1498Szrj tree type = TREE_TYPE (DR_REF (dr));
128638fd1498Szrj bool is_packed = not_size_aligned (DR_REF (dr));
128738fd1498Szrj if (dump_enabled_p ())
128838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
128938fd1498Szrj "Unknown misalignment, %snaturally aligned\n",
129038fd1498Szrj is_packed ? "not " : "");
129138fd1498Szrj return targetm.vectorize.vector_alignment_reachable (type, is_packed);
129238fd1498Szrj }
129338fd1498Szrj
129438fd1498Szrj return true;
129538fd1498Szrj }
129638fd1498Szrj
129738fd1498Szrj
129838fd1498Szrj /* Calculate the cost of the memory access represented by DR. */
129938fd1498Szrj
130038fd1498Szrj static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)130138fd1498Szrj vect_get_data_access_cost (struct data_reference *dr,
130238fd1498Szrj unsigned int *inside_cost,
130338fd1498Szrj unsigned int *outside_cost,
130438fd1498Szrj stmt_vector_for_cost *body_cost_vec)
130538fd1498Szrj {
130638fd1498Szrj gimple *stmt = DR_STMT (dr);
130738fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
130838fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
130938fd1498Szrj int ncopies;
131038fd1498Szrj
131138fd1498Szrj if (PURE_SLP_STMT (stmt_info))
131238fd1498Szrj ncopies = 1;
131338fd1498Szrj else
131438fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
131538fd1498Szrj
131638fd1498Szrj if (DR_IS_READ (dr))
131738fd1498Szrj vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
131838fd1498Szrj NULL, body_cost_vec, false);
131938fd1498Szrj else
132038fd1498Szrj vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
132138fd1498Szrj
132238fd1498Szrj if (dump_enabled_p ())
132338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
132438fd1498Szrj "vect_get_data_access_cost: inside_cost = %d, "
132538fd1498Szrj "outside_cost = %d.\n", *inside_cost, *outside_cost);
132638fd1498Szrj }
132738fd1498Szrj
132838fd1498Szrj
132938fd1498Szrj typedef struct _vect_peel_info
133038fd1498Szrj {
133138fd1498Szrj struct data_reference *dr;
133238fd1498Szrj int npeel;
133338fd1498Szrj unsigned int count;
133438fd1498Szrj } *vect_peel_info;
133538fd1498Szrj
133638fd1498Szrj typedef struct _vect_peel_extended_info
133738fd1498Szrj {
133838fd1498Szrj struct _vect_peel_info peel_info;
133938fd1498Szrj unsigned int inside_cost;
134038fd1498Szrj unsigned int outside_cost;
134138fd1498Szrj } *vect_peel_extended_info;
134238fd1498Szrj
134338fd1498Szrj
134438fd1498Szrj /* Peeling hashtable helpers. */
134538fd1498Szrj
134638fd1498Szrj struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
134738fd1498Szrj {
134838fd1498Szrj static inline hashval_t hash (const _vect_peel_info *);
134938fd1498Szrj static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
135038fd1498Szrj };
135138fd1498Szrj
135238fd1498Szrj inline hashval_t
hash(const _vect_peel_info * peel_info)135338fd1498Szrj peel_info_hasher::hash (const _vect_peel_info *peel_info)
135438fd1498Szrj {
135538fd1498Szrj return (hashval_t) peel_info->npeel;
135638fd1498Szrj }
135738fd1498Szrj
135838fd1498Szrj inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)135938fd1498Szrj peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
136038fd1498Szrj {
136138fd1498Szrj return (a->npeel == b->npeel);
136238fd1498Szrj }
136338fd1498Szrj
136438fd1498Szrj
136538fd1498Szrj /* Insert DR into peeling hash table with NPEEL as key. */
136638fd1498Szrj
136738fd1498Szrj static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)136838fd1498Szrj vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
136938fd1498Szrj loop_vec_info loop_vinfo, struct data_reference *dr,
137038fd1498Szrj int npeel)
137138fd1498Szrj {
137238fd1498Szrj struct _vect_peel_info elem, *slot;
137338fd1498Szrj _vect_peel_info **new_slot;
137438fd1498Szrj bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
137538fd1498Szrj
137638fd1498Szrj elem.npeel = npeel;
137738fd1498Szrj slot = peeling_htab->find (&elem);
137838fd1498Szrj if (slot)
137938fd1498Szrj slot->count++;
138038fd1498Szrj else
138138fd1498Szrj {
138238fd1498Szrj slot = XNEW (struct _vect_peel_info);
138338fd1498Szrj slot->npeel = npeel;
138438fd1498Szrj slot->dr = dr;
138538fd1498Szrj slot->count = 1;
138638fd1498Szrj new_slot = peeling_htab->find_slot (slot, INSERT);
138738fd1498Szrj *new_slot = slot;
138838fd1498Szrj }
138938fd1498Szrj
139038fd1498Szrj if (!supportable_dr_alignment
139138fd1498Szrj && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
139238fd1498Szrj slot->count += VECT_MAX_COST;
139338fd1498Szrj }
139438fd1498Szrj
139538fd1498Szrj
139638fd1498Szrj /* Traverse peeling hash table to find peeling option that aligns maximum
139738fd1498Szrj number of data accesses. */
139838fd1498Szrj
139938fd1498Szrj int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)140038fd1498Szrj vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
140138fd1498Szrj _vect_peel_extended_info *max)
140238fd1498Szrj {
140338fd1498Szrj vect_peel_info elem = *slot;
140438fd1498Szrj
140538fd1498Szrj if (elem->count > max->peel_info.count
140638fd1498Szrj || (elem->count == max->peel_info.count
140738fd1498Szrj && max->peel_info.npeel > elem->npeel))
140838fd1498Szrj {
140938fd1498Szrj max->peel_info.npeel = elem->npeel;
141038fd1498Szrj max->peel_info.count = elem->count;
141138fd1498Szrj max->peel_info.dr = elem->dr;
141238fd1498Szrj }
141338fd1498Szrj
141438fd1498Szrj return 1;
141538fd1498Szrj }
141638fd1498Szrj
141738fd1498Szrj /* Get the costs of peeling NPEEL iterations checking data access costs
141838fd1498Szrj for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
141938fd1498Szrj misalignment will be zero after peeling. */
142038fd1498Szrj
142138fd1498Szrj static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)142238fd1498Szrj vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
142338fd1498Szrj struct data_reference *dr0,
142438fd1498Szrj unsigned int *inside_cost,
142538fd1498Szrj unsigned int *outside_cost,
142638fd1498Szrj stmt_vector_for_cost *body_cost_vec,
142738fd1498Szrj unsigned int npeel,
142838fd1498Szrj bool unknown_misalignment)
142938fd1498Szrj {
143038fd1498Szrj unsigned i;
143138fd1498Szrj data_reference *dr;
143238fd1498Szrj
143338fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
143438fd1498Szrj {
143538fd1498Szrj gimple *stmt = DR_STMT (dr);
143638fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
143738fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info))
143838fd1498Szrj continue;
143938fd1498Szrj
144038fd1498Szrj /* For interleaving, only the alignment of the first access
144138fd1498Szrj matters. */
144238fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
144338fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
144438fd1498Szrj continue;
144538fd1498Szrj
144638fd1498Szrj /* Strided accesses perform only component accesses, alignment is
144738fd1498Szrj irrelevant for them. */
144838fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
144938fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
145038fd1498Szrj continue;
145138fd1498Szrj
145238fd1498Szrj int save_misalignment;
145338fd1498Szrj save_misalignment = DR_MISALIGNMENT (dr);
145438fd1498Szrj if (npeel == 0)
145538fd1498Szrj ;
145638fd1498Szrj else if (unknown_misalignment && dr == dr0)
145738fd1498Szrj SET_DR_MISALIGNMENT (dr, 0);
145838fd1498Szrj else
145938fd1498Szrj vect_update_misalignment_for_peel (dr, dr0, npeel);
146038fd1498Szrj vect_get_data_access_cost (dr, inside_cost, outside_cost,
146138fd1498Szrj body_cost_vec);
146238fd1498Szrj SET_DR_MISALIGNMENT (dr, save_misalignment);
146338fd1498Szrj }
146438fd1498Szrj }
146538fd1498Szrj
146638fd1498Szrj /* Traverse peeling hash table and calculate cost for each peeling option.
146738fd1498Szrj Find the one with the lowest cost. */
146838fd1498Szrj
146938fd1498Szrj int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)147038fd1498Szrj vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
147138fd1498Szrj _vect_peel_extended_info *min)
147238fd1498Szrj {
147338fd1498Szrj vect_peel_info elem = *slot;
147438fd1498Szrj int dummy;
147538fd1498Szrj unsigned int inside_cost = 0, outside_cost = 0;
147638fd1498Szrj gimple *stmt = DR_STMT (elem->dr);
147738fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
147838fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
147938fd1498Szrj stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
148038fd1498Szrj epilogue_cost_vec;
148138fd1498Szrj
148238fd1498Szrj prologue_cost_vec.create (2);
148338fd1498Szrj body_cost_vec.create (2);
148438fd1498Szrj epilogue_cost_vec.create (2);
148538fd1498Szrj
148638fd1498Szrj vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
148738fd1498Szrj elem->dr, &inside_cost, &outside_cost,
148838fd1498Szrj &body_cost_vec, elem->npeel, false);
148938fd1498Szrj
149038fd1498Szrj body_cost_vec.release ();
149138fd1498Szrj
149238fd1498Szrj outside_cost += vect_get_known_peeling_cost
149338fd1498Szrj (loop_vinfo, elem->npeel, &dummy,
149438fd1498Szrj &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
149538fd1498Szrj &prologue_cost_vec, &epilogue_cost_vec);
149638fd1498Szrj
149738fd1498Szrj /* Prologue and epilogue costs are added to the target model later.
149838fd1498Szrj These costs depend only on the scalar iteration cost, the
149938fd1498Szrj number of peeling iterations finally chosen, and the number of
150038fd1498Szrj misaligned statements. So discard the information found here. */
150138fd1498Szrj prologue_cost_vec.release ();
150238fd1498Szrj epilogue_cost_vec.release ();
150338fd1498Szrj
150438fd1498Szrj if (inside_cost < min->inside_cost
150538fd1498Szrj || (inside_cost == min->inside_cost
150638fd1498Szrj && outside_cost < min->outside_cost))
150738fd1498Szrj {
150838fd1498Szrj min->inside_cost = inside_cost;
150938fd1498Szrj min->outside_cost = outside_cost;
151038fd1498Szrj min->peel_info.dr = elem->dr;
151138fd1498Szrj min->peel_info.npeel = elem->npeel;
151238fd1498Szrj min->peel_info.count = elem->count;
151338fd1498Szrj }
151438fd1498Szrj
151538fd1498Szrj return 1;
151638fd1498Szrj }
151738fd1498Szrj
151838fd1498Szrj
151938fd1498Szrj /* Choose best peeling option by traversing peeling hash table and either
152038fd1498Szrj choosing an option with the lowest cost (if cost model is enabled) or the
152138fd1498Szrj option that aligns as many accesses as possible. */
152238fd1498Szrj
152338fd1498Szrj static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)152438fd1498Szrj vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
152538fd1498Szrj loop_vec_info loop_vinfo)
152638fd1498Szrj {
152738fd1498Szrj struct _vect_peel_extended_info res;
152838fd1498Szrj
152938fd1498Szrj res.peel_info.dr = NULL;
153038fd1498Szrj
153138fd1498Szrj if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
153238fd1498Szrj {
153338fd1498Szrj res.inside_cost = INT_MAX;
153438fd1498Szrj res.outside_cost = INT_MAX;
153538fd1498Szrj peeling_htab->traverse <_vect_peel_extended_info *,
153638fd1498Szrj vect_peeling_hash_get_lowest_cost> (&res);
153738fd1498Szrj }
153838fd1498Szrj else
153938fd1498Szrj {
154038fd1498Szrj res.peel_info.count = 0;
154138fd1498Szrj peeling_htab->traverse <_vect_peel_extended_info *,
154238fd1498Szrj vect_peeling_hash_get_most_frequent> (&res);
154338fd1498Szrj res.inside_cost = 0;
154438fd1498Szrj res.outside_cost = 0;
154538fd1498Szrj }
154638fd1498Szrj
154738fd1498Szrj return res;
154838fd1498Szrj }
154938fd1498Szrj
155038fd1498Szrj /* Return true if the new peeling NPEEL is supported. */
155138fd1498Szrj
155238fd1498Szrj static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)155338fd1498Szrj vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
155438fd1498Szrj unsigned npeel)
155538fd1498Szrj {
155638fd1498Szrj unsigned i;
155738fd1498Szrj struct data_reference *dr = NULL;
155838fd1498Szrj vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
155938fd1498Szrj gimple *stmt;
156038fd1498Szrj stmt_vec_info stmt_info;
156138fd1498Szrj enum dr_alignment_support supportable_dr_alignment;
156238fd1498Szrj
156338fd1498Szrj /* Ensure that all data refs can be vectorized after the peel. */
156438fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
156538fd1498Szrj {
156638fd1498Szrj int save_misalignment;
156738fd1498Szrj
156838fd1498Szrj if (dr == dr0)
156938fd1498Szrj continue;
157038fd1498Szrj
157138fd1498Szrj stmt = DR_STMT (dr);
157238fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
157338fd1498Szrj /* For interleaving, only the alignment of the first access
157438fd1498Szrj matters. */
157538fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
157638fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
157738fd1498Szrj continue;
157838fd1498Szrj
157938fd1498Szrj /* Strided accesses perform only component accesses, alignment is
158038fd1498Szrj irrelevant for them. */
158138fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
158238fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
158338fd1498Szrj continue;
158438fd1498Szrj
158538fd1498Szrj save_misalignment = DR_MISALIGNMENT (dr);
158638fd1498Szrj vect_update_misalignment_for_peel (dr, dr0, npeel);
158738fd1498Szrj supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
158838fd1498Szrj SET_DR_MISALIGNMENT (dr, save_misalignment);
158938fd1498Szrj
159038fd1498Szrj if (!supportable_dr_alignment)
159138fd1498Szrj return false;
159238fd1498Szrj }
159338fd1498Szrj
159438fd1498Szrj return true;
159538fd1498Szrj }
159638fd1498Szrj
159738fd1498Szrj /* Function vect_enhance_data_refs_alignment
159838fd1498Szrj
159938fd1498Szrj This pass will use loop versioning and loop peeling in order to enhance
160038fd1498Szrj the alignment of data references in the loop.
160138fd1498Szrj
160238fd1498Szrj FOR NOW: we assume that whatever versioning/peeling takes place, only the
160338fd1498Szrj original loop is to be vectorized. Any other loops that are created by
160438fd1498Szrj the transformations performed in this pass - are not supposed to be
160538fd1498Szrj vectorized. This restriction will be relaxed.
160638fd1498Szrj
160738fd1498Szrj This pass will require a cost model to guide it whether to apply peeling
160838fd1498Szrj or versioning or a combination of the two. For example, the scheme that
160938fd1498Szrj intel uses when given a loop with several memory accesses, is as follows:
161038fd1498Szrj choose one memory access ('p') which alignment you want to force by doing
161138fd1498Szrj peeling. Then, either (1) generate a loop in which 'p' is aligned and all
161238fd1498Szrj other accesses are not necessarily aligned, or (2) use loop versioning to
161338fd1498Szrj generate one loop in which all accesses are aligned, and another loop in
161438fd1498Szrj which only 'p' is necessarily aligned.
161538fd1498Szrj
161638fd1498Szrj ("Automatic Intra-Register Vectorization for the Intel Architecture",
161738fd1498Szrj Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
161838fd1498Szrj Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
161938fd1498Szrj
162038fd1498Szrj Devising a cost model is the most critical aspect of this work. It will
162138fd1498Szrj guide us on which access to peel for, whether to use loop versioning, how
162238fd1498Szrj many versions to create, etc. The cost model will probably consist of
162338fd1498Szrj generic considerations as well as target specific considerations (on
162438fd1498Szrj powerpc for example, misaligned stores are more painful than misaligned
162538fd1498Szrj loads).
162638fd1498Szrj
162738fd1498Szrj Here are the general steps involved in alignment enhancements:
162838fd1498Szrj
162938fd1498Szrj -- original loop, before alignment analysis:
163038fd1498Szrj for (i=0; i<N; i++){
163138fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = unknown
163238fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = unknown
163338fd1498Szrj }
163438fd1498Szrj
163538fd1498Szrj -- After vect_compute_data_refs_alignment:
163638fd1498Szrj for (i=0; i<N; i++){
163738fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 3
163838fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = unknown
163938fd1498Szrj }
164038fd1498Szrj
164138fd1498Szrj -- Possibility 1: we do loop versioning:
164238fd1498Szrj if (p is aligned) {
164338fd1498Szrj for (i=0; i<N; i++){ # loop 1A
164438fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 3
164538fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = 0
164638fd1498Szrj }
164738fd1498Szrj }
164838fd1498Szrj else {
164938fd1498Szrj for (i=0; i<N; i++){ # loop 1B
165038fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 3
165138fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = unaligned
165238fd1498Szrj }
165338fd1498Szrj }
165438fd1498Szrj
165538fd1498Szrj -- Possibility 2: we do loop peeling:
165638fd1498Szrj for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
165738fd1498Szrj x = q[i];
165838fd1498Szrj p[i] = y;
165938fd1498Szrj }
166038fd1498Szrj for (i = 3; i < N; i++){ # loop 2A
166138fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 0
166238fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = unknown
166338fd1498Szrj }
166438fd1498Szrj
166538fd1498Szrj -- Possibility 3: combination of loop peeling and versioning:
166638fd1498Szrj for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
166738fd1498Szrj x = q[i];
166838fd1498Szrj p[i] = y;
166938fd1498Szrj }
167038fd1498Szrj if (p is aligned) {
167138fd1498Szrj for (i = 3; i<N; i++){ # loop 3A
167238fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 0
167338fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = 0
167438fd1498Szrj }
167538fd1498Szrj }
167638fd1498Szrj else {
167738fd1498Szrj for (i = 3; i<N; i++){ # loop 3B
167838fd1498Szrj x = q[i]; # DR_MISALIGNMENT(q) = 0
167938fd1498Szrj p[i] = y; # DR_MISALIGNMENT(p) = unaligned
168038fd1498Szrj }
168138fd1498Szrj }
168238fd1498Szrj
168338fd1498Szrj These loops are later passed to loop_transform to be vectorized. The
168438fd1498Szrj vectorizer will use the alignment information to guide the transformation
168538fd1498Szrj (whether to generate regular loads/stores, or with special handling for
168638fd1498Szrj misalignment). */
168738fd1498Szrj
168838fd1498Szrj bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)168938fd1498Szrj vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
169038fd1498Szrj {
169138fd1498Szrj vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
169238fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
169338fd1498Szrj enum dr_alignment_support supportable_dr_alignment;
169438fd1498Szrj struct data_reference *dr0 = NULL, *first_store = NULL;
169538fd1498Szrj struct data_reference *dr;
169638fd1498Szrj unsigned int i, j;
169738fd1498Szrj bool do_peeling = false;
169838fd1498Szrj bool do_versioning = false;
169938fd1498Szrj bool stat;
170038fd1498Szrj gimple *stmt;
170138fd1498Szrj stmt_vec_info stmt_info;
170238fd1498Szrj unsigned int npeel = 0;
170338fd1498Szrj bool one_misalignment_known = false;
170438fd1498Szrj bool one_misalignment_unknown = false;
170538fd1498Szrj bool one_dr_unsupportable = false;
170638fd1498Szrj struct data_reference *unsupportable_dr = NULL;
170738fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
170838fd1498Szrj unsigned possible_npeel_number = 1;
170938fd1498Szrj tree vectype;
171038fd1498Szrj unsigned int mis, same_align_drs_max = 0;
171138fd1498Szrj hash_table<peel_info_hasher> peeling_htab (1);
171238fd1498Szrj
171338fd1498Szrj if (dump_enabled_p ())
171438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
171538fd1498Szrj "=== vect_enhance_data_refs_alignment ===\n");
171638fd1498Szrj
171738fd1498Szrj /* Reset data so we can safely be called multiple times. */
171838fd1498Szrj LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
171938fd1498Szrj LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
172038fd1498Szrj
172138fd1498Szrj /* While cost model enhancements are expected in the future, the high level
172238fd1498Szrj view of the code at this time is as follows:
172338fd1498Szrj
172438fd1498Szrj A) If there is a misaligned access then see if peeling to align
172538fd1498Szrj this access can make all data references satisfy
172638fd1498Szrj vect_supportable_dr_alignment. If so, update data structures
172738fd1498Szrj as needed and return true.
172838fd1498Szrj
172938fd1498Szrj B) If peeling wasn't possible and there is a data reference with an
173038fd1498Szrj unknown misalignment that does not satisfy vect_supportable_dr_alignment
173138fd1498Szrj then see if loop versioning checks can be used to make all data
173238fd1498Szrj references satisfy vect_supportable_dr_alignment. If so, update
173338fd1498Szrj data structures as needed and return true.
173438fd1498Szrj
173538fd1498Szrj C) If neither peeling nor versioning were successful then return false if
173638fd1498Szrj any data reference does not satisfy vect_supportable_dr_alignment.
173738fd1498Szrj
173838fd1498Szrj D) Return true (all data references satisfy vect_supportable_dr_alignment).
173938fd1498Szrj
174038fd1498Szrj Note, Possibility 3 above (which is peeling and versioning together) is not
174138fd1498Szrj being done at this time. */
174238fd1498Szrj
174338fd1498Szrj /* (1) Peeling to force alignment. */
174438fd1498Szrj
174538fd1498Szrj /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
174638fd1498Szrj Considerations:
174738fd1498Szrj + How many accesses will become aligned due to the peeling
174838fd1498Szrj - How many accesses will become unaligned due to the peeling,
174938fd1498Szrj and the cost of misaligned accesses.
175038fd1498Szrj - The cost of peeling (the extra runtime checks, the increase
175138fd1498Szrj in code size). */
175238fd1498Szrj
175338fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
175438fd1498Szrj {
175538fd1498Szrj stmt = DR_STMT (dr);
175638fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
175738fd1498Szrj
175838fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info))
175938fd1498Szrj continue;
176038fd1498Szrj
176138fd1498Szrj /* For interleaving, only the alignment of the first access
176238fd1498Szrj matters. */
176338fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
176438fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
176538fd1498Szrj continue;
176638fd1498Szrj
176738fd1498Szrj /* For invariant accesses there is nothing to enhance. */
176838fd1498Szrj if (integer_zerop (DR_STEP (dr)))
176938fd1498Szrj continue;
177038fd1498Szrj
177138fd1498Szrj /* Strided accesses perform only component accesses, alignment is
177238fd1498Szrj irrelevant for them. */
177338fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
177438fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
177538fd1498Szrj continue;
177638fd1498Szrj
177738fd1498Szrj supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
177838fd1498Szrj do_peeling = vector_alignment_reachable_p (dr);
177938fd1498Szrj if (do_peeling)
178038fd1498Szrj {
178138fd1498Szrj if (known_alignment_for_access_p (dr))
178238fd1498Szrj {
178338fd1498Szrj unsigned int npeel_tmp = 0;
178438fd1498Szrj bool negative = tree_int_cst_compare (DR_STEP (dr),
178538fd1498Szrj size_zero_node) < 0;
178638fd1498Szrj
178738fd1498Szrj vectype = STMT_VINFO_VECTYPE (stmt_info);
178838fd1498Szrj unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
178938fd1498Szrj unsigned int dr_size = vect_get_scalar_dr_size (dr);
179038fd1498Szrj mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
179138fd1498Szrj if (DR_MISALIGNMENT (dr) != 0)
179238fd1498Szrj npeel_tmp = (mis & (target_align - 1)) / dr_size;
179338fd1498Szrj
179438fd1498Szrj /* For multiple types, it is possible that the bigger type access
179538fd1498Szrj will have more than one peeling option. E.g., a loop with two
179638fd1498Szrj types: one of size (vector size / 4), and the other one of
179738fd1498Szrj size (vector size / 8). Vectorization factor will 8. If both
179838fd1498Szrj accesses are misaligned by 3, the first one needs one scalar
179938fd1498Szrj iteration to be aligned, and the second one needs 5. But the
180038fd1498Szrj first one will be aligned also by peeling 5 scalar
180138fd1498Szrj iterations, and in that case both accesses will be aligned.
180238fd1498Szrj Hence, except for the immediate peeling amount, we also want
180338fd1498Szrj to try to add full vector size, while we don't exceed
180438fd1498Szrj vectorization factor.
180538fd1498Szrj We do this automatically for cost model, since we calculate
180638fd1498Szrj cost for every peeling option. */
180738fd1498Szrj if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
180838fd1498Szrj {
180938fd1498Szrj poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
181038fd1498Szrj ? vf * GROUP_SIZE (stmt_info) : vf);
181138fd1498Szrj possible_npeel_number
181238fd1498Szrj = vect_get_num_vectors (nscalars, vectype);
181338fd1498Szrj
181438fd1498Szrj /* NPEEL_TMP is 0 when there is no misalignment, but also
181538fd1498Szrj allow peeling NELEMENTS. */
181638fd1498Szrj if (DR_MISALIGNMENT (dr) == 0)
181738fd1498Szrj possible_npeel_number++;
181838fd1498Szrj }
181938fd1498Szrj
182038fd1498Szrj /* Save info about DR in the hash table. Also include peeling
182138fd1498Szrj amounts according to the explanation above. */
182238fd1498Szrj for (j = 0; j < possible_npeel_number; j++)
182338fd1498Szrj {
182438fd1498Szrj vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
182538fd1498Szrj dr, npeel_tmp);
182638fd1498Szrj npeel_tmp += target_align / dr_size;
182738fd1498Szrj }
182838fd1498Szrj
182938fd1498Szrj one_misalignment_known = true;
183038fd1498Szrj }
183138fd1498Szrj else
183238fd1498Szrj {
183338fd1498Szrj /* If we don't know any misalignment values, we prefer
183438fd1498Szrj peeling for data-ref that has the maximum number of data-refs
183538fd1498Szrj with the same alignment, unless the target prefers to align
183638fd1498Szrj stores over load. */
183738fd1498Szrj unsigned same_align_drs
183838fd1498Szrj = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
183938fd1498Szrj if (!dr0
184038fd1498Szrj || same_align_drs_max < same_align_drs)
184138fd1498Szrj {
184238fd1498Szrj same_align_drs_max = same_align_drs;
184338fd1498Szrj dr0 = dr;
184438fd1498Szrj }
184538fd1498Szrj /* For data-refs with the same number of related
184638fd1498Szrj accesses prefer the one where the misalign
184738fd1498Szrj computation will be invariant in the outermost loop. */
184838fd1498Szrj else if (same_align_drs_max == same_align_drs)
184938fd1498Szrj {
185038fd1498Szrj struct loop *ivloop0, *ivloop;
185138fd1498Szrj ivloop0 = outermost_invariant_loop_for_expr
185238fd1498Szrj (loop, DR_BASE_ADDRESS (dr0));
185338fd1498Szrj ivloop = outermost_invariant_loop_for_expr
185438fd1498Szrj (loop, DR_BASE_ADDRESS (dr));
185538fd1498Szrj if ((ivloop && !ivloop0)
185638fd1498Szrj || (ivloop && ivloop0
185738fd1498Szrj && flow_loop_nested_p (ivloop, ivloop0)))
185838fd1498Szrj dr0 = dr;
185938fd1498Szrj }
186038fd1498Szrj
186138fd1498Szrj one_misalignment_unknown = true;
186238fd1498Szrj
186338fd1498Szrj /* Check for data refs with unsupportable alignment that
186438fd1498Szrj can be peeled. */
186538fd1498Szrj if (!supportable_dr_alignment)
186638fd1498Szrj {
186738fd1498Szrj one_dr_unsupportable = true;
186838fd1498Szrj unsupportable_dr = dr;
186938fd1498Szrj }
187038fd1498Szrj
187138fd1498Szrj if (!first_store && DR_IS_WRITE (dr))
187238fd1498Szrj first_store = dr;
187338fd1498Szrj }
187438fd1498Szrj }
187538fd1498Szrj else
187638fd1498Szrj {
187738fd1498Szrj if (!aligned_access_p (dr))
187838fd1498Szrj {
187938fd1498Szrj if (dump_enabled_p ())
188038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188138fd1498Szrj "vector alignment may not be reachable\n");
188238fd1498Szrj break;
188338fd1498Szrj }
188438fd1498Szrj }
188538fd1498Szrj }
188638fd1498Szrj
188738fd1498Szrj /* Check if we can possibly peel the loop. */
188838fd1498Szrj if (!vect_can_advance_ivs_p (loop_vinfo)
188938fd1498Szrj || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
189038fd1498Szrj || loop->inner)
189138fd1498Szrj do_peeling = false;
189238fd1498Szrj
189338fd1498Szrj struct _vect_peel_extended_info peel_for_known_alignment;
189438fd1498Szrj struct _vect_peel_extended_info peel_for_unknown_alignment;
189538fd1498Szrj struct _vect_peel_extended_info best_peel;
189638fd1498Szrj
189738fd1498Szrj peel_for_unknown_alignment.inside_cost = INT_MAX;
189838fd1498Szrj peel_for_unknown_alignment.outside_cost = INT_MAX;
189938fd1498Szrj peel_for_unknown_alignment.peel_info.count = 0;
190038fd1498Szrj
190138fd1498Szrj if (do_peeling
190238fd1498Szrj && one_misalignment_unknown)
190338fd1498Szrj {
190438fd1498Szrj /* Check if the target requires to prefer stores over loads, i.e., if
190538fd1498Szrj misaligned stores are more expensive than misaligned loads (taking
190638fd1498Szrj drs with same alignment into account). */
190738fd1498Szrj unsigned int load_inside_cost = 0;
190838fd1498Szrj unsigned int load_outside_cost = 0;
190938fd1498Szrj unsigned int store_inside_cost = 0;
191038fd1498Szrj unsigned int store_outside_cost = 0;
191138fd1498Szrj unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
191238fd1498Szrj
191338fd1498Szrj stmt_vector_for_cost dummy;
191438fd1498Szrj dummy.create (2);
191538fd1498Szrj vect_get_peeling_costs_all_drs (datarefs, dr0,
191638fd1498Szrj &load_inside_cost,
191738fd1498Szrj &load_outside_cost,
191838fd1498Szrj &dummy, estimated_npeels, true);
191938fd1498Szrj dummy.release ();
192038fd1498Szrj
192138fd1498Szrj if (first_store)
192238fd1498Szrj {
192338fd1498Szrj dummy.create (2);
192438fd1498Szrj vect_get_peeling_costs_all_drs (datarefs, first_store,
192538fd1498Szrj &store_inside_cost,
192638fd1498Szrj &store_outside_cost,
192738fd1498Szrj &dummy, estimated_npeels, true);
192838fd1498Szrj dummy.release ();
192938fd1498Szrj }
193038fd1498Szrj else
193138fd1498Szrj {
193238fd1498Szrj store_inside_cost = INT_MAX;
193338fd1498Szrj store_outside_cost = INT_MAX;
193438fd1498Szrj }
193538fd1498Szrj
193638fd1498Szrj if (load_inside_cost > store_inside_cost
193738fd1498Szrj || (load_inside_cost == store_inside_cost
193838fd1498Szrj && load_outside_cost > store_outside_cost))
193938fd1498Szrj {
194038fd1498Szrj dr0 = first_store;
194138fd1498Szrj peel_for_unknown_alignment.inside_cost = store_inside_cost;
194238fd1498Szrj peel_for_unknown_alignment.outside_cost = store_outside_cost;
194338fd1498Szrj }
194438fd1498Szrj else
194538fd1498Szrj {
194638fd1498Szrj peel_for_unknown_alignment.inside_cost = load_inside_cost;
194738fd1498Szrj peel_for_unknown_alignment.outside_cost = load_outside_cost;
194838fd1498Szrj }
194938fd1498Szrj
195038fd1498Szrj stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
195138fd1498Szrj prologue_cost_vec.create (2);
195238fd1498Szrj epilogue_cost_vec.create (2);
195338fd1498Szrj
195438fd1498Szrj int dummy2;
195538fd1498Szrj peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
195638fd1498Szrj (loop_vinfo, estimated_npeels, &dummy2,
195738fd1498Szrj &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
195838fd1498Szrj &prologue_cost_vec, &epilogue_cost_vec);
195938fd1498Szrj
196038fd1498Szrj prologue_cost_vec.release ();
196138fd1498Szrj epilogue_cost_vec.release ();
196238fd1498Szrj
196338fd1498Szrj peel_for_unknown_alignment.peel_info.count = 1
196438fd1498Szrj + STMT_VINFO_SAME_ALIGN_REFS
196538fd1498Szrj (vinfo_for_stmt (DR_STMT (dr0))).length ();
196638fd1498Szrj }
196738fd1498Szrj
196838fd1498Szrj peel_for_unknown_alignment.peel_info.npeel = 0;
196938fd1498Szrj peel_for_unknown_alignment.peel_info.dr = dr0;
197038fd1498Szrj
197138fd1498Szrj best_peel = peel_for_unknown_alignment;
197238fd1498Szrj
197338fd1498Szrj peel_for_known_alignment.inside_cost = INT_MAX;
197438fd1498Szrj peel_for_known_alignment.outside_cost = INT_MAX;
197538fd1498Szrj peel_for_known_alignment.peel_info.count = 0;
197638fd1498Szrj peel_for_known_alignment.peel_info.dr = NULL;
197738fd1498Szrj
197838fd1498Szrj if (do_peeling && one_misalignment_known)
197938fd1498Szrj {
198038fd1498Szrj /* Peeling is possible, but there is no data access that is not supported
198138fd1498Szrj unless aligned. So we try to choose the best possible peeling from
198238fd1498Szrj the hash table. */
198338fd1498Szrj peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
198438fd1498Szrj (&peeling_htab, loop_vinfo);
198538fd1498Szrj }
198638fd1498Szrj
198738fd1498Szrj /* Compare costs of peeling for known and unknown alignment. */
198838fd1498Szrj if (peel_for_known_alignment.peel_info.dr != NULL
198938fd1498Szrj && peel_for_unknown_alignment.inside_cost
199038fd1498Szrj >= peel_for_known_alignment.inside_cost)
199138fd1498Szrj {
199238fd1498Szrj best_peel = peel_for_known_alignment;
199338fd1498Szrj
199438fd1498Szrj /* If the best peeling for known alignment has NPEEL == 0, perform no
199538fd1498Szrj peeling at all except if there is an unsupportable dr that we can
199638fd1498Szrj align. */
199738fd1498Szrj if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
199838fd1498Szrj do_peeling = false;
199938fd1498Szrj }
200038fd1498Szrj
200138fd1498Szrj /* If there is an unsupportable data ref, prefer this over all choices so far
200238fd1498Szrj since we'd have to discard a chosen peeling except when it accidentally
200338fd1498Szrj aligned the unsupportable data ref. */
200438fd1498Szrj if (one_dr_unsupportable)
200538fd1498Szrj dr0 = unsupportable_dr;
200638fd1498Szrj else if (do_peeling)
200738fd1498Szrj {
200838fd1498Szrj /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
200938fd1498Szrj TODO: Use nopeel_outside_cost or get rid of it? */
201038fd1498Szrj unsigned nopeel_inside_cost = 0;
201138fd1498Szrj unsigned nopeel_outside_cost = 0;
201238fd1498Szrj
201338fd1498Szrj stmt_vector_for_cost dummy;
201438fd1498Szrj dummy.create (2);
201538fd1498Szrj vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
201638fd1498Szrj &nopeel_outside_cost, &dummy, 0, false);
201738fd1498Szrj dummy.release ();
201838fd1498Szrj
201938fd1498Szrj /* Add epilogue costs. As we do not peel for alignment here, no prologue
202038fd1498Szrj costs will be recorded. */
202138fd1498Szrj stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
202238fd1498Szrj prologue_cost_vec.create (2);
202338fd1498Szrj epilogue_cost_vec.create (2);
202438fd1498Szrj
202538fd1498Szrj int dummy2;
202638fd1498Szrj nopeel_outside_cost += vect_get_known_peeling_cost
202738fd1498Szrj (loop_vinfo, 0, &dummy2,
202838fd1498Szrj &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
202938fd1498Szrj &prologue_cost_vec, &epilogue_cost_vec);
203038fd1498Szrj
203138fd1498Szrj prologue_cost_vec.release ();
203238fd1498Szrj epilogue_cost_vec.release ();
203338fd1498Szrj
203438fd1498Szrj npeel = best_peel.peel_info.npeel;
203538fd1498Szrj dr0 = best_peel.peel_info.dr;
203638fd1498Szrj
203738fd1498Szrj /* If no peeling is not more expensive than the best peeling we
203838fd1498Szrj have so far, don't perform any peeling. */
203938fd1498Szrj if (nopeel_inside_cost <= best_peel.inside_cost)
204038fd1498Szrj do_peeling = false;
204138fd1498Szrj }
204238fd1498Szrj
204338fd1498Szrj if (do_peeling)
204438fd1498Szrj {
204538fd1498Szrj stmt = DR_STMT (dr0);
204638fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
204738fd1498Szrj vectype = STMT_VINFO_VECTYPE (stmt_info);
204838fd1498Szrj
204938fd1498Szrj if (known_alignment_for_access_p (dr0))
205038fd1498Szrj {
205138fd1498Szrj bool negative = tree_int_cst_compare (DR_STEP (dr0),
205238fd1498Szrj size_zero_node) < 0;
205338fd1498Szrj if (!npeel)
205438fd1498Szrj {
205538fd1498Szrj /* Since it's known at compile time, compute the number of
205638fd1498Szrj iterations in the peeled loop (the peeling factor) for use in
205738fd1498Szrj updating DR_MISALIGNMENT values. The peeling factor is the
205838fd1498Szrj vectorization factor minus the misalignment as an element
205938fd1498Szrj count. */
206038fd1498Szrj mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
206138fd1498Szrj unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
206238fd1498Szrj npeel = ((mis & (target_align - 1))
206338fd1498Szrj / vect_get_scalar_dr_size (dr0));
206438fd1498Szrj }
206538fd1498Szrj
206638fd1498Szrj /* For interleaved data access every iteration accesses all the
206738fd1498Szrj members of the group, therefore we divide the number of iterations
206838fd1498Szrj by the group size. */
206938fd1498Szrj stmt_info = vinfo_for_stmt (DR_STMT (dr0));
207038fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
207138fd1498Szrj npeel /= GROUP_SIZE (stmt_info);
207238fd1498Szrj
207338fd1498Szrj if (dump_enabled_p ())
207438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
207538fd1498Szrj "Try peeling by %d\n", npeel);
207638fd1498Szrj }
207738fd1498Szrj
207838fd1498Szrj /* Ensure that all datarefs can be vectorized after the peel. */
207938fd1498Szrj if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
208038fd1498Szrj do_peeling = false;
208138fd1498Szrj
208238fd1498Szrj /* Check if all datarefs are supportable and log. */
208338fd1498Szrj if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
208438fd1498Szrj {
208538fd1498Szrj stat = vect_verify_datarefs_alignment (loop_vinfo);
208638fd1498Szrj if (!stat)
208738fd1498Szrj do_peeling = false;
208838fd1498Szrj else
208938fd1498Szrj return stat;
209038fd1498Szrj }
209138fd1498Szrj
209238fd1498Szrj /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
209338fd1498Szrj if (do_peeling)
209438fd1498Szrj {
209538fd1498Szrj unsigned max_allowed_peel
209638fd1498Szrj = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
209738fd1498Szrj if (max_allowed_peel != (unsigned)-1)
209838fd1498Szrj {
209938fd1498Szrj unsigned max_peel = npeel;
210038fd1498Szrj if (max_peel == 0)
210138fd1498Szrj {
210238fd1498Szrj unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
210338fd1498Szrj max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
210438fd1498Szrj }
210538fd1498Szrj if (max_peel > max_allowed_peel)
210638fd1498Szrj {
210738fd1498Szrj do_peeling = false;
210838fd1498Szrj if (dump_enabled_p ())
210938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
211038fd1498Szrj "Disable peeling, max peels reached: %d\n", max_peel);
211138fd1498Szrj }
211238fd1498Szrj }
211338fd1498Szrj }
211438fd1498Szrj
211538fd1498Szrj /* Cost model #2 - if peeling may result in a remaining loop not
211638fd1498Szrj iterating enough to be vectorized then do not peel. Since this
211738fd1498Szrj is a cost heuristic rather than a correctness decision, use the
211838fd1498Szrj most likely runtime value for variable vectorization factors. */
211938fd1498Szrj if (do_peeling
212038fd1498Szrj && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
212138fd1498Szrj {
212238fd1498Szrj unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
212338fd1498Szrj unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
212438fd1498Szrj if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
212538fd1498Szrj < assumed_vf + max_peel)
212638fd1498Szrj do_peeling = false;
212738fd1498Szrj }
212838fd1498Szrj
212938fd1498Szrj if (do_peeling)
213038fd1498Szrj {
213138fd1498Szrj /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
213238fd1498Szrj If the misalignment of DR_i is identical to that of dr0 then set
213338fd1498Szrj DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
213438fd1498Szrj dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
213538fd1498Szrj by the peeling factor times the element size of DR_i (MOD the
213638fd1498Szrj vectorization factor times the size). Otherwise, the
213738fd1498Szrj misalignment of DR_i must be set to unknown. */
213838fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
213938fd1498Szrj if (dr != dr0)
214038fd1498Szrj {
214138fd1498Szrj /* Strided accesses perform only component accesses, alignment
214238fd1498Szrj is irrelevant for them. */
214338fd1498Szrj stmt_info = vinfo_for_stmt (DR_STMT (dr));
214438fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
214538fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
214638fd1498Szrj continue;
214738fd1498Szrj
214838fd1498Szrj vect_update_misalignment_for_peel (dr, dr0, npeel);
214938fd1498Szrj }
215038fd1498Szrj
215138fd1498Szrj LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
215238fd1498Szrj if (npeel)
215338fd1498Szrj LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
215438fd1498Szrj else
215538fd1498Szrj LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
215638fd1498Szrj = DR_MISALIGNMENT (dr0);
215738fd1498Szrj SET_DR_MISALIGNMENT (dr0, 0);
215838fd1498Szrj if (dump_enabled_p ())
215938fd1498Szrj {
216038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
216138fd1498Szrj "Alignment of access forced using peeling.\n");
216238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
216338fd1498Szrj "Peeling for alignment will be applied.\n");
216438fd1498Szrj }
216538fd1498Szrj
216638fd1498Szrj /* The inside-loop cost will be accounted for in vectorizable_load
216738fd1498Szrj and vectorizable_store correctly with adjusted alignments.
216838fd1498Szrj Drop the body_cst_vec on the floor here. */
216938fd1498Szrj stat = vect_verify_datarefs_alignment (loop_vinfo);
217038fd1498Szrj gcc_assert (stat);
217138fd1498Szrj return stat;
217238fd1498Szrj }
217338fd1498Szrj }
217438fd1498Szrj
217538fd1498Szrj /* (2) Versioning to force alignment. */
217638fd1498Szrj
217738fd1498Szrj /* Try versioning if:
217838fd1498Szrj 1) optimize loop for speed
217938fd1498Szrj 2) there is at least one unsupported misaligned data ref with an unknown
218038fd1498Szrj misalignment, and
218138fd1498Szrj 3) all misaligned data refs with a known misalignment are supported, and
218238fd1498Szrj 4) the number of runtime alignment checks is within reason. */
218338fd1498Szrj
218438fd1498Szrj do_versioning =
218538fd1498Szrj optimize_loop_nest_for_speed_p (loop)
218638fd1498Szrj && (!loop->inner); /* FORNOW */
218738fd1498Szrj
218838fd1498Szrj if (do_versioning)
218938fd1498Szrj {
219038fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
219138fd1498Szrj {
219238fd1498Szrj stmt = DR_STMT (dr);
219338fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
219438fd1498Szrj
219538fd1498Szrj /* For interleaving, only the alignment of the first access
219638fd1498Szrj matters. */
219738fd1498Szrj if (aligned_access_p (dr)
219838fd1498Szrj || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
219938fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
220038fd1498Szrj continue;
220138fd1498Szrj
220238fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info))
220338fd1498Szrj {
220438fd1498Szrj /* Strided loads perform only component accesses, alignment is
220538fd1498Szrj irrelevant for them. */
220638fd1498Szrj if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
220738fd1498Szrj continue;
220838fd1498Szrj do_versioning = false;
220938fd1498Szrj break;
221038fd1498Szrj }
221138fd1498Szrj
221238fd1498Szrj supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
221338fd1498Szrj
221438fd1498Szrj if (!supportable_dr_alignment)
221538fd1498Szrj {
221638fd1498Szrj gimple *stmt;
221738fd1498Szrj int mask;
221838fd1498Szrj tree vectype;
221938fd1498Szrj
222038fd1498Szrj if (known_alignment_for_access_p (dr)
222138fd1498Szrj || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
222238fd1498Szrj >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
222338fd1498Szrj {
222438fd1498Szrj do_versioning = false;
222538fd1498Szrj break;
222638fd1498Szrj }
222738fd1498Szrj
222838fd1498Szrj stmt = DR_STMT (dr);
222938fd1498Szrj vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
223038fd1498Szrj gcc_assert (vectype);
223138fd1498Szrj
223238fd1498Szrj /* At present we don't support versioning for alignment
223338fd1498Szrj with variable VF, since there's no guarantee that the
223438fd1498Szrj VF is a power of two. We could relax this if we added
223538fd1498Szrj a way of enforcing a power-of-two size. */
223638fd1498Szrj unsigned HOST_WIDE_INT size;
223738fd1498Szrj if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
223838fd1498Szrj {
223938fd1498Szrj do_versioning = false;
224038fd1498Szrj break;
224138fd1498Szrj }
224238fd1498Szrj
224338fd1498Szrj /* The rightmost bits of an aligned address must be zeros.
224438fd1498Szrj Construct the mask needed for this test. For example,
224538fd1498Szrj GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
224638fd1498Szrj mask must be 15 = 0xf. */
224738fd1498Szrj mask = size - 1;
224838fd1498Szrj
224938fd1498Szrj /* FORNOW: use the same mask to test all potentially unaligned
225038fd1498Szrj references in the loop. The vectorizer currently supports
225138fd1498Szrj a single vector size, see the reference to
225238fd1498Szrj GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
225338fd1498Szrj vectorization factor is computed. */
225438fd1498Szrj gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
225538fd1498Szrj || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
225638fd1498Szrj LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
225738fd1498Szrj LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
225838fd1498Szrj DR_STMT (dr));
225938fd1498Szrj }
226038fd1498Szrj }
226138fd1498Szrj
226238fd1498Szrj /* Versioning requires at least one misaligned data reference. */
226338fd1498Szrj if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
226438fd1498Szrj do_versioning = false;
226538fd1498Szrj else if (!do_versioning)
226638fd1498Szrj LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
226738fd1498Szrj }
226838fd1498Szrj
226938fd1498Szrj if (do_versioning)
227038fd1498Szrj {
227138fd1498Szrj vec<gimple *> may_misalign_stmts
227238fd1498Szrj = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
227338fd1498Szrj gimple *stmt;
227438fd1498Szrj
227538fd1498Szrj /* It can now be assumed that the data references in the statements
227638fd1498Szrj in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
227738fd1498Szrj of the loop being vectorized. */
227838fd1498Szrj FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
227938fd1498Szrj {
228038fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
228138fd1498Szrj dr = STMT_VINFO_DATA_REF (stmt_info);
228238fd1498Szrj SET_DR_MISALIGNMENT (dr, 0);
228338fd1498Szrj if (dump_enabled_p ())
228438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
228538fd1498Szrj "Alignment of access forced using versioning.\n");
228638fd1498Szrj }
228738fd1498Szrj
228838fd1498Szrj if (dump_enabled_p ())
228938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
229038fd1498Szrj "Versioning for alignment will be applied.\n");
229138fd1498Szrj
229238fd1498Szrj /* Peeling and versioning can't be done together at this time. */
229338fd1498Szrj gcc_assert (! (do_peeling && do_versioning));
229438fd1498Szrj
229538fd1498Szrj stat = vect_verify_datarefs_alignment (loop_vinfo);
229638fd1498Szrj gcc_assert (stat);
229738fd1498Szrj return stat;
229838fd1498Szrj }
229938fd1498Szrj
230038fd1498Szrj /* This point is reached if neither peeling nor versioning is being done. */
230138fd1498Szrj gcc_assert (! (do_peeling || do_versioning));
230238fd1498Szrj
230338fd1498Szrj stat = vect_verify_datarefs_alignment (loop_vinfo);
230438fd1498Szrj return stat;
230538fd1498Szrj }
230638fd1498Szrj
230738fd1498Szrj
230838fd1498Szrj /* Function vect_find_same_alignment_drs.
230938fd1498Szrj
231038fd1498Szrj Update group and alignment relations according to the chosen
231138fd1498Szrj vectorization factor. */
231238fd1498Szrj
231338fd1498Szrj static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)231438fd1498Szrj vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
231538fd1498Szrj {
231638fd1498Szrj struct data_reference *dra = DDR_A (ddr);
231738fd1498Szrj struct data_reference *drb = DDR_B (ddr);
231838fd1498Szrj stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
231938fd1498Szrj stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
232038fd1498Szrj
232138fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232238fd1498Szrj return;
232338fd1498Szrj
232438fd1498Szrj if (dra == drb)
232538fd1498Szrj return;
232638fd1498Szrj
232738fd1498Szrj if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
232838fd1498Szrj || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
232938fd1498Szrj || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
233038fd1498Szrj return;
233138fd1498Szrj
233238fd1498Szrj /* Two references with distance zero have the same alignment. */
233338fd1498Szrj poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
233438fd1498Szrj - wi::to_poly_offset (DR_INIT (drb)));
233538fd1498Szrj if (maybe_ne (diff, 0))
233638fd1498Szrj {
233738fd1498Szrj /* Get the wider of the two alignments. */
233838fd1498Szrj unsigned int align_a = (vect_calculate_target_alignment (dra)
233938fd1498Szrj / BITS_PER_UNIT);
234038fd1498Szrj unsigned int align_b = (vect_calculate_target_alignment (drb)
234138fd1498Szrj / BITS_PER_UNIT);
234238fd1498Szrj unsigned int max_align = MAX (align_a, align_b);
234338fd1498Szrj
234438fd1498Szrj /* Require the gap to be a multiple of the larger vector alignment. */
234538fd1498Szrj if (!multiple_p (diff, max_align))
234638fd1498Szrj return;
234738fd1498Szrj }
234838fd1498Szrj
234938fd1498Szrj STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
235038fd1498Szrj STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
235138fd1498Szrj if (dump_enabled_p ())
235238fd1498Szrj {
235338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
235438fd1498Szrj "accesses have the same alignment: ");
235538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
235638fd1498Szrj dump_printf (MSG_NOTE, " and ");
235738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
235838fd1498Szrj dump_printf (MSG_NOTE, "\n");
235938fd1498Szrj }
236038fd1498Szrj }
236138fd1498Szrj
236238fd1498Szrj
236338fd1498Szrj /* Function vect_analyze_data_refs_alignment
236438fd1498Szrj
236538fd1498Szrj Analyze the alignment of the data-references in the loop.
236638fd1498Szrj Return FALSE if a data reference is found that cannot be vectorized. */
236738fd1498Szrj
236838fd1498Szrj bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)236938fd1498Szrj vect_analyze_data_refs_alignment (loop_vec_info vinfo)
237038fd1498Szrj {
237138fd1498Szrj if (dump_enabled_p ())
237238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
237338fd1498Szrj "=== vect_analyze_data_refs_alignment ===\n");
237438fd1498Szrj
237538fd1498Szrj /* Mark groups of data references with same alignment using
237638fd1498Szrj data dependence information. */
237738fd1498Szrj vec<ddr_p> ddrs = vinfo->ddrs;
237838fd1498Szrj struct data_dependence_relation *ddr;
237938fd1498Szrj unsigned int i;
238038fd1498Szrj
238138fd1498Szrj FOR_EACH_VEC_ELT (ddrs, i, ddr)
238238fd1498Szrj vect_find_same_alignment_drs (ddr);
238338fd1498Szrj
238438fd1498Szrj vec<data_reference_p> datarefs = vinfo->datarefs;
238538fd1498Szrj struct data_reference *dr;
238638fd1498Szrj
238738fd1498Szrj vect_record_base_alignments (vinfo);
238838fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
238938fd1498Szrj {
239038fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
239138fd1498Szrj if (STMT_VINFO_VECTORIZABLE (stmt_info)
239238fd1498Szrj && !vect_compute_data_ref_alignment (dr))
239338fd1498Szrj {
239438fd1498Szrj /* Strided accesses perform only component accesses, misalignment
239538fd1498Szrj information is irrelevant for them. */
239638fd1498Szrj if (STMT_VINFO_STRIDED_P (stmt_info)
239738fd1498Szrj && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
239838fd1498Szrj continue;
239938fd1498Szrj
240038fd1498Szrj if (dump_enabled_p ())
240138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
240238fd1498Szrj "not vectorized: can't calculate alignment "
240338fd1498Szrj "for data ref.\n");
240438fd1498Szrj
240538fd1498Szrj return false;
240638fd1498Szrj }
240738fd1498Szrj }
240838fd1498Szrj
240938fd1498Szrj return true;
241038fd1498Szrj }
241138fd1498Szrj
241238fd1498Szrj
241338fd1498Szrj /* Analyze alignment of DRs of stmts in NODE. */
241438fd1498Szrj
241538fd1498Szrj static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)241638fd1498Szrj vect_slp_analyze_and_verify_node_alignment (slp_tree node)
241738fd1498Szrj {
241838fd1498Szrj /* We vectorize from the first scalar stmt in the node unless
241938fd1498Szrj the node is permuted in which case we start from the first
242038fd1498Szrj element in the group. */
242138fd1498Szrj gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
242238fd1498Szrj data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
242338fd1498Szrj if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
242438fd1498Szrj first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
242538fd1498Szrj
242638fd1498Szrj data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
242738fd1498Szrj if (! vect_compute_data_ref_alignment (dr)
242838fd1498Szrj /* For creating the data-ref pointer we need alignment of the
242938fd1498Szrj first element anyway. */
243038fd1498Szrj || (dr != first_dr
243138fd1498Szrj && ! vect_compute_data_ref_alignment (first_dr))
243238fd1498Szrj || ! verify_data_ref_alignment (dr))
243338fd1498Szrj {
243438fd1498Szrj if (dump_enabled_p ())
243538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
243638fd1498Szrj "not vectorized: bad data alignment in basic "
243738fd1498Szrj "block.\n");
243838fd1498Szrj return false;
243938fd1498Szrj }
244038fd1498Szrj
244138fd1498Szrj return true;
244238fd1498Szrj }
244338fd1498Szrj
244438fd1498Szrj /* Function vect_slp_analyze_instance_alignment
244538fd1498Szrj
244638fd1498Szrj Analyze the alignment of the data-references in the SLP instance.
244738fd1498Szrj Return FALSE if a data reference is found that cannot be vectorized. */
244838fd1498Szrj
244938fd1498Szrj bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)245038fd1498Szrj vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
245138fd1498Szrj {
245238fd1498Szrj if (dump_enabled_p ())
245338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
245438fd1498Szrj "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
245538fd1498Szrj
245638fd1498Szrj slp_tree node;
245738fd1498Szrj unsigned i;
245838fd1498Szrj FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
245938fd1498Szrj if (! vect_slp_analyze_and_verify_node_alignment (node))
246038fd1498Szrj return false;
246138fd1498Szrj
246238fd1498Szrj node = SLP_INSTANCE_TREE (instance);
246338fd1498Szrj if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
246438fd1498Szrj && ! vect_slp_analyze_and_verify_node_alignment
246538fd1498Szrj (SLP_INSTANCE_TREE (instance)))
246638fd1498Szrj return false;
246738fd1498Szrj
246838fd1498Szrj return true;
246938fd1498Szrj }
247038fd1498Szrj
247138fd1498Szrj
247238fd1498Szrj /* Analyze groups of accesses: check that DR belongs to a group of
247338fd1498Szrj accesses of legal size, step, etc. Detect gaps, single element
247438fd1498Szrj interleaving, and other special cases. Set grouped access info.
247538fd1498Szrj Collect groups of strided stores for further use in SLP analysis.
247638fd1498Szrj Worker for vect_analyze_group_access. */
247738fd1498Szrj
247838fd1498Szrj static bool
vect_analyze_group_access_1(struct data_reference * dr)247938fd1498Szrj vect_analyze_group_access_1 (struct data_reference *dr)
248038fd1498Szrj {
248138fd1498Szrj tree step = DR_STEP (dr);
248238fd1498Szrj tree scalar_type = TREE_TYPE (DR_REF (dr));
248338fd1498Szrj HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
248438fd1498Szrj gimple *stmt = DR_STMT (dr);
248538fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
248638fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
248738fd1498Szrj bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
248838fd1498Szrj HOST_WIDE_INT dr_step = -1;
248938fd1498Szrj HOST_WIDE_INT groupsize, last_accessed_element = 1;
249038fd1498Szrj bool slp_impossible = false;
249138fd1498Szrj
249238fd1498Szrj /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
249338fd1498Szrj size of the interleaving group (including gaps). */
249438fd1498Szrj if (tree_fits_shwi_p (step))
249538fd1498Szrj {
249638fd1498Szrj dr_step = tree_to_shwi (step);
249738fd1498Szrj /* Check that STEP is a multiple of type size. Otherwise there is
249838fd1498Szrj a non-element-sized gap at the end of the group which we
249938fd1498Szrj cannot represent in GROUP_GAP or GROUP_SIZE.
250038fd1498Szrj ??? As we can handle non-constant step fine here we should
250138fd1498Szrj simply remove uses of GROUP_GAP between the last and first
250238fd1498Szrj element and instead rely on DR_STEP. GROUP_SIZE then would
250338fd1498Szrj simply not include that gap. */
250438fd1498Szrj if ((dr_step % type_size) != 0)
250538fd1498Szrj {
250638fd1498Szrj if (dump_enabled_p ())
250738fd1498Szrj {
250838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
250938fd1498Szrj "Step ");
251038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
251138fd1498Szrj dump_printf (MSG_NOTE,
251238fd1498Szrj " is not a multiple of the element size for ");
251338fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
251438fd1498Szrj dump_printf (MSG_NOTE, "\n");
251538fd1498Szrj }
251638fd1498Szrj return false;
251738fd1498Szrj }
251838fd1498Szrj groupsize = absu_hwi (dr_step) / type_size;
251938fd1498Szrj }
252038fd1498Szrj else
252138fd1498Szrj groupsize = 0;
252238fd1498Szrj
252338fd1498Szrj /* Not consecutive access is possible only if it is a part of interleaving. */
252438fd1498Szrj if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
252538fd1498Szrj {
252638fd1498Szrj /* Check if it this DR is a part of interleaving, and is a single
252738fd1498Szrj element of the group that is accessed in the loop. */
252838fd1498Szrj
252938fd1498Szrj /* Gaps are supported only for loads. STEP must be a multiple of the type
253038fd1498Szrj size. */
253138fd1498Szrj if (DR_IS_READ (dr)
253238fd1498Szrj && (dr_step % type_size) == 0
253338fd1498Szrj && groupsize > 0)
253438fd1498Szrj {
253538fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
253638fd1498Szrj GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
253738fd1498Szrj GROUP_GAP (stmt_info) = groupsize - 1;
253838fd1498Szrj if (dump_enabled_p ())
253938fd1498Szrj {
254038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
254138fd1498Szrj "Detected single element interleaving ");
254238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
254338fd1498Szrj dump_printf (MSG_NOTE, " step ");
254438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
254538fd1498Szrj dump_printf (MSG_NOTE, "\n");
254638fd1498Szrj }
254738fd1498Szrj
254838fd1498Szrj return true;
254938fd1498Szrj }
255038fd1498Szrj
255138fd1498Szrj if (dump_enabled_p ())
255238fd1498Szrj {
255338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
255438fd1498Szrj "not consecutive access ");
255538fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
255638fd1498Szrj }
255738fd1498Szrj
255838fd1498Szrj if (bb_vinfo)
255938fd1498Szrj {
256038fd1498Szrj /* Mark the statement as unvectorizable. */
256138fd1498Szrj STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
256238fd1498Szrj return true;
256338fd1498Szrj }
256438fd1498Szrj
256538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
256638fd1498Szrj STMT_VINFO_STRIDED_P (stmt_info) = true;
256738fd1498Szrj return true;
256838fd1498Szrj }
256938fd1498Szrj
257038fd1498Szrj if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
257138fd1498Szrj {
257238fd1498Szrj /* First stmt in the interleaving chain. Check the chain. */
257338fd1498Szrj gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
257438fd1498Szrj struct data_reference *data_ref = dr;
257538fd1498Szrj unsigned int count = 1;
257638fd1498Szrj tree prev_init = DR_INIT (data_ref);
257738fd1498Szrj gimple *prev = stmt;
257838fd1498Szrj HOST_WIDE_INT diff, gaps = 0;
257938fd1498Szrj
258038fd1498Szrj /* By construction, all group members have INTEGER_CST DR_INITs. */
258138fd1498Szrj while (next)
258238fd1498Szrj {
258338fd1498Szrj /* Skip same data-refs. In case that two or more stmts share
258438fd1498Szrj data-ref (supported only for loads), we vectorize only the first
258538fd1498Szrj stmt, and the rest get their vectorized loads from the first
258638fd1498Szrj one. */
258738fd1498Szrj if (!tree_int_cst_compare (DR_INIT (data_ref),
258838fd1498Szrj DR_INIT (STMT_VINFO_DATA_REF (
258938fd1498Szrj vinfo_for_stmt (next)))))
259038fd1498Szrj {
259138fd1498Szrj if (DR_IS_WRITE (data_ref))
259238fd1498Szrj {
259338fd1498Szrj if (dump_enabled_p ())
259438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
259538fd1498Szrj "Two store stmts share the same dr.\n");
259638fd1498Szrj return false;
259738fd1498Szrj }
259838fd1498Szrj
259938fd1498Szrj if (dump_enabled_p ())
260038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
260138fd1498Szrj "Two or more load stmts share the same dr.\n");
260238fd1498Szrj
260338fd1498Szrj /* For load use the same data-ref load. */
260438fd1498Szrj GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
260538fd1498Szrj
260638fd1498Szrj prev = next;
260738fd1498Szrj next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
260838fd1498Szrj continue;
260938fd1498Szrj }
261038fd1498Szrj
261138fd1498Szrj prev = next;
261238fd1498Szrj data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
261338fd1498Szrj
261438fd1498Szrj /* All group members have the same STEP by construction. */
261538fd1498Szrj gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
261638fd1498Szrj
261738fd1498Szrj /* Check that the distance between two accesses is equal to the type
261838fd1498Szrj size. Otherwise, we have gaps. */
261938fd1498Szrj diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
262038fd1498Szrj - TREE_INT_CST_LOW (prev_init)) / type_size;
262138fd1498Szrj if (diff != 1)
262238fd1498Szrj {
262338fd1498Szrj /* FORNOW: SLP of accesses with gaps is not supported. */
262438fd1498Szrj slp_impossible = true;
262538fd1498Szrj if (DR_IS_WRITE (data_ref))
262638fd1498Szrj {
262738fd1498Szrj if (dump_enabled_p ())
262838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
262938fd1498Szrj "interleaved store with gaps\n");
263038fd1498Szrj return false;
263138fd1498Szrj }
263238fd1498Szrj
263338fd1498Szrj gaps += diff - 1;
263438fd1498Szrj }
263538fd1498Szrj
263638fd1498Szrj last_accessed_element += diff;
263738fd1498Szrj
263838fd1498Szrj /* Store the gap from the previous member of the group. If there is no
263938fd1498Szrj gap in the access, GROUP_GAP is always 1. */
264038fd1498Szrj GROUP_GAP (vinfo_for_stmt (next)) = diff;
264138fd1498Szrj
264238fd1498Szrj prev_init = DR_INIT (data_ref);
264338fd1498Szrj next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
264438fd1498Szrj /* Count the number of data-refs in the chain. */
264538fd1498Szrj count++;
264638fd1498Szrj }
264738fd1498Szrj
264838fd1498Szrj if (groupsize == 0)
264938fd1498Szrj groupsize = count + gaps;
265038fd1498Szrj
265138fd1498Szrj /* This could be UINT_MAX but as we are generating code in a very
265238fd1498Szrj inefficient way we have to cap earlier. See PR78699 for example. */
265338fd1498Szrj if (groupsize > 4096)
265438fd1498Szrj {
265538fd1498Szrj if (dump_enabled_p ())
265638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
265738fd1498Szrj "group is too large\n");
265838fd1498Szrj return false;
265938fd1498Szrj }
266038fd1498Szrj
266138fd1498Szrj /* Check that the size of the interleaving is equal to count for stores,
266238fd1498Szrj i.e., that there are no gaps. */
266338fd1498Szrj if (groupsize != count
266438fd1498Szrj && !DR_IS_READ (dr))
266538fd1498Szrj {
266638fd1498Szrj if (dump_enabled_p ())
266738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266838fd1498Szrj "interleaved store with gaps\n");
266938fd1498Szrj return false;
267038fd1498Szrj }
267138fd1498Szrj
267238fd1498Szrj /* If there is a gap after the last load in the group it is the
267338fd1498Szrj difference between the groupsize and the last accessed
267438fd1498Szrj element.
267538fd1498Szrj When there is no gap, this difference should be 0. */
267638fd1498Szrj GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
267738fd1498Szrj
267838fd1498Szrj GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
267938fd1498Szrj if (dump_enabled_p ())
268038fd1498Szrj {
268138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
268238fd1498Szrj "Detected interleaving ");
268338fd1498Szrj if (DR_IS_READ (dr))
268438fd1498Szrj dump_printf (MSG_NOTE, "load ");
268538fd1498Szrj else
268638fd1498Szrj dump_printf (MSG_NOTE, "store ");
268738fd1498Szrj dump_printf (MSG_NOTE, "of size %u starting with ",
268838fd1498Szrj (unsigned)groupsize);
268938fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
269038fd1498Szrj if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
269138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
269238fd1498Szrj "There is a gap of %u elements after the group\n",
269338fd1498Szrj GROUP_GAP (vinfo_for_stmt (stmt)));
269438fd1498Szrj }
269538fd1498Szrj
269638fd1498Szrj /* SLP: create an SLP data structure for every interleaving group of
269738fd1498Szrj stores for further analysis in vect_analyse_slp. */
269838fd1498Szrj if (DR_IS_WRITE (dr) && !slp_impossible)
269938fd1498Szrj {
270038fd1498Szrj if (loop_vinfo)
270138fd1498Szrj LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
270238fd1498Szrj if (bb_vinfo)
270338fd1498Szrj BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
270438fd1498Szrj }
270538fd1498Szrj }
270638fd1498Szrj
270738fd1498Szrj return true;
270838fd1498Szrj }
270938fd1498Szrj
271038fd1498Szrj /* Analyze groups of accesses: check that DR belongs to a group of
271138fd1498Szrj accesses of legal size, step, etc. Detect gaps, single element
271238fd1498Szrj interleaving, and other special cases. Set grouped access info.
271338fd1498Szrj Collect groups of strided stores for further use in SLP analysis. */
271438fd1498Szrj
271538fd1498Szrj static bool
vect_analyze_group_access(struct data_reference * dr)271638fd1498Szrj vect_analyze_group_access (struct data_reference *dr)
271738fd1498Szrj {
271838fd1498Szrj if (!vect_analyze_group_access_1 (dr))
271938fd1498Szrj {
272038fd1498Szrj /* Dissolve the group if present. */
272138fd1498Szrj gimple *next;
272238fd1498Szrj gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
272338fd1498Szrj while (stmt)
272438fd1498Szrj {
272538fd1498Szrj stmt_vec_info vinfo = vinfo_for_stmt (stmt);
272638fd1498Szrj next = GROUP_NEXT_ELEMENT (vinfo);
272738fd1498Szrj GROUP_FIRST_ELEMENT (vinfo) = NULL;
272838fd1498Szrj GROUP_NEXT_ELEMENT (vinfo) = NULL;
272938fd1498Szrj stmt = next;
273038fd1498Szrj }
273138fd1498Szrj return false;
273238fd1498Szrj }
273338fd1498Szrj return true;
273438fd1498Szrj }
273538fd1498Szrj
273638fd1498Szrj /* Analyze the access pattern of the data-reference DR.
273738fd1498Szrj In case of non-consecutive accesses call vect_analyze_group_access() to
273838fd1498Szrj analyze groups of accesses. */
273938fd1498Szrj
274038fd1498Szrj static bool
vect_analyze_data_ref_access(struct data_reference * dr)274138fd1498Szrj vect_analyze_data_ref_access (struct data_reference *dr)
274238fd1498Szrj {
274338fd1498Szrj tree step = DR_STEP (dr);
274438fd1498Szrj tree scalar_type = TREE_TYPE (DR_REF (dr));
274538fd1498Szrj gimple *stmt = DR_STMT (dr);
274638fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
274738fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
274838fd1498Szrj struct loop *loop = NULL;
274938fd1498Szrj
275038fd1498Szrj if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
275138fd1498Szrj return true;
275238fd1498Szrj
275338fd1498Szrj if (loop_vinfo)
275438fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
275538fd1498Szrj
275638fd1498Szrj if (loop_vinfo && !step)
275738fd1498Szrj {
275838fd1498Szrj if (dump_enabled_p ())
275938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276038fd1498Szrj "bad data-ref access in loop\n");
276138fd1498Szrj return false;
276238fd1498Szrj }
276338fd1498Szrj
276438fd1498Szrj /* Allow loads with zero step in inner-loop vectorization. */
276538fd1498Szrj if (loop_vinfo && integer_zerop (step))
276638fd1498Szrj {
276738fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
276838fd1498Szrj if (!nested_in_vect_loop_p (loop, stmt))
276938fd1498Szrj return DR_IS_READ (dr);
277038fd1498Szrj /* Allow references with zero step for outer loops marked
277138fd1498Szrj with pragma omp simd only - it guarantees absence of
277238fd1498Szrj loop-carried dependencies between inner loop iterations. */
277338fd1498Szrj if (loop->safelen < 2)
277438fd1498Szrj {
277538fd1498Szrj if (dump_enabled_p ())
277638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
277738fd1498Szrj "zero step in inner loop of nest\n");
277838fd1498Szrj return false;
277938fd1498Szrj }
278038fd1498Szrj }
278138fd1498Szrj
278238fd1498Szrj if (loop && nested_in_vect_loop_p (loop, stmt))
278338fd1498Szrj {
278438fd1498Szrj /* Interleaved accesses are not yet supported within outer-loop
278538fd1498Szrj vectorization for references in the inner-loop. */
278638fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
278738fd1498Szrj
278838fd1498Szrj /* For the rest of the analysis we use the outer-loop step. */
278938fd1498Szrj step = STMT_VINFO_DR_STEP (stmt_info);
279038fd1498Szrj if (integer_zerop (step))
279138fd1498Szrj {
279238fd1498Szrj if (dump_enabled_p ())
279338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
279438fd1498Szrj "zero step in outer loop.\n");
279538fd1498Szrj return DR_IS_READ (dr);
279638fd1498Szrj }
279738fd1498Szrj }
279838fd1498Szrj
279938fd1498Szrj /* Consecutive? */
280038fd1498Szrj if (TREE_CODE (step) == INTEGER_CST)
280138fd1498Szrj {
280238fd1498Szrj HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
280338fd1498Szrj if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
280438fd1498Szrj || (dr_step < 0
280538fd1498Szrj && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
280638fd1498Szrj {
280738fd1498Szrj /* Mark that it is not interleaving. */
280838fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
280938fd1498Szrj return true;
281038fd1498Szrj }
281138fd1498Szrj }
281238fd1498Szrj
281338fd1498Szrj if (loop && nested_in_vect_loop_p (loop, stmt))
281438fd1498Szrj {
281538fd1498Szrj if (dump_enabled_p ())
281638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
281738fd1498Szrj "grouped access in outer loop.\n");
281838fd1498Szrj return false;
281938fd1498Szrj }
282038fd1498Szrj
282138fd1498Szrj
282238fd1498Szrj /* Assume this is a DR handled by non-constant strided load case. */
282338fd1498Szrj if (TREE_CODE (step) != INTEGER_CST)
282438fd1498Szrj return (STMT_VINFO_STRIDED_P (stmt_info)
282538fd1498Szrj && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
282638fd1498Szrj || vect_analyze_group_access (dr)));
282738fd1498Szrj
282838fd1498Szrj /* Not consecutive access - check if it's a part of interleaving group. */
282938fd1498Szrj return vect_analyze_group_access (dr);
283038fd1498Szrj }
283138fd1498Szrj
283238fd1498Szrj /* Compare two data-references DRA and DRB to group them into chunks
283338fd1498Szrj suitable for grouping. */
283438fd1498Szrj
283538fd1498Szrj static int
dr_group_sort_cmp(const void * dra_,const void * drb_)283638fd1498Szrj dr_group_sort_cmp (const void *dra_, const void *drb_)
283738fd1498Szrj {
283838fd1498Szrj data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
283938fd1498Szrj data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
284038fd1498Szrj int cmp;
284138fd1498Szrj
284238fd1498Szrj /* Stabilize sort. */
284338fd1498Szrj if (dra == drb)
284438fd1498Szrj return 0;
284538fd1498Szrj
284638fd1498Szrj /* DRs in different loops never belong to the same group. */
284738fd1498Szrj loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
284838fd1498Szrj loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
284938fd1498Szrj if (loopa != loopb)
285038fd1498Szrj return loopa->num < loopb->num ? -1 : 1;
285138fd1498Szrj
285238fd1498Szrj /* Ordering of DRs according to base. */
285338fd1498Szrj cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
285438fd1498Szrj DR_BASE_ADDRESS (drb));
285538fd1498Szrj if (cmp != 0)
285638fd1498Szrj return cmp;
285738fd1498Szrj
285838fd1498Szrj /* And according to DR_OFFSET. */
285938fd1498Szrj cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
286038fd1498Szrj if (cmp != 0)
286138fd1498Szrj return cmp;
286238fd1498Szrj
286338fd1498Szrj /* Put reads before writes. */
286438fd1498Szrj if (DR_IS_READ (dra) != DR_IS_READ (drb))
286538fd1498Szrj return DR_IS_READ (dra) ? -1 : 1;
286638fd1498Szrj
286738fd1498Szrj /* Then sort after access size. */
286838fd1498Szrj cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
286938fd1498Szrj TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
287038fd1498Szrj if (cmp != 0)
287138fd1498Szrj return cmp;
287238fd1498Szrj
287338fd1498Szrj /* And after step. */
287438fd1498Szrj cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
287538fd1498Szrj if (cmp != 0)
287638fd1498Szrj return cmp;
287738fd1498Szrj
287838fd1498Szrj /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
287938fd1498Szrj cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
288038fd1498Szrj if (cmp == 0)
288138fd1498Szrj return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
288238fd1498Szrj return cmp;
288338fd1498Szrj }
288438fd1498Szrj
288538fd1498Szrj /* If OP is the result of a conversion, return the unconverted value,
288638fd1498Szrj otherwise return null. */
288738fd1498Szrj
288838fd1498Szrj static tree
strip_conversion(tree op)288938fd1498Szrj strip_conversion (tree op)
289038fd1498Szrj {
289138fd1498Szrj if (TREE_CODE (op) != SSA_NAME)
289238fd1498Szrj return NULL_TREE;
289338fd1498Szrj gimple *stmt = SSA_NAME_DEF_STMT (op);
289438fd1498Szrj if (!is_gimple_assign (stmt)
289538fd1498Szrj || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
289638fd1498Szrj return NULL_TREE;
289738fd1498Szrj return gimple_assign_rhs1 (stmt);
289838fd1498Szrj }
289938fd1498Szrj
290038fd1498Szrj /* Return true if vectorizable_* routines can handle statements STMT1
290138fd1498Szrj and STMT2 being in a single group. */
290238fd1498Szrj
290338fd1498Szrj static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)290438fd1498Szrj can_group_stmts_p (gimple *stmt1, gimple *stmt2)
290538fd1498Szrj {
290638fd1498Szrj if (gimple_assign_single_p (stmt1))
290738fd1498Szrj return gimple_assign_single_p (stmt2);
290838fd1498Szrj
290938fd1498Szrj if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
291038fd1498Szrj {
291138fd1498Szrj /* Check for two masked loads or two masked stores. */
291238fd1498Szrj if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
291338fd1498Szrj return false;
291438fd1498Szrj internal_fn ifn = gimple_call_internal_fn (stmt1);
291538fd1498Szrj if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
291638fd1498Szrj return false;
291738fd1498Szrj if (ifn != gimple_call_internal_fn (stmt2))
291838fd1498Szrj return false;
291938fd1498Szrj
292038fd1498Szrj /* Check that the masks are the same. Cope with casts of masks,
292138fd1498Szrj like those created by build_mask_conversion. */
292238fd1498Szrj tree mask1 = gimple_call_arg (stmt1, 2);
292338fd1498Szrj tree mask2 = gimple_call_arg (stmt2, 2);
292438fd1498Szrj if (!operand_equal_p (mask1, mask2, 0))
292538fd1498Szrj {
292638fd1498Szrj mask1 = strip_conversion (mask1);
292738fd1498Szrj if (!mask1)
292838fd1498Szrj return false;
292938fd1498Szrj mask2 = strip_conversion (mask2);
293038fd1498Szrj if (!mask2)
293138fd1498Szrj return false;
293238fd1498Szrj if (!operand_equal_p (mask1, mask2, 0))
293338fd1498Szrj return false;
293438fd1498Szrj }
293538fd1498Szrj return true;
293638fd1498Szrj }
293738fd1498Szrj
293838fd1498Szrj return false;
293938fd1498Szrj }
294038fd1498Szrj
294138fd1498Szrj /* Function vect_analyze_data_ref_accesses.
294238fd1498Szrj
294338fd1498Szrj Analyze the access pattern of all the data references in the loop.
294438fd1498Szrj
294538fd1498Szrj FORNOW: the only access pattern that is considered vectorizable is a
294638fd1498Szrj simple step 1 (consecutive) access.
294738fd1498Szrj
294838fd1498Szrj FORNOW: handle only arrays and pointer accesses. */
294938fd1498Szrj
295038fd1498Szrj bool
vect_analyze_data_ref_accesses(vec_info * vinfo)295138fd1498Szrj vect_analyze_data_ref_accesses (vec_info *vinfo)
295238fd1498Szrj {
295338fd1498Szrj unsigned int i;
295438fd1498Szrj vec<data_reference_p> datarefs = vinfo->datarefs;
295538fd1498Szrj struct data_reference *dr;
295638fd1498Szrj
295738fd1498Szrj if (dump_enabled_p ())
295838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
295938fd1498Szrj "=== vect_analyze_data_ref_accesses ===\n");
296038fd1498Szrj
296138fd1498Szrj if (datarefs.is_empty ())
296238fd1498Szrj return true;
296338fd1498Szrj
296438fd1498Szrj /* Sort the array of datarefs to make building the interleaving chains
296538fd1498Szrj linear. Don't modify the original vector's order, it is needed for
296638fd1498Szrj determining what dependencies are reversed. */
296738fd1498Szrj vec<data_reference_p> datarefs_copy = datarefs.copy ();
296838fd1498Szrj datarefs_copy.qsort (dr_group_sort_cmp);
296938fd1498Szrj
297038fd1498Szrj /* Build the interleaving chains. */
297138fd1498Szrj for (i = 0; i < datarefs_copy.length () - 1;)
297238fd1498Szrj {
297338fd1498Szrj data_reference_p dra = datarefs_copy[i];
297438fd1498Szrj stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
297538fd1498Szrj stmt_vec_info lastinfo = NULL;
297638fd1498Szrj if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
297738fd1498Szrj || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
297838fd1498Szrj {
297938fd1498Szrj ++i;
298038fd1498Szrj continue;
298138fd1498Szrj }
298238fd1498Szrj for (i = i + 1; i < datarefs_copy.length (); ++i)
298338fd1498Szrj {
298438fd1498Szrj data_reference_p drb = datarefs_copy[i];
298538fd1498Szrj stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
298638fd1498Szrj if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
298738fd1498Szrj || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
298838fd1498Szrj break;
298938fd1498Szrj
299038fd1498Szrj /* ??? Imperfect sorting (non-compatible types, non-modulo
299138fd1498Szrj accesses, same accesses) can lead to a group to be artificially
299238fd1498Szrj split here as we don't just skip over those. If it really
299338fd1498Szrj matters we can push those to a worklist and re-iterate
299438fd1498Szrj over them. The we can just skip ahead to the next DR here. */
299538fd1498Szrj
299638fd1498Szrj /* DRs in a different loop should not be put into the same
299738fd1498Szrj interleaving group. */
299838fd1498Szrj if (gimple_bb (DR_STMT (dra))->loop_father
299938fd1498Szrj != gimple_bb (DR_STMT (drb))->loop_father)
300038fd1498Szrj break;
300138fd1498Szrj
300238fd1498Szrj /* Check that the data-refs have same first location (except init)
300338fd1498Szrj and they are both either store or load (not load and store,
300438fd1498Szrj not masked loads or stores). */
300538fd1498Szrj if (DR_IS_READ (dra) != DR_IS_READ (drb)
300638fd1498Szrj || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
300738fd1498Szrj DR_BASE_ADDRESS (drb)) != 0
300838fd1498Szrj || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
300938fd1498Szrj || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
301038fd1498Szrj break;
301138fd1498Szrj
301238fd1498Szrj /* Check that the data-refs have the same constant size. */
301338fd1498Szrj tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
301438fd1498Szrj tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
301538fd1498Szrj if (!tree_fits_uhwi_p (sza)
301638fd1498Szrj || !tree_fits_uhwi_p (szb)
301738fd1498Szrj || !tree_int_cst_equal (sza, szb))
301838fd1498Szrj break;
301938fd1498Szrj
302038fd1498Szrj /* Check that the data-refs have the same step. */
302138fd1498Szrj if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
302238fd1498Szrj break;
302338fd1498Szrj
302438fd1498Szrj /* Check the types are compatible.
302538fd1498Szrj ??? We don't distinguish this during sorting. */
302638fd1498Szrj if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
302738fd1498Szrj TREE_TYPE (DR_REF (drb))))
302838fd1498Szrj break;
302938fd1498Szrj
303038fd1498Szrj /* Check that the DR_INITs are compile-time constants. */
303138fd1498Szrj if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
303238fd1498Szrj || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
303338fd1498Szrj break;
303438fd1498Szrj
303538fd1498Szrj /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
303638fd1498Szrj HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
303738fd1498Szrj HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
303838fd1498Szrj HOST_WIDE_INT init_prev
303938fd1498Szrj = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
304038fd1498Szrj gcc_assert (init_a <= init_b
304138fd1498Szrj && init_a <= init_prev
304238fd1498Szrj && init_prev <= init_b);
304338fd1498Szrj
304438fd1498Szrj /* Do not place the same access in the interleaving chain twice. */
304538fd1498Szrj if (init_b == init_prev)
304638fd1498Szrj {
304738fd1498Szrj gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
304838fd1498Szrj < gimple_uid (DR_STMT (drb)));
304938fd1498Szrj /* ??? For now we simply "drop" the later reference which is
305038fd1498Szrj otherwise the same rather than finishing off this group.
305138fd1498Szrj In the end we'd want to re-process duplicates forming
305238fd1498Szrj multiple groups from the refs, likely by just collecting
305338fd1498Szrj all candidates (including duplicates and split points
305438fd1498Szrj below) in a vector and then process them together. */
305538fd1498Szrj continue;
305638fd1498Szrj }
305738fd1498Szrj
305838fd1498Szrj /* If init_b == init_a + the size of the type * k, we have an
305938fd1498Szrj interleaving, and DRA is accessed before DRB. */
306038fd1498Szrj HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
306138fd1498Szrj if (type_size_a == 0
306238fd1498Szrj || (init_b - init_a) % type_size_a != 0)
306338fd1498Szrj break;
306438fd1498Szrj
306538fd1498Szrj /* If we have a store, the accesses are adjacent. This splits
306638fd1498Szrj groups into chunks we support (we don't support vectorization
306738fd1498Szrj of stores with gaps). */
306838fd1498Szrj if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
306938fd1498Szrj break;
307038fd1498Szrj
307138fd1498Szrj /* If the step (if not zero or non-constant) is greater than the
307238fd1498Szrj difference between data-refs' inits this splits groups into
307338fd1498Szrj suitable sizes. */
307438fd1498Szrj if (tree_fits_shwi_p (DR_STEP (dra)))
307538fd1498Szrj {
307638fd1498Szrj HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
307738fd1498Szrj if (step != 0 && step <= (init_b - init_a))
307838fd1498Szrj break;
307938fd1498Szrj }
308038fd1498Szrj
308138fd1498Szrj if (dump_enabled_p ())
308238fd1498Szrj {
308338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
308438fd1498Szrj "Detected interleaving ");
308538fd1498Szrj if (DR_IS_READ (dra))
308638fd1498Szrj dump_printf (MSG_NOTE, "load ");
308738fd1498Szrj else
308838fd1498Szrj dump_printf (MSG_NOTE, "store ");
308938fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
309038fd1498Szrj dump_printf (MSG_NOTE, " and ");
309138fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
309238fd1498Szrj dump_printf (MSG_NOTE, "\n");
309338fd1498Szrj }
309438fd1498Szrj
309538fd1498Szrj /* Link the found element into the group list. */
309638fd1498Szrj if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
309738fd1498Szrj {
309838fd1498Szrj GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
309938fd1498Szrj lastinfo = stmtinfo_a;
310038fd1498Szrj }
310138fd1498Szrj GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
310238fd1498Szrj GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
310338fd1498Szrj lastinfo = stmtinfo_b;
310438fd1498Szrj }
310538fd1498Szrj }
310638fd1498Szrj
310738fd1498Szrj FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
310838fd1498Szrj if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
310938fd1498Szrj && !vect_analyze_data_ref_access (dr))
311038fd1498Szrj {
311138fd1498Szrj if (dump_enabled_p ())
311238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
311338fd1498Szrj "not vectorized: complicated access pattern.\n");
311438fd1498Szrj
311538fd1498Szrj if (is_a <bb_vec_info> (vinfo))
311638fd1498Szrj {
311738fd1498Szrj /* Mark the statement as not vectorizable. */
311838fd1498Szrj STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
311938fd1498Szrj continue;
312038fd1498Szrj }
312138fd1498Szrj else
312238fd1498Szrj {
312338fd1498Szrj datarefs_copy.release ();
312438fd1498Szrj return false;
312538fd1498Szrj }
312638fd1498Szrj }
312738fd1498Szrj
312838fd1498Szrj datarefs_copy.release ();
312938fd1498Szrj return true;
313038fd1498Szrj }
313138fd1498Szrj
313238fd1498Szrj /* Function vect_vfa_segment_size.
313338fd1498Szrj
313438fd1498Szrj Input:
313538fd1498Szrj DR: The data reference.
313638fd1498Szrj LENGTH_FACTOR: segment length to consider.
313738fd1498Szrj
313838fd1498Szrj Return a value suitable for the dr_with_seg_len::seg_len field.
313938fd1498Szrj This is the "distance travelled" by the pointer from the first
314038fd1498Szrj iteration in the segment to the last. Note that it does not include
314138fd1498Szrj the size of the access; in effect it only describes the first byte. */
314238fd1498Szrj
314338fd1498Szrj static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)314438fd1498Szrj vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
314538fd1498Szrj {
314638fd1498Szrj length_factor = size_binop (MINUS_EXPR,
314738fd1498Szrj fold_convert (sizetype, length_factor),
314838fd1498Szrj size_one_node);
314938fd1498Szrj return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
315038fd1498Szrj length_factor);
315138fd1498Szrj }
315238fd1498Szrj
315338fd1498Szrj /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
315438fd1498Szrj gives the worst-case number of bytes covered by the segment. */
315538fd1498Szrj
315638fd1498Szrj static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)315738fd1498Szrj vect_vfa_access_size (data_reference *dr)
315838fd1498Szrj {
315938fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
316038fd1498Szrj tree ref_type = TREE_TYPE (DR_REF (dr));
316138fd1498Szrj unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
316238fd1498Szrj unsigned HOST_WIDE_INT access_size = ref_size;
316338fd1498Szrj if (GROUP_FIRST_ELEMENT (stmt_vinfo))
316438fd1498Szrj {
316538fd1498Szrj gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
316638fd1498Szrj access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
316738fd1498Szrj }
316838fd1498Szrj if (STMT_VINFO_VEC_STMT (stmt_vinfo)
316938fd1498Szrj && (vect_supportable_dr_alignment (dr, false)
317038fd1498Szrj == dr_explicit_realign_optimized))
317138fd1498Szrj {
317238fd1498Szrj /* We might access a full vector's worth. */
317338fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
317438fd1498Szrj access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
317538fd1498Szrj }
317638fd1498Szrj return access_size;
317738fd1498Szrj }
317838fd1498Szrj
317938fd1498Szrj /* Get the minimum alignment for all the scalar accesses that DR describes. */
318038fd1498Szrj
318138fd1498Szrj static unsigned int
vect_vfa_align(const data_reference * dr)318238fd1498Szrj vect_vfa_align (const data_reference *dr)
318338fd1498Szrj {
318438fd1498Szrj return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
318538fd1498Szrj }
318638fd1498Szrj
318738fd1498Szrj /* Function vect_no_alias_p.
318838fd1498Szrj
318938fd1498Szrj Given data references A and B with equal base and offset, see whether
319038fd1498Szrj the alias relation can be decided at compilation time. Return 1 if
319138fd1498Szrj it can and the references alias, 0 if it can and the references do
319238fd1498Szrj not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
319338fd1498Szrj SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
319438fd1498Szrj of dr_with_seg_len::{seg_len,access_size} for A and B. */
319538fd1498Szrj
319638fd1498Szrj static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)319738fd1498Szrj vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
319838fd1498Szrj tree segment_length_a, tree segment_length_b,
319938fd1498Szrj unsigned HOST_WIDE_INT access_size_a,
320038fd1498Szrj unsigned HOST_WIDE_INT access_size_b)
320138fd1498Szrj {
320238fd1498Szrj poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
320338fd1498Szrj poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
320438fd1498Szrj poly_uint64 const_length_a;
320538fd1498Szrj poly_uint64 const_length_b;
320638fd1498Szrj
320738fd1498Szrj /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
320838fd1498Szrj bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
320938fd1498Szrj [a, a+12) */
321038fd1498Szrj if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
321138fd1498Szrj {
321238fd1498Szrj const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
321338fd1498Szrj offset_a = (offset_a + access_size_a) - const_length_a;
321438fd1498Szrj }
321538fd1498Szrj else
321638fd1498Szrj const_length_a = tree_to_poly_uint64 (segment_length_a);
321738fd1498Szrj if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
321838fd1498Szrj {
321938fd1498Szrj const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
322038fd1498Szrj offset_b = (offset_b + access_size_b) - const_length_b;
322138fd1498Szrj }
322238fd1498Szrj else
322338fd1498Szrj const_length_b = tree_to_poly_uint64 (segment_length_b);
322438fd1498Szrj
322538fd1498Szrj const_length_a += access_size_a;
322638fd1498Szrj const_length_b += access_size_b;
322738fd1498Szrj
322838fd1498Szrj if (ranges_known_overlap_p (offset_a, const_length_a,
322938fd1498Szrj offset_b, const_length_b))
323038fd1498Szrj return 1;
323138fd1498Szrj
323238fd1498Szrj if (!ranges_maybe_overlap_p (offset_a, const_length_a,
323338fd1498Szrj offset_b, const_length_b))
323438fd1498Szrj return 0;
323538fd1498Szrj
323638fd1498Szrj return -1;
323738fd1498Szrj }
323838fd1498Szrj
323938fd1498Szrj /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
324038fd1498Szrj in DDR is >= VF. */
324138fd1498Szrj
324238fd1498Szrj static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)324338fd1498Szrj dependence_distance_ge_vf (data_dependence_relation *ddr,
324438fd1498Szrj unsigned int loop_depth, poly_uint64 vf)
324538fd1498Szrj {
324638fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
324738fd1498Szrj || DDR_NUM_DIST_VECTS (ddr) == 0)
324838fd1498Szrj return false;
324938fd1498Szrj
325038fd1498Szrj /* If the dependence is exact, we should have limited the VF instead. */
325138fd1498Szrj gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
325238fd1498Szrj
325338fd1498Szrj unsigned int i;
325438fd1498Szrj lambda_vector dist_v;
325538fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
325638fd1498Szrj {
325738fd1498Szrj HOST_WIDE_INT dist = dist_v[loop_depth];
325838fd1498Szrj if (dist != 0
325938fd1498Szrj && !(dist > 0 && DDR_REVERSED_P (ddr))
326038fd1498Szrj && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
326138fd1498Szrj return false;
326238fd1498Szrj }
326338fd1498Szrj
326438fd1498Szrj if (dump_enabled_p ())
326538fd1498Szrj {
326638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
326738fd1498Szrj "dependence distance between ");
326838fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
326938fd1498Szrj dump_printf (MSG_NOTE, " and ");
327038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
327138fd1498Szrj dump_printf (MSG_NOTE, " is >= VF\n");
327238fd1498Szrj }
327338fd1498Szrj
327438fd1498Szrj return true;
327538fd1498Szrj }
327638fd1498Szrj
327738fd1498Szrj /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
327838fd1498Szrj
327938fd1498Szrj static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)328038fd1498Szrj dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
328138fd1498Szrj {
328238fd1498Szrj dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
328338fd1498Szrj dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
328438fd1498Szrj dump_printf (dump_kind, ") >= ");
328538fd1498Szrj dump_dec (dump_kind, lower_bound.min_value);
328638fd1498Szrj }
328738fd1498Szrj
328838fd1498Szrj /* Record that the vectorized loop requires the vec_lower_bound described
328938fd1498Szrj by EXPR, UNSIGNED_P and MIN_VALUE. */
329038fd1498Szrj
329138fd1498Szrj static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)329238fd1498Szrj vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
329338fd1498Szrj poly_uint64 min_value)
329438fd1498Szrj {
329538fd1498Szrj vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
329638fd1498Szrj for (unsigned int i = 0; i < lower_bounds.length (); ++i)
329738fd1498Szrj if (operand_equal_p (lower_bounds[i].expr, expr, 0))
329838fd1498Szrj {
329938fd1498Szrj unsigned_p &= lower_bounds[i].unsigned_p;
330038fd1498Szrj min_value = upper_bound (lower_bounds[i].min_value, min_value);
330138fd1498Szrj if (lower_bounds[i].unsigned_p != unsigned_p
330238fd1498Szrj || maybe_lt (lower_bounds[i].min_value, min_value))
330338fd1498Szrj {
330438fd1498Szrj lower_bounds[i].unsigned_p = unsigned_p;
330538fd1498Szrj lower_bounds[i].min_value = min_value;
330638fd1498Szrj if (dump_enabled_p ())
330738fd1498Szrj {
330838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
330938fd1498Szrj "updating run-time check to ");
331038fd1498Szrj dump_lower_bound (MSG_NOTE, lower_bounds[i]);
331138fd1498Szrj dump_printf (MSG_NOTE, "\n");
331238fd1498Szrj }
331338fd1498Szrj }
331438fd1498Szrj return;
331538fd1498Szrj }
331638fd1498Szrj
331738fd1498Szrj vec_lower_bound lower_bound (expr, unsigned_p, min_value);
331838fd1498Szrj if (dump_enabled_p ())
331938fd1498Szrj {
332038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
332138fd1498Szrj dump_lower_bound (MSG_NOTE, lower_bound);
332238fd1498Szrj dump_printf (MSG_NOTE, "\n");
332338fd1498Szrj }
332438fd1498Szrj LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
332538fd1498Szrj }
332638fd1498Szrj
332738fd1498Szrj /* Return true if it's unlikely that the step of the vectorized form of DR
332838fd1498Szrj will span fewer than GAP bytes. */
332938fd1498Szrj
333038fd1498Szrj static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)333138fd1498Szrj vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
333238fd1498Szrj {
333338fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
333438fd1498Szrj HOST_WIDE_INT count
333538fd1498Szrj = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
333638fd1498Szrj if (GROUP_FIRST_ELEMENT (stmt_info))
333738fd1498Szrj count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
333838fd1498Szrj return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
333938fd1498Szrj }
334038fd1498Szrj
334138fd1498Szrj /* Return true if we know that there is no alias between DR_A and DR_B
334238fd1498Szrj when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
334338fd1498Szrj *LOWER_BOUND_OUT to this N. */
334438fd1498Szrj
334538fd1498Szrj static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)334638fd1498Szrj vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
334738fd1498Szrj poly_uint64 *lower_bound_out)
334838fd1498Szrj {
334938fd1498Szrj /* Check that there is a constant gap of known sign between DR_A
335038fd1498Szrj and DR_B. */
335138fd1498Szrj poly_int64 init_a, init_b;
335238fd1498Szrj if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
335338fd1498Szrj || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
335438fd1498Szrj || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
335538fd1498Szrj || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
335638fd1498Szrj || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
335738fd1498Szrj || !ordered_p (init_a, init_b))
335838fd1498Szrj return false;
335938fd1498Szrj
336038fd1498Szrj /* Sort DR_A and DR_B by the address they access. */
336138fd1498Szrj if (maybe_lt (init_b, init_a))
336238fd1498Szrj {
336338fd1498Szrj std::swap (init_a, init_b);
336438fd1498Szrj std::swap (dr_a, dr_b);
336538fd1498Szrj }
336638fd1498Szrj
336738fd1498Szrj /* If the two accesses could be dependent within a scalar iteration,
336838fd1498Szrj make sure that we'd retain their order. */
336938fd1498Szrj if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
337038fd1498Szrj && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
337138fd1498Szrj return false;
337238fd1498Szrj
337338fd1498Szrj /* There is no alias if abs (DR_STEP) is greater than or equal to
337438fd1498Szrj the bytes spanned by the combination of the two accesses. */
337538fd1498Szrj *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
337638fd1498Szrj return true;
337738fd1498Szrj }
337838fd1498Szrj
337938fd1498Szrj /* Function vect_prune_runtime_alias_test_list.
338038fd1498Szrj
338138fd1498Szrj Prune a list of ddrs to be tested at run-time by versioning for alias.
338238fd1498Szrj Merge several alias checks into one if possible.
338338fd1498Szrj Return FALSE if resulting list of ddrs is longer then allowed by
338438fd1498Szrj PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
338538fd1498Szrj
338638fd1498Szrj bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)338738fd1498Szrj vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
338838fd1498Szrj {
338938fd1498Szrj typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
339038fd1498Szrj hash_set <tree_pair_hash> compared_objects;
339138fd1498Szrj
339238fd1498Szrj vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
339338fd1498Szrj vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
339438fd1498Szrj = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
339538fd1498Szrj vec<vec_object_pair> &check_unequal_addrs
339638fd1498Szrj = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
339738fd1498Szrj poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
339838fd1498Szrj tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
339938fd1498Szrj
340038fd1498Szrj ddr_p ddr;
340138fd1498Szrj unsigned int i;
340238fd1498Szrj tree length_factor;
340338fd1498Szrj
340438fd1498Szrj if (dump_enabled_p ())
340538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
340638fd1498Szrj "=== vect_prune_runtime_alias_test_list ===\n");
340738fd1498Szrj
340838fd1498Szrj /* Step values are irrelevant for aliasing if the number of vector
340938fd1498Szrj iterations is equal to the number of scalar iterations (which can
341038fd1498Szrj happen for fully-SLP loops). */
341138fd1498Szrj bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
341238fd1498Szrj
341338fd1498Szrj if (!ignore_step_p)
341438fd1498Szrj {
341538fd1498Szrj /* Convert the checks for nonzero steps into bound tests. */
341638fd1498Szrj tree value;
341738fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
341838fd1498Szrj vect_check_lower_bound (loop_vinfo, value, true, 1);
341938fd1498Szrj }
342038fd1498Szrj
342138fd1498Szrj if (may_alias_ddrs.is_empty ())
342238fd1498Szrj return true;
342338fd1498Szrj
342438fd1498Szrj comp_alias_ddrs.create (may_alias_ddrs.length ());
342538fd1498Szrj
342638fd1498Szrj unsigned int loop_depth
342738fd1498Szrj = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
342838fd1498Szrj LOOP_VINFO_LOOP_NEST (loop_vinfo));
342938fd1498Szrj
343038fd1498Szrj /* First, we collect all data ref pairs for aliasing checks. */
343138fd1498Szrj FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
343238fd1498Szrj {
343338fd1498Szrj int comp_res;
343438fd1498Szrj poly_uint64 lower_bound;
343538fd1498Szrj struct data_reference *dr_a, *dr_b;
343638fd1498Szrj gimple *dr_group_first_a, *dr_group_first_b;
343738fd1498Szrj tree segment_length_a, segment_length_b;
343838fd1498Szrj unsigned HOST_WIDE_INT access_size_a, access_size_b;
343938fd1498Szrj unsigned int align_a, align_b;
344038fd1498Szrj gimple *stmt_a, *stmt_b;
344138fd1498Szrj
344238fd1498Szrj /* Ignore the alias if the VF we chose ended up being no greater
344338fd1498Szrj than the dependence distance. */
344438fd1498Szrj if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
344538fd1498Szrj continue;
344638fd1498Szrj
344738fd1498Szrj if (DDR_OBJECT_A (ddr))
344838fd1498Szrj {
344938fd1498Szrj vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
345038fd1498Szrj if (!compared_objects.add (new_pair))
345138fd1498Szrj {
345238fd1498Szrj if (dump_enabled_p ())
345338fd1498Szrj {
345438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
345538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
345638fd1498Szrj dump_printf (MSG_NOTE, " and ");
345738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
345838fd1498Szrj dump_printf (MSG_NOTE, " have different addresses\n");
345938fd1498Szrj }
346038fd1498Szrj LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
346138fd1498Szrj }
346238fd1498Szrj continue;
346338fd1498Szrj }
346438fd1498Szrj
346538fd1498Szrj dr_a = DDR_A (ddr);
346638fd1498Szrj stmt_a = DR_STMT (DDR_A (ddr));
346738fd1498Szrj
346838fd1498Szrj dr_b = DDR_B (ddr);
346938fd1498Szrj stmt_b = DR_STMT (DDR_B (ddr));
347038fd1498Szrj
347138fd1498Szrj /* Skip the pair if inter-iteration dependencies are irrelevant
347238fd1498Szrj and intra-iteration dependencies are guaranteed to be honored. */
347338fd1498Szrj if (ignore_step_p
347438fd1498Szrj && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
347538fd1498Szrj || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
347638fd1498Szrj {
347738fd1498Szrj if (dump_enabled_p ())
347838fd1498Szrj {
347938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
348038fd1498Szrj "no need for alias check between ");
348138fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
348238fd1498Szrj dump_printf (MSG_NOTE, " and ");
348338fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
348438fd1498Szrj dump_printf (MSG_NOTE, " when VF is 1\n");
348538fd1498Szrj }
348638fd1498Szrj continue;
348738fd1498Szrj }
348838fd1498Szrj
348938fd1498Szrj /* See whether we can handle the alias using a bounds check on
349038fd1498Szrj the step, and whether that's likely to be the best approach.
349138fd1498Szrj (It might not be, for example, if the minimum step is much larger
349238fd1498Szrj than the number of bytes handled by one vector iteration.) */
349338fd1498Szrj if (!ignore_step_p
349438fd1498Szrj && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
349538fd1498Szrj && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
349638fd1498Szrj && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
349738fd1498Szrj || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
349838fd1498Szrj {
349938fd1498Szrj bool unsigned_p = dr_known_forward_stride_p (dr_a);
350038fd1498Szrj if (dump_enabled_p ())
350138fd1498Szrj {
350238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
350338fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
350438fd1498Szrj dump_printf (MSG_NOTE, " and ");
350538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
350638fd1498Szrj dump_printf (MSG_NOTE, " when the step ");
350738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
350838fd1498Szrj dump_printf (MSG_NOTE, " is outside ");
350938fd1498Szrj if (unsigned_p)
351038fd1498Szrj dump_printf (MSG_NOTE, "[0");
351138fd1498Szrj else
351238fd1498Szrj {
351338fd1498Szrj dump_printf (MSG_NOTE, "(");
351438fd1498Szrj dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
351538fd1498Szrj }
351638fd1498Szrj dump_printf (MSG_NOTE, ", ");
351738fd1498Szrj dump_dec (MSG_NOTE, lower_bound);
351838fd1498Szrj dump_printf (MSG_NOTE, ")\n");
351938fd1498Szrj }
352038fd1498Szrj vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
352138fd1498Szrj lower_bound);
352238fd1498Szrj continue;
352338fd1498Szrj }
352438fd1498Szrj
352538fd1498Szrj dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
352638fd1498Szrj if (dr_group_first_a)
352738fd1498Szrj {
352838fd1498Szrj stmt_a = dr_group_first_a;
352938fd1498Szrj dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
353038fd1498Szrj }
353138fd1498Szrj
353238fd1498Szrj dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
353338fd1498Szrj if (dr_group_first_b)
353438fd1498Szrj {
353538fd1498Szrj stmt_b = dr_group_first_b;
353638fd1498Szrj dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
353738fd1498Szrj }
353838fd1498Szrj
353938fd1498Szrj if (ignore_step_p)
354038fd1498Szrj {
354138fd1498Szrj segment_length_a = size_zero_node;
354238fd1498Szrj segment_length_b = size_zero_node;
354338fd1498Szrj }
354438fd1498Szrj else
354538fd1498Szrj {
354638fd1498Szrj if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
354738fd1498Szrj length_factor = scalar_loop_iters;
354838fd1498Szrj else
354938fd1498Szrj length_factor = size_int (vect_factor);
355038fd1498Szrj segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
355138fd1498Szrj segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
355238fd1498Szrj }
355338fd1498Szrj access_size_a = vect_vfa_access_size (dr_a);
355438fd1498Szrj access_size_b = vect_vfa_access_size (dr_b);
355538fd1498Szrj align_a = vect_vfa_align (dr_a);
355638fd1498Szrj align_b = vect_vfa_align (dr_b);
355738fd1498Szrj
355838fd1498Szrj comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
355938fd1498Szrj DR_BASE_ADDRESS (dr_b));
356038fd1498Szrj if (comp_res == 0)
356138fd1498Szrj comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
356238fd1498Szrj DR_OFFSET (dr_b));
356338fd1498Szrj
356438fd1498Szrj /* See whether the alias is known at compilation time. */
356538fd1498Szrj if (comp_res == 0
356638fd1498Szrj && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
356738fd1498Szrj && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
356838fd1498Szrj && poly_int_tree_p (segment_length_a)
356938fd1498Szrj && poly_int_tree_p (segment_length_b))
357038fd1498Szrj {
357138fd1498Szrj int res = vect_compile_time_alias (dr_a, dr_b,
357238fd1498Szrj segment_length_a,
357338fd1498Szrj segment_length_b,
357438fd1498Szrj access_size_a,
357538fd1498Szrj access_size_b);
357638fd1498Szrj if (res >= 0 && dump_enabled_p ())
357738fd1498Szrj {
357838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
357938fd1498Szrj "can tell at compile time that ");
358038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
358138fd1498Szrj dump_printf (MSG_NOTE, " and ");
358238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
358338fd1498Szrj if (res == 0)
358438fd1498Szrj dump_printf (MSG_NOTE, " do not alias\n");
358538fd1498Szrj else
358638fd1498Szrj dump_printf (MSG_NOTE, " alias\n");
358738fd1498Szrj }
358838fd1498Szrj
358938fd1498Szrj if (res == 0)
359038fd1498Szrj continue;
359138fd1498Szrj
359238fd1498Szrj if (res == 1)
359338fd1498Szrj {
359438fd1498Szrj if (dump_enabled_p ())
359538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
359638fd1498Szrj "not vectorized: compilation time alias.\n");
359738fd1498Szrj return false;
359838fd1498Szrj }
359938fd1498Szrj }
360038fd1498Szrj
360138fd1498Szrj dr_with_seg_len_pair_t dr_with_seg_len_pair
360238fd1498Szrj (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
360338fd1498Szrj dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
360438fd1498Szrj
360538fd1498Szrj /* Canonicalize pairs by sorting the two DR members. */
360638fd1498Szrj if (comp_res > 0)
360738fd1498Szrj std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
360838fd1498Szrj
360938fd1498Szrj comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
361038fd1498Szrj }
361138fd1498Szrj
361238fd1498Szrj prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
361338fd1498Szrj
361438fd1498Szrj unsigned int count = (comp_alias_ddrs.length ()
361538fd1498Szrj + check_unequal_addrs.length ());
361638fd1498Szrj
361738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
361838fd1498Szrj "improved number of alias checks from %d to %d\n",
361938fd1498Szrj may_alias_ddrs.length (), count);
362038fd1498Szrj if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
362138fd1498Szrj {
362238fd1498Szrj if (dump_enabled_p ())
362338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362438fd1498Szrj "number of versioning for alias "
362538fd1498Szrj "run-time tests exceeds %d "
362638fd1498Szrj "(--param vect-max-version-for-alias-checks)\n",
362738fd1498Szrj PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
362838fd1498Szrj return false;
362938fd1498Szrj }
363038fd1498Szrj
363138fd1498Szrj return true;
363238fd1498Szrj }
363338fd1498Szrj
363438fd1498Szrj /* Check whether we can use an internal function for a gather load
363538fd1498Szrj or scatter store. READ_P is true for loads and false for stores.
363638fd1498Szrj MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
363738fd1498Szrj the type of the memory elements being loaded or stored. OFFSET_BITS
363838fd1498Szrj is the number of bits in each scalar offset and OFFSET_SIGN is the
363938fd1498Szrj sign of the offset. SCALE is the amount by which the offset should
364038fd1498Szrj be multiplied *after* it has been converted to address width.
364138fd1498Szrj
364238fd1498Szrj Return true if the function is supported, storing the function
364338fd1498Szrj id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
364438fd1498Szrj
364538fd1498Szrj bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)364638fd1498Szrj vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
364738fd1498Szrj tree memory_type, unsigned int offset_bits,
364838fd1498Szrj signop offset_sign, int scale,
364938fd1498Szrj internal_fn *ifn_out, tree *element_type_out)
365038fd1498Szrj {
365138fd1498Szrj unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
365238fd1498Szrj unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
365338fd1498Szrj if (offset_bits > element_bits)
365438fd1498Szrj /* Internal functions require the offset to be the same width as
365538fd1498Szrj the vector elements. We can extend narrower offsets, but it isn't
365638fd1498Szrj safe to truncate wider offsets. */
365738fd1498Szrj return false;
365838fd1498Szrj
365938fd1498Szrj if (element_bits != memory_bits)
366038fd1498Szrj /* For now the vector elements must be the same width as the
366138fd1498Szrj memory elements. */
366238fd1498Szrj return false;
366338fd1498Szrj
366438fd1498Szrj /* Work out which function we need. */
366538fd1498Szrj internal_fn ifn;
366638fd1498Szrj if (read_p)
366738fd1498Szrj ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
366838fd1498Szrj else
366938fd1498Szrj ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
367038fd1498Szrj
367138fd1498Szrj /* Test whether the target supports this combination. */
367238fd1498Szrj if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
367338fd1498Szrj offset_sign, scale))
367438fd1498Szrj return false;
367538fd1498Szrj
367638fd1498Szrj *ifn_out = ifn;
367738fd1498Szrj *element_type_out = TREE_TYPE (vectype);
367838fd1498Szrj return true;
367938fd1498Szrj }
368038fd1498Szrj
368138fd1498Szrj /* CALL is a call to an internal gather load or scatter store function.
368238fd1498Szrj Describe the operation in INFO. */
368338fd1498Szrj
368438fd1498Szrj static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)368538fd1498Szrj vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
368638fd1498Szrj {
368738fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (call);
368838fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
368938fd1498Szrj data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
369038fd1498Szrj
369138fd1498Szrj info->ifn = gimple_call_internal_fn (call);
369238fd1498Szrj info->decl = NULL_TREE;
369338fd1498Szrj info->base = gimple_call_arg (call, 0);
369438fd1498Szrj info->offset = gimple_call_arg (call, 1);
369538fd1498Szrj info->offset_dt = vect_unknown_def_type;
369638fd1498Szrj info->offset_vectype = NULL_TREE;
369738fd1498Szrj info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
369838fd1498Szrj info->element_type = TREE_TYPE (vectype);
369938fd1498Szrj info->memory_type = TREE_TYPE (DR_REF (dr));
370038fd1498Szrj }
370138fd1498Szrj
370238fd1498Szrj /* Return true if a non-affine read or write in STMT is suitable for a
370338fd1498Szrj gather load or scatter store. Describe the operation in *INFO if so. */
370438fd1498Szrj
370538fd1498Szrj bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)370638fd1498Szrj vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
370738fd1498Szrj gather_scatter_info *info)
370838fd1498Szrj {
370938fd1498Szrj HOST_WIDE_INT scale = 1;
371038fd1498Szrj poly_int64 pbitpos, pbitsize;
371138fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
371238fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
371338fd1498Szrj struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
371438fd1498Szrj tree offtype = NULL_TREE;
371538fd1498Szrj tree decl = NULL_TREE, base, off;
371638fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
371738fd1498Szrj tree memory_type = TREE_TYPE (DR_REF (dr));
371838fd1498Szrj machine_mode pmode;
371938fd1498Szrj int punsignedp, reversep, pvolatilep = 0;
372038fd1498Szrj internal_fn ifn;
372138fd1498Szrj tree element_type;
372238fd1498Szrj bool masked_p = false;
372338fd1498Szrj
372438fd1498Szrj /* See whether this is already a call to a gather/scatter internal function.
372538fd1498Szrj If not, see whether it's a masked load or store. */
372638fd1498Szrj gcall *call = dyn_cast <gcall *> (stmt);
372738fd1498Szrj if (call && gimple_call_internal_p (call))
372838fd1498Szrj {
372938fd1498Szrj ifn = gimple_call_internal_fn (stmt);
373038fd1498Szrj if (internal_gather_scatter_fn_p (ifn))
373138fd1498Szrj {
373238fd1498Szrj vect_describe_gather_scatter_call (call, info);
373338fd1498Szrj return true;
373438fd1498Szrj }
373538fd1498Szrj masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
373638fd1498Szrj }
373738fd1498Szrj
373838fd1498Szrj /* True if we should aim to use internal functions rather than
373938fd1498Szrj built-in functions. */
374038fd1498Szrj bool use_ifn_p = (DR_IS_READ (dr)
374138fd1498Szrj ? supports_vec_gather_load_p ()
374238fd1498Szrj : supports_vec_scatter_store_p ());
374338fd1498Szrj
374438fd1498Szrj base = DR_REF (dr);
374538fd1498Szrj /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
374638fd1498Szrj see if we can use the def stmt of the address. */
374738fd1498Szrj if (masked_p
374838fd1498Szrj && TREE_CODE (base) == MEM_REF
374938fd1498Szrj && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
375038fd1498Szrj && integer_zerop (TREE_OPERAND (base, 1))
375138fd1498Szrj && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
375238fd1498Szrj {
375338fd1498Szrj gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
375438fd1498Szrj if (is_gimple_assign (def_stmt)
375538fd1498Szrj && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
375638fd1498Szrj base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
375738fd1498Szrj }
375838fd1498Szrj
375938fd1498Szrj /* The gather and scatter builtins need address of the form
376038fd1498Szrj loop_invariant + vector * {1, 2, 4, 8}
376138fd1498Szrj or
376238fd1498Szrj loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
376338fd1498Szrj Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
376438fd1498Szrj of loop invariants/SSA_NAMEs defined in the loop, with casts,
376538fd1498Szrj multiplications and additions in it. To get a vector, we need
376638fd1498Szrj a single SSA_NAME that will be defined in the loop and will
376738fd1498Szrj contain everything that is not loop invariant and that can be
376838fd1498Szrj vectorized. The following code attempts to find such a preexistng
376938fd1498Szrj SSA_NAME OFF and put the loop invariants into a tree BASE
377038fd1498Szrj that can be gimplified before the loop. */
377138fd1498Szrj base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
377238fd1498Szrj &punsignedp, &reversep, &pvolatilep);
377338fd1498Szrj gcc_assert (base && !reversep);
377438fd1498Szrj poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
377538fd1498Szrj
377638fd1498Szrj if (TREE_CODE (base) == MEM_REF)
377738fd1498Szrj {
377838fd1498Szrj if (!integer_zerop (TREE_OPERAND (base, 1)))
377938fd1498Szrj {
378038fd1498Szrj if (off == NULL_TREE)
378138fd1498Szrj off = wide_int_to_tree (sizetype, mem_ref_offset (base));
378238fd1498Szrj else
378338fd1498Szrj off = size_binop (PLUS_EXPR, off,
378438fd1498Szrj fold_convert (sizetype, TREE_OPERAND (base, 1)));
378538fd1498Szrj }
378638fd1498Szrj base = TREE_OPERAND (base, 0);
378738fd1498Szrj }
378838fd1498Szrj else
378938fd1498Szrj base = build_fold_addr_expr (base);
379038fd1498Szrj
379138fd1498Szrj if (off == NULL_TREE)
379238fd1498Szrj off = size_zero_node;
379338fd1498Szrj
379438fd1498Szrj /* If base is not loop invariant, either off is 0, then we start with just
379538fd1498Szrj the constant offset in the loop invariant BASE and continue with base
379638fd1498Szrj as OFF, otherwise give up.
379738fd1498Szrj We could handle that case by gimplifying the addition of base + off
379838fd1498Szrj into some SSA_NAME and use that as off, but for now punt. */
379938fd1498Szrj if (!expr_invariant_in_loop_p (loop, base))
380038fd1498Szrj {
380138fd1498Szrj if (!integer_zerop (off))
380238fd1498Szrj return false;
380338fd1498Szrj off = base;
380438fd1498Szrj base = size_int (pbytepos);
380538fd1498Szrj }
380638fd1498Szrj /* Otherwise put base + constant offset into the loop invariant BASE
380738fd1498Szrj and continue with OFF. */
380838fd1498Szrj else
380938fd1498Szrj {
381038fd1498Szrj base = fold_convert (sizetype, base);
381138fd1498Szrj base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
381238fd1498Szrj }
381338fd1498Szrj
381438fd1498Szrj /* OFF at this point may be either a SSA_NAME or some tree expression
381538fd1498Szrj from get_inner_reference. Try to peel off loop invariants from it
381638fd1498Szrj into BASE as long as possible. */
381738fd1498Szrj STRIP_NOPS (off);
381838fd1498Szrj while (offtype == NULL_TREE)
381938fd1498Szrj {
382038fd1498Szrj enum tree_code code;
382138fd1498Szrj tree op0, op1, add = NULL_TREE;
382238fd1498Szrj
382338fd1498Szrj if (TREE_CODE (off) == SSA_NAME)
382438fd1498Szrj {
382538fd1498Szrj gimple *def_stmt = SSA_NAME_DEF_STMT (off);
382638fd1498Szrj
382738fd1498Szrj if (expr_invariant_in_loop_p (loop, off))
382838fd1498Szrj return false;
382938fd1498Szrj
383038fd1498Szrj if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
383138fd1498Szrj break;
383238fd1498Szrj
383338fd1498Szrj op0 = gimple_assign_rhs1 (def_stmt);
383438fd1498Szrj code = gimple_assign_rhs_code (def_stmt);
383538fd1498Szrj op1 = gimple_assign_rhs2 (def_stmt);
383638fd1498Szrj }
383738fd1498Szrj else
383838fd1498Szrj {
383938fd1498Szrj if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
384038fd1498Szrj return false;
384138fd1498Szrj code = TREE_CODE (off);
384238fd1498Szrj extract_ops_from_tree (off, &code, &op0, &op1);
384338fd1498Szrj }
384438fd1498Szrj switch (code)
384538fd1498Szrj {
384638fd1498Szrj case POINTER_PLUS_EXPR:
384738fd1498Szrj case PLUS_EXPR:
384838fd1498Szrj if (expr_invariant_in_loop_p (loop, op0))
384938fd1498Szrj {
385038fd1498Szrj add = op0;
385138fd1498Szrj off = op1;
385238fd1498Szrj do_add:
385338fd1498Szrj add = fold_convert (sizetype, add);
385438fd1498Szrj if (scale != 1)
385538fd1498Szrj add = size_binop (MULT_EXPR, add, size_int (scale));
385638fd1498Szrj base = size_binop (PLUS_EXPR, base, add);
385738fd1498Szrj continue;
385838fd1498Szrj }
385938fd1498Szrj if (expr_invariant_in_loop_p (loop, op1))
386038fd1498Szrj {
386138fd1498Szrj add = op1;
386238fd1498Szrj off = op0;
386338fd1498Szrj goto do_add;
386438fd1498Szrj }
386538fd1498Szrj break;
386638fd1498Szrj case MINUS_EXPR:
386738fd1498Szrj if (expr_invariant_in_loop_p (loop, op1))
386838fd1498Szrj {
386938fd1498Szrj add = fold_convert (sizetype, op1);
387038fd1498Szrj add = size_binop (MINUS_EXPR, size_zero_node, add);
387138fd1498Szrj off = op0;
387238fd1498Szrj goto do_add;
387338fd1498Szrj }
387438fd1498Szrj break;
387538fd1498Szrj case MULT_EXPR:
387638fd1498Szrj if (scale == 1 && tree_fits_shwi_p (op1))
387738fd1498Szrj {
387838fd1498Szrj int new_scale = tree_to_shwi (op1);
387938fd1498Szrj /* Only treat this as a scaling operation if the target
388038fd1498Szrj supports it. */
388138fd1498Szrj if (use_ifn_p
388238fd1498Szrj && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
388338fd1498Szrj vectype, memory_type, 1,
388438fd1498Szrj TYPE_SIGN (TREE_TYPE (op0)),
388538fd1498Szrj new_scale, &ifn,
388638fd1498Szrj &element_type))
388738fd1498Szrj break;
388838fd1498Szrj scale = new_scale;
388938fd1498Szrj off = op0;
389038fd1498Szrj continue;
389138fd1498Szrj }
389238fd1498Szrj break;
389338fd1498Szrj case SSA_NAME:
389438fd1498Szrj off = op0;
389538fd1498Szrj continue;
389638fd1498Szrj CASE_CONVERT:
389738fd1498Szrj if (!POINTER_TYPE_P (TREE_TYPE (op0))
389838fd1498Szrj && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
389938fd1498Szrj break;
390038fd1498Szrj if (TYPE_PRECISION (TREE_TYPE (op0))
390138fd1498Szrj == TYPE_PRECISION (TREE_TYPE (off)))
390238fd1498Szrj {
390338fd1498Szrj off = op0;
390438fd1498Szrj continue;
390538fd1498Szrj }
390638fd1498Szrj
390738fd1498Szrj /* The internal functions need the offset to be the same width
390838fd1498Szrj as the elements of VECTYPE. Don't include operations that
390938fd1498Szrj cast the offset from that width to a different width. */
391038fd1498Szrj if (use_ifn_p
391138fd1498Szrj && (int_size_in_bytes (TREE_TYPE (vectype))
391238fd1498Szrj == int_size_in_bytes (TREE_TYPE (off))))
391338fd1498Szrj break;
391438fd1498Szrj
391538fd1498Szrj if (TYPE_PRECISION (TREE_TYPE (op0))
391638fd1498Szrj < TYPE_PRECISION (TREE_TYPE (off)))
391738fd1498Szrj {
391838fd1498Szrj off = op0;
391938fd1498Szrj offtype = TREE_TYPE (off);
392038fd1498Szrj STRIP_NOPS (off);
392138fd1498Szrj continue;
392238fd1498Szrj }
392338fd1498Szrj break;
392438fd1498Szrj default:
392538fd1498Szrj break;
392638fd1498Szrj }
392738fd1498Szrj break;
392838fd1498Szrj }
392938fd1498Szrj
393038fd1498Szrj /* If at the end OFF still isn't a SSA_NAME or isn't
393138fd1498Szrj defined in the loop, punt. */
393238fd1498Szrj if (TREE_CODE (off) != SSA_NAME
393338fd1498Szrj || expr_invariant_in_loop_p (loop, off))
393438fd1498Szrj return false;
393538fd1498Szrj
393638fd1498Szrj if (offtype == NULL_TREE)
393738fd1498Szrj offtype = TREE_TYPE (off);
393838fd1498Szrj
393938fd1498Szrj if (use_ifn_p)
394038fd1498Szrj {
394138fd1498Szrj if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
394238fd1498Szrj memory_type, TYPE_PRECISION (offtype),
394338fd1498Szrj TYPE_SIGN (offtype), scale, &ifn,
394438fd1498Szrj &element_type))
394538fd1498Szrj return false;
394638fd1498Szrj }
394738fd1498Szrj else
394838fd1498Szrj {
394938fd1498Szrj if (DR_IS_READ (dr))
395038fd1498Szrj {
395138fd1498Szrj if (targetm.vectorize.builtin_gather)
395238fd1498Szrj decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
395338fd1498Szrj }
395438fd1498Szrj else
395538fd1498Szrj {
395638fd1498Szrj if (targetm.vectorize.builtin_scatter)
395738fd1498Szrj decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
395838fd1498Szrj }
395938fd1498Szrj
396038fd1498Szrj if (!decl)
396138fd1498Szrj return false;
396238fd1498Szrj
396338fd1498Szrj ifn = IFN_LAST;
396438fd1498Szrj element_type = TREE_TYPE (vectype);
396538fd1498Szrj }
396638fd1498Szrj
396738fd1498Szrj info->ifn = ifn;
396838fd1498Szrj info->decl = decl;
396938fd1498Szrj info->base = base;
397038fd1498Szrj info->offset = off;
397138fd1498Szrj info->offset_dt = vect_unknown_def_type;
397238fd1498Szrj info->offset_vectype = NULL_TREE;
397338fd1498Szrj info->scale = scale;
397438fd1498Szrj info->element_type = element_type;
397538fd1498Szrj info->memory_type = memory_type;
397638fd1498Szrj return true;
397738fd1498Szrj }
397838fd1498Szrj
397938fd1498Szrj /* Function vect_analyze_data_refs.
398038fd1498Szrj
398138fd1498Szrj Find all the data references in the loop or basic block.
398238fd1498Szrj
398338fd1498Szrj The general structure of the analysis of data refs in the vectorizer is as
398438fd1498Szrj follows:
398538fd1498Szrj 1- vect_analyze_data_refs(loop/bb): call
398638fd1498Szrj compute_data_dependences_for_loop/bb to find and analyze all data-refs
398738fd1498Szrj in the loop/bb and their dependences.
398838fd1498Szrj 2- vect_analyze_dependences(): apply dependence testing using ddrs.
398938fd1498Szrj 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
399038fd1498Szrj 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
399138fd1498Szrj
399238fd1498Szrj */
399338fd1498Szrj
399438fd1498Szrj bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)399538fd1498Szrj vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
399638fd1498Szrj {
399738fd1498Szrj struct loop *loop = NULL;
399838fd1498Szrj unsigned int i;
399938fd1498Szrj struct data_reference *dr;
400038fd1498Szrj tree scalar_type;
400138fd1498Szrj
400238fd1498Szrj if (dump_enabled_p ())
400338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
400438fd1498Szrj "=== vect_analyze_data_refs ===\n");
400538fd1498Szrj
400638fd1498Szrj if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
400738fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
400838fd1498Szrj
400938fd1498Szrj /* Go through the data-refs, check that the analysis succeeded. Update
401038fd1498Szrj pointer from stmt_vec_info struct to DR and vectype. */
401138fd1498Szrj
401238fd1498Szrj vec<data_reference_p> datarefs = vinfo->datarefs;
401338fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
401438fd1498Szrj {
401538fd1498Szrj gimple *stmt;
401638fd1498Szrj stmt_vec_info stmt_info;
401738fd1498Szrj tree base, offset, init;
401838fd1498Szrj enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
401938fd1498Szrj bool simd_lane_access = false;
402038fd1498Szrj poly_uint64 vf;
402138fd1498Szrj
402238fd1498Szrj again:
402338fd1498Szrj if (!dr || !DR_REF (dr))
402438fd1498Szrj {
402538fd1498Szrj if (dump_enabled_p ())
402638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402738fd1498Szrj "not vectorized: unhandled data-ref\n");
402838fd1498Szrj return false;
402938fd1498Szrj }
403038fd1498Szrj
403138fd1498Szrj stmt = DR_STMT (dr);
403238fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
403338fd1498Szrj
403438fd1498Szrj /* Discard clobbers from the dataref vector. We will remove
403538fd1498Szrj clobber stmts during vectorization. */
403638fd1498Szrj if (gimple_clobber_p (stmt))
403738fd1498Szrj {
403838fd1498Szrj free_data_ref (dr);
403938fd1498Szrj if (i == datarefs.length () - 1)
404038fd1498Szrj {
404138fd1498Szrj datarefs.pop ();
404238fd1498Szrj break;
404338fd1498Szrj }
404438fd1498Szrj datarefs.ordered_remove (i);
404538fd1498Szrj dr = datarefs[i];
404638fd1498Szrj goto again;
404738fd1498Szrj }
404838fd1498Szrj
404938fd1498Szrj /* Check that analysis of the data-ref succeeded. */
405038fd1498Szrj if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
405138fd1498Szrj || !DR_STEP (dr))
405238fd1498Szrj {
405338fd1498Szrj bool maybe_gather
405438fd1498Szrj = DR_IS_READ (dr)
405538fd1498Szrj && !TREE_THIS_VOLATILE (DR_REF (dr))
405638fd1498Szrj && (targetm.vectorize.builtin_gather != NULL
405738fd1498Szrj || supports_vec_gather_load_p ());
405838fd1498Szrj bool maybe_scatter
405938fd1498Szrj = DR_IS_WRITE (dr)
406038fd1498Szrj && !TREE_THIS_VOLATILE (DR_REF (dr))
406138fd1498Szrj && (targetm.vectorize.builtin_scatter != NULL
406238fd1498Szrj || supports_vec_scatter_store_p ());
406338fd1498Szrj bool maybe_simd_lane_access
406438fd1498Szrj = is_a <loop_vec_info> (vinfo) && loop->simduid;
406538fd1498Szrj
406638fd1498Szrj /* If target supports vector gather loads or scatter stores, or if
406738fd1498Szrj this might be a SIMD lane access, see if they can't be used. */
406838fd1498Szrj if (is_a <loop_vec_info> (vinfo)
406938fd1498Szrj && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
407038fd1498Szrj && !nested_in_vect_loop_p (loop, stmt))
407138fd1498Szrj {
407238fd1498Szrj struct data_reference *newdr
407338fd1498Szrj = create_data_ref (NULL, loop_containing_stmt (stmt),
407438fd1498Szrj DR_REF (dr), stmt, !maybe_scatter,
407538fd1498Szrj DR_IS_CONDITIONAL_IN_STMT (dr));
407638fd1498Szrj gcc_assert (newdr != NULL && DR_REF (newdr));
407738fd1498Szrj if (DR_BASE_ADDRESS (newdr)
407838fd1498Szrj && DR_OFFSET (newdr)
407938fd1498Szrj && DR_INIT (newdr)
408038fd1498Szrj && DR_STEP (newdr)
408138fd1498Szrj && integer_zerop (DR_STEP (newdr)))
408238fd1498Szrj {
408338fd1498Szrj if (maybe_simd_lane_access)
408438fd1498Szrj {
408538fd1498Szrj tree off = DR_OFFSET (newdr);
408638fd1498Szrj STRIP_NOPS (off);
408738fd1498Szrj if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
408838fd1498Szrj && TREE_CODE (off) == MULT_EXPR
408938fd1498Szrj && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
409038fd1498Szrj {
409138fd1498Szrj tree step = TREE_OPERAND (off, 1);
409238fd1498Szrj off = TREE_OPERAND (off, 0);
409338fd1498Szrj STRIP_NOPS (off);
409438fd1498Szrj if (CONVERT_EXPR_P (off)
409538fd1498Szrj && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
409638fd1498Szrj 0)))
409738fd1498Szrj < TYPE_PRECISION (TREE_TYPE (off)))
409838fd1498Szrj off = TREE_OPERAND (off, 0);
409938fd1498Szrj if (TREE_CODE (off) == SSA_NAME)
410038fd1498Szrj {
410138fd1498Szrj gimple *def = SSA_NAME_DEF_STMT (off);
410238fd1498Szrj tree reft = TREE_TYPE (DR_REF (newdr));
410338fd1498Szrj if (is_gimple_call (def)
410438fd1498Szrj && gimple_call_internal_p (def)
410538fd1498Szrj && (gimple_call_internal_fn (def)
410638fd1498Szrj == IFN_GOMP_SIMD_LANE))
410738fd1498Szrj {
410838fd1498Szrj tree arg = gimple_call_arg (def, 0);
410938fd1498Szrj gcc_assert (TREE_CODE (arg) == SSA_NAME);
411038fd1498Szrj arg = SSA_NAME_VAR (arg);
411138fd1498Szrj if (arg == loop->simduid
411238fd1498Szrj /* For now. */
411338fd1498Szrj && tree_int_cst_equal
411438fd1498Szrj (TYPE_SIZE_UNIT (reft),
411538fd1498Szrj step))
411638fd1498Szrj {
411738fd1498Szrj DR_OFFSET (newdr) = ssize_int (0);
411838fd1498Szrj DR_STEP (newdr) = step;
411938fd1498Szrj DR_OFFSET_ALIGNMENT (newdr)
412038fd1498Szrj = BIGGEST_ALIGNMENT;
412138fd1498Szrj DR_STEP_ALIGNMENT (newdr)
412238fd1498Szrj = highest_pow2_factor (step);
412338fd1498Szrj dr = newdr;
412438fd1498Szrj simd_lane_access = true;
412538fd1498Szrj }
412638fd1498Szrj }
412738fd1498Szrj }
412838fd1498Szrj }
412938fd1498Szrj }
413038fd1498Szrj if (!simd_lane_access && (maybe_gather || maybe_scatter))
413138fd1498Szrj {
413238fd1498Szrj dr = newdr;
413338fd1498Szrj if (maybe_gather)
413438fd1498Szrj gatherscatter = GATHER;
413538fd1498Szrj else
413638fd1498Szrj gatherscatter = SCATTER;
413738fd1498Szrj }
413838fd1498Szrj }
413938fd1498Szrj if (gatherscatter == SG_NONE && !simd_lane_access)
414038fd1498Szrj free_data_ref (newdr);
414138fd1498Szrj }
414238fd1498Szrj
414338fd1498Szrj if (gatherscatter == SG_NONE && !simd_lane_access)
414438fd1498Szrj {
414538fd1498Szrj if (dump_enabled_p ())
414638fd1498Szrj {
414738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414838fd1498Szrj "not vectorized: data ref analysis "
414938fd1498Szrj "failed ");
415038fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
415138fd1498Szrj }
415238fd1498Szrj
415338fd1498Szrj if (is_a <bb_vec_info> (vinfo))
415438fd1498Szrj break;
415538fd1498Szrj
415638fd1498Szrj return false;
415738fd1498Szrj }
415838fd1498Szrj }
415938fd1498Szrj
416038fd1498Szrj if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
416138fd1498Szrj {
416238fd1498Szrj if (dump_enabled_p ())
416338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416438fd1498Szrj "not vectorized: base addr of dr is a "
416538fd1498Szrj "constant\n");
416638fd1498Szrj
416738fd1498Szrj if (is_a <bb_vec_info> (vinfo))
416838fd1498Szrj break;
416938fd1498Szrj
417038fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
417138fd1498Szrj free_data_ref (dr);
417238fd1498Szrj return false;
417338fd1498Szrj }
417438fd1498Szrj
417538fd1498Szrj if (TREE_THIS_VOLATILE (DR_REF (dr)))
417638fd1498Szrj {
417738fd1498Szrj if (dump_enabled_p ())
417838fd1498Szrj {
417938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418038fd1498Szrj "not vectorized: volatile type ");
418138fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
418238fd1498Szrj }
418338fd1498Szrj
418438fd1498Szrj if (is_a <bb_vec_info> (vinfo))
418538fd1498Szrj break;
418638fd1498Szrj
418738fd1498Szrj return false;
418838fd1498Szrj }
418938fd1498Szrj
419038fd1498Szrj if (stmt_can_throw_internal (stmt))
419138fd1498Szrj {
419238fd1498Szrj if (dump_enabled_p ())
419338fd1498Szrj {
419438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419538fd1498Szrj "not vectorized: statement can throw an "
419638fd1498Szrj "exception ");
419738fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
419838fd1498Szrj }
419938fd1498Szrj
420038fd1498Szrj if (is_a <bb_vec_info> (vinfo))
420138fd1498Szrj break;
420238fd1498Szrj
420338fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
420438fd1498Szrj free_data_ref (dr);
420538fd1498Szrj return false;
420638fd1498Szrj }
420738fd1498Szrj
420838fd1498Szrj if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
420938fd1498Szrj && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
421038fd1498Szrj {
421138fd1498Szrj if (dump_enabled_p ())
421238fd1498Szrj {
421338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421438fd1498Szrj "not vectorized: statement is bitfield "
421538fd1498Szrj "access ");
421638fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
421738fd1498Szrj }
421838fd1498Szrj
421938fd1498Szrj if (is_a <bb_vec_info> (vinfo))
422038fd1498Szrj break;
422138fd1498Szrj
422238fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
422338fd1498Szrj free_data_ref (dr);
422438fd1498Szrj return false;
422538fd1498Szrj }
422638fd1498Szrj
422738fd1498Szrj base = unshare_expr (DR_BASE_ADDRESS (dr));
422838fd1498Szrj offset = unshare_expr (DR_OFFSET (dr));
422938fd1498Szrj init = unshare_expr (DR_INIT (dr));
423038fd1498Szrj
423138fd1498Szrj if (is_gimple_call (stmt)
423238fd1498Szrj && (!gimple_call_internal_p (stmt)
423338fd1498Szrj || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
423438fd1498Szrj && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
423538fd1498Szrj {
423638fd1498Szrj if (dump_enabled_p ())
423738fd1498Szrj {
423838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423938fd1498Szrj "not vectorized: dr in a call ");
424038fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
424138fd1498Szrj }
424238fd1498Szrj
424338fd1498Szrj if (is_a <bb_vec_info> (vinfo))
424438fd1498Szrj break;
424538fd1498Szrj
424638fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
424738fd1498Szrj free_data_ref (dr);
424838fd1498Szrj return false;
424938fd1498Szrj }
425038fd1498Szrj
425138fd1498Szrj /* Update DR field in stmt_vec_info struct. */
425238fd1498Szrj
425338fd1498Szrj /* If the dataref is in an inner-loop of the loop that is considered for
425438fd1498Szrj for vectorization, we also want to analyze the access relative to
425538fd1498Szrj the outer-loop (DR contains information only relative to the
425638fd1498Szrj inner-most enclosing loop). We do that by building a reference to the
425738fd1498Szrj first location accessed by the inner-loop, and analyze it relative to
425838fd1498Szrj the outer-loop. */
425938fd1498Szrj if (loop && nested_in_vect_loop_p (loop, stmt))
426038fd1498Szrj {
426138fd1498Szrj /* Build a reference to the first location accessed by the
426238fd1498Szrj inner loop: *(BASE + INIT + OFFSET). By construction,
426338fd1498Szrj this address must be invariant in the inner loop, so we
426438fd1498Szrj can consider it as being used in the outer loop. */
426538fd1498Szrj tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
426638fd1498Szrj init, offset);
426738fd1498Szrj tree init_addr = fold_build_pointer_plus (base, init_offset);
426838fd1498Szrj tree init_ref = build_fold_indirect_ref (init_addr);
426938fd1498Szrj
427038fd1498Szrj if (dump_enabled_p ())
427138fd1498Szrj {
427238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
427338fd1498Szrj "analyze in outer loop: ");
427438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
427538fd1498Szrj dump_printf (MSG_NOTE, "\n");
427638fd1498Szrj }
427738fd1498Szrj
427838fd1498Szrj if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
427938fd1498Szrj init_ref, loop))
428038fd1498Szrj /* dr_analyze_innermost already explained the failure. */
428138fd1498Szrj return false;
428238fd1498Szrj
428338fd1498Szrj if (dump_enabled_p ())
428438fd1498Szrj {
428538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
428638fd1498Szrj "\touter base_address: ");
428738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
428838fd1498Szrj STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
428938fd1498Szrj dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
429038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
429138fd1498Szrj STMT_VINFO_DR_OFFSET (stmt_info));
429238fd1498Szrj dump_printf (MSG_NOTE,
429338fd1498Szrj "\n\touter constant offset from base address: ");
429438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
429538fd1498Szrj STMT_VINFO_DR_INIT (stmt_info));
429638fd1498Szrj dump_printf (MSG_NOTE, "\n\touter step: ");
429738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
429838fd1498Szrj STMT_VINFO_DR_STEP (stmt_info));
429938fd1498Szrj dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
430038fd1498Szrj STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
430138fd1498Szrj dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
430238fd1498Szrj STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
430338fd1498Szrj dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
430438fd1498Szrj STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
430538fd1498Szrj dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
430638fd1498Szrj STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
430738fd1498Szrj }
430838fd1498Szrj }
430938fd1498Szrj
431038fd1498Szrj if (STMT_VINFO_DATA_REF (stmt_info))
431138fd1498Szrj {
431238fd1498Szrj if (dump_enabled_p ())
431338fd1498Szrj {
431438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431538fd1498Szrj "not vectorized: more than one data ref "
431638fd1498Szrj "in stmt: ");
431738fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
431838fd1498Szrj }
431938fd1498Szrj
432038fd1498Szrj if (is_a <bb_vec_info> (vinfo))
432138fd1498Szrj break;
432238fd1498Szrj
432338fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
432438fd1498Szrj free_data_ref (dr);
432538fd1498Szrj return false;
432638fd1498Szrj }
432738fd1498Szrj
432838fd1498Szrj STMT_VINFO_DATA_REF (stmt_info) = dr;
432938fd1498Szrj if (simd_lane_access)
433038fd1498Szrj {
433138fd1498Szrj STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
433238fd1498Szrj free_data_ref (datarefs[i]);
433338fd1498Szrj datarefs[i] = dr;
433438fd1498Szrj }
433538fd1498Szrj
433638fd1498Szrj if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
433738fd1498Szrj && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
433838fd1498Szrj && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
433938fd1498Szrj {
434038fd1498Szrj if (dump_enabled_p ())
434138fd1498Szrj {
434238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434338fd1498Szrj "not vectorized: base object not addressable "
434438fd1498Szrj "for stmt: ");
434538fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
434638fd1498Szrj }
434738fd1498Szrj if (is_a <bb_vec_info> (vinfo))
434838fd1498Szrj {
434938fd1498Szrj /* In BB vectorization the ref can still participate
435038fd1498Szrj in dependence analysis, we just can't vectorize it. */
435138fd1498Szrj STMT_VINFO_VECTORIZABLE (stmt_info) = false;
435238fd1498Szrj continue;
435338fd1498Szrj }
435438fd1498Szrj return false;
435538fd1498Szrj }
435638fd1498Szrj
435738fd1498Szrj /* Set vectype for STMT. */
435838fd1498Szrj scalar_type = TREE_TYPE (DR_REF (dr));
435938fd1498Szrj STMT_VINFO_VECTYPE (stmt_info)
436038fd1498Szrj = get_vectype_for_scalar_type (scalar_type);
436138fd1498Szrj if (!STMT_VINFO_VECTYPE (stmt_info))
436238fd1498Szrj {
436338fd1498Szrj if (dump_enabled_p ())
436438fd1498Szrj {
436538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436638fd1498Szrj "not vectorized: no vectype for stmt: ");
436738fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
436838fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
436938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
437038fd1498Szrj scalar_type);
437138fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
437238fd1498Szrj }
437338fd1498Szrj
437438fd1498Szrj if (is_a <bb_vec_info> (vinfo))
437538fd1498Szrj {
437638fd1498Szrj /* No vector type is fine, the ref can still participate
437738fd1498Szrj in dependence analysis, we just can't vectorize it. */
437838fd1498Szrj STMT_VINFO_VECTORIZABLE (stmt_info) = false;
437938fd1498Szrj continue;
438038fd1498Szrj }
438138fd1498Szrj
438238fd1498Szrj if (gatherscatter != SG_NONE || simd_lane_access)
438338fd1498Szrj {
438438fd1498Szrj STMT_VINFO_DATA_REF (stmt_info) = NULL;
438538fd1498Szrj if (gatherscatter != SG_NONE)
438638fd1498Szrj free_data_ref (dr);
438738fd1498Szrj }
438838fd1498Szrj return false;
438938fd1498Szrj }
439038fd1498Szrj else
439138fd1498Szrj {
439238fd1498Szrj if (dump_enabled_p ())
439338fd1498Szrj {
439438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
439538fd1498Szrj "got vectype for stmt: ");
439638fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
439738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
439838fd1498Szrj STMT_VINFO_VECTYPE (stmt_info));
439938fd1498Szrj dump_printf (MSG_NOTE, "\n");
440038fd1498Szrj }
440138fd1498Szrj }
440238fd1498Szrj
440338fd1498Szrj /* Adjust the minimal vectorization factor according to the
440438fd1498Szrj vector type. */
440538fd1498Szrj vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
440638fd1498Szrj *min_vf = upper_bound (*min_vf, vf);
440738fd1498Szrj
440838fd1498Szrj if (gatherscatter != SG_NONE)
440938fd1498Szrj {
441038fd1498Szrj gather_scatter_info gs_info;
441138fd1498Szrj if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
441238fd1498Szrj &gs_info)
441338fd1498Szrj || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
441438fd1498Szrj {
441538fd1498Szrj STMT_VINFO_DATA_REF (stmt_info) = NULL;
441638fd1498Szrj free_data_ref (dr);
441738fd1498Szrj if (dump_enabled_p ())
441838fd1498Szrj {
441938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442038fd1498Szrj (gatherscatter == GATHER) ?
442138fd1498Szrj "not vectorized: not suitable for gather "
442238fd1498Szrj "load " :
442338fd1498Szrj "not vectorized: not suitable for scatter "
442438fd1498Szrj "store ");
442538fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
442638fd1498Szrj }
442738fd1498Szrj return false;
442838fd1498Szrj }
442938fd1498Szrj
443038fd1498Szrj free_data_ref (datarefs[i]);
443138fd1498Szrj datarefs[i] = dr;
443238fd1498Szrj STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
443338fd1498Szrj }
443438fd1498Szrj
443538fd1498Szrj else if (is_a <loop_vec_info> (vinfo)
443638fd1498Szrj && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
443738fd1498Szrj {
443838fd1498Szrj if (nested_in_vect_loop_p (loop, stmt))
443938fd1498Szrj {
444038fd1498Szrj if (dump_enabled_p ())
444138fd1498Szrj {
444238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444338fd1498Szrj "not vectorized: not suitable for strided "
444438fd1498Szrj "load ");
444538fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
444638fd1498Szrj }
444738fd1498Szrj return false;
444838fd1498Szrj }
444938fd1498Szrj STMT_VINFO_STRIDED_P (stmt_info) = true;
445038fd1498Szrj }
445138fd1498Szrj }
445238fd1498Szrj
445338fd1498Szrj /* If we stopped analysis at the first dataref we could not analyze
445438fd1498Szrj when trying to vectorize a basic-block mark the rest of the datarefs
445538fd1498Szrj as not vectorizable and truncate the vector of datarefs. That
445638fd1498Szrj avoids spending useless time in analyzing their dependence. */
445738fd1498Szrj if (i != datarefs.length ())
445838fd1498Szrj {
445938fd1498Szrj gcc_assert (is_a <bb_vec_info> (vinfo));
446038fd1498Szrj for (unsigned j = i; j < datarefs.length (); ++j)
446138fd1498Szrj {
446238fd1498Szrj data_reference_p dr = datarefs[j];
446338fd1498Szrj STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
446438fd1498Szrj free_data_ref (dr);
446538fd1498Szrj }
446638fd1498Szrj datarefs.truncate (i);
446738fd1498Szrj }
446838fd1498Szrj
446938fd1498Szrj return true;
447038fd1498Szrj }
447138fd1498Szrj
447238fd1498Szrj
447338fd1498Szrj /* Function vect_get_new_vect_var.
447438fd1498Szrj
447538fd1498Szrj Returns a name for a new variable. The current naming scheme appends the
447638fd1498Szrj prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
447738fd1498Szrj the name of vectorizer generated variables, and appends that to NAME if
447838fd1498Szrj provided. */
447938fd1498Szrj
448038fd1498Szrj tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)448138fd1498Szrj vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
448238fd1498Szrj {
448338fd1498Szrj const char *prefix;
448438fd1498Szrj tree new_vect_var;
448538fd1498Szrj
448638fd1498Szrj switch (var_kind)
448738fd1498Szrj {
448838fd1498Szrj case vect_simple_var:
448938fd1498Szrj prefix = "vect";
449038fd1498Szrj break;
449138fd1498Szrj case vect_scalar_var:
449238fd1498Szrj prefix = "stmp";
449338fd1498Szrj break;
449438fd1498Szrj case vect_mask_var:
449538fd1498Szrj prefix = "mask";
449638fd1498Szrj break;
449738fd1498Szrj case vect_pointer_var:
449838fd1498Szrj prefix = "vectp";
449938fd1498Szrj break;
450038fd1498Szrj default:
450138fd1498Szrj gcc_unreachable ();
450238fd1498Szrj }
450338fd1498Szrj
450438fd1498Szrj if (name)
450538fd1498Szrj {
450638fd1498Szrj char* tmp = concat (prefix, "_", name, NULL);
450738fd1498Szrj new_vect_var = create_tmp_reg (type, tmp);
450838fd1498Szrj free (tmp);
450938fd1498Szrj }
451038fd1498Szrj else
451138fd1498Szrj new_vect_var = create_tmp_reg (type, prefix);
451238fd1498Szrj
451338fd1498Szrj return new_vect_var;
451438fd1498Szrj }
451538fd1498Szrj
451638fd1498Szrj /* Like vect_get_new_vect_var but return an SSA name. */
451738fd1498Szrj
451838fd1498Szrj tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)451938fd1498Szrj vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
452038fd1498Szrj {
452138fd1498Szrj const char *prefix;
452238fd1498Szrj tree new_vect_var;
452338fd1498Szrj
452438fd1498Szrj switch (var_kind)
452538fd1498Szrj {
452638fd1498Szrj case vect_simple_var:
452738fd1498Szrj prefix = "vect";
452838fd1498Szrj break;
452938fd1498Szrj case vect_scalar_var:
453038fd1498Szrj prefix = "stmp";
453138fd1498Szrj break;
453238fd1498Szrj case vect_pointer_var:
453338fd1498Szrj prefix = "vectp";
453438fd1498Szrj break;
453538fd1498Szrj default:
453638fd1498Szrj gcc_unreachable ();
453738fd1498Szrj }
453838fd1498Szrj
453938fd1498Szrj if (name)
454038fd1498Szrj {
454138fd1498Szrj char* tmp = concat (prefix, "_", name, NULL);
454238fd1498Szrj new_vect_var = make_temp_ssa_name (type, NULL, tmp);
454338fd1498Szrj free (tmp);
454438fd1498Szrj }
454538fd1498Szrj else
454638fd1498Szrj new_vect_var = make_temp_ssa_name (type, NULL, prefix);
454738fd1498Szrj
454838fd1498Szrj return new_vect_var;
454938fd1498Szrj }
455038fd1498Szrj
455138fd1498Szrj /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
455238fd1498Szrj
455338fd1498Szrj static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)455438fd1498Szrj vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
455538fd1498Szrj {
455638fd1498Szrj duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
455738fd1498Szrj int misalign = DR_MISALIGNMENT (dr);
455838fd1498Szrj if (misalign == DR_MISALIGNMENT_UNKNOWN)
455938fd1498Szrj mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
456038fd1498Szrj else
456138fd1498Szrj set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
456238fd1498Szrj DR_TARGET_ALIGNMENT (dr), misalign);
456338fd1498Szrj }
456438fd1498Szrj
456538fd1498Szrj /* Function vect_create_addr_base_for_vector_ref.
456638fd1498Szrj
456738fd1498Szrj Create an expression that computes the address of the first memory location
456838fd1498Szrj that will be accessed for a data reference.
456938fd1498Szrj
457038fd1498Szrj Input:
457138fd1498Szrj STMT: The statement containing the data reference.
457238fd1498Szrj NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
457338fd1498Szrj OFFSET: Optional. If supplied, it is be added to the initial address.
457438fd1498Szrj LOOP: Specify relative to which loop-nest should the address be computed.
457538fd1498Szrj For example, when the dataref is in an inner-loop nested in an
457638fd1498Szrj outer-loop that is now being vectorized, LOOP can be either the
457738fd1498Szrj outer-loop, or the inner-loop. The first memory location accessed
457838fd1498Szrj by the following dataref ('in' points to short):
457938fd1498Szrj
458038fd1498Szrj for (i=0; i<N; i++)
458138fd1498Szrj for (j=0; j<M; j++)
458238fd1498Szrj s += in[i+j]
458338fd1498Szrj
458438fd1498Szrj is as follows:
458538fd1498Szrj if LOOP=i_loop: &in (relative to i_loop)
458638fd1498Szrj if LOOP=j_loop: &in+i*2B (relative to j_loop)
458738fd1498Szrj BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
458838fd1498Szrj initial address. Unlike OFFSET, which is number of elements to
458938fd1498Szrj be added, BYTE_OFFSET is measured in bytes.
459038fd1498Szrj
459138fd1498Szrj Output:
459238fd1498Szrj 1. Return an SSA_NAME whose value is the address of the memory location of
459338fd1498Szrj the first vector of the data reference.
459438fd1498Szrj 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
459538fd1498Szrj these statement(s) which define the returned SSA_NAME.
459638fd1498Szrj
459738fd1498Szrj FORNOW: We are only handling array accesses with step 1. */
459838fd1498Szrj
459938fd1498Szrj tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)460038fd1498Szrj vect_create_addr_base_for_vector_ref (gimple *stmt,
460138fd1498Szrj gimple_seq *new_stmt_list,
460238fd1498Szrj tree offset,
460338fd1498Szrj tree byte_offset)
460438fd1498Szrj {
460538fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
460638fd1498Szrj struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
460738fd1498Szrj const char *base_name;
460838fd1498Szrj tree addr_base;
460938fd1498Szrj tree dest;
461038fd1498Szrj gimple_seq seq = NULL;
461138fd1498Szrj tree vect_ptr_type;
461238fd1498Szrj tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
461338fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
461438fd1498Szrj innermost_loop_behavior *drb = vect_dr_behavior (dr);
461538fd1498Szrj
461638fd1498Szrj tree data_ref_base = unshare_expr (drb->base_address);
461738fd1498Szrj tree base_offset = unshare_expr (drb->offset);
461838fd1498Szrj tree init = unshare_expr (drb->init);
461938fd1498Szrj
462038fd1498Szrj if (loop_vinfo)
462138fd1498Szrj base_name = get_name (data_ref_base);
462238fd1498Szrj else
462338fd1498Szrj {
462438fd1498Szrj base_offset = ssize_int (0);
462538fd1498Szrj init = ssize_int (0);
462638fd1498Szrj base_name = get_name (DR_REF (dr));
462738fd1498Szrj }
462838fd1498Szrj
462938fd1498Szrj /* Create base_offset */
463038fd1498Szrj base_offset = size_binop (PLUS_EXPR,
463138fd1498Szrj fold_convert (sizetype, base_offset),
463238fd1498Szrj fold_convert (sizetype, init));
463338fd1498Szrj
463438fd1498Szrj if (offset)
463538fd1498Szrj {
463638fd1498Szrj offset = fold_build2 (MULT_EXPR, sizetype,
463738fd1498Szrj fold_convert (sizetype, offset), step);
463838fd1498Szrj base_offset = fold_build2 (PLUS_EXPR, sizetype,
463938fd1498Szrj base_offset, offset);
464038fd1498Szrj }
464138fd1498Szrj if (byte_offset)
464238fd1498Szrj {
464338fd1498Szrj byte_offset = fold_convert (sizetype, byte_offset);
464438fd1498Szrj base_offset = fold_build2 (PLUS_EXPR, sizetype,
464538fd1498Szrj base_offset, byte_offset);
464638fd1498Szrj }
464738fd1498Szrj
464838fd1498Szrj /* base + base_offset */
464938fd1498Szrj if (loop_vinfo)
465038fd1498Szrj addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
465138fd1498Szrj else
465238fd1498Szrj {
465338fd1498Szrj addr_base = build1 (ADDR_EXPR,
465438fd1498Szrj build_pointer_type (TREE_TYPE (DR_REF (dr))),
465538fd1498Szrj unshare_expr (DR_REF (dr)));
465638fd1498Szrj }
465738fd1498Szrj
465838fd1498Szrj vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
465938fd1498Szrj dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
466038fd1498Szrj addr_base = force_gimple_operand (addr_base, &seq, true, dest);
466138fd1498Szrj gimple_seq_add_seq (new_stmt_list, seq);
466238fd1498Szrj
466338fd1498Szrj if (DR_PTR_INFO (dr)
466438fd1498Szrj && TREE_CODE (addr_base) == SSA_NAME
466538fd1498Szrj && !SSA_NAME_PTR_INFO (addr_base))
466638fd1498Szrj {
466738fd1498Szrj vect_duplicate_ssa_name_ptr_info (addr_base, dr);
466838fd1498Szrj if (offset || byte_offset)
466938fd1498Szrj mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
467038fd1498Szrj }
467138fd1498Szrj
467238fd1498Szrj if (dump_enabled_p ())
467338fd1498Szrj {
467438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "created ");
467538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
467638fd1498Szrj dump_printf (MSG_NOTE, "\n");
467738fd1498Szrj }
467838fd1498Szrj
467938fd1498Szrj return addr_base;
468038fd1498Szrj }
468138fd1498Szrj
468238fd1498Szrj
468338fd1498Szrj /* Function vect_create_data_ref_ptr.
468438fd1498Szrj
468538fd1498Szrj Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
468638fd1498Szrj location accessed in the loop by STMT, along with the def-use update
468738fd1498Szrj chain to appropriately advance the pointer through the loop iterations.
468838fd1498Szrj Also set aliasing information for the pointer. This pointer is used by
468938fd1498Szrj the callers to this function to create a memory reference expression for
469038fd1498Szrj vector load/store access.
469138fd1498Szrj
469238fd1498Szrj Input:
469338fd1498Szrj 1. STMT: a stmt that references memory. Expected to be of the form
469438fd1498Szrj GIMPLE_ASSIGN <name, data-ref> or
469538fd1498Szrj GIMPLE_ASSIGN <data-ref, name>.
469638fd1498Szrj 2. AGGR_TYPE: the type of the reference, which should be either a vector
469738fd1498Szrj or an array.
469838fd1498Szrj 3. AT_LOOP: the loop where the vector memref is to be created.
469938fd1498Szrj 4. OFFSET (optional): an offset to be added to the initial address accessed
470038fd1498Szrj by the data-ref in STMT.
470138fd1498Szrj 5. BSI: location where the new stmts are to be placed if there is no loop
470238fd1498Szrj 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
470338fd1498Szrj pointing to the initial address.
470438fd1498Szrj 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
470538fd1498Szrj to the initial address accessed by the data-ref in STMT. This is
470638fd1498Szrj similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
470738fd1498Szrj in bytes.
470838fd1498Szrj 8. IV_STEP (optional, defaults to NULL): the amount that should be added
470938fd1498Szrj to the IV during each iteration of the loop. NULL says to move
471038fd1498Szrj by one copy of AGGR_TYPE up or down, depending on the step of the
471138fd1498Szrj data reference.
471238fd1498Szrj
471338fd1498Szrj Output:
471438fd1498Szrj 1. Declare a new ptr to vector_type, and have it point to the base of the
471538fd1498Szrj data reference (initial addressed accessed by the data reference).
471638fd1498Szrj For example, for vector of type V8HI, the following code is generated:
471738fd1498Szrj
471838fd1498Szrj v8hi *ap;
471938fd1498Szrj ap = (v8hi *)initial_address;
472038fd1498Szrj
472138fd1498Szrj if OFFSET is not supplied:
472238fd1498Szrj initial_address = &a[init];
472338fd1498Szrj if OFFSET is supplied:
472438fd1498Szrj initial_address = &a[init + OFFSET];
472538fd1498Szrj if BYTE_OFFSET is supplied:
472638fd1498Szrj initial_address = &a[init] + BYTE_OFFSET;
472738fd1498Szrj
472838fd1498Szrj Return the initial_address in INITIAL_ADDRESS.
472938fd1498Szrj
473038fd1498Szrj 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
473138fd1498Szrj update the pointer in each iteration of the loop.
473238fd1498Szrj
473338fd1498Szrj Return the increment stmt that updates the pointer in PTR_INCR.
473438fd1498Szrj
473538fd1498Szrj 3. Set INV_P to true if the access pattern of the data reference in the
473638fd1498Szrj vectorized loop is invariant. Set it to false otherwise.
473738fd1498Szrj
473838fd1498Szrj 4. Return the pointer. */
473938fd1498Szrj
474038fd1498Szrj tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)474138fd1498Szrj vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
474238fd1498Szrj tree offset, tree *initial_address,
474338fd1498Szrj gimple_stmt_iterator *gsi, gimple **ptr_incr,
474438fd1498Szrj bool only_init, bool *inv_p, tree byte_offset,
474538fd1498Szrj tree iv_step)
474638fd1498Szrj {
474738fd1498Szrj const char *base_name;
474838fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
474938fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
475038fd1498Szrj struct loop *loop = NULL;
475138fd1498Szrj bool nested_in_vect_loop = false;
475238fd1498Szrj struct loop *containing_loop = NULL;
475338fd1498Szrj tree aggr_ptr_type;
475438fd1498Szrj tree aggr_ptr;
475538fd1498Szrj tree new_temp;
475638fd1498Szrj gimple_seq new_stmt_list = NULL;
475738fd1498Szrj edge pe = NULL;
475838fd1498Szrj basic_block new_bb;
475938fd1498Szrj tree aggr_ptr_init;
476038fd1498Szrj struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
476138fd1498Szrj tree aptr;
476238fd1498Szrj gimple_stmt_iterator incr_gsi;
476338fd1498Szrj bool insert_after;
476438fd1498Szrj tree indx_before_incr, indx_after_incr;
476538fd1498Szrj gimple *incr;
476638fd1498Szrj tree step;
476738fd1498Szrj bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
476838fd1498Szrj
476938fd1498Szrj gcc_assert (iv_step != NULL_TREE
477038fd1498Szrj || TREE_CODE (aggr_type) == ARRAY_TYPE
477138fd1498Szrj || TREE_CODE (aggr_type) == VECTOR_TYPE);
477238fd1498Szrj
477338fd1498Szrj if (loop_vinfo)
477438fd1498Szrj {
477538fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
477638fd1498Szrj nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
477738fd1498Szrj containing_loop = (gimple_bb (stmt))->loop_father;
477838fd1498Szrj pe = loop_preheader_edge (loop);
477938fd1498Szrj }
478038fd1498Szrj else
478138fd1498Szrj {
478238fd1498Szrj gcc_assert (bb_vinfo);
478338fd1498Szrj only_init = true;
478438fd1498Szrj *ptr_incr = NULL;
478538fd1498Szrj }
478638fd1498Szrj
478738fd1498Szrj /* Check the step (evolution) of the load in LOOP, and record
478838fd1498Szrj whether it's invariant. */
478938fd1498Szrj step = vect_dr_behavior (dr)->step;
479038fd1498Szrj if (integer_zerop (step))
479138fd1498Szrj *inv_p = true;
479238fd1498Szrj else
479338fd1498Szrj *inv_p = false;
479438fd1498Szrj
479538fd1498Szrj /* Create an expression for the first address accessed by this load
479638fd1498Szrj in LOOP. */
479738fd1498Szrj base_name = get_name (DR_BASE_ADDRESS (dr));
479838fd1498Szrj
479938fd1498Szrj if (dump_enabled_p ())
480038fd1498Szrj {
480138fd1498Szrj tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
480238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
480338fd1498Szrj "create %s-pointer variable to type: ",
480438fd1498Szrj get_tree_code_name (TREE_CODE (aggr_type)));
480538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
480638fd1498Szrj if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
480738fd1498Szrj dump_printf (MSG_NOTE, " vectorizing an array ref: ");
480838fd1498Szrj else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
480938fd1498Szrj dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
481038fd1498Szrj else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
481138fd1498Szrj dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
481238fd1498Szrj else
481338fd1498Szrj dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
481438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
481538fd1498Szrj dump_printf (MSG_NOTE, "\n");
481638fd1498Szrj }
481738fd1498Szrj
481838fd1498Szrj /* (1) Create the new aggregate-pointer variable.
481938fd1498Szrj Vector and array types inherit the alias set of their component
482038fd1498Szrj type by default so we need to use a ref-all pointer if the data
482138fd1498Szrj reference does not conflict with the created aggregated data
482238fd1498Szrj reference because it is not addressable. */
482338fd1498Szrj bool need_ref_all = false;
482438fd1498Szrj if (!alias_sets_conflict_p (get_alias_set (aggr_type),
482538fd1498Szrj get_alias_set (DR_REF (dr))))
482638fd1498Szrj need_ref_all = true;
482738fd1498Szrj /* Likewise for any of the data references in the stmt group. */
482838fd1498Szrj else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
482938fd1498Szrj {
483038fd1498Szrj gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
483138fd1498Szrj do
483238fd1498Szrj {
483338fd1498Szrj stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
483438fd1498Szrj struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
483538fd1498Szrj if (!alias_sets_conflict_p (get_alias_set (aggr_type),
483638fd1498Szrj get_alias_set (DR_REF (sdr))))
483738fd1498Szrj {
483838fd1498Szrj need_ref_all = true;
483938fd1498Szrj break;
484038fd1498Szrj }
484138fd1498Szrj orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
484238fd1498Szrj }
484338fd1498Szrj while (orig_stmt);
484438fd1498Szrj }
484538fd1498Szrj aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
484638fd1498Szrj need_ref_all);
484738fd1498Szrj aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
484838fd1498Szrj
484938fd1498Szrj
485038fd1498Szrj /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
485138fd1498Szrj vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
485238fd1498Szrj def-use update cycles for the pointer: one relative to the outer-loop
485338fd1498Szrj (LOOP), which is what steps (3) and (4) below do. The other is relative
485438fd1498Szrj to the inner-loop (which is the inner-most loop containing the dataref),
485538fd1498Szrj and this is done be step (5) below.
485638fd1498Szrj
485738fd1498Szrj When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
485838fd1498Szrj inner-most loop, and so steps (3),(4) work the same, and step (5) is
485938fd1498Szrj redundant. Steps (3),(4) create the following:
486038fd1498Szrj
486138fd1498Szrj vp0 = &base_addr;
486238fd1498Szrj LOOP: vp1 = phi(vp0,vp2)
486338fd1498Szrj ...
486438fd1498Szrj ...
486538fd1498Szrj vp2 = vp1 + step
486638fd1498Szrj goto LOOP
486738fd1498Szrj
486838fd1498Szrj If there is an inner-loop nested in loop, then step (5) will also be
486938fd1498Szrj applied, and an additional update in the inner-loop will be created:
487038fd1498Szrj
487138fd1498Szrj vp0 = &base_addr;
487238fd1498Szrj LOOP: vp1 = phi(vp0,vp2)
487338fd1498Szrj ...
487438fd1498Szrj inner: vp3 = phi(vp1,vp4)
487538fd1498Szrj vp4 = vp3 + inner_step
487638fd1498Szrj if () goto inner
487738fd1498Szrj ...
487838fd1498Szrj vp2 = vp1 + step
487938fd1498Szrj if () goto LOOP */
488038fd1498Szrj
488138fd1498Szrj /* (2) Calculate the initial address of the aggregate-pointer, and set
488238fd1498Szrj the aggregate-pointer to point to it before the loop. */
488338fd1498Szrj
488438fd1498Szrj /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
488538fd1498Szrj
488638fd1498Szrj new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
488738fd1498Szrj offset, byte_offset);
488838fd1498Szrj if (new_stmt_list)
488938fd1498Szrj {
489038fd1498Szrj if (pe)
489138fd1498Szrj {
489238fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
489338fd1498Szrj gcc_assert (!new_bb);
489438fd1498Szrj }
489538fd1498Szrj else
489638fd1498Szrj gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
489738fd1498Szrj }
489838fd1498Szrj
489938fd1498Szrj *initial_address = new_temp;
490038fd1498Szrj aggr_ptr_init = new_temp;
490138fd1498Szrj
490238fd1498Szrj /* (3) Handle the updating of the aggregate-pointer inside the loop.
490338fd1498Szrj This is needed when ONLY_INIT is false, and also when AT_LOOP is the
490438fd1498Szrj inner-loop nested in LOOP (during outer-loop vectorization). */
490538fd1498Szrj
490638fd1498Szrj /* No update in loop is required. */
490738fd1498Szrj if (only_init && (!loop_vinfo || at_loop == loop))
490838fd1498Szrj aptr = aggr_ptr_init;
490938fd1498Szrj else
491038fd1498Szrj {
491138fd1498Szrj if (iv_step == NULL_TREE)
491238fd1498Szrj {
491338fd1498Szrj /* The step of the aggregate pointer is the type size. */
491438fd1498Szrj iv_step = TYPE_SIZE_UNIT (aggr_type);
491538fd1498Szrj /* One exception to the above is when the scalar step of the load in
491638fd1498Szrj LOOP is zero. In this case the step here is also zero. */
491738fd1498Szrj if (*inv_p)
491838fd1498Szrj iv_step = size_zero_node;
491938fd1498Szrj else if (tree_int_cst_sgn (step) == -1)
492038fd1498Szrj iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
492138fd1498Szrj }
492238fd1498Szrj
492338fd1498Szrj standard_iv_increment_position (loop, &incr_gsi, &insert_after);
492438fd1498Szrj
492538fd1498Szrj create_iv (aggr_ptr_init,
492638fd1498Szrj fold_convert (aggr_ptr_type, iv_step),
492738fd1498Szrj aggr_ptr, loop, &incr_gsi, insert_after,
492838fd1498Szrj &indx_before_incr, &indx_after_incr);
492938fd1498Szrj incr = gsi_stmt (incr_gsi);
493038fd1498Szrj set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
493138fd1498Szrj
493238fd1498Szrj /* Copy the points-to information if it exists. */
493338fd1498Szrj if (DR_PTR_INFO (dr))
493438fd1498Szrj {
493538fd1498Szrj vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
493638fd1498Szrj vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
493738fd1498Szrj }
493838fd1498Szrj if (ptr_incr)
493938fd1498Szrj *ptr_incr = incr;
494038fd1498Szrj
494138fd1498Szrj aptr = indx_before_incr;
494238fd1498Szrj }
494338fd1498Szrj
494438fd1498Szrj if (!nested_in_vect_loop || only_init)
494538fd1498Szrj return aptr;
494638fd1498Szrj
494738fd1498Szrj
494838fd1498Szrj /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
494938fd1498Szrj nested in LOOP, if exists. */
495038fd1498Szrj
495138fd1498Szrj gcc_assert (nested_in_vect_loop);
495238fd1498Szrj if (!only_init)
495338fd1498Szrj {
495438fd1498Szrj standard_iv_increment_position (containing_loop, &incr_gsi,
495538fd1498Szrj &insert_after);
495638fd1498Szrj create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
495738fd1498Szrj containing_loop, &incr_gsi, insert_after, &indx_before_incr,
495838fd1498Szrj &indx_after_incr);
495938fd1498Szrj incr = gsi_stmt (incr_gsi);
496038fd1498Szrj set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
496138fd1498Szrj
496238fd1498Szrj /* Copy the points-to information if it exists. */
496338fd1498Szrj if (DR_PTR_INFO (dr))
496438fd1498Szrj {
496538fd1498Szrj vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
496638fd1498Szrj vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
496738fd1498Szrj }
496838fd1498Szrj if (ptr_incr)
496938fd1498Szrj *ptr_incr = incr;
497038fd1498Szrj
497138fd1498Szrj return indx_before_incr;
497238fd1498Szrj }
497338fd1498Szrj else
497438fd1498Szrj gcc_unreachable ();
497538fd1498Szrj }
497638fd1498Szrj
497738fd1498Szrj
497838fd1498Szrj /* Function bump_vector_ptr
497938fd1498Szrj
498038fd1498Szrj Increment a pointer (to a vector type) by vector-size. If requested,
498138fd1498Szrj i.e. if PTR-INCR is given, then also connect the new increment stmt
498238fd1498Szrj to the existing def-use update-chain of the pointer, by modifying
498338fd1498Szrj the PTR_INCR as illustrated below:
498438fd1498Szrj
498538fd1498Szrj The pointer def-use update-chain before this function:
498638fd1498Szrj DATAREF_PTR = phi (p_0, p_2)
498738fd1498Szrj ....
498838fd1498Szrj PTR_INCR: p_2 = DATAREF_PTR + step
498938fd1498Szrj
499038fd1498Szrj The pointer def-use update-chain after this function:
499138fd1498Szrj DATAREF_PTR = phi (p_0, p_2)
499238fd1498Szrj ....
499338fd1498Szrj NEW_DATAREF_PTR = DATAREF_PTR + BUMP
499438fd1498Szrj ....
499538fd1498Szrj PTR_INCR: p_2 = NEW_DATAREF_PTR + step
499638fd1498Szrj
499738fd1498Szrj Input:
499838fd1498Szrj DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
499938fd1498Szrj in the loop.
500038fd1498Szrj PTR_INCR - optional. The stmt that updates the pointer in each iteration of
500138fd1498Szrj the loop. The increment amount across iterations is expected
500238fd1498Szrj to be vector_size.
500338fd1498Szrj BSI - location where the new update stmt is to be placed.
500438fd1498Szrj STMT - the original scalar memory-access stmt that is being vectorized.
500538fd1498Szrj BUMP - optional. The offset by which to bump the pointer. If not given,
500638fd1498Szrj the offset is assumed to be vector_size.
500738fd1498Szrj
500838fd1498Szrj Output: Return NEW_DATAREF_PTR as illustrated above.
500938fd1498Szrj
501038fd1498Szrj */
501138fd1498Szrj
501238fd1498Szrj tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)501338fd1498Szrj bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
501438fd1498Szrj gimple *stmt, tree bump)
501538fd1498Szrj {
501638fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
501738fd1498Szrj struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
501838fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
501938fd1498Szrj tree update = TYPE_SIZE_UNIT (vectype);
502038fd1498Szrj gassign *incr_stmt;
502138fd1498Szrj ssa_op_iter iter;
502238fd1498Szrj use_operand_p use_p;
502338fd1498Szrj tree new_dataref_ptr;
502438fd1498Szrj
502538fd1498Szrj if (bump)
502638fd1498Szrj update = bump;
502738fd1498Szrj
502838fd1498Szrj if (TREE_CODE (dataref_ptr) == SSA_NAME)
502938fd1498Szrj new_dataref_ptr = copy_ssa_name (dataref_ptr);
503038fd1498Szrj else
503138fd1498Szrj new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
503238fd1498Szrj incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
503338fd1498Szrj dataref_ptr, update);
503438fd1498Szrj vect_finish_stmt_generation (stmt, incr_stmt, gsi);
503538fd1498Szrj
503638fd1498Szrj /* Copy the points-to information if it exists. */
503738fd1498Szrj if (DR_PTR_INFO (dr))
503838fd1498Szrj {
503938fd1498Szrj duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
504038fd1498Szrj mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
504138fd1498Szrj }
504238fd1498Szrj
504338fd1498Szrj if (!ptr_incr)
504438fd1498Szrj return new_dataref_ptr;
504538fd1498Szrj
504638fd1498Szrj /* Update the vector-pointer's cross-iteration increment. */
504738fd1498Szrj FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
504838fd1498Szrj {
504938fd1498Szrj tree use = USE_FROM_PTR (use_p);
505038fd1498Szrj
505138fd1498Szrj if (use == dataref_ptr)
505238fd1498Szrj SET_USE (use_p, new_dataref_ptr);
505338fd1498Szrj else
505438fd1498Szrj gcc_assert (operand_equal_p (use, update, 0));
505538fd1498Szrj }
505638fd1498Szrj
505738fd1498Szrj return new_dataref_ptr;
505838fd1498Szrj }
505938fd1498Szrj
506038fd1498Szrj
506138fd1498Szrj /* Copy memory reference info such as base/clique from the SRC reference
506238fd1498Szrj to the DEST MEM_REF. */
506338fd1498Szrj
506438fd1498Szrj void
vect_copy_ref_info(tree dest,tree src)506538fd1498Szrj vect_copy_ref_info (tree dest, tree src)
506638fd1498Szrj {
506738fd1498Szrj if (TREE_CODE (dest) != MEM_REF)
506838fd1498Szrj return;
506938fd1498Szrj
507038fd1498Szrj tree src_base = src;
507138fd1498Szrj while (handled_component_p (src_base))
507238fd1498Szrj src_base = TREE_OPERAND (src_base, 0);
507338fd1498Szrj if (TREE_CODE (src_base) != MEM_REF
507438fd1498Szrj && TREE_CODE (src_base) != TARGET_MEM_REF)
507538fd1498Szrj return;
507638fd1498Szrj
507738fd1498Szrj MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
507838fd1498Szrj MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
507938fd1498Szrj }
508038fd1498Szrj
508138fd1498Szrj
508238fd1498Szrj /* Function vect_create_destination_var.
508338fd1498Szrj
508438fd1498Szrj Create a new temporary of type VECTYPE. */
508538fd1498Szrj
508638fd1498Szrj tree
vect_create_destination_var(tree scalar_dest,tree vectype)508738fd1498Szrj vect_create_destination_var (tree scalar_dest, tree vectype)
508838fd1498Szrj {
508938fd1498Szrj tree vec_dest;
509038fd1498Szrj const char *name;
509138fd1498Szrj char *new_name;
509238fd1498Szrj tree type;
509338fd1498Szrj enum vect_var_kind kind;
509438fd1498Szrj
509538fd1498Szrj kind = vectype
509638fd1498Szrj ? VECTOR_BOOLEAN_TYPE_P (vectype)
509738fd1498Szrj ? vect_mask_var
509838fd1498Szrj : vect_simple_var
509938fd1498Szrj : vect_scalar_var;
510038fd1498Szrj type = vectype ? vectype : TREE_TYPE (scalar_dest);
510138fd1498Szrj
510238fd1498Szrj gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
510338fd1498Szrj
510438fd1498Szrj name = get_name (scalar_dest);
510538fd1498Szrj if (name)
510638fd1498Szrj new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
510738fd1498Szrj else
510838fd1498Szrj new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
510938fd1498Szrj vec_dest = vect_get_new_vect_var (type, kind, new_name);
511038fd1498Szrj free (new_name);
511138fd1498Szrj
511238fd1498Szrj return vec_dest;
511338fd1498Szrj }
511438fd1498Szrj
511538fd1498Szrj /* Function vect_grouped_store_supported.
511638fd1498Szrj
511738fd1498Szrj Returns TRUE if interleave high and interleave low permutations
511838fd1498Szrj are supported, and FALSE otherwise. */
511938fd1498Szrj
512038fd1498Szrj bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)512138fd1498Szrj vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
512238fd1498Szrj {
512338fd1498Szrj machine_mode mode = TYPE_MODE (vectype);
512438fd1498Szrj
512538fd1498Szrj /* vect_permute_store_chain requires the group size to be equal to 3 or
512638fd1498Szrj be a power of two. */
512738fd1498Szrj if (count != 3 && exact_log2 (count) == -1)
512838fd1498Szrj {
512938fd1498Szrj if (dump_enabled_p ())
513038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513138fd1498Szrj "the size of the group of accesses"
513238fd1498Szrj " is not a power of 2 or not eqaul to 3\n");
513338fd1498Szrj return false;
513438fd1498Szrj }
513538fd1498Szrj
513638fd1498Szrj /* Check that the permutation is supported. */
513738fd1498Szrj if (VECTOR_MODE_P (mode))
513838fd1498Szrj {
513938fd1498Szrj unsigned int i;
514038fd1498Szrj if (count == 3)
514138fd1498Szrj {
514238fd1498Szrj unsigned int j0 = 0, j1 = 0, j2 = 0;
514338fd1498Szrj unsigned int i, j;
514438fd1498Szrj
514538fd1498Szrj unsigned int nelt;
514638fd1498Szrj if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
514738fd1498Szrj {
514838fd1498Szrj if (dump_enabled_p ())
514938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515038fd1498Szrj "cannot handle groups of 3 stores for"
515138fd1498Szrj " variable-length vectors\n");
515238fd1498Szrj return false;
515338fd1498Szrj }
515438fd1498Szrj
515538fd1498Szrj vec_perm_builder sel (nelt, nelt, 1);
515638fd1498Szrj sel.quick_grow (nelt);
515738fd1498Szrj vec_perm_indices indices;
515838fd1498Szrj for (j = 0; j < 3; j++)
515938fd1498Szrj {
516038fd1498Szrj int nelt0 = ((3 - j) * nelt) % 3;
516138fd1498Szrj int nelt1 = ((3 - j) * nelt + 1) % 3;
516238fd1498Szrj int nelt2 = ((3 - j) * nelt + 2) % 3;
516338fd1498Szrj for (i = 0; i < nelt; i++)
516438fd1498Szrj {
516538fd1498Szrj if (3 * i + nelt0 < nelt)
516638fd1498Szrj sel[3 * i + nelt0] = j0++;
516738fd1498Szrj if (3 * i + nelt1 < nelt)
516838fd1498Szrj sel[3 * i + nelt1] = nelt + j1++;
516938fd1498Szrj if (3 * i + nelt2 < nelt)
517038fd1498Szrj sel[3 * i + nelt2] = 0;
517138fd1498Szrj }
517238fd1498Szrj indices.new_vector (sel, 2, nelt);
517338fd1498Szrj if (!can_vec_perm_const_p (mode, indices))
517438fd1498Szrj {
517538fd1498Szrj if (dump_enabled_p ())
517638fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION,
517738fd1498Szrj "permutation op not supported by target.\n");
517838fd1498Szrj return false;
517938fd1498Szrj }
518038fd1498Szrj
518138fd1498Szrj for (i = 0; i < nelt; i++)
518238fd1498Szrj {
518338fd1498Szrj if (3 * i + nelt0 < nelt)
518438fd1498Szrj sel[3 * i + nelt0] = 3 * i + nelt0;
518538fd1498Szrj if (3 * i + nelt1 < nelt)
518638fd1498Szrj sel[3 * i + nelt1] = 3 * i + nelt1;
518738fd1498Szrj if (3 * i + nelt2 < nelt)
518838fd1498Szrj sel[3 * i + nelt2] = nelt + j2++;
518938fd1498Szrj }
519038fd1498Szrj indices.new_vector (sel, 2, nelt);
519138fd1498Szrj if (!can_vec_perm_const_p (mode, indices))
519238fd1498Szrj {
519338fd1498Szrj if (dump_enabled_p ())
519438fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION,
519538fd1498Szrj "permutation op not supported by target.\n");
519638fd1498Szrj return false;
519738fd1498Szrj }
519838fd1498Szrj }
519938fd1498Szrj return true;
520038fd1498Szrj }
520138fd1498Szrj else
520238fd1498Szrj {
520338fd1498Szrj /* If length is not equal to 3 then only power of 2 is supported. */
520438fd1498Szrj gcc_assert (pow2p_hwi (count));
520538fd1498Szrj poly_uint64 nelt = GET_MODE_NUNITS (mode);
520638fd1498Szrj
520738fd1498Szrj /* The encoding has 2 interleaved stepped patterns. */
520838fd1498Szrj vec_perm_builder sel (nelt, 2, 3);
520938fd1498Szrj sel.quick_grow (6);
521038fd1498Szrj for (i = 0; i < 3; i++)
521138fd1498Szrj {
521238fd1498Szrj sel[i * 2] = i;
521338fd1498Szrj sel[i * 2 + 1] = i + nelt;
521438fd1498Szrj }
521538fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
521638fd1498Szrj if (can_vec_perm_const_p (mode, indices))
521738fd1498Szrj {
521838fd1498Szrj for (i = 0; i < 6; i++)
521938fd1498Szrj sel[i] += exact_div (nelt, 2);
522038fd1498Szrj indices.new_vector (sel, 2, nelt);
522138fd1498Szrj if (can_vec_perm_const_p (mode, indices))
522238fd1498Szrj return true;
522338fd1498Szrj }
522438fd1498Szrj }
522538fd1498Szrj }
522638fd1498Szrj
522738fd1498Szrj if (dump_enabled_p ())
522838fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION,
522938fd1498Szrj "permutaion op not supported by target.\n");
523038fd1498Szrj return false;
523138fd1498Szrj }
523238fd1498Szrj
523338fd1498Szrj
523438fd1498Szrj /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
523538fd1498Szrj type VECTYPE. MASKED_P says whether the masked form is needed. */
523638fd1498Szrj
523738fd1498Szrj bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)523838fd1498Szrj vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
523938fd1498Szrj bool masked_p)
524038fd1498Szrj {
524138fd1498Szrj if (masked_p)
524238fd1498Szrj return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
524338fd1498Szrj vec_mask_store_lanes_optab,
524438fd1498Szrj vectype, count);
524538fd1498Szrj else
524638fd1498Szrj return vect_lanes_optab_supported_p ("vec_store_lanes",
524738fd1498Szrj vec_store_lanes_optab,
524838fd1498Szrj vectype, count);
524938fd1498Szrj }
525038fd1498Szrj
525138fd1498Szrj
525238fd1498Szrj /* Function vect_permute_store_chain.
525338fd1498Szrj
525438fd1498Szrj Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
525538fd1498Szrj a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
525638fd1498Szrj the data correctly for the stores. Return the final references for stores
525738fd1498Szrj in RESULT_CHAIN.
525838fd1498Szrj
525938fd1498Szrj E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
526038fd1498Szrj The input is 4 vectors each containing 8 elements. We assign a number to
526138fd1498Szrj each element, the input sequence is:
526238fd1498Szrj
526338fd1498Szrj 1st vec: 0 1 2 3 4 5 6 7
526438fd1498Szrj 2nd vec: 8 9 10 11 12 13 14 15
526538fd1498Szrj 3rd vec: 16 17 18 19 20 21 22 23
526638fd1498Szrj 4th vec: 24 25 26 27 28 29 30 31
526738fd1498Szrj
526838fd1498Szrj The output sequence should be:
526938fd1498Szrj
527038fd1498Szrj 1st vec: 0 8 16 24 1 9 17 25
527138fd1498Szrj 2nd vec: 2 10 18 26 3 11 19 27
527238fd1498Szrj 3rd vec: 4 12 20 28 5 13 21 30
527338fd1498Szrj 4th vec: 6 14 22 30 7 15 23 31
527438fd1498Szrj
527538fd1498Szrj i.e., we interleave the contents of the four vectors in their order.
527638fd1498Szrj
527738fd1498Szrj We use interleave_high/low instructions to create such output. The input of
527838fd1498Szrj each interleave_high/low operation is two vectors:
527938fd1498Szrj 1st vec 2nd vec
528038fd1498Szrj 0 1 2 3 4 5 6 7
528138fd1498Szrj the even elements of the result vector are obtained left-to-right from the
528238fd1498Szrj high/low elements of the first vector. The odd elements of the result are
528338fd1498Szrj obtained left-to-right from the high/low elements of the second vector.
528438fd1498Szrj The output of interleave_high will be: 0 4 1 5
528538fd1498Szrj and of interleave_low: 2 6 3 7
528638fd1498Szrj
528738fd1498Szrj
528838fd1498Szrj The permutation is done in log LENGTH stages. In each stage interleave_high
528938fd1498Szrj and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
529038fd1498Szrj where the first argument is taken from the first half of DR_CHAIN and the
529138fd1498Szrj second argument from it's second half.
529238fd1498Szrj In our example,
529338fd1498Szrj
529438fd1498Szrj I1: interleave_high (1st vec, 3rd vec)
529538fd1498Szrj I2: interleave_low (1st vec, 3rd vec)
529638fd1498Szrj I3: interleave_high (2nd vec, 4th vec)
529738fd1498Szrj I4: interleave_low (2nd vec, 4th vec)
529838fd1498Szrj
529938fd1498Szrj The output for the first stage is:
530038fd1498Szrj
530138fd1498Szrj I1: 0 16 1 17 2 18 3 19
530238fd1498Szrj I2: 4 20 5 21 6 22 7 23
530338fd1498Szrj I3: 8 24 9 25 10 26 11 27
530438fd1498Szrj I4: 12 28 13 29 14 30 15 31
530538fd1498Szrj
530638fd1498Szrj The output of the second stage, i.e. the final result is:
530738fd1498Szrj
530838fd1498Szrj I1: 0 8 16 24 1 9 17 25
530938fd1498Szrj I2: 2 10 18 26 3 11 19 27
531038fd1498Szrj I3: 4 12 20 28 5 13 21 30
531138fd1498Szrj I4: 6 14 22 30 7 15 23 31. */
531238fd1498Szrj
531338fd1498Szrj void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)531438fd1498Szrj vect_permute_store_chain (vec<tree> dr_chain,
531538fd1498Szrj unsigned int length,
531638fd1498Szrj gimple *stmt,
531738fd1498Szrj gimple_stmt_iterator *gsi,
531838fd1498Szrj vec<tree> *result_chain)
531938fd1498Szrj {
532038fd1498Szrj tree vect1, vect2, high, low;
532138fd1498Szrj gimple *perm_stmt;
532238fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
532338fd1498Szrj tree perm_mask_low, perm_mask_high;
532438fd1498Szrj tree data_ref;
532538fd1498Szrj tree perm3_mask_low, perm3_mask_high;
532638fd1498Szrj unsigned int i, j, n, log_length = exact_log2 (length);
532738fd1498Szrj
532838fd1498Szrj result_chain->quick_grow (length);
532938fd1498Szrj memcpy (result_chain->address (), dr_chain.address (),
533038fd1498Szrj length * sizeof (tree));
533138fd1498Szrj
533238fd1498Szrj if (length == 3)
533338fd1498Szrj {
533438fd1498Szrj /* vect_grouped_store_supported ensures that this is constant. */
533538fd1498Szrj unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
533638fd1498Szrj unsigned int j0 = 0, j1 = 0, j2 = 0;
533738fd1498Szrj
533838fd1498Szrj vec_perm_builder sel (nelt, nelt, 1);
533938fd1498Szrj sel.quick_grow (nelt);
534038fd1498Szrj vec_perm_indices indices;
534138fd1498Szrj for (j = 0; j < 3; j++)
534238fd1498Szrj {
534338fd1498Szrj int nelt0 = ((3 - j) * nelt) % 3;
534438fd1498Szrj int nelt1 = ((3 - j) * nelt + 1) % 3;
534538fd1498Szrj int nelt2 = ((3 - j) * nelt + 2) % 3;
534638fd1498Szrj
534738fd1498Szrj for (i = 0; i < nelt; i++)
534838fd1498Szrj {
534938fd1498Szrj if (3 * i + nelt0 < nelt)
535038fd1498Szrj sel[3 * i + nelt0] = j0++;
535138fd1498Szrj if (3 * i + nelt1 < nelt)
535238fd1498Szrj sel[3 * i + nelt1] = nelt + j1++;
535338fd1498Szrj if (3 * i + nelt2 < nelt)
535438fd1498Szrj sel[3 * i + nelt2] = 0;
535538fd1498Szrj }
535638fd1498Szrj indices.new_vector (sel, 2, nelt);
535738fd1498Szrj perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
535838fd1498Szrj
535938fd1498Szrj for (i = 0; i < nelt; i++)
536038fd1498Szrj {
536138fd1498Szrj if (3 * i + nelt0 < nelt)
536238fd1498Szrj sel[3 * i + nelt0] = 3 * i + nelt0;
536338fd1498Szrj if (3 * i + nelt1 < nelt)
536438fd1498Szrj sel[3 * i + nelt1] = 3 * i + nelt1;
536538fd1498Szrj if (3 * i + nelt2 < nelt)
536638fd1498Szrj sel[3 * i + nelt2] = nelt + j2++;
536738fd1498Szrj }
536838fd1498Szrj indices.new_vector (sel, 2, nelt);
536938fd1498Szrj perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
537038fd1498Szrj
537138fd1498Szrj vect1 = dr_chain[0];
537238fd1498Szrj vect2 = dr_chain[1];
537338fd1498Szrj
537438fd1498Szrj /* Create interleaving stmt:
537538fd1498Szrj low = VEC_PERM_EXPR <vect1, vect2,
537638fd1498Szrj {j, nelt, *, j + 1, nelt + j + 1, *,
537738fd1498Szrj j + 2, nelt + j + 2, *, ...}> */
537838fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
537938fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
538038fd1498Szrj vect2, perm3_mask_low);
538138fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
538238fd1498Szrj
538338fd1498Szrj vect1 = data_ref;
538438fd1498Szrj vect2 = dr_chain[2];
538538fd1498Szrj /* Create interleaving stmt:
538638fd1498Szrj low = VEC_PERM_EXPR <vect1, vect2,
538738fd1498Szrj {0, 1, nelt + j, 3, 4, nelt + j + 1,
538838fd1498Szrj 6, 7, nelt + j + 2, ...}> */
538938fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
539038fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
539138fd1498Szrj vect2, perm3_mask_high);
539238fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
539338fd1498Szrj (*result_chain)[j] = data_ref;
539438fd1498Szrj }
539538fd1498Szrj }
539638fd1498Szrj else
539738fd1498Szrj {
539838fd1498Szrj /* If length is not equal to 3 then only power of 2 is supported. */
539938fd1498Szrj gcc_assert (pow2p_hwi (length));
540038fd1498Szrj
540138fd1498Szrj /* The encoding has 2 interleaved stepped patterns. */
540238fd1498Szrj poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
540338fd1498Szrj vec_perm_builder sel (nelt, 2, 3);
540438fd1498Szrj sel.quick_grow (6);
540538fd1498Szrj for (i = 0; i < 3; i++)
540638fd1498Szrj {
540738fd1498Szrj sel[i * 2] = i;
540838fd1498Szrj sel[i * 2 + 1] = i + nelt;
540938fd1498Szrj }
541038fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
541138fd1498Szrj perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
541238fd1498Szrj
541338fd1498Szrj for (i = 0; i < 6; i++)
541438fd1498Szrj sel[i] += exact_div (nelt, 2);
541538fd1498Szrj indices.new_vector (sel, 2, nelt);
541638fd1498Szrj perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
541738fd1498Szrj
541838fd1498Szrj for (i = 0, n = log_length; i < n; i++)
541938fd1498Szrj {
542038fd1498Szrj for (j = 0; j < length/2; j++)
542138fd1498Szrj {
542238fd1498Szrj vect1 = dr_chain[j];
542338fd1498Szrj vect2 = dr_chain[j+length/2];
542438fd1498Szrj
542538fd1498Szrj /* Create interleaving stmt:
542638fd1498Szrj high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
542738fd1498Szrj ...}> */
542838fd1498Szrj high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
542938fd1498Szrj perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
543038fd1498Szrj vect2, perm_mask_high);
543138fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
543238fd1498Szrj (*result_chain)[2*j] = high;
543338fd1498Szrj
543438fd1498Szrj /* Create interleaving stmt:
543538fd1498Szrj low = VEC_PERM_EXPR <vect1, vect2,
543638fd1498Szrj {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
543738fd1498Szrj ...}> */
543838fd1498Szrj low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
543938fd1498Szrj perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
544038fd1498Szrj vect2, perm_mask_low);
544138fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
544238fd1498Szrj (*result_chain)[2*j+1] = low;
544338fd1498Szrj }
544438fd1498Szrj memcpy (dr_chain.address (), result_chain->address (),
544538fd1498Szrj length * sizeof (tree));
544638fd1498Szrj }
544738fd1498Szrj }
544838fd1498Szrj }
544938fd1498Szrj
545038fd1498Szrj /* Function vect_setup_realignment
545138fd1498Szrj
545238fd1498Szrj This function is called when vectorizing an unaligned load using
545338fd1498Szrj the dr_explicit_realign[_optimized] scheme.
545438fd1498Szrj This function generates the following code at the loop prolog:
545538fd1498Szrj
545638fd1498Szrj p = initial_addr;
545738fd1498Szrj x msq_init = *(floor(p)); # prolog load
545838fd1498Szrj realignment_token = call target_builtin;
545938fd1498Szrj loop:
546038fd1498Szrj x msq = phi (msq_init, ---)
546138fd1498Szrj
546238fd1498Szrj The stmts marked with x are generated only for the case of
546338fd1498Szrj dr_explicit_realign_optimized.
546438fd1498Szrj
546538fd1498Szrj The code above sets up a new (vector) pointer, pointing to the first
546638fd1498Szrj location accessed by STMT, and a "floor-aligned" load using that pointer.
546738fd1498Szrj It also generates code to compute the "realignment-token" (if the relevant
546838fd1498Szrj target hook was defined), and creates a phi-node at the loop-header bb
546938fd1498Szrj whose arguments are the result of the prolog-load (created by this
547038fd1498Szrj function) and the result of a load that takes place in the loop (to be
547138fd1498Szrj created by the caller to this function).
547238fd1498Szrj
547338fd1498Szrj For the case of dr_explicit_realign_optimized:
547438fd1498Szrj The caller to this function uses the phi-result (msq) to create the
547538fd1498Szrj realignment code inside the loop, and sets up the missing phi argument,
547638fd1498Szrj as follows:
547738fd1498Szrj loop:
547838fd1498Szrj msq = phi (msq_init, lsq)
547938fd1498Szrj lsq = *(floor(p')); # load in loop
548038fd1498Szrj result = realign_load (msq, lsq, realignment_token);
548138fd1498Szrj
548238fd1498Szrj For the case of dr_explicit_realign:
548338fd1498Szrj loop:
548438fd1498Szrj msq = *(floor(p)); # load in loop
548538fd1498Szrj p' = p + (VS-1);
548638fd1498Szrj lsq = *(floor(p')); # load in loop
548738fd1498Szrj result = realign_load (msq, lsq, realignment_token);
548838fd1498Szrj
548938fd1498Szrj Input:
549038fd1498Szrj STMT - (scalar) load stmt to be vectorized. This load accesses
549138fd1498Szrj a memory location that may be unaligned.
549238fd1498Szrj BSI - place where new code is to be inserted.
549338fd1498Szrj ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
549438fd1498Szrj is used.
549538fd1498Szrj
549638fd1498Szrj Output:
549738fd1498Szrj REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
549838fd1498Szrj target hook, if defined.
549938fd1498Szrj Return value - the result of the loop-header phi node. */
550038fd1498Szrj
550138fd1498Szrj tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)550238fd1498Szrj vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
550338fd1498Szrj tree *realignment_token,
550438fd1498Szrj enum dr_alignment_support alignment_support_scheme,
550538fd1498Szrj tree init_addr,
550638fd1498Szrj struct loop **at_loop)
550738fd1498Szrj {
550838fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
550938fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
551038fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
551138fd1498Szrj struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
551238fd1498Szrj struct loop *loop = NULL;
551338fd1498Szrj edge pe = NULL;
551438fd1498Szrj tree scalar_dest = gimple_assign_lhs (stmt);
551538fd1498Szrj tree vec_dest;
551638fd1498Szrj gimple *inc;
551738fd1498Szrj tree ptr;
551838fd1498Szrj tree data_ref;
551938fd1498Szrj basic_block new_bb;
552038fd1498Szrj tree msq_init = NULL_TREE;
552138fd1498Szrj tree new_temp;
552238fd1498Szrj gphi *phi_stmt;
552338fd1498Szrj tree msq = NULL_TREE;
552438fd1498Szrj gimple_seq stmts = NULL;
552538fd1498Szrj bool inv_p;
552638fd1498Szrj bool compute_in_loop = false;
552738fd1498Szrj bool nested_in_vect_loop = false;
552838fd1498Szrj struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
552938fd1498Szrj struct loop *loop_for_initial_load = NULL;
553038fd1498Szrj
553138fd1498Szrj if (loop_vinfo)
553238fd1498Szrj {
553338fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
553438fd1498Szrj nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
553538fd1498Szrj }
553638fd1498Szrj
553738fd1498Szrj gcc_assert (alignment_support_scheme == dr_explicit_realign
553838fd1498Szrj || alignment_support_scheme == dr_explicit_realign_optimized);
553938fd1498Szrj
554038fd1498Szrj /* We need to generate three things:
554138fd1498Szrj 1. the misalignment computation
554238fd1498Szrj 2. the extra vector load (for the optimized realignment scheme).
554338fd1498Szrj 3. the phi node for the two vectors from which the realignment is
554438fd1498Szrj done (for the optimized realignment scheme). */
554538fd1498Szrj
554638fd1498Szrj /* 1. Determine where to generate the misalignment computation.
554738fd1498Szrj
554838fd1498Szrj If INIT_ADDR is NULL_TREE, this indicates that the misalignment
554938fd1498Szrj calculation will be generated by this function, outside the loop (in the
555038fd1498Szrj preheader). Otherwise, INIT_ADDR had already been computed for us by the
555138fd1498Szrj caller, inside the loop.
555238fd1498Szrj
555338fd1498Szrj Background: If the misalignment remains fixed throughout the iterations of
555438fd1498Szrj the loop, then both realignment schemes are applicable, and also the
555538fd1498Szrj misalignment computation can be done outside LOOP. This is because we are
555638fd1498Szrj vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
555738fd1498Szrj are a multiple of VS (the Vector Size), and therefore the misalignment in
555838fd1498Szrj different vectorized LOOP iterations is always the same.
555938fd1498Szrj The problem arises only if the memory access is in an inner-loop nested
556038fd1498Szrj inside LOOP, which is now being vectorized using outer-loop vectorization.
556138fd1498Szrj This is the only case when the misalignment of the memory access may not
556238fd1498Szrj remain fixed throughout the iterations of the inner-loop (as explained in
556338fd1498Szrj detail in vect_supportable_dr_alignment). In this case, not only is the
556438fd1498Szrj optimized realignment scheme not applicable, but also the misalignment
556538fd1498Szrj computation (and generation of the realignment token that is passed to
556638fd1498Szrj REALIGN_LOAD) have to be done inside the loop.
556738fd1498Szrj
556838fd1498Szrj In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
556938fd1498Szrj or not, which in turn determines if the misalignment is computed inside
557038fd1498Szrj the inner-loop, or outside LOOP. */
557138fd1498Szrj
557238fd1498Szrj if (init_addr != NULL_TREE || !loop_vinfo)
557338fd1498Szrj {
557438fd1498Szrj compute_in_loop = true;
557538fd1498Szrj gcc_assert (alignment_support_scheme == dr_explicit_realign);
557638fd1498Szrj }
557738fd1498Szrj
557838fd1498Szrj
557938fd1498Szrj /* 2. Determine where to generate the extra vector load.
558038fd1498Szrj
558138fd1498Szrj For the optimized realignment scheme, instead of generating two vector
558238fd1498Szrj loads in each iteration, we generate a single extra vector load in the
558338fd1498Szrj preheader of the loop, and in each iteration reuse the result of the
558438fd1498Szrj vector load from the previous iteration. In case the memory access is in
558538fd1498Szrj an inner-loop nested inside LOOP, which is now being vectorized using
558638fd1498Szrj outer-loop vectorization, we need to determine whether this initial vector
558738fd1498Szrj load should be generated at the preheader of the inner-loop, or can be
558838fd1498Szrj generated at the preheader of LOOP. If the memory access has no evolution
558938fd1498Szrj in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
559038fd1498Szrj to be generated inside LOOP (in the preheader of the inner-loop). */
559138fd1498Szrj
559238fd1498Szrj if (nested_in_vect_loop)
559338fd1498Szrj {
559438fd1498Szrj tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
559538fd1498Szrj bool invariant_in_outerloop =
559638fd1498Szrj (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
559738fd1498Szrj loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
559838fd1498Szrj }
559938fd1498Szrj else
560038fd1498Szrj loop_for_initial_load = loop;
560138fd1498Szrj if (at_loop)
560238fd1498Szrj *at_loop = loop_for_initial_load;
560338fd1498Szrj
560438fd1498Szrj if (loop_for_initial_load)
560538fd1498Szrj pe = loop_preheader_edge (loop_for_initial_load);
560638fd1498Szrj
560738fd1498Szrj /* 3. For the case of the optimized realignment, create the first vector
560838fd1498Szrj load at the loop preheader. */
560938fd1498Szrj
561038fd1498Szrj if (alignment_support_scheme == dr_explicit_realign_optimized)
561138fd1498Szrj {
561238fd1498Szrj /* Create msq_init = *(floor(p1)) in the loop preheader */
561338fd1498Szrj gassign *new_stmt;
561438fd1498Szrj
561538fd1498Szrj gcc_assert (!compute_in_loop);
561638fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype);
561738fd1498Szrj ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
561838fd1498Szrj NULL_TREE, &init_addr, NULL, &inc,
561938fd1498Szrj true, &inv_p);
562038fd1498Szrj if (TREE_CODE (ptr) == SSA_NAME)
562138fd1498Szrj new_temp = copy_ssa_name (ptr);
562238fd1498Szrj else
562338fd1498Szrj new_temp = make_ssa_name (TREE_TYPE (ptr));
562438fd1498Szrj unsigned int align = DR_TARGET_ALIGNMENT (dr);
562538fd1498Szrj new_stmt = gimple_build_assign
562638fd1498Szrj (new_temp, BIT_AND_EXPR, ptr,
562738fd1498Szrj build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
562838fd1498Szrj new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
562938fd1498Szrj gcc_assert (!new_bb);
563038fd1498Szrj data_ref
563138fd1498Szrj = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
563238fd1498Szrj build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
563338fd1498Szrj vect_copy_ref_info (data_ref, DR_REF (dr));
563438fd1498Szrj new_stmt = gimple_build_assign (vec_dest, data_ref);
563538fd1498Szrj new_temp = make_ssa_name (vec_dest, new_stmt);
563638fd1498Szrj gimple_assign_set_lhs (new_stmt, new_temp);
563738fd1498Szrj if (pe)
563838fd1498Szrj {
563938fd1498Szrj new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
564038fd1498Szrj gcc_assert (!new_bb);
564138fd1498Szrj }
564238fd1498Szrj else
564338fd1498Szrj gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
564438fd1498Szrj
564538fd1498Szrj msq_init = gimple_assign_lhs (new_stmt);
564638fd1498Szrj }
564738fd1498Szrj
564838fd1498Szrj /* 4. Create realignment token using a target builtin, if available.
564938fd1498Szrj It is done either inside the containing loop, or before LOOP (as
565038fd1498Szrj determined above). */
565138fd1498Szrj
565238fd1498Szrj if (targetm.vectorize.builtin_mask_for_load)
565338fd1498Szrj {
565438fd1498Szrj gcall *new_stmt;
565538fd1498Szrj tree builtin_decl;
565638fd1498Szrj
565738fd1498Szrj /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
565838fd1498Szrj if (!init_addr)
565938fd1498Szrj {
566038fd1498Szrj /* Generate the INIT_ADDR computation outside LOOP. */
566138fd1498Szrj init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
566238fd1498Szrj NULL_TREE);
566338fd1498Szrj if (loop)
566438fd1498Szrj {
566538fd1498Szrj pe = loop_preheader_edge (loop);
566638fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
566738fd1498Szrj gcc_assert (!new_bb);
566838fd1498Szrj }
566938fd1498Szrj else
567038fd1498Szrj gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
567138fd1498Szrj }
567238fd1498Szrj
567338fd1498Szrj builtin_decl = targetm.vectorize.builtin_mask_for_load ();
567438fd1498Szrj new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
567538fd1498Szrj vec_dest =
567638fd1498Szrj vect_create_destination_var (scalar_dest,
567738fd1498Szrj gimple_call_return_type (new_stmt));
567838fd1498Szrj new_temp = make_ssa_name (vec_dest, new_stmt);
567938fd1498Szrj gimple_call_set_lhs (new_stmt, new_temp);
568038fd1498Szrj
568138fd1498Szrj if (compute_in_loop)
568238fd1498Szrj gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
568338fd1498Szrj else
568438fd1498Szrj {
568538fd1498Szrj /* Generate the misalignment computation outside LOOP. */
568638fd1498Szrj pe = loop_preheader_edge (loop);
568738fd1498Szrj new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
568838fd1498Szrj gcc_assert (!new_bb);
568938fd1498Szrj }
569038fd1498Szrj
569138fd1498Szrj *realignment_token = gimple_call_lhs (new_stmt);
569238fd1498Szrj
569338fd1498Szrj /* The result of the CALL_EXPR to this builtin is determined from
569438fd1498Szrj the value of the parameter and no global variables are touched
569538fd1498Szrj which makes the builtin a "const" function. Requiring the
569638fd1498Szrj builtin to have the "const" attribute makes it unnecessary
569738fd1498Szrj to call mark_call_clobbered. */
569838fd1498Szrj gcc_assert (TREE_READONLY (builtin_decl));
569938fd1498Szrj }
570038fd1498Szrj
570138fd1498Szrj if (alignment_support_scheme == dr_explicit_realign)
570238fd1498Szrj return msq;
570338fd1498Szrj
570438fd1498Szrj gcc_assert (!compute_in_loop);
570538fd1498Szrj gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
570638fd1498Szrj
570738fd1498Szrj
570838fd1498Szrj /* 5. Create msq = phi <msq_init, lsq> in loop */
570938fd1498Szrj
571038fd1498Szrj pe = loop_preheader_edge (containing_loop);
571138fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype);
571238fd1498Szrj msq = make_ssa_name (vec_dest);
571338fd1498Szrj phi_stmt = create_phi_node (msq, containing_loop->header);
571438fd1498Szrj add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
571538fd1498Szrj
571638fd1498Szrj return msq;
571738fd1498Szrj }
571838fd1498Szrj
571938fd1498Szrj
572038fd1498Szrj /* Function vect_grouped_load_supported.
572138fd1498Szrj
572238fd1498Szrj COUNT is the size of the load group (the number of statements plus the
572338fd1498Szrj number of gaps). SINGLE_ELEMENT_P is true if there is actually
572438fd1498Szrj only one statement, with a gap of COUNT - 1.
572538fd1498Szrj
572638fd1498Szrj Returns true if a suitable permute exists. */
572738fd1498Szrj
572838fd1498Szrj bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)572938fd1498Szrj vect_grouped_load_supported (tree vectype, bool single_element_p,
573038fd1498Szrj unsigned HOST_WIDE_INT count)
573138fd1498Szrj {
573238fd1498Szrj machine_mode mode = TYPE_MODE (vectype);
573338fd1498Szrj
573438fd1498Szrj /* If this is single-element interleaving with an element distance
573538fd1498Szrj that leaves unused vector loads around punt - we at least create
573638fd1498Szrj very sub-optimal code in that case (and blow up memory,
573738fd1498Szrj see PR65518). */
573838fd1498Szrj if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
573938fd1498Szrj {
574038fd1498Szrj if (dump_enabled_p ())
574138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
574238fd1498Szrj "single-element interleaving not supported "
574338fd1498Szrj "for not adjacent vector loads\n");
574438fd1498Szrj return false;
574538fd1498Szrj }
574638fd1498Szrj
574738fd1498Szrj /* vect_permute_load_chain requires the group size to be equal to 3 or
574838fd1498Szrj be a power of two. */
574938fd1498Szrj if (count != 3 && exact_log2 (count) == -1)
575038fd1498Szrj {
575138fd1498Szrj if (dump_enabled_p ())
575238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
575338fd1498Szrj "the size of the group of accesses"
575438fd1498Szrj " is not a power of 2 or not equal to 3\n");
575538fd1498Szrj return false;
575638fd1498Szrj }
575738fd1498Szrj
575838fd1498Szrj /* Check that the permutation is supported. */
575938fd1498Szrj if (VECTOR_MODE_P (mode))
576038fd1498Szrj {
576138fd1498Szrj unsigned int i, j;
576238fd1498Szrj if (count == 3)
576338fd1498Szrj {
576438fd1498Szrj unsigned int nelt;
576538fd1498Szrj if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
576638fd1498Szrj {
576738fd1498Szrj if (dump_enabled_p ())
576838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576938fd1498Szrj "cannot handle groups of 3 loads for"
577038fd1498Szrj " variable-length vectors\n");
577138fd1498Szrj return false;
577238fd1498Szrj }
577338fd1498Szrj
577438fd1498Szrj vec_perm_builder sel (nelt, nelt, 1);
577538fd1498Szrj sel.quick_grow (nelt);
577638fd1498Szrj vec_perm_indices indices;
577738fd1498Szrj unsigned int k;
577838fd1498Szrj for (k = 0; k < 3; k++)
577938fd1498Szrj {
578038fd1498Szrj for (i = 0; i < nelt; i++)
578138fd1498Szrj if (3 * i + k < 2 * nelt)
578238fd1498Szrj sel[i] = 3 * i + k;
578338fd1498Szrj else
578438fd1498Szrj sel[i] = 0;
578538fd1498Szrj indices.new_vector (sel, 2, nelt);
578638fd1498Szrj if (!can_vec_perm_const_p (mode, indices))
578738fd1498Szrj {
578838fd1498Szrj if (dump_enabled_p ())
578938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579038fd1498Szrj "shuffle of 3 loads is not supported by"
579138fd1498Szrj " target\n");
579238fd1498Szrj return false;
579338fd1498Szrj }
579438fd1498Szrj for (i = 0, j = 0; i < nelt; i++)
579538fd1498Szrj if (3 * i + k < 2 * nelt)
579638fd1498Szrj sel[i] = i;
579738fd1498Szrj else
579838fd1498Szrj sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
579938fd1498Szrj indices.new_vector (sel, 2, nelt);
580038fd1498Szrj if (!can_vec_perm_const_p (mode, indices))
580138fd1498Szrj {
580238fd1498Szrj if (dump_enabled_p ())
580338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
580438fd1498Szrj "shuffle of 3 loads is not supported by"
580538fd1498Szrj " target\n");
580638fd1498Szrj return false;
580738fd1498Szrj }
580838fd1498Szrj }
580938fd1498Szrj return true;
581038fd1498Szrj }
581138fd1498Szrj else
581238fd1498Szrj {
581338fd1498Szrj /* If length is not equal to 3 then only power of 2 is supported. */
581438fd1498Szrj gcc_assert (pow2p_hwi (count));
581538fd1498Szrj poly_uint64 nelt = GET_MODE_NUNITS (mode);
581638fd1498Szrj
581738fd1498Szrj /* The encoding has a single stepped pattern. */
581838fd1498Szrj vec_perm_builder sel (nelt, 1, 3);
581938fd1498Szrj sel.quick_grow (3);
582038fd1498Szrj for (i = 0; i < 3; i++)
582138fd1498Szrj sel[i] = i * 2;
582238fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
582338fd1498Szrj if (can_vec_perm_const_p (mode, indices))
582438fd1498Szrj {
582538fd1498Szrj for (i = 0; i < 3; i++)
582638fd1498Szrj sel[i] = i * 2 + 1;
582738fd1498Szrj indices.new_vector (sel, 2, nelt);
582838fd1498Szrj if (can_vec_perm_const_p (mode, indices))
582938fd1498Szrj return true;
583038fd1498Szrj }
583138fd1498Szrj }
583238fd1498Szrj }
583338fd1498Szrj
583438fd1498Szrj if (dump_enabled_p ())
583538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583638fd1498Szrj "extract even/odd not supported by target\n");
583738fd1498Szrj return false;
583838fd1498Szrj }
583938fd1498Szrj
584038fd1498Szrj /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
584138fd1498Szrj type VECTYPE. MASKED_P says whether the masked form is needed. */
584238fd1498Szrj
584338fd1498Szrj bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)584438fd1498Szrj vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
584538fd1498Szrj bool masked_p)
584638fd1498Szrj {
584738fd1498Szrj if (masked_p)
584838fd1498Szrj return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
584938fd1498Szrj vec_mask_load_lanes_optab,
585038fd1498Szrj vectype, count);
585138fd1498Szrj else
585238fd1498Szrj return vect_lanes_optab_supported_p ("vec_load_lanes",
585338fd1498Szrj vec_load_lanes_optab,
585438fd1498Szrj vectype, count);
585538fd1498Szrj }
585638fd1498Szrj
585738fd1498Szrj /* Function vect_permute_load_chain.
585838fd1498Szrj
585938fd1498Szrj Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
586038fd1498Szrj a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
586138fd1498Szrj the input data correctly. Return the final references for loads in
586238fd1498Szrj RESULT_CHAIN.
586338fd1498Szrj
586438fd1498Szrj E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
586538fd1498Szrj The input is 4 vectors each containing 8 elements. We assign a number to each
586638fd1498Szrj element, the input sequence is:
586738fd1498Szrj
586838fd1498Szrj 1st vec: 0 1 2 3 4 5 6 7
586938fd1498Szrj 2nd vec: 8 9 10 11 12 13 14 15
587038fd1498Szrj 3rd vec: 16 17 18 19 20 21 22 23
587138fd1498Szrj 4th vec: 24 25 26 27 28 29 30 31
587238fd1498Szrj
587338fd1498Szrj The output sequence should be:
587438fd1498Szrj
587538fd1498Szrj 1st vec: 0 4 8 12 16 20 24 28
587638fd1498Szrj 2nd vec: 1 5 9 13 17 21 25 29
587738fd1498Szrj 3rd vec: 2 6 10 14 18 22 26 30
587838fd1498Szrj 4th vec: 3 7 11 15 19 23 27 31
587938fd1498Szrj
588038fd1498Szrj i.e., the first output vector should contain the first elements of each
588138fd1498Szrj interleaving group, etc.
588238fd1498Szrj
588338fd1498Szrj We use extract_even/odd instructions to create such output. The input of
588438fd1498Szrj each extract_even/odd operation is two vectors
588538fd1498Szrj 1st vec 2nd vec
588638fd1498Szrj 0 1 2 3 4 5 6 7
588738fd1498Szrj
588838fd1498Szrj and the output is the vector of extracted even/odd elements. The output of
588938fd1498Szrj extract_even will be: 0 2 4 6
589038fd1498Szrj and of extract_odd: 1 3 5 7
589138fd1498Szrj
589238fd1498Szrj
589338fd1498Szrj The permutation is done in log LENGTH stages. In each stage extract_even
589438fd1498Szrj and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
589538fd1498Szrj their order. In our example,
589638fd1498Szrj
589738fd1498Szrj E1: extract_even (1st vec, 2nd vec)
589838fd1498Szrj E2: extract_odd (1st vec, 2nd vec)
589938fd1498Szrj E3: extract_even (3rd vec, 4th vec)
590038fd1498Szrj E4: extract_odd (3rd vec, 4th vec)
590138fd1498Szrj
590238fd1498Szrj The output for the first stage will be:
590338fd1498Szrj
590438fd1498Szrj E1: 0 2 4 6 8 10 12 14
590538fd1498Szrj E2: 1 3 5 7 9 11 13 15
590638fd1498Szrj E3: 16 18 20 22 24 26 28 30
590738fd1498Szrj E4: 17 19 21 23 25 27 29 31
590838fd1498Szrj
590938fd1498Szrj In order to proceed and create the correct sequence for the next stage (or
591038fd1498Szrj for the correct output, if the second stage is the last one, as in our
591138fd1498Szrj example), we first put the output of extract_even operation and then the
591238fd1498Szrj output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
591338fd1498Szrj The input for the second stage is:
591438fd1498Szrj
591538fd1498Szrj 1st vec (E1): 0 2 4 6 8 10 12 14
591638fd1498Szrj 2nd vec (E3): 16 18 20 22 24 26 28 30
591738fd1498Szrj 3rd vec (E2): 1 3 5 7 9 11 13 15
591838fd1498Szrj 4th vec (E4): 17 19 21 23 25 27 29 31
591938fd1498Szrj
592038fd1498Szrj The output of the second stage:
592138fd1498Szrj
592238fd1498Szrj E1: 0 4 8 12 16 20 24 28
592338fd1498Szrj E2: 2 6 10 14 18 22 26 30
592438fd1498Szrj E3: 1 5 9 13 17 21 25 29
592538fd1498Szrj E4: 3 7 11 15 19 23 27 31
592638fd1498Szrj
592738fd1498Szrj And RESULT_CHAIN after reordering:
592838fd1498Szrj
592938fd1498Szrj 1st vec (E1): 0 4 8 12 16 20 24 28
593038fd1498Szrj 2nd vec (E3): 1 5 9 13 17 21 25 29
593138fd1498Szrj 3rd vec (E2): 2 6 10 14 18 22 26 30
593238fd1498Szrj 4th vec (E4): 3 7 11 15 19 23 27 31. */
593338fd1498Szrj
593438fd1498Szrj static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)593538fd1498Szrj vect_permute_load_chain (vec<tree> dr_chain,
593638fd1498Szrj unsigned int length,
593738fd1498Szrj gimple *stmt,
593838fd1498Szrj gimple_stmt_iterator *gsi,
593938fd1498Szrj vec<tree> *result_chain)
594038fd1498Szrj {
594138fd1498Szrj tree data_ref, first_vect, second_vect;
594238fd1498Szrj tree perm_mask_even, perm_mask_odd;
594338fd1498Szrj tree perm3_mask_low, perm3_mask_high;
594438fd1498Szrj gimple *perm_stmt;
594538fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
594638fd1498Szrj unsigned int i, j, log_length = exact_log2 (length);
594738fd1498Szrj
594838fd1498Szrj result_chain->quick_grow (length);
594938fd1498Szrj memcpy (result_chain->address (), dr_chain.address (),
595038fd1498Szrj length * sizeof (tree));
595138fd1498Szrj
595238fd1498Szrj if (length == 3)
595338fd1498Szrj {
595438fd1498Szrj /* vect_grouped_load_supported ensures that this is constant. */
595538fd1498Szrj unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
595638fd1498Szrj unsigned int k;
595738fd1498Szrj
595838fd1498Szrj vec_perm_builder sel (nelt, nelt, 1);
595938fd1498Szrj sel.quick_grow (nelt);
596038fd1498Szrj vec_perm_indices indices;
596138fd1498Szrj for (k = 0; k < 3; k++)
596238fd1498Szrj {
596338fd1498Szrj for (i = 0; i < nelt; i++)
596438fd1498Szrj if (3 * i + k < 2 * nelt)
596538fd1498Szrj sel[i] = 3 * i + k;
596638fd1498Szrj else
596738fd1498Szrj sel[i] = 0;
596838fd1498Szrj indices.new_vector (sel, 2, nelt);
596938fd1498Szrj perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
597038fd1498Szrj
597138fd1498Szrj for (i = 0, j = 0; i < nelt; i++)
597238fd1498Szrj if (3 * i + k < 2 * nelt)
597338fd1498Szrj sel[i] = i;
597438fd1498Szrj else
597538fd1498Szrj sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
597638fd1498Szrj indices.new_vector (sel, 2, nelt);
597738fd1498Szrj perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
597838fd1498Szrj
597938fd1498Szrj first_vect = dr_chain[0];
598038fd1498Szrj second_vect = dr_chain[1];
598138fd1498Szrj
598238fd1498Szrj /* Create interleaving stmt (low part of):
598338fd1498Szrj low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
598438fd1498Szrj ...}> */
598538fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
598638fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
598738fd1498Szrj second_vect, perm3_mask_low);
598838fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
598938fd1498Szrj
599038fd1498Szrj /* Create interleaving stmt (high part of):
599138fd1498Szrj high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
599238fd1498Szrj ...}> */
599338fd1498Szrj first_vect = data_ref;
599438fd1498Szrj second_vect = dr_chain[2];
599538fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
599638fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
599738fd1498Szrj second_vect, perm3_mask_high);
599838fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
599938fd1498Szrj (*result_chain)[k] = data_ref;
600038fd1498Szrj }
600138fd1498Szrj }
600238fd1498Szrj else
600338fd1498Szrj {
600438fd1498Szrj /* If length is not equal to 3 then only power of 2 is supported. */
600538fd1498Szrj gcc_assert (pow2p_hwi (length));
600638fd1498Szrj
600738fd1498Szrj /* The encoding has a single stepped pattern. */
600838fd1498Szrj poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
600938fd1498Szrj vec_perm_builder sel (nelt, 1, 3);
601038fd1498Szrj sel.quick_grow (3);
601138fd1498Szrj for (i = 0; i < 3; ++i)
601238fd1498Szrj sel[i] = i * 2;
601338fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
601438fd1498Szrj perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
601538fd1498Szrj
601638fd1498Szrj for (i = 0; i < 3; ++i)
601738fd1498Szrj sel[i] = i * 2 + 1;
601838fd1498Szrj indices.new_vector (sel, 2, nelt);
601938fd1498Szrj perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
602038fd1498Szrj
602138fd1498Szrj for (i = 0; i < log_length; i++)
602238fd1498Szrj {
602338fd1498Szrj for (j = 0; j < length; j += 2)
602438fd1498Szrj {
602538fd1498Szrj first_vect = dr_chain[j];
602638fd1498Szrj second_vect = dr_chain[j+1];
602738fd1498Szrj
602838fd1498Szrj /* data_ref = permute_even (first_data_ref, second_data_ref); */
602938fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
603038fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
603138fd1498Szrj first_vect, second_vect,
603238fd1498Szrj perm_mask_even);
603338fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
603438fd1498Szrj (*result_chain)[j/2] = data_ref;
603538fd1498Szrj
603638fd1498Szrj /* data_ref = permute_odd (first_data_ref, second_data_ref); */
603738fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
603838fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
603938fd1498Szrj first_vect, second_vect,
604038fd1498Szrj perm_mask_odd);
604138fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
604238fd1498Szrj (*result_chain)[j/2+length/2] = data_ref;
604338fd1498Szrj }
604438fd1498Szrj memcpy (dr_chain.address (), result_chain->address (),
604538fd1498Szrj length * sizeof (tree));
604638fd1498Szrj }
604738fd1498Szrj }
604838fd1498Szrj }
604938fd1498Szrj
605038fd1498Szrj /* Function vect_shift_permute_load_chain.
605138fd1498Szrj
605238fd1498Szrj Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
605338fd1498Szrj sequence of stmts to reorder the input data accordingly.
605438fd1498Szrj Return the final references for loads in RESULT_CHAIN.
605538fd1498Szrj Return true if successed, false otherwise.
605638fd1498Szrj
605738fd1498Szrj E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
605838fd1498Szrj The input is 3 vectors each containing 8 elements. We assign a
605938fd1498Szrj number to each element, the input sequence is:
606038fd1498Szrj
606138fd1498Szrj 1st vec: 0 1 2 3 4 5 6 7
606238fd1498Szrj 2nd vec: 8 9 10 11 12 13 14 15
606338fd1498Szrj 3rd vec: 16 17 18 19 20 21 22 23
606438fd1498Szrj
606538fd1498Szrj The output sequence should be:
606638fd1498Szrj
606738fd1498Szrj 1st vec: 0 3 6 9 12 15 18 21
606838fd1498Szrj 2nd vec: 1 4 7 10 13 16 19 22
606938fd1498Szrj 3rd vec: 2 5 8 11 14 17 20 23
607038fd1498Szrj
607138fd1498Szrj We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
607238fd1498Szrj
607338fd1498Szrj First we shuffle all 3 vectors to get correct elements order:
607438fd1498Szrj
607538fd1498Szrj 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
607638fd1498Szrj 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
607738fd1498Szrj 3rd vec: (16 19 22) (17 20 23) (18 21)
607838fd1498Szrj
607938fd1498Szrj Next we unite and shift vector 3 times:
608038fd1498Szrj
608138fd1498Szrj 1st step:
608238fd1498Szrj shift right by 6 the concatenation of:
608338fd1498Szrj "1st vec" and "2nd vec"
608438fd1498Szrj ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
608538fd1498Szrj "2nd vec" and "3rd vec"
608638fd1498Szrj ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
608738fd1498Szrj "3rd vec" and "1st vec"
608838fd1498Szrj (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
608938fd1498Szrj | New vectors |
609038fd1498Szrj
609138fd1498Szrj So that now new vectors are:
609238fd1498Szrj
609338fd1498Szrj 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
609438fd1498Szrj 2nd vec: (10 13) (16 19 22) (17 20 23)
609538fd1498Szrj 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
609638fd1498Szrj
609738fd1498Szrj 2nd step:
609838fd1498Szrj shift right by 5 the concatenation of:
609938fd1498Szrj "1st vec" and "3rd vec"
610038fd1498Szrj ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
610138fd1498Szrj "2nd vec" and "1st vec"
610238fd1498Szrj (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
610338fd1498Szrj "3rd vec" and "2nd vec"
610438fd1498Szrj (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
610538fd1498Szrj | New vectors |
610638fd1498Szrj
610738fd1498Szrj So that now new vectors are:
610838fd1498Szrj
610938fd1498Szrj 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
611038fd1498Szrj 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
611138fd1498Szrj 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
611238fd1498Szrj
611338fd1498Szrj 3rd step:
611438fd1498Szrj shift right by 5 the concatenation of:
611538fd1498Szrj "1st vec" and "1st vec"
611638fd1498Szrj ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
611738fd1498Szrj shift right by 3 the concatenation of:
611838fd1498Szrj "2nd vec" and "2nd vec"
611938fd1498Szrj (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
612038fd1498Szrj | New vectors |
612138fd1498Szrj
612238fd1498Szrj So that now all vectors are READY:
612338fd1498Szrj 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
612438fd1498Szrj 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
612538fd1498Szrj 3rd vec: ( 1 4 7) (10 13) (16 19 22)
612638fd1498Szrj
612738fd1498Szrj This algorithm is faster than one in vect_permute_load_chain if:
612838fd1498Szrj 1. "shift of a concatination" is faster than general permutation.
612938fd1498Szrj This is usually so.
613038fd1498Szrj 2. The TARGET machine can't execute vector instructions in parallel.
613138fd1498Szrj This is because each step of the algorithm depends on previous.
613238fd1498Szrj The algorithm in vect_permute_load_chain is much more parallel.
613338fd1498Szrj
613438fd1498Szrj The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
613538fd1498Szrj */
613638fd1498Szrj
613738fd1498Szrj static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)613838fd1498Szrj vect_shift_permute_load_chain (vec<tree> dr_chain,
613938fd1498Szrj unsigned int length,
614038fd1498Szrj gimple *stmt,
614138fd1498Szrj gimple_stmt_iterator *gsi,
614238fd1498Szrj vec<tree> *result_chain)
614338fd1498Szrj {
614438fd1498Szrj tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
614538fd1498Szrj tree perm2_mask1, perm2_mask2, perm3_mask;
614638fd1498Szrj tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
614738fd1498Szrj gimple *perm_stmt;
614838fd1498Szrj
614938fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
615038fd1498Szrj unsigned int i;
615138fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
615238fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
615338fd1498Szrj
615438fd1498Szrj unsigned HOST_WIDE_INT nelt, vf;
615538fd1498Szrj if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
615638fd1498Szrj || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
615738fd1498Szrj /* Not supported for variable-length vectors. */
615838fd1498Szrj return false;
615938fd1498Szrj
616038fd1498Szrj vec_perm_builder sel (nelt, nelt, 1);
616138fd1498Szrj sel.quick_grow (nelt);
616238fd1498Szrj
616338fd1498Szrj result_chain->quick_grow (length);
616438fd1498Szrj memcpy (result_chain->address (), dr_chain.address (),
616538fd1498Szrj length * sizeof (tree));
616638fd1498Szrj
616738fd1498Szrj if (pow2p_hwi (length) && vf > 4)
616838fd1498Szrj {
616938fd1498Szrj unsigned int j, log_length = exact_log2 (length);
617038fd1498Szrj for (i = 0; i < nelt / 2; ++i)
617138fd1498Szrj sel[i] = i * 2;
617238fd1498Szrj for (i = 0; i < nelt / 2; ++i)
617338fd1498Szrj sel[nelt / 2 + i] = i * 2 + 1;
617438fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
617538fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
617638fd1498Szrj {
617738fd1498Szrj if (dump_enabled_p ())
617838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617938fd1498Szrj "shuffle of 2 fields structure is not \
618038fd1498Szrj supported by target\n");
618138fd1498Szrj return false;
618238fd1498Szrj }
618338fd1498Szrj perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
618438fd1498Szrj
618538fd1498Szrj for (i = 0; i < nelt / 2; ++i)
618638fd1498Szrj sel[i] = i * 2 + 1;
618738fd1498Szrj for (i = 0; i < nelt / 2; ++i)
618838fd1498Szrj sel[nelt / 2 + i] = i * 2;
618938fd1498Szrj indices.new_vector (sel, 2, nelt);
619038fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
619138fd1498Szrj {
619238fd1498Szrj if (dump_enabled_p ())
619338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619438fd1498Szrj "shuffle of 2 fields structure is not \
619538fd1498Szrj supported by target\n");
619638fd1498Szrj return false;
619738fd1498Szrj }
619838fd1498Szrj perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
619938fd1498Szrj
620038fd1498Szrj /* Generating permutation constant to shift all elements.
620138fd1498Szrj For vector length 8 it is {4 5 6 7 8 9 10 11}. */
620238fd1498Szrj for (i = 0; i < nelt; i++)
620338fd1498Szrj sel[i] = nelt / 2 + i;
620438fd1498Szrj indices.new_vector (sel, 2, nelt);
620538fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
620638fd1498Szrj {
620738fd1498Szrj if (dump_enabled_p ())
620838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
620938fd1498Szrj "shift permutation is not supported by target\n");
621038fd1498Szrj return false;
621138fd1498Szrj }
621238fd1498Szrj shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
621338fd1498Szrj
621438fd1498Szrj /* Generating permutation constant to select vector from 2.
621538fd1498Szrj For vector length 8 it is {0 1 2 3 12 13 14 15}. */
621638fd1498Szrj for (i = 0; i < nelt / 2; i++)
621738fd1498Szrj sel[i] = i;
621838fd1498Szrj for (i = nelt / 2; i < nelt; i++)
621938fd1498Szrj sel[i] = nelt + i;
622038fd1498Szrj indices.new_vector (sel, 2, nelt);
622138fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
622238fd1498Szrj {
622338fd1498Szrj if (dump_enabled_p ())
622438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
622538fd1498Szrj "select is not supported by target\n");
622638fd1498Szrj return false;
622738fd1498Szrj }
622838fd1498Szrj select_mask = vect_gen_perm_mask_checked (vectype, indices);
622938fd1498Szrj
623038fd1498Szrj for (i = 0; i < log_length; i++)
623138fd1498Szrj {
623238fd1498Szrj for (j = 0; j < length; j += 2)
623338fd1498Szrj {
623438fd1498Szrj first_vect = dr_chain[j];
623538fd1498Szrj second_vect = dr_chain[j + 1];
623638fd1498Szrj
623738fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
623838fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
623938fd1498Szrj first_vect, first_vect,
624038fd1498Szrj perm2_mask1);
624138fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
624238fd1498Szrj vect[0] = data_ref;
624338fd1498Szrj
624438fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
624538fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
624638fd1498Szrj second_vect, second_vect,
624738fd1498Szrj perm2_mask2);
624838fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
624938fd1498Szrj vect[1] = data_ref;
625038fd1498Szrj
625138fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
625238fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
625338fd1498Szrj vect[0], vect[1], shift1_mask);
625438fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
625538fd1498Szrj (*result_chain)[j/2 + length/2] = data_ref;
625638fd1498Szrj
625738fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
625838fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
625938fd1498Szrj vect[0], vect[1], select_mask);
626038fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
626138fd1498Szrj (*result_chain)[j/2] = data_ref;
626238fd1498Szrj }
626338fd1498Szrj memcpy (dr_chain.address (), result_chain->address (),
626438fd1498Szrj length * sizeof (tree));
626538fd1498Szrj }
626638fd1498Szrj return true;
626738fd1498Szrj }
626838fd1498Szrj if (length == 3 && vf > 2)
626938fd1498Szrj {
627038fd1498Szrj unsigned int k = 0, l = 0;
627138fd1498Szrj
627238fd1498Szrj /* Generating permutation constant to get all elements in rigth order.
627338fd1498Szrj For vector length 8 it is {0 3 6 1 4 7 2 5}. */
627438fd1498Szrj for (i = 0; i < nelt; i++)
627538fd1498Szrj {
627638fd1498Szrj if (3 * k + (l % 3) >= nelt)
627738fd1498Szrj {
627838fd1498Szrj k = 0;
627938fd1498Szrj l += (3 - (nelt % 3));
628038fd1498Szrj }
628138fd1498Szrj sel[i] = 3 * k + (l % 3);
628238fd1498Szrj k++;
628338fd1498Szrj }
628438fd1498Szrj vec_perm_indices indices (sel, 2, nelt);
628538fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
628638fd1498Szrj {
628738fd1498Szrj if (dump_enabled_p ())
628838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628938fd1498Szrj "shuffle of 3 fields structure is not \
629038fd1498Szrj supported by target\n");
629138fd1498Szrj return false;
629238fd1498Szrj }
629338fd1498Szrj perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
629438fd1498Szrj
629538fd1498Szrj /* Generating permutation constant to shift all elements.
629638fd1498Szrj For vector length 8 it is {6 7 8 9 10 11 12 13}. */
629738fd1498Szrj for (i = 0; i < nelt; i++)
629838fd1498Szrj sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
629938fd1498Szrj indices.new_vector (sel, 2, nelt);
630038fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
630138fd1498Szrj {
630238fd1498Szrj if (dump_enabled_p ())
630338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
630438fd1498Szrj "shift permutation is not supported by target\n");
630538fd1498Szrj return false;
630638fd1498Szrj }
630738fd1498Szrj shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
630838fd1498Szrj
630938fd1498Szrj /* Generating permutation constant to shift all elements.
631038fd1498Szrj For vector length 8 it is {5 6 7 8 9 10 11 12}. */
631138fd1498Szrj for (i = 0; i < nelt; i++)
631238fd1498Szrj sel[i] = 2 * (nelt / 3) + 1 + i;
631338fd1498Szrj indices.new_vector (sel, 2, nelt);
631438fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
631538fd1498Szrj {
631638fd1498Szrj if (dump_enabled_p ())
631738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631838fd1498Szrj "shift permutation is not supported by target\n");
631938fd1498Szrj return false;
632038fd1498Szrj }
632138fd1498Szrj shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
632238fd1498Szrj
632338fd1498Szrj /* Generating permutation constant to shift all elements.
632438fd1498Szrj For vector length 8 it is {3 4 5 6 7 8 9 10}. */
632538fd1498Szrj for (i = 0; i < nelt; i++)
632638fd1498Szrj sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
632738fd1498Szrj indices.new_vector (sel, 2, nelt);
632838fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
632938fd1498Szrj {
633038fd1498Szrj if (dump_enabled_p ())
633138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633238fd1498Szrj "shift permutation is not supported by target\n");
633338fd1498Szrj return false;
633438fd1498Szrj }
633538fd1498Szrj shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
633638fd1498Szrj
633738fd1498Szrj /* Generating permutation constant to shift all elements.
633838fd1498Szrj For vector length 8 it is {5 6 7 8 9 10 11 12}. */
633938fd1498Szrj for (i = 0; i < nelt; i++)
634038fd1498Szrj sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
634138fd1498Szrj indices.new_vector (sel, 2, nelt);
634238fd1498Szrj if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
634338fd1498Szrj {
634438fd1498Szrj if (dump_enabled_p ())
634538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
634638fd1498Szrj "shift permutation is not supported by target\n");
634738fd1498Szrj return false;
634838fd1498Szrj }
634938fd1498Szrj shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
635038fd1498Szrj
635138fd1498Szrj for (k = 0; k < 3; k++)
635238fd1498Szrj {
635338fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
635438fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
635538fd1498Szrj dr_chain[k], dr_chain[k],
635638fd1498Szrj perm3_mask);
635738fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
635838fd1498Szrj vect[k] = data_ref;
635938fd1498Szrj }
636038fd1498Szrj
636138fd1498Szrj for (k = 0; k < 3; k++)
636238fd1498Szrj {
636338fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
636438fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
636538fd1498Szrj vect[k % 3], vect[(k + 1) % 3],
636638fd1498Szrj shift1_mask);
636738fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
636838fd1498Szrj vect_shift[k] = data_ref;
636938fd1498Szrj }
637038fd1498Szrj
637138fd1498Szrj for (k = 0; k < 3; k++)
637238fd1498Szrj {
637338fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
637438fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
637538fd1498Szrj vect_shift[(4 - k) % 3],
637638fd1498Szrj vect_shift[(3 - k) % 3],
637738fd1498Szrj shift2_mask);
637838fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
637938fd1498Szrj vect[k] = data_ref;
638038fd1498Szrj }
638138fd1498Szrj
638238fd1498Szrj (*result_chain)[3 - (nelt % 3)] = vect[2];
638338fd1498Szrj
638438fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
638538fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
638638fd1498Szrj vect[0], shift3_mask);
638738fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
638838fd1498Szrj (*result_chain)[nelt % 3] = data_ref;
638938fd1498Szrj
639038fd1498Szrj data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
639138fd1498Szrj perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
639238fd1498Szrj vect[1], shift4_mask);
639338fd1498Szrj vect_finish_stmt_generation (stmt, perm_stmt, gsi);
639438fd1498Szrj (*result_chain)[0] = data_ref;
639538fd1498Szrj return true;
639638fd1498Szrj }
639738fd1498Szrj return false;
639838fd1498Szrj }
639938fd1498Szrj
640038fd1498Szrj /* Function vect_transform_grouped_load.
640138fd1498Szrj
640238fd1498Szrj Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
640338fd1498Szrj to perform their permutation and ascribe the result vectorized statements to
640438fd1498Szrj the scalar statements.
640538fd1498Szrj */
640638fd1498Szrj
640738fd1498Szrj void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)640838fd1498Szrj vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
640938fd1498Szrj gimple_stmt_iterator *gsi)
641038fd1498Szrj {
641138fd1498Szrj machine_mode mode;
641238fd1498Szrj vec<tree> result_chain = vNULL;
641338fd1498Szrj
641438fd1498Szrj /* DR_CHAIN contains input data-refs that are a part of the interleaving.
641538fd1498Szrj RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
641638fd1498Szrj vectors, that are ready for vector computation. */
641738fd1498Szrj result_chain.create (size);
641838fd1498Szrj
641938fd1498Szrj /* If reassociation width for vector type is 2 or greater target machine can
642038fd1498Szrj execute 2 or more vector instructions in parallel. Otherwise try to
642138fd1498Szrj get chain for loads group using vect_shift_permute_load_chain. */
642238fd1498Szrj mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
642338fd1498Szrj if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
642438fd1498Szrj || pow2p_hwi (size)
642538fd1498Szrj || !vect_shift_permute_load_chain (dr_chain, size, stmt,
642638fd1498Szrj gsi, &result_chain))
642738fd1498Szrj vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
642838fd1498Szrj vect_record_grouped_load_vectors (stmt, result_chain);
642938fd1498Szrj result_chain.release ();
643038fd1498Szrj }
643138fd1498Szrj
643238fd1498Szrj /* RESULT_CHAIN contains the output of a group of grouped loads that were
643338fd1498Szrj generated as part of the vectorization of STMT. Assign the statement
643438fd1498Szrj for each vector to the associated scalar statement. */
643538fd1498Szrj
643638fd1498Szrj void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)643738fd1498Szrj vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
643838fd1498Szrj {
643938fd1498Szrj gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
644038fd1498Szrj gimple *next_stmt, *new_stmt;
644138fd1498Szrj unsigned int i, gap_count;
644238fd1498Szrj tree tmp_data_ref;
644338fd1498Szrj
644438fd1498Szrj /* Put a permuted data-ref in the VECTORIZED_STMT field.
644538fd1498Szrj Since we scan the chain starting from it's first node, their order
644638fd1498Szrj corresponds the order of data-refs in RESULT_CHAIN. */
644738fd1498Szrj next_stmt = first_stmt;
644838fd1498Szrj gap_count = 1;
644938fd1498Szrj FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
645038fd1498Szrj {
645138fd1498Szrj if (!next_stmt)
645238fd1498Szrj break;
645338fd1498Szrj
645438fd1498Szrj /* Skip the gaps. Loads created for the gaps will be removed by dead
645538fd1498Szrj code elimination pass later. No need to check for the first stmt in
645638fd1498Szrj the group, since it always exists.
645738fd1498Szrj GROUP_GAP is the number of steps in elements from the previous
645838fd1498Szrj access (if there is no gap GROUP_GAP is 1). We skip loads that
645938fd1498Szrj correspond to the gaps. */
646038fd1498Szrj if (next_stmt != first_stmt
646138fd1498Szrj && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
646238fd1498Szrj {
646338fd1498Szrj gap_count++;
646438fd1498Szrj continue;
646538fd1498Szrj }
646638fd1498Szrj
646738fd1498Szrj while (next_stmt)
646838fd1498Szrj {
646938fd1498Szrj new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
647038fd1498Szrj /* We assume that if VEC_STMT is not NULL, this is a case of multiple
647138fd1498Szrj copies, and we put the new vector statement in the first available
647238fd1498Szrj RELATED_STMT. */
647338fd1498Szrj if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
647438fd1498Szrj STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
647538fd1498Szrj else
647638fd1498Szrj {
647738fd1498Szrj if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
647838fd1498Szrj {
647938fd1498Szrj gimple *prev_stmt =
648038fd1498Szrj STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
648138fd1498Szrj gimple *rel_stmt =
648238fd1498Szrj STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
648338fd1498Szrj while (rel_stmt)
648438fd1498Szrj {
648538fd1498Szrj prev_stmt = rel_stmt;
648638fd1498Szrj rel_stmt =
648738fd1498Szrj STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
648838fd1498Szrj }
648938fd1498Szrj
649038fd1498Szrj STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
649138fd1498Szrj new_stmt;
649238fd1498Szrj }
649338fd1498Szrj }
649438fd1498Szrj
649538fd1498Szrj next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
649638fd1498Szrj gap_count = 1;
649738fd1498Szrj /* If NEXT_STMT accesses the same DR as the previous statement,
649838fd1498Szrj put the same TMP_DATA_REF as its vectorized statement; otherwise
649938fd1498Szrj get the next data-ref from RESULT_CHAIN. */
650038fd1498Szrj if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
650138fd1498Szrj break;
650238fd1498Szrj }
650338fd1498Szrj }
650438fd1498Szrj }
650538fd1498Szrj
650638fd1498Szrj /* Function vect_force_dr_alignment_p.
650738fd1498Szrj
650838fd1498Szrj Returns whether the alignment of a DECL can be forced to be aligned
650938fd1498Szrj on ALIGNMENT bit boundary. */
651038fd1498Szrj
651138fd1498Szrj bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)651238fd1498Szrj vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
651338fd1498Szrj {
651438fd1498Szrj if (!VAR_P (decl))
651538fd1498Szrj return false;
651638fd1498Szrj
651738fd1498Szrj if (decl_in_symtab_p (decl)
651838fd1498Szrj && !symtab_node::get (decl)->can_increase_alignment_p ())
651938fd1498Szrj return false;
652038fd1498Szrj
652138fd1498Szrj if (TREE_STATIC (decl))
652238fd1498Szrj return (alignment <= MAX_OFILE_ALIGNMENT);
652338fd1498Szrj else
652438fd1498Szrj return (alignment <= MAX_STACK_ALIGNMENT);
652538fd1498Szrj }
652638fd1498Szrj
652738fd1498Szrj
652838fd1498Szrj /* Return whether the data reference DR is supported with respect to its
652938fd1498Szrj alignment.
653038fd1498Szrj If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
653138fd1498Szrj it is aligned, i.e., check if it is possible to vectorize it with different
653238fd1498Szrj alignment. */
653338fd1498Szrj
653438fd1498Szrj enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)653538fd1498Szrj vect_supportable_dr_alignment (struct data_reference *dr,
653638fd1498Szrj bool check_aligned_accesses)
653738fd1498Szrj {
653838fd1498Szrj gimple *stmt = DR_STMT (dr);
653938fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
654038fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
654138fd1498Szrj machine_mode mode = TYPE_MODE (vectype);
654238fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
654338fd1498Szrj struct loop *vect_loop = NULL;
654438fd1498Szrj bool nested_in_vect_loop = false;
654538fd1498Szrj
654638fd1498Szrj if (aligned_access_p (dr) && !check_aligned_accesses)
654738fd1498Szrj return dr_aligned;
654838fd1498Szrj
654938fd1498Szrj /* For now assume all conditional loads/stores support unaligned
655038fd1498Szrj access without any special code. */
655138fd1498Szrj if (is_gimple_call (stmt)
655238fd1498Szrj && gimple_call_internal_p (stmt)
655338fd1498Szrj && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
655438fd1498Szrj || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
655538fd1498Szrj return dr_unaligned_supported;
655638fd1498Szrj
655738fd1498Szrj if (loop_vinfo)
655838fd1498Szrj {
655938fd1498Szrj vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
656038fd1498Szrj nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
656138fd1498Szrj }
656238fd1498Szrj
656338fd1498Szrj /* Possibly unaligned access. */
656438fd1498Szrj
656538fd1498Szrj /* We can choose between using the implicit realignment scheme (generating
656638fd1498Szrj a misaligned_move stmt) and the explicit realignment scheme (generating
656738fd1498Szrj aligned loads with a REALIGN_LOAD). There are two variants to the
656838fd1498Szrj explicit realignment scheme: optimized, and unoptimized.
656938fd1498Szrj We can optimize the realignment only if the step between consecutive
657038fd1498Szrj vector loads is equal to the vector size. Since the vector memory
657138fd1498Szrj accesses advance in steps of VS (Vector Size) in the vectorized loop, it
657238fd1498Szrj is guaranteed that the misalignment amount remains the same throughout the
657338fd1498Szrj execution of the vectorized loop. Therefore, we can create the
657438fd1498Szrj "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
657538fd1498Szrj at the loop preheader.
657638fd1498Szrj
657738fd1498Szrj However, in the case of outer-loop vectorization, when vectorizing a
657838fd1498Szrj memory access in the inner-loop nested within the LOOP that is now being
657938fd1498Szrj vectorized, while it is guaranteed that the misalignment of the
658038fd1498Szrj vectorized memory access will remain the same in different outer-loop
658138fd1498Szrj iterations, it is *not* guaranteed that is will remain the same throughout
658238fd1498Szrj the execution of the inner-loop. This is because the inner-loop advances
658338fd1498Szrj with the original scalar step (and not in steps of VS). If the inner-loop
658438fd1498Szrj step happens to be a multiple of VS, then the misalignment remains fixed
658538fd1498Szrj and we can use the optimized realignment scheme. For example:
658638fd1498Szrj
658738fd1498Szrj for (i=0; i<N; i++)
658838fd1498Szrj for (j=0; j<M; j++)
658938fd1498Szrj s += a[i+j];
659038fd1498Szrj
659138fd1498Szrj When vectorizing the i-loop in the above example, the step between
659238fd1498Szrj consecutive vector loads is 1, and so the misalignment does not remain
659338fd1498Szrj fixed across the execution of the inner-loop, and the realignment cannot
659438fd1498Szrj be optimized (as illustrated in the following pseudo vectorized loop):
659538fd1498Szrj
659638fd1498Szrj for (i=0; i<N; i+=4)
659738fd1498Szrj for (j=0; j<M; j++){
659838fd1498Szrj vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
659938fd1498Szrj // when j is {0,1,2,3,4,5,6,7,...} respectively.
660038fd1498Szrj // (assuming that we start from an aligned address).
660138fd1498Szrj }
660238fd1498Szrj
660338fd1498Szrj We therefore have to use the unoptimized realignment scheme:
660438fd1498Szrj
660538fd1498Szrj for (i=0; i<N; i+=4)
660638fd1498Szrj for (j=k; j<M; j+=4)
660738fd1498Szrj vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
660838fd1498Szrj // that the misalignment of the initial address is
660938fd1498Szrj // 0).
661038fd1498Szrj
661138fd1498Szrj The loop can then be vectorized as follows:
661238fd1498Szrj
661338fd1498Szrj for (k=0; k<4; k++){
661438fd1498Szrj rt = get_realignment_token (&vp[k]);
661538fd1498Szrj for (i=0; i<N; i+=4){
661638fd1498Szrj v1 = vp[i+k];
661738fd1498Szrj for (j=k; j<M; j+=4){
661838fd1498Szrj v2 = vp[i+j+VS-1];
661938fd1498Szrj va = REALIGN_LOAD <v1,v2,rt>;
662038fd1498Szrj vs += va;
662138fd1498Szrj v1 = v2;
662238fd1498Szrj }
662338fd1498Szrj }
662438fd1498Szrj } */
662538fd1498Szrj
662638fd1498Szrj if (DR_IS_READ (dr))
662738fd1498Szrj {
662838fd1498Szrj bool is_packed = false;
662938fd1498Szrj tree type = (TREE_TYPE (DR_REF (dr)));
663038fd1498Szrj
663138fd1498Szrj if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
663238fd1498Szrj && (!targetm.vectorize.builtin_mask_for_load
663338fd1498Szrj || targetm.vectorize.builtin_mask_for_load ()))
663438fd1498Szrj {
663538fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
663638fd1498Szrj
663738fd1498Szrj /* If we are doing SLP then the accesses need not have the
663838fd1498Szrj same alignment, instead it depends on the SLP group size. */
663938fd1498Szrj if (loop_vinfo
664038fd1498Szrj && STMT_SLP_TYPE (stmt_info)
664138fd1498Szrj && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
664238fd1498Szrj * GROUP_SIZE (vinfo_for_stmt
664338fd1498Szrj (GROUP_FIRST_ELEMENT (stmt_info))),
664438fd1498Szrj TYPE_VECTOR_SUBPARTS (vectype)))
664538fd1498Szrj ;
664638fd1498Szrj else if (!loop_vinfo
664738fd1498Szrj || (nested_in_vect_loop
664838fd1498Szrj && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
664938fd1498Szrj GET_MODE_SIZE (TYPE_MODE (vectype)))))
665038fd1498Szrj return dr_explicit_realign;
665138fd1498Szrj else
665238fd1498Szrj return dr_explicit_realign_optimized;
665338fd1498Szrj }
665438fd1498Szrj if (!known_alignment_for_access_p (dr))
665538fd1498Szrj is_packed = not_size_aligned (DR_REF (dr));
665638fd1498Szrj
665738fd1498Szrj if (targetm.vectorize.support_vector_misalignment
665838fd1498Szrj (mode, type, DR_MISALIGNMENT (dr), is_packed))
665938fd1498Szrj /* Can't software pipeline the loads, but can at least do them. */
666038fd1498Szrj return dr_unaligned_supported;
666138fd1498Szrj }
666238fd1498Szrj else
666338fd1498Szrj {
666438fd1498Szrj bool is_packed = false;
666538fd1498Szrj tree type = (TREE_TYPE (DR_REF (dr)));
666638fd1498Szrj
666738fd1498Szrj if (!known_alignment_for_access_p (dr))
666838fd1498Szrj is_packed = not_size_aligned (DR_REF (dr));
666938fd1498Szrj
667038fd1498Szrj if (targetm.vectorize.support_vector_misalignment
667138fd1498Szrj (mode, type, DR_MISALIGNMENT (dr), is_packed))
667238fd1498Szrj return dr_unaligned_supported;
667338fd1498Szrj }
667438fd1498Szrj
667538fd1498Szrj /* Unsupported. */
667638fd1498Szrj return dr_unaligned_unsupported;
667738fd1498Szrj }
6678