138fd1498Szrj /* Loop Vectorization
238fd1498Szrj Copyright (C) 2003-2018 Free Software Foundation, Inc.
338fd1498Szrj Contributed by Dorit Naishlos <dorit@il.ibm.com> and
438fd1498Szrj Ira Rosen <irar@il.ibm.com>
538fd1498Szrj
638fd1498Szrj This file is part of GCC.
738fd1498Szrj
838fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
938fd1498Szrj the terms of the GNU General Public License as published by the Free
1038fd1498Szrj Software Foundation; either version 3, or (at your option) any later
1138fd1498Szrj version.
1238fd1498Szrj
1338fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1438fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
1538fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1638fd1498Szrj for more details.
1738fd1498Szrj
1838fd1498Szrj You should have received a copy of the GNU General Public License
1938fd1498Szrj along with GCC; see the file COPYING3. If not see
2038fd1498Szrj <http://www.gnu.org/licenses/>. */
2138fd1498Szrj
2238fd1498Szrj #include "config.h"
2338fd1498Szrj #include "system.h"
2438fd1498Szrj #include "coretypes.h"
2538fd1498Szrj #include "backend.h"
2638fd1498Szrj #include "target.h"
2738fd1498Szrj #include "rtl.h"
2838fd1498Szrj #include "tree.h"
2938fd1498Szrj #include "gimple.h"
3038fd1498Szrj #include "cfghooks.h"
3138fd1498Szrj #include "tree-pass.h"
3238fd1498Szrj #include "ssa.h"
3338fd1498Szrj #include "optabs-tree.h"
3438fd1498Szrj #include "diagnostic-core.h"
3538fd1498Szrj #include "fold-const.h"
3638fd1498Szrj #include "stor-layout.h"
3738fd1498Szrj #include "cfganal.h"
3838fd1498Szrj #include "gimplify.h"
3938fd1498Szrj #include "gimple-iterator.h"
4038fd1498Szrj #include "gimplify-me.h"
4138fd1498Szrj #include "tree-ssa-loop-ivopts.h"
4238fd1498Szrj #include "tree-ssa-loop-manip.h"
4338fd1498Szrj #include "tree-ssa-loop-niter.h"
4438fd1498Szrj #include "tree-ssa-loop.h"
4538fd1498Szrj #include "cfgloop.h"
4638fd1498Szrj #include "params.h"
4738fd1498Szrj #include "tree-scalar-evolution.h"
4838fd1498Szrj #include "tree-vectorizer.h"
4938fd1498Szrj #include "gimple-fold.h"
5038fd1498Szrj #include "cgraph.h"
5138fd1498Szrj #include "tree-cfg.h"
5238fd1498Szrj #include "tree-if-conv.h"
5338fd1498Szrj #include "internal-fn.h"
5438fd1498Szrj #include "tree-vector-builder.h"
5538fd1498Szrj #include "vec-perm-indices.h"
5638fd1498Szrj #include "tree-eh.h"
5738fd1498Szrj
5838fd1498Szrj /* Loop Vectorization Pass.
5938fd1498Szrj
6038fd1498Szrj This pass tries to vectorize loops.
6138fd1498Szrj
6238fd1498Szrj For example, the vectorizer transforms the following simple loop:
6338fd1498Szrj
6438fd1498Szrj short a[N]; short b[N]; short c[N]; int i;
6538fd1498Szrj
6638fd1498Szrj for (i=0; i<N; i++){
6738fd1498Szrj a[i] = b[i] + c[i];
6838fd1498Szrj }
6938fd1498Szrj
7038fd1498Szrj as if it was manually vectorized by rewriting the source code into:
7138fd1498Szrj
7238fd1498Szrj typedef int __attribute__((mode(V8HI))) v8hi;
7338fd1498Szrj short a[N]; short b[N]; short c[N]; int i;
7438fd1498Szrj v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
7538fd1498Szrj v8hi va, vb, vc;
7638fd1498Szrj
7738fd1498Szrj for (i=0; i<N/8; i++){
7838fd1498Szrj vb = pb[i];
7938fd1498Szrj vc = pc[i];
8038fd1498Szrj va = vb + vc;
8138fd1498Szrj pa[i] = va;
8238fd1498Szrj }
8338fd1498Szrj
8438fd1498Szrj The main entry to this pass is vectorize_loops(), in which
8538fd1498Szrj the vectorizer applies a set of analyses on a given set of loops,
8638fd1498Szrj followed by the actual vectorization transformation for the loops that
8738fd1498Szrj had successfully passed the analysis phase.
8838fd1498Szrj Throughout this pass we make a distinction between two types of
8938fd1498Szrj data: scalars (which are represented by SSA_NAMES), and memory references
9038fd1498Szrj ("data-refs"). These two types of data require different handling both
9138fd1498Szrj during analysis and transformation. The types of data-refs that the
9238fd1498Szrj vectorizer currently supports are ARRAY_REFS which base is an array DECL
9338fd1498Szrj (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
9438fd1498Szrj accesses are required to have a simple (consecutive) access pattern.
9538fd1498Szrj
9638fd1498Szrj Analysis phase:
9738fd1498Szrj ===============
9838fd1498Szrj The driver for the analysis phase is vect_analyze_loop().
9938fd1498Szrj It applies a set of analyses, some of which rely on the scalar evolution
10038fd1498Szrj analyzer (scev) developed by Sebastian Pop.
10138fd1498Szrj
10238fd1498Szrj During the analysis phase the vectorizer records some information
10338fd1498Szrj per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
10438fd1498Szrj loop, as well as general information about the loop as a whole, which is
10538fd1498Szrj recorded in a "loop_vec_info" struct attached to each loop.
10638fd1498Szrj
10738fd1498Szrj Transformation phase:
10838fd1498Szrj =====================
10938fd1498Szrj The loop transformation phase scans all the stmts in the loop, and
11038fd1498Szrj creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
11138fd1498Szrj the loop that needs to be vectorized. It inserts the vector code sequence
11238fd1498Szrj just before the scalar stmt S, and records a pointer to the vector code
11338fd1498Szrj in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
11438fd1498Szrj attached to S). This pointer will be used for the vectorization of following
11538fd1498Szrj stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
11638fd1498Szrj otherwise, we rely on dead code elimination for removing it.
11738fd1498Szrj
11838fd1498Szrj For example, say stmt S1 was vectorized into stmt VS1:
11938fd1498Szrj
12038fd1498Szrj VS1: vb = px[i];
12138fd1498Szrj S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
12238fd1498Szrj S2: a = b;
12338fd1498Szrj
12438fd1498Szrj To vectorize stmt S2, the vectorizer first finds the stmt that defines
12538fd1498Szrj the operand 'b' (S1), and gets the relevant vector def 'vb' from the
12638fd1498Szrj vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
12738fd1498Szrj resulting sequence would be:
12838fd1498Szrj
12938fd1498Szrj VS1: vb = px[i];
13038fd1498Szrj S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
13138fd1498Szrj VS2: va = vb;
13238fd1498Szrj S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
13338fd1498Szrj
13438fd1498Szrj Operands that are not SSA_NAMEs, are data-refs that appear in
13538fd1498Szrj load/store operations (like 'x[i]' in S1), and are handled differently.
13638fd1498Szrj
13738fd1498Szrj Target modeling:
13838fd1498Szrj =================
13938fd1498Szrj Currently the only target specific information that is used is the
14038fd1498Szrj size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
14138fd1498Szrj Targets that can support different sizes of vectors, for now will need
14238fd1498Szrj to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
14338fd1498Szrj flexibility will be added in the future.
14438fd1498Szrj
14538fd1498Szrj Since we only vectorize operations which vector form can be
14638fd1498Szrj expressed using existing tree codes, to verify that an operation is
14738fd1498Szrj supported, the vectorizer checks the relevant optab at the relevant
14838fd1498Szrj machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
14938fd1498Szrj the value found is CODE_FOR_nothing, then there's no target support, and
15038fd1498Szrj we can't vectorize the stmt.
15138fd1498Szrj
15238fd1498Szrj For additional information on this project see:
15338fd1498Szrj http://gcc.gnu.org/projects/tree-ssa/vectorization.html
15438fd1498Szrj */
15538fd1498Szrj
15638fd1498Szrj static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
15738fd1498Szrj
15838fd1498Szrj /* Function vect_determine_vectorization_factor
15938fd1498Szrj
16038fd1498Szrj Determine the vectorization factor (VF). VF is the number of data elements
16138fd1498Szrj that are operated upon in parallel in a single iteration of the vectorized
16238fd1498Szrj loop. For example, when vectorizing a loop that operates on 4byte elements,
16338fd1498Szrj on a target with vector size (VS) 16byte, the VF is set to 4, since 4
16438fd1498Szrj elements can fit in a single vector register.
16538fd1498Szrj
16638fd1498Szrj We currently support vectorization of loops in which all types operated upon
16738fd1498Szrj are of the same size. Therefore this function currently sets VF according to
16838fd1498Szrj the size of the types operated upon, and fails if there are multiple sizes
16938fd1498Szrj in the loop.
17038fd1498Szrj
17138fd1498Szrj VF is also the factor by which the loop iterations are strip-mined, e.g.:
17238fd1498Szrj original loop:
17338fd1498Szrj for (i=0; i<N; i++){
17438fd1498Szrj a[i] = b[i] + c[i];
17538fd1498Szrj }
17638fd1498Szrj
17738fd1498Szrj vectorized loop:
17838fd1498Szrj for (i=0; i<N; i+=VF){
17938fd1498Szrj a[i:VF] = b[i:VF] + c[i:VF];
18038fd1498Szrj }
18138fd1498Szrj */
18238fd1498Szrj
18338fd1498Szrj static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)18438fd1498Szrj vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
18538fd1498Szrj {
18638fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
18738fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
18838fd1498Szrj unsigned nbbs = loop->num_nodes;
18938fd1498Szrj poly_uint64 vectorization_factor = 1;
19038fd1498Szrj tree scalar_type = NULL_TREE;
19138fd1498Szrj gphi *phi;
19238fd1498Szrj tree vectype;
19338fd1498Szrj stmt_vec_info stmt_info;
19438fd1498Szrj unsigned i;
19538fd1498Szrj HOST_WIDE_INT dummy;
19638fd1498Szrj gimple *stmt, *pattern_stmt = NULL;
19738fd1498Szrj gimple_seq pattern_def_seq = NULL;
19838fd1498Szrj gimple_stmt_iterator pattern_def_si = gsi_none ();
19938fd1498Szrj bool analyze_pattern_stmt = false;
20038fd1498Szrj bool bool_result;
20138fd1498Szrj auto_vec<stmt_vec_info> mask_producers;
20238fd1498Szrj
20338fd1498Szrj if (dump_enabled_p ())
20438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
20538fd1498Szrj "=== vect_determine_vectorization_factor ===\n");
20638fd1498Szrj
20738fd1498Szrj for (i = 0; i < nbbs; i++)
20838fd1498Szrj {
20938fd1498Szrj basic_block bb = bbs[i];
21038fd1498Szrj
21138fd1498Szrj for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
21238fd1498Szrj gsi_next (&si))
21338fd1498Szrj {
21438fd1498Szrj phi = si.phi ();
21538fd1498Szrj stmt_info = vinfo_for_stmt (phi);
21638fd1498Szrj if (dump_enabled_p ())
21738fd1498Szrj {
21838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
21938fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
22038fd1498Szrj }
22138fd1498Szrj
22238fd1498Szrj gcc_assert (stmt_info);
22338fd1498Szrj
22438fd1498Szrj if (STMT_VINFO_RELEVANT_P (stmt_info)
22538fd1498Szrj || STMT_VINFO_LIVE_P (stmt_info))
22638fd1498Szrj {
22738fd1498Szrj gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
22838fd1498Szrj scalar_type = TREE_TYPE (PHI_RESULT (phi));
22938fd1498Szrj
23038fd1498Szrj if (dump_enabled_p ())
23138fd1498Szrj {
23238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
23338fd1498Szrj "get vectype for scalar type: ");
23438fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
23538fd1498Szrj dump_printf (MSG_NOTE, "\n");
23638fd1498Szrj }
23738fd1498Szrj
23838fd1498Szrj vectype = get_vectype_for_scalar_type (scalar_type);
23938fd1498Szrj if (!vectype)
24038fd1498Szrj {
24138fd1498Szrj if (dump_enabled_p ())
24238fd1498Szrj {
24338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
24438fd1498Szrj "not vectorized: unsupported "
24538fd1498Szrj "data-type ");
24638fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
24738fd1498Szrj scalar_type);
24838fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
24938fd1498Szrj }
25038fd1498Szrj return false;
25138fd1498Szrj }
25238fd1498Szrj STMT_VINFO_VECTYPE (stmt_info) = vectype;
25338fd1498Szrj
25438fd1498Szrj if (dump_enabled_p ())
25538fd1498Szrj {
25638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
25738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
25838fd1498Szrj dump_printf (MSG_NOTE, "\n");
25938fd1498Szrj }
26038fd1498Szrj
26138fd1498Szrj if (dump_enabled_p ())
26238fd1498Szrj {
26338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
26438fd1498Szrj dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
26538fd1498Szrj dump_printf (MSG_NOTE, "\n");
26638fd1498Szrj }
26738fd1498Szrj
26838fd1498Szrj vect_update_max_nunits (&vectorization_factor, vectype);
26938fd1498Szrj }
27038fd1498Szrj }
27138fd1498Szrj
27238fd1498Szrj for (gimple_stmt_iterator si = gsi_start_bb (bb);
27338fd1498Szrj !gsi_end_p (si) || analyze_pattern_stmt;)
27438fd1498Szrj {
27538fd1498Szrj tree vf_vectype;
27638fd1498Szrj
27738fd1498Szrj if (analyze_pattern_stmt)
27838fd1498Szrj stmt = pattern_stmt;
27938fd1498Szrj else
28038fd1498Szrj stmt = gsi_stmt (si);
28138fd1498Szrj
28238fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
28338fd1498Szrj
28438fd1498Szrj if (dump_enabled_p ())
28538fd1498Szrj {
28638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
28738fd1498Szrj "==> examining statement: ");
28838fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
28938fd1498Szrj }
29038fd1498Szrj
29138fd1498Szrj gcc_assert (stmt_info);
29238fd1498Szrj
29338fd1498Szrj /* Skip stmts which do not need to be vectorized. */
29438fd1498Szrj if ((!STMT_VINFO_RELEVANT_P (stmt_info)
29538fd1498Szrj && !STMT_VINFO_LIVE_P (stmt_info))
29638fd1498Szrj || gimple_clobber_p (stmt))
29738fd1498Szrj {
29838fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (stmt_info)
29938fd1498Szrj && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
30038fd1498Szrj && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
30138fd1498Szrj || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
30238fd1498Szrj {
30338fd1498Szrj stmt = pattern_stmt;
30438fd1498Szrj stmt_info = vinfo_for_stmt (pattern_stmt);
30538fd1498Szrj if (dump_enabled_p ())
30638fd1498Szrj {
30738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
30838fd1498Szrj "==> examining pattern statement: ");
30938fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
31038fd1498Szrj }
31138fd1498Szrj }
31238fd1498Szrj else
31338fd1498Szrj {
31438fd1498Szrj if (dump_enabled_p ())
31538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
31638fd1498Szrj gsi_next (&si);
31738fd1498Szrj continue;
31838fd1498Szrj }
31938fd1498Szrj }
32038fd1498Szrj else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
32138fd1498Szrj && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
32238fd1498Szrj && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
32338fd1498Szrj || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
32438fd1498Szrj analyze_pattern_stmt = true;
32538fd1498Szrj
32638fd1498Szrj /* If a pattern statement has def stmts, analyze them too. */
32738fd1498Szrj if (is_pattern_stmt_p (stmt_info))
32838fd1498Szrj {
32938fd1498Szrj if (pattern_def_seq == NULL)
33038fd1498Szrj {
33138fd1498Szrj pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
33238fd1498Szrj pattern_def_si = gsi_start (pattern_def_seq);
33338fd1498Szrj }
33438fd1498Szrj else if (!gsi_end_p (pattern_def_si))
33538fd1498Szrj gsi_next (&pattern_def_si);
33638fd1498Szrj if (pattern_def_seq != NULL)
33738fd1498Szrj {
33838fd1498Szrj gimple *pattern_def_stmt = NULL;
33938fd1498Szrj stmt_vec_info pattern_def_stmt_info = NULL;
34038fd1498Szrj
34138fd1498Szrj while (!gsi_end_p (pattern_def_si))
34238fd1498Szrj {
34338fd1498Szrj pattern_def_stmt = gsi_stmt (pattern_def_si);
34438fd1498Szrj pattern_def_stmt_info
34538fd1498Szrj = vinfo_for_stmt (pattern_def_stmt);
34638fd1498Szrj if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
34738fd1498Szrj || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
34838fd1498Szrj break;
34938fd1498Szrj gsi_next (&pattern_def_si);
35038fd1498Szrj }
35138fd1498Szrj
35238fd1498Szrj if (!gsi_end_p (pattern_def_si))
35338fd1498Szrj {
35438fd1498Szrj if (dump_enabled_p ())
35538fd1498Szrj {
35638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
35738fd1498Szrj "==> examining pattern def stmt: ");
35838fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
35938fd1498Szrj pattern_def_stmt, 0);
36038fd1498Szrj }
36138fd1498Szrj
36238fd1498Szrj stmt = pattern_def_stmt;
36338fd1498Szrj stmt_info = pattern_def_stmt_info;
36438fd1498Szrj }
36538fd1498Szrj else
36638fd1498Szrj {
36738fd1498Szrj pattern_def_si = gsi_none ();
36838fd1498Szrj analyze_pattern_stmt = false;
36938fd1498Szrj }
37038fd1498Szrj }
37138fd1498Szrj else
37238fd1498Szrj analyze_pattern_stmt = false;
37338fd1498Szrj }
37438fd1498Szrj
37538fd1498Szrj if (gimple_get_lhs (stmt) == NULL_TREE
37638fd1498Szrj /* MASK_STORE has no lhs, but is ok. */
37738fd1498Szrj && (!is_gimple_call (stmt)
37838fd1498Szrj || !gimple_call_internal_p (stmt)
37938fd1498Szrj || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
38038fd1498Szrj {
38138fd1498Szrj if (is_gimple_call (stmt))
38238fd1498Szrj {
38338fd1498Szrj /* Ignore calls with no lhs. These must be calls to
38438fd1498Szrj #pragma omp simd functions, and what vectorization factor
38538fd1498Szrj it really needs can't be determined until
38638fd1498Szrj vectorizable_simd_clone_call. */
38738fd1498Szrj if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
38838fd1498Szrj {
38938fd1498Szrj pattern_def_seq = NULL;
39038fd1498Szrj gsi_next (&si);
39138fd1498Szrj }
39238fd1498Szrj continue;
39338fd1498Szrj }
39438fd1498Szrj if (dump_enabled_p ())
39538fd1498Szrj {
39638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
39738fd1498Szrj "not vectorized: irregular stmt.");
39838fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
39938fd1498Szrj 0);
40038fd1498Szrj }
40138fd1498Szrj return false;
40238fd1498Szrj }
40338fd1498Szrj
40438fd1498Szrj if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
40538fd1498Szrj {
40638fd1498Szrj if (dump_enabled_p ())
40738fd1498Szrj {
40838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
40938fd1498Szrj "not vectorized: vector stmt in loop:");
41038fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
41138fd1498Szrj }
41238fd1498Szrj return false;
41338fd1498Szrj }
41438fd1498Szrj
41538fd1498Szrj bool_result = false;
41638fd1498Szrj
41738fd1498Szrj if (STMT_VINFO_VECTYPE (stmt_info))
41838fd1498Szrj {
41938fd1498Szrj /* The only case when a vectype had been already set is for stmts
42038fd1498Szrj that contain a dataref, or for "pattern-stmts" (stmts
42138fd1498Szrj generated by the vectorizer to represent/replace a certain
42238fd1498Szrj idiom). */
42338fd1498Szrj gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
42438fd1498Szrj || is_pattern_stmt_p (stmt_info)
42538fd1498Szrj || !gsi_end_p (pattern_def_si));
42638fd1498Szrj vectype = STMT_VINFO_VECTYPE (stmt_info);
42738fd1498Szrj }
42838fd1498Szrj else
42938fd1498Szrj {
43038fd1498Szrj gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
43138fd1498Szrj if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
43238fd1498Szrj scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
43338fd1498Szrj else
43438fd1498Szrj scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
43538fd1498Szrj
43638fd1498Szrj /* Bool ops don't participate in vectorization factor
43738fd1498Szrj computation. For comparison use compared types to
43838fd1498Szrj compute a factor. */
43938fd1498Szrj if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
44038fd1498Szrj && is_gimple_assign (stmt)
44138fd1498Szrj && gimple_assign_rhs_code (stmt) != COND_EXPR)
44238fd1498Szrj {
44338fd1498Szrj if (STMT_VINFO_RELEVANT_P (stmt_info)
44438fd1498Szrj || STMT_VINFO_LIVE_P (stmt_info))
44538fd1498Szrj mask_producers.safe_push (stmt_info);
44638fd1498Szrj bool_result = true;
44738fd1498Szrj
44838fd1498Szrj if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
44938fd1498Szrj == tcc_comparison
45038fd1498Szrj && !VECT_SCALAR_BOOLEAN_TYPE_P
45138fd1498Szrj (TREE_TYPE (gimple_assign_rhs1 (stmt))))
45238fd1498Szrj scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
45338fd1498Szrj else
45438fd1498Szrj {
45538fd1498Szrj if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
45638fd1498Szrj {
45738fd1498Szrj pattern_def_seq = NULL;
45838fd1498Szrj gsi_next (&si);
45938fd1498Szrj }
46038fd1498Szrj continue;
46138fd1498Szrj }
46238fd1498Szrj }
46338fd1498Szrj
46438fd1498Szrj if (dump_enabled_p ())
46538fd1498Szrj {
46638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
46738fd1498Szrj "get vectype for scalar type: ");
46838fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
46938fd1498Szrj dump_printf (MSG_NOTE, "\n");
47038fd1498Szrj }
47138fd1498Szrj vectype = get_vectype_for_scalar_type (scalar_type);
47238fd1498Szrj if (!vectype)
47338fd1498Szrj {
47438fd1498Szrj if (dump_enabled_p ())
47538fd1498Szrj {
47638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
47738fd1498Szrj "not vectorized: unsupported "
47838fd1498Szrj "data-type ");
47938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
48038fd1498Szrj scalar_type);
48138fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
48238fd1498Szrj }
48338fd1498Szrj return false;
48438fd1498Szrj }
48538fd1498Szrj
48638fd1498Szrj if (!bool_result)
48738fd1498Szrj STMT_VINFO_VECTYPE (stmt_info) = vectype;
48838fd1498Szrj
48938fd1498Szrj if (dump_enabled_p ())
49038fd1498Szrj {
49138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
49238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
49338fd1498Szrj dump_printf (MSG_NOTE, "\n");
49438fd1498Szrj }
49538fd1498Szrj }
49638fd1498Szrj
49738fd1498Szrj /* Don't try to compute VF out scalar types if we stmt
49838fd1498Szrj produces boolean vector. Use result vectype instead. */
49938fd1498Szrj if (VECTOR_BOOLEAN_TYPE_P (vectype))
50038fd1498Szrj vf_vectype = vectype;
50138fd1498Szrj else
50238fd1498Szrj {
50338fd1498Szrj /* The vectorization factor is according to the smallest
50438fd1498Szrj scalar type (or the largest vector size, but we only
50538fd1498Szrj support one vector size per loop). */
50638fd1498Szrj if (!bool_result)
50738fd1498Szrj scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
50838fd1498Szrj &dummy);
50938fd1498Szrj if (dump_enabled_p ())
51038fd1498Szrj {
51138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
51238fd1498Szrj "get vectype for scalar type: ");
51338fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
51438fd1498Szrj dump_printf (MSG_NOTE, "\n");
51538fd1498Szrj }
51638fd1498Szrj vf_vectype = get_vectype_for_scalar_type (scalar_type);
51738fd1498Szrj }
51838fd1498Szrj if (!vf_vectype)
51938fd1498Szrj {
52038fd1498Szrj if (dump_enabled_p ())
52138fd1498Szrj {
52238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
52338fd1498Szrj "not vectorized: unsupported data-type ");
52438fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
52538fd1498Szrj scalar_type);
52638fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
52738fd1498Szrj }
52838fd1498Szrj return false;
52938fd1498Szrj }
53038fd1498Szrj
53138fd1498Szrj if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
53238fd1498Szrj GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
53338fd1498Szrj {
53438fd1498Szrj if (dump_enabled_p ())
53538fd1498Szrj {
53638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
53738fd1498Szrj "not vectorized: different sized vector "
53838fd1498Szrj "types in statement, ");
53938fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
54038fd1498Szrj vectype);
54138fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
54238fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
54338fd1498Szrj vf_vectype);
54438fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
54538fd1498Szrj }
54638fd1498Szrj return false;
54738fd1498Szrj }
54838fd1498Szrj
54938fd1498Szrj if (dump_enabled_p ())
55038fd1498Szrj {
55138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
55238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
55338fd1498Szrj dump_printf (MSG_NOTE, "\n");
55438fd1498Szrj }
55538fd1498Szrj
55638fd1498Szrj if (dump_enabled_p ())
55738fd1498Szrj {
55838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
55938fd1498Szrj dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
56038fd1498Szrj dump_printf (MSG_NOTE, "\n");
56138fd1498Szrj }
56238fd1498Szrj
56338fd1498Szrj vect_update_max_nunits (&vectorization_factor, vf_vectype);
56438fd1498Szrj
56538fd1498Szrj if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
56638fd1498Szrj {
56738fd1498Szrj pattern_def_seq = NULL;
56838fd1498Szrj gsi_next (&si);
56938fd1498Szrj }
57038fd1498Szrj }
57138fd1498Szrj }
57238fd1498Szrj
57338fd1498Szrj /* TODO: Analyze cost. Decide if worth while to vectorize. */
57438fd1498Szrj if (dump_enabled_p ())
57538fd1498Szrj {
57638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
57738fd1498Szrj dump_dec (MSG_NOTE, vectorization_factor);
57838fd1498Szrj dump_printf (MSG_NOTE, "\n");
57938fd1498Szrj }
58038fd1498Szrj
58138fd1498Szrj if (known_le (vectorization_factor, 1U))
58238fd1498Szrj {
58338fd1498Szrj if (dump_enabled_p ())
58438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
58538fd1498Szrj "not vectorized: unsupported data-type\n");
58638fd1498Szrj return false;
58738fd1498Szrj }
58838fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
58938fd1498Szrj
59038fd1498Szrj for (i = 0; i < mask_producers.length (); i++)
59138fd1498Szrj {
59238fd1498Szrj tree mask_type = NULL;
59338fd1498Szrj
59438fd1498Szrj stmt = STMT_VINFO_STMT (mask_producers[i]);
59538fd1498Szrj
59638fd1498Szrj if (is_gimple_assign (stmt)
59738fd1498Szrj && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
59838fd1498Szrj && !VECT_SCALAR_BOOLEAN_TYPE_P
59938fd1498Szrj (TREE_TYPE (gimple_assign_rhs1 (stmt))))
60038fd1498Szrj {
60138fd1498Szrj scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
60238fd1498Szrj mask_type = get_mask_type_for_scalar_type (scalar_type);
60338fd1498Szrj
60438fd1498Szrj if (!mask_type)
60538fd1498Szrj {
60638fd1498Szrj if (dump_enabled_p ())
60738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
60838fd1498Szrj "not vectorized: unsupported mask\n");
60938fd1498Szrj return false;
61038fd1498Szrj }
61138fd1498Szrj }
61238fd1498Szrj else
61338fd1498Szrj {
61438fd1498Szrj tree rhs;
61538fd1498Szrj ssa_op_iter iter;
61638fd1498Szrj gimple *def_stmt;
61738fd1498Szrj enum vect_def_type dt;
61838fd1498Szrj
61938fd1498Szrj FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
62038fd1498Szrj {
62138fd1498Szrj if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
62238fd1498Szrj &def_stmt, &dt, &vectype))
62338fd1498Szrj {
62438fd1498Szrj if (dump_enabled_p ())
62538fd1498Szrj {
62638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
62738fd1498Szrj "not vectorized: can't compute mask type "
62838fd1498Szrj "for statement, ");
62938fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
63038fd1498Szrj 0);
63138fd1498Szrj }
63238fd1498Szrj return false;
63338fd1498Szrj }
63438fd1498Szrj
63538fd1498Szrj /* No vectype probably means external definition.
63638fd1498Szrj Allow it in case there is another operand which
63738fd1498Szrj allows to determine mask type. */
63838fd1498Szrj if (!vectype)
63938fd1498Szrj continue;
64038fd1498Szrj
64138fd1498Szrj if (!mask_type)
64238fd1498Szrj mask_type = vectype;
64338fd1498Szrj else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
64438fd1498Szrj TYPE_VECTOR_SUBPARTS (vectype)))
64538fd1498Szrj {
64638fd1498Szrj if (dump_enabled_p ())
64738fd1498Szrj {
64838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64938fd1498Szrj "not vectorized: different sized masks "
65038fd1498Szrj "types in statement, ");
65138fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
65238fd1498Szrj mask_type);
65338fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
65438fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
65538fd1498Szrj vectype);
65638fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
65738fd1498Szrj }
65838fd1498Szrj return false;
65938fd1498Szrj }
66038fd1498Szrj else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
66138fd1498Szrj != VECTOR_BOOLEAN_TYPE_P (vectype))
66238fd1498Szrj {
66338fd1498Szrj if (dump_enabled_p ())
66438fd1498Szrj {
66538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
66638fd1498Szrj "not vectorized: mixed mask and "
66738fd1498Szrj "nonmask vector types in statement, ");
66838fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
66938fd1498Szrj mask_type);
67038fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
67138fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
67238fd1498Szrj vectype);
67338fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
67438fd1498Szrj }
67538fd1498Szrj return false;
67638fd1498Szrj }
67738fd1498Szrj }
67838fd1498Szrj
67938fd1498Szrj /* We may compare boolean value loaded as vector of integers.
68038fd1498Szrj Fix mask_type in such case. */
68138fd1498Szrj if (mask_type
68238fd1498Szrj && !VECTOR_BOOLEAN_TYPE_P (mask_type)
68338fd1498Szrj && gimple_code (stmt) == GIMPLE_ASSIGN
68438fd1498Szrj && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
68538fd1498Szrj mask_type = build_same_sized_truth_vector_type (mask_type);
68638fd1498Szrj }
68738fd1498Szrj
68838fd1498Szrj /* No mask_type should mean loop invariant predicate.
68938fd1498Szrj This is probably a subject for optimization in
69038fd1498Szrj if-conversion. */
69138fd1498Szrj if (!mask_type)
69238fd1498Szrj {
69338fd1498Szrj if (dump_enabled_p ())
69438fd1498Szrj {
69538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
69638fd1498Szrj "not vectorized: can't compute mask type "
69738fd1498Szrj "for statement, ");
69838fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
69938fd1498Szrj 0);
70038fd1498Szrj }
70138fd1498Szrj return false;
70238fd1498Szrj }
70338fd1498Szrj
70438fd1498Szrj STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
70538fd1498Szrj }
70638fd1498Szrj
70738fd1498Szrj return true;
70838fd1498Szrj }
70938fd1498Szrj
71038fd1498Szrj
71138fd1498Szrj /* Function vect_is_simple_iv_evolution.
71238fd1498Szrj
71338fd1498Szrj FORNOW: A simple evolution of an induction variables in the loop is
71438fd1498Szrj considered a polynomial evolution. */
71538fd1498Szrj
71638fd1498Szrj static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)71738fd1498Szrj vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
71838fd1498Szrj tree * step)
71938fd1498Szrj {
72038fd1498Szrj tree init_expr;
72138fd1498Szrj tree step_expr;
72238fd1498Szrj tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
72338fd1498Szrj basic_block bb;
72438fd1498Szrj
72538fd1498Szrj /* When there is no evolution in this loop, the evolution function
72638fd1498Szrj is not "simple". */
72738fd1498Szrj if (evolution_part == NULL_TREE)
72838fd1498Szrj return false;
72938fd1498Szrj
73038fd1498Szrj /* When the evolution is a polynomial of degree >= 2
73138fd1498Szrj the evolution function is not "simple". */
73238fd1498Szrj if (tree_is_chrec (evolution_part))
73338fd1498Szrj return false;
73438fd1498Szrj
73538fd1498Szrj step_expr = evolution_part;
73638fd1498Szrj init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
73738fd1498Szrj
73838fd1498Szrj if (dump_enabled_p ())
73938fd1498Szrj {
74038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "step: ");
74138fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
74238fd1498Szrj dump_printf (MSG_NOTE, ", init: ");
74338fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
74438fd1498Szrj dump_printf (MSG_NOTE, "\n");
74538fd1498Szrj }
74638fd1498Szrj
74738fd1498Szrj *init = init_expr;
74838fd1498Szrj *step = step_expr;
74938fd1498Szrj
75038fd1498Szrj if (TREE_CODE (step_expr) != INTEGER_CST
75138fd1498Szrj && (TREE_CODE (step_expr) != SSA_NAME
75238fd1498Szrj || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
75338fd1498Szrj && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
75438fd1498Szrj || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
75538fd1498Szrj && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
75638fd1498Szrj || !flag_associative_math)))
75738fd1498Szrj && (TREE_CODE (step_expr) != REAL_CST
75838fd1498Szrj || !flag_associative_math))
75938fd1498Szrj {
76038fd1498Szrj if (dump_enabled_p ())
76138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76238fd1498Szrj "step unknown.\n");
76338fd1498Szrj return false;
76438fd1498Szrj }
76538fd1498Szrj
76638fd1498Szrj return true;
76738fd1498Szrj }
76838fd1498Szrj
76938fd1498Szrj /* Function vect_analyze_scalar_cycles_1.
77038fd1498Szrj
77138fd1498Szrj Examine the cross iteration def-use cycles of scalar variables
77238fd1498Szrj in LOOP. LOOP_VINFO represents the loop that is now being
77338fd1498Szrj considered for vectorization (can be LOOP, or an outer-loop
77438fd1498Szrj enclosing LOOP). */
77538fd1498Szrj
77638fd1498Szrj static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)77738fd1498Szrj vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
77838fd1498Szrj {
77938fd1498Szrj basic_block bb = loop->header;
78038fd1498Szrj tree init, step;
78138fd1498Szrj auto_vec<gimple *, 64> worklist;
78238fd1498Szrj gphi_iterator gsi;
78338fd1498Szrj bool double_reduc;
78438fd1498Szrj
78538fd1498Szrj if (dump_enabled_p ())
78638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
78738fd1498Szrj "=== vect_analyze_scalar_cycles ===\n");
78838fd1498Szrj
78938fd1498Szrj /* First - identify all inductions. Reduction detection assumes that all the
79038fd1498Szrj inductions have been identified, therefore, this order must not be
79138fd1498Szrj changed. */
79238fd1498Szrj for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
79338fd1498Szrj {
79438fd1498Szrj gphi *phi = gsi.phi ();
79538fd1498Szrj tree access_fn = NULL;
79638fd1498Szrj tree def = PHI_RESULT (phi);
79738fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
79838fd1498Szrj
79938fd1498Szrj if (dump_enabled_p ())
80038fd1498Szrj {
80138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
80238fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
80338fd1498Szrj }
80438fd1498Szrj
80538fd1498Szrj /* Skip virtual phi's. The data dependences that are associated with
80638fd1498Szrj virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
80738fd1498Szrj if (virtual_operand_p (def))
80838fd1498Szrj continue;
80938fd1498Szrj
81038fd1498Szrj STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
81138fd1498Szrj
81238fd1498Szrj /* Analyze the evolution function. */
81338fd1498Szrj access_fn = analyze_scalar_evolution (loop, def);
81438fd1498Szrj if (access_fn)
81538fd1498Szrj {
81638fd1498Szrj STRIP_NOPS (access_fn);
81738fd1498Szrj if (dump_enabled_p ())
81838fd1498Szrj {
81938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
82038fd1498Szrj "Access function of PHI: ");
82138fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
82238fd1498Szrj dump_printf (MSG_NOTE, "\n");
82338fd1498Szrj }
82438fd1498Szrj STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
82538fd1498Szrj = initial_condition_in_loop_num (access_fn, loop->num);
82638fd1498Szrj STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
82738fd1498Szrj = evolution_part_in_loop_num (access_fn, loop->num);
82838fd1498Szrj }
82938fd1498Szrj
83038fd1498Szrj if (!access_fn
83138fd1498Szrj || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
83238fd1498Szrj || (LOOP_VINFO_LOOP (loop_vinfo) != loop
83338fd1498Szrj && TREE_CODE (step) != INTEGER_CST))
83438fd1498Szrj {
83538fd1498Szrj worklist.safe_push (phi);
83638fd1498Szrj continue;
83738fd1498Szrj }
83838fd1498Szrj
83938fd1498Szrj gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
84038fd1498Szrj != NULL_TREE);
84138fd1498Szrj gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
84238fd1498Szrj
84338fd1498Szrj if (dump_enabled_p ())
84438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
84538fd1498Szrj STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
84638fd1498Szrj }
84738fd1498Szrj
84838fd1498Szrj
84938fd1498Szrj /* Second - identify all reductions and nested cycles. */
85038fd1498Szrj while (worklist.length () > 0)
85138fd1498Szrj {
85238fd1498Szrj gimple *phi = worklist.pop ();
85338fd1498Szrj tree def = PHI_RESULT (phi);
85438fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
85538fd1498Szrj gimple *reduc_stmt;
85638fd1498Szrj
85738fd1498Szrj if (dump_enabled_p ())
85838fd1498Szrj {
85938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
86038fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
86138fd1498Szrj }
86238fd1498Szrj
86338fd1498Szrj gcc_assert (!virtual_operand_p (def)
86438fd1498Szrj && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
86538fd1498Szrj
86638fd1498Szrj reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
86738fd1498Szrj &double_reduc, false);
86838fd1498Szrj if (reduc_stmt)
86938fd1498Szrj {
87038fd1498Szrj if (double_reduc)
87138fd1498Szrj {
87238fd1498Szrj if (dump_enabled_p ())
87338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
87438fd1498Szrj "Detected double reduction.\n");
87538fd1498Szrj
87638fd1498Szrj STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
87738fd1498Szrj STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
87838fd1498Szrj vect_double_reduction_def;
87938fd1498Szrj }
88038fd1498Szrj else
88138fd1498Szrj {
88238fd1498Szrj if (loop != LOOP_VINFO_LOOP (loop_vinfo))
88338fd1498Szrj {
88438fd1498Szrj if (dump_enabled_p ())
88538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
88638fd1498Szrj "Detected vectorizable nested cycle.\n");
88738fd1498Szrj
88838fd1498Szrj STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
88938fd1498Szrj STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
89038fd1498Szrj vect_nested_cycle;
89138fd1498Szrj }
89238fd1498Szrj else
89338fd1498Szrj {
89438fd1498Szrj if (dump_enabled_p ())
89538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
89638fd1498Szrj "Detected reduction.\n");
89738fd1498Szrj
89838fd1498Szrj STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
89938fd1498Szrj STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
90038fd1498Szrj vect_reduction_def;
90138fd1498Szrj /* Store the reduction cycles for possible vectorization in
90238fd1498Szrj loop-aware SLP if it was not detected as reduction
90338fd1498Szrj chain. */
90438fd1498Szrj if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
90538fd1498Szrj LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
90638fd1498Szrj }
90738fd1498Szrj }
90838fd1498Szrj }
90938fd1498Szrj else
91038fd1498Szrj if (dump_enabled_p ())
91138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
91238fd1498Szrj "Unknown def-use cycle pattern.\n");
91338fd1498Szrj }
91438fd1498Szrj }
91538fd1498Szrj
91638fd1498Szrj
91738fd1498Szrj /* Function vect_analyze_scalar_cycles.
91838fd1498Szrj
91938fd1498Szrj Examine the cross iteration def-use cycles of scalar variables, by
92038fd1498Szrj analyzing the loop-header PHIs of scalar variables. Classify each
92138fd1498Szrj cycle as one of the following: invariant, induction, reduction, unknown.
92238fd1498Szrj We do that for the loop represented by LOOP_VINFO, and also to its
92338fd1498Szrj inner-loop, if exists.
92438fd1498Szrj Examples for scalar cycles:
92538fd1498Szrj
92638fd1498Szrj Example1: reduction:
92738fd1498Szrj
92838fd1498Szrj loop1:
92938fd1498Szrj for (i=0; i<N; i++)
93038fd1498Szrj sum += a[i];
93138fd1498Szrj
93238fd1498Szrj Example2: induction:
93338fd1498Szrj
93438fd1498Szrj loop2:
93538fd1498Szrj for (i=0; i<N; i++)
93638fd1498Szrj a[i] = i; */
93738fd1498Szrj
93838fd1498Szrj static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)93938fd1498Szrj vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
94038fd1498Szrj {
94138fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
94238fd1498Szrj
94338fd1498Szrj vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
94438fd1498Szrj
94538fd1498Szrj /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
94638fd1498Szrj Reductions in such inner-loop therefore have different properties than
94738fd1498Szrj the reductions in the nest that gets vectorized:
94838fd1498Szrj 1. When vectorized, they are executed in the same order as in the original
94938fd1498Szrj scalar loop, so we can't change the order of computation when
95038fd1498Szrj vectorizing them.
95138fd1498Szrj 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
95238fd1498Szrj current checks are too strict. */
95338fd1498Szrj
95438fd1498Szrj if (loop->inner)
95538fd1498Szrj vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
95638fd1498Szrj }
95738fd1498Szrj
95838fd1498Szrj /* Transfer group and reduction information from STMT to its pattern stmt. */
95938fd1498Szrj
96038fd1498Szrj static void
vect_fixup_reduc_chain(gimple * stmt)96138fd1498Szrj vect_fixup_reduc_chain (gimple *stmt)
96238fd1498Szrj {
96338fd1498Szrj gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
96438fd1498Szrj gimple *stmtp;
96538fd1498Szrj gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
96638fd1498Szrj && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
96738fd1498Szrj GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
96838fd1498Szrj do
96938fd1498Szrj {
97038fd1498Szrj stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
97138fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
97238fd1498Szrj stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
97338fd1498Szrj if (stmt)
97438fd1498Szrj GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
97538fd1498Szrj = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
97638fd1498Szrj }
97738fd1498Szrj while (stmt);
97838fd1498Szrj STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
97938fd1498Szrj }
98038fd1498Szrj
98138fd1498Szrj /* Fixup scalar cycles that now have their stmts detected as patterns. */
98238fd1498Szrj
98338fd1498Szrj static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)98438fd1498Szrj vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
98538fd1498Szrj {
98638fd1498Szrj gimple *first;
98738fd1498Szrj unsigned i;
98838fd1498Szrj
98938fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
99038fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
99138fd1498Szrj {
99238fd1498Szrj gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
99338fd1498Szrj while (next)
99438fd1498Szrj {
99538fd1498Szrj if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
99638fd1498Szrj break;
99738fd1498Szrj next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
99838fd1498Szrj }
99938fd1498Szrj /* If not all stmt in the chain are patterns try to handle
100038fd1498Szrj the chain without patterns. */
100138fd1498Szrj if (! next)
100238fd1498Szrj {
100338fd1498Szrj vect_fixup_reduc_chain (first);
100438fd1498Szrj LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
100538fd1498Szrj = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
100638fd1498Szrj }
100738fd1498Szrj }
100838fd1498Szrj }
100938fd1498Szrj
101038fd1498Szrj /* Function vect_get_loop_niters.
101138fd1498Szrj
101238fd1498Szrj Determine how many iterations the loop is executed and place it
101338fd1498Szrj in NUMBER_OF_ITERATIONS. Place the number of latch iterations
101438fd1498Szrj in NUMBER_OF_ITERATIONSM1. Place the condition under which the
101538fd1498Szrj niter information holds in ASSUMPTIONS.
101638fd1498Szrj
101738fd1498Szrj Return the loop exit condition. */
101838fd1498Szrj
101938fd1498Szrj
102038fd1498Szrj static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)102138fd1498Szrj vect_get_loop_niters (struct loop *loop, tree *assumptions,
102238fd1498Szrj tree *number_of_iterations, tree *number_of_iterationsm1)
102338fd1498Szrj {
102438fd1498Szrj edge exit = single_exit (loop);
102538fd1498Szrj struct tree_niter_desc niter_desc;
102638fd1498Szrj tree niter_assumptions, niter, may_be_zero;
102738fd1498Szrj gcond *cond = get_loop_exit_condition (loop);
102838fd1498Szrj
102938fd1498Szrj *assumptions = boolean_true_node;
103038fd1498Szrj *number_of_iterationsm1 = chrec_dont_know;
103138fd1498Szrj *number_of_iterations = chrec_dont_know;
103238fd1498Szrj if (dump_enabled_p ())
103338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
103438fd1498Szrj "=== get_loop_niters ===\n");
103538fd1498Szrj
103638fd1498Szrj if (!exit)
103738fd1498Szrj return cond;
103838fd1498Szrj
103938fd1498Szrj niter = chrec_dont_know;
104038fd1498Szrj may_be_zero = NULL_TREE;
104138fd1498Szrj niter_assumptions = boolean_true_node;
104238fd1498Szrj if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
104338fd1498Szrj || chrec_contains_undetermined (niter_desc.niter))
104438fd1498Szrj return cond;
104538fd1498Szrj
104638fd1498Szrj niter_assumptions = niter_desc.assumptions;
104738fd1498Szrj may_be_zero = niter_desc.may_be_zero;
104838fd1498Szrj niter = niter_desc.niter;
104938fd1498Szrj
105038fd1498Szrj if (may_be_zero && integer_zerop (may_be_zero))
105138fd1498Szrj may_be_zero = NULL_TREE;
105238fd1498Szrj
105338fd1498Szrj if (may_be_zero)
105438fd1498Szrj {
105538fd1498Szrj if (COMPARISON_CLASS_P (may_be_zero))
105638fd1498Szrj {
105738fd1498Szrj /* Try to combine may_be_zero with assumptions, this can simplify
105838fd1498Szrj computation of niter expression. */
105938fd1498Szrj if (niter_assumptions && !integer_nonzerop (niter_assumptions))
106038fd1498Szrj niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
106138fd1498Szrj niter_assumptions,
106238fd1498Szrj fold_build1 (TRUTH_NOT_EXPR,
106338fd1498Szrj boolean_type_node,
106438fd1498Szrj may_be_zero));
106538fd1498Szrj else
106638fd1498Szrj niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
106738fd1498Szrj build_int_cst (TREE_TYPE (niter), 0),
106838fd1498Szrj rewrite_to_non_trapping_overflow (niter));
106938fd1498Szrj
107038fd1498Szrj may_be_zero = NULL_TREE;
107138fd1498Szrj }
107238fd1498Szrj else if (integer_nonzerop (may_be_zero))
107338fd1498Szrj {
107438fd1498Szrj *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
107538fd1498Szrj *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
107638fd1498Szrj return cond;
107738fd1498Szrj }
107838fd1498Szrj else
107938fd1498Szrj return cond;
108038fd1498Szrj }
108138fd1498Szrj
108238fd1498Szrj *assumptions = niter_assumptions;
108338fd1498Szrj *number_of_iterationsm1 = niter;
108438fd1498Szrj
108538fd1498Szrj /* We want the number of loop header executions which is the number
108638fd1498Szrj of latch executions plus one.
108738fd1498Szrj ??? For UINT_MAX latch executions this number overflows to zero
108838fd1498Szrj for loops like do { n++; } while (n != 0); */
108938fd1498Szrj if (niter && !chrec_contains_undetermined (niter))
109038fd1498Szrj niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
109138fd1498Szrj build_int_cst (TREE_TYPE (niter), 1));
109238fd1498Szrj *number_of_iterations = niter;
109338fd1498Szrj
109438fd1498Szrj return cond;
109538fd1498Szrj }
109638fd1498Szrj
109738fd1498Szrj /* Function bb_in_loop_p
109838fd1498Szrj
109938fd1498Szrj Used as predicate for dfs order traversal of the loop bbs. */
110038fd1498Szrj
110138fd1498Szrj static bool
bb_in_loop_p(const_basic_block bb,const void * data)110238fd1498Szrj bb_in_loop_p (const_basic_block bb, const void *data)
110338fd1498Szrj {
110438fd1498Szrj const struct loop *const loop = (const struct loop *)data;
110538fd1498Szrj if (flow_bb_inside_loop_p (loop, bb))
110638fd1498Szrj return true;
110738fd1498Szrj return false;
110838fd1498Szrj }
110938fd1498Szrj
111038fd1498Szrj
111138fd1498Szrj /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
111238fd1498Szrj stmt_vec_info structs for all the stmts in LOOP_IN. */
111338fd1498Szrj
_loop_vec_info(struct loop * loop_in)111438fd1498Szrj _loop_vec_info::_loop_vec_info (struct loop *loop_in)
111538fd1498Szrj : vec_info (vec_info::loop, init_cost (loop_in)),
111638fd1498Szrj loop (loop_in),
111738fd1498Szrj bbs (XCNEWVEC (basic_block, loop->num_nodes)),
111838fd1498Szrj num_itersm1 (NULL_TREE),
111938fd1498Szrj num_iters (NULL_TREE),
112038fd1498Szrj num_iters_unchanged (NULL_TREE),
112138fd1498Szrj num_iters_assumptions (NULL_TREE),
112238fd1498Szrj th (0),
112338fd1498Szrj versioning_threshold (0),
112438fd1498Szrj vectorization_factor (0),
112538fd1498Szrj max_vectorization_factor (0),
112638fd1498Szrj mask_skip_niters (NULL_TREE),
112738fd1498Szrj mask_compare_type (NULL_TREE),
112838fd1498Szrj unaligned_dr (NULL),
112938fd1498Szrj peeling_for_alignment (0),
113038fd1498Szrj ptr_mask (0),
113138fd1498Szrj ivexpr_map (NULL),
113238fd1498Szrj slp_unrolling_factor (1),
113338fd1498Szrj single_scalar_iteration_cost (0),
113438fd1498Szrj vectorizable (false),
113538fd1498Szrj can_fully_mask_p (true),
113638fd1498Szrj fully_masked_p (false),
113738fd1498Szrj peeling_for_gaps (false),
113838fd1498Szrj peeling_for_niter (false),
113938fd1498Szrj operands_swapped (false),
114038fd1498Szrj no_data_dependencies (false),
114138fd1498Szrj has_mask_store (false),
114238fd1498Szrj scalar_loop (NULL),
114338fd1498Szrj orig_loop_info (NULL)
114438fd1498Szrj {
114538fd1498Szrj /* Create/Update stmt_info for all stmts in the loop. */
114638fd1498Szrj basic_block *body = get_loop_body (loop);
114738fd1498Szrj for (unsigned int i = 0; i < loop->num_nodes; i++)
114838fd1498Szrj {
114938fd1498Szrj basic_block bb = body[i];
115038fd1498Szrj gimple_stmt_iterator si;
115138fd1498Szrj
115238fd1498Szrj for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
115338fd1498Szrj {
115438fd1498Szrj gimple *phi = gsi_stmt (si);
115538fd1498Szrj gimple_set_uid (phi, 0);
115638fd1498Szrj set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
115738fd1498Szrj }
115838fd1498Szrj
115938fd1498Szrj for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
116038fd1498Szrj {
116138fd1498Szrj gimple *stmt = gsi_stmt (si);
116238fd1498Szrj gimple_set_uid (stmt, 0);
116338fd1498Szrj set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
116438fd1498Szrj }
116538fd1498Szrj }
116638fd1498Szrj free (body);
116738fd1498Szrj
116838fd1498Szrj /* CHECKME: We want to visit all BBs before their successors (except for
116938fd1498Szrj latch blocks, for which this assertion wouldn't hold). In the simple
117038fd1498Szrj case of the loop forms we allow, a dfs order of the BBs would the same
117138fd1498Szrj as reversed postorder traversal, so we are safe. */
117238fd1498Szrj
117338fd1498Szrj unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
117438fd1498Szrj bbs, loop->num_nodes, loop);
117538fd1498Szrj gcc_assert (nbbs == loop->num_nodes);
117638fd1498Szrj }
117738fd1498Szrj
117838fd1498Szrj /* Free all levels of MASKS. */
117938fd1498Szrj
118038fd1498Szrj void
release_vec_loop_masks(vec_loop_masks * masks)118138fd1498Szrj release_vec_loop_masks (vec_loop_masks *masks)
118238fd1498Szrj {
118338fd1498Szrj rgroup_masks *rgm;
118438fd1498Szrj unsigned int i;
118538fd1498Szrj FOR_EACH_VEC_ELT (*masks, i, rgm)
118638fd1498Szrj rgm->masks.release ();
118738fd1498Szrj masks->release ();
118838fd1498Szrj }
118938fd1498Szrj
119038fd1498Szrj /* Free all memory used by the _loop_vec_info, as well as all the
119138fd1498Szrj stmt_vec_info structs of all the stmts in the loop. */
119238fd1498Szrj
~_loop_vec_info()119338fd1498Szrj _loop_vec_info::~_loop_vec_info ()
119438fd1498Szrj {
119538fd1498Szrj int nbbs;
119638fd1498Szrj gimple_stmt_iterator si;
119738fd1498Szrj int j;
119838fd1498Szrj
119938fd1498Szrj nbbs = loop->num_nodes;
120038fd1498Szrj for (j = 0; j < nbbs; j++)
120138fd1498Szrj {
120238fd1498Szrj basic_block bb = bbs[j];
120338fd1498Szrj for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
120438fd1498Szrj free_stmt_vec_info (gsi_stmt (si));
120538fd1498Szrj
120638fd1498Szrj for (si = gsi_start_bb (bb); !gsi_end_p (si); )
120738fd1498Szrj {
120838fd1498Szrj gimple *stmt = gsi_stmt (si);
120938fd1498Szrj
121038fd1498Szrj /* We may have broken canonical form by moving a constant
121138fd1498Szrj into RHS1 of a commutative op. Fix such occurrences. */
121238fd1498Szrj if (operands_swapped && is_gimple_assign (stmt))
121338fd1498Szrj {
121438fd1498Szrj enum tree_code code = gimple_assign_rhs_code (stmt);
121538fd1498Szrj
121638fd1498Szrj if ((code == PLUS_EXPR
121738fd1498Szrj || code == POINTER_PLUS_EXPR
121838fd1498Szrj || code == MULT_EXPR)
121938fd1498Szrj && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
122038fd1498Szrj swap_ssa_operands (stmt,
122138fd1498Szrj gimple_assign_rhs1_ptr (stmt),
122238fd1498Szrj gimple_assign_rhs2_ptr (stmt));
122338fd1498Szrj else if (code == COND_EXPR
122438fd1498Szrj && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
122538fd1498Szrj {
122638fd1498Szrj tree cond_expr = gimple_assign_rhs1 (stmt);
122738fd1498Szrj enum tree_code cond_code = TREE_CODE (cond_expr);
122838fd1498Szrj
122938fd1498Szrj if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
123038fd1498Szrj {
123138fd1498Szrj bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
123238fd1498Szrj 0));
123338fd1498Szrj cond_code = invert_tree_comparison (cond_code,
123438fd1498Szrj honor_nans);
123538fd1498Szrj if (cond_code != ERROR_MARK)
123638fd1498Szrj {
123738fd1498Szrj TREE_SET_CODE (cond_expr, cond_code);
123838fd1498Szrj swap_ssa_operands (stmt,
123938fd1498Szrj gimple_assign_rhs2_ptr (stmt),
124038fd1498Szrj gimple_assign_rhs3_ptr (stmt));
124138fd1498Szrj }
124238fd1498Szrj }
124338fd1498Szrj }
124438fd1498Szrj }
124538fd1498Szrj
124638fd1498Szrj /* Free stmt_vec_info. */
124738fd1498Szrj free_stmt_vec_info (stmt);
124838fd1498Szrj gsi_next (&si);
124938fd1498Szrj }
125038fd1498Szrj }
125138fd1498Szrj
125238fd1498Szrj free (bbs);
125338fd1498Szrj
125438fd1498Szrj release_vec_loop_masks (&masks);
125538fd1498Szrj delete ivexpr_map;
125638fd1498Szrj
125738fd1498Szrj loop->aux = NULL;
125838fd1498Szrj }
125938fd1498Szrj
126038fd1498Szrj /* Return an invariant or register for EXPR and emit necessary
126138fd1498Szrj computations in the LOOP_VINFO loop preheader. */
126238fd1498Szrj
126338fd1498Szrj tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)126438fd1498Szrj cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
126538fd1498Szrj {
126638fd1498Szrj if (is_gimple_reg (expr)
126738fd1498Szrj || is_gimple_min_invariant (expr))
126838fd1498Szrj return expr;
126938fd1498Szrj
127038fd1498Szrj if (! loop_vinfo->ivexpr_map)
127138fd1498Szrj loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
127238fd1498Szrj tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
127338fd1498Szrj if (! cached)
127438fd1498Szrj {
127538fd1498Szrj gimple_seq stmts = NULL;
127638fd1498Szrj cached = force_gimple_operand (unshare_expr (expr),
127738fd1498Szrj &stmts, true, NULL_TREE);
127838fd1498Szrj if (stmts)
127938fd1498Szrj {
128038fd1498Szrj edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
128138fd1498Szrj gsi_insert_seq_on_edge_immediate (e, stmts);
128238fd1498Szrj }
128338fd1498Szrj }
128438fd1498Szrj return cached;
128538fd1498Szrj }
128638fd1498Szrj
128738fd1498Szrj /* Return true if we can use CMP_TYPE as the comparison type to produce
128838fd1498Szrj all masks required to mask LOOP_VINFO. */
128938fd1498Szrj
129038fd1498Szrj static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)129138fd1498Szrj can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
129238fd1498Szrj {
129338fd1498Szrj rgroup_masks *rgm;
129438fd1498Szrj unsigned int i;
129538fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
129638fd1498Szrj if (rgm->mask_type != NULL_TREE
129738fd1498Szrj && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
129838fd1498Szrj cmp_type, rgm->mask_type,
129938fd1498Szrj OPTIMIZE_FOR_SPEED))
130038fd1498Szrj return false;
130138fd1498Szrj return true;
130238fd1498Szrj }
130338fd1498Szrj
130438fd1498Szrj /* Calculate the maximum number of scalars per iteration for every
130538fd1498Szrj rgroup in LOOP_VINFO. */
130638fd1498Szrj
130738fd1498Szrj static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)130838fd1498Szrj vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
130938fd1498Szrj {
131038fd1498Szrj unsigned int res = 1;
131138fd1498Szrj unsigned int i;
131238fd1498Szrj rgroup_masks *rgm;
131338fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
131438fd1498Szrj res = MAX (res, rgm->max_nscalars_per_iter);
131538fd1498Szrj return res;
131638fd1498Szrj }
131738fd1498Szrj
131838fd1498Szrj /* Each statement in LOOP_VINFO can be masked where necessary. Check
131938fd1498Szrj whether we can actually generate the masks required. Return true if so,
132038fd1498Szrj storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
132138fd1498Szrj
132238fd1498Szrj static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)132338fd1498Szrj vect_verify_full_masking (loop_vec_info loop_vinfo)
132438fd1498Szrj {
132538fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
132638fd1498Szrj unsigned int min_ni_width;
132738fd1498Szrj
132838fd1498Szrj /* Use a normal loop if there are no statements that need masking.
132938fd1498Szrj This only happens in rare degenerate cases: it means that the loop
133038fd1498Szrj has no loads, no stores, and no live-out values. */
133138fd1498Szrj if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
133238fd1498Szrj return false;
133338fd1498Szrj
133438fd1498Szrj /* Get the maximum number of iterations that is representable
133538fd1498Szrj in the counter type. */
133638fd1498Szrj tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
133738fd1498Szrj widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
133838fd1498Szrj
133938fd1498Szrj /* Get a more refined estimate for the number of iterations. */
134038fd1498Szrj widest_int max_back_edges;
134138fd1498Szrj if (max_loop_iterations (loop, &max_back_edges))
134238fd1498Szrj max_ni = wi::smin (max_ni, max_back_edges + 1);
134338fd1498Szrj
134438fd1498Szrj /* Account for rgroup masks, in which each bit is replicated N times. */
134538fd1498Szrj max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
134638fd1498Szrj
134738fd1498Szrj /* Work out how many bits we need to represent the limit. */
134838fd1498Szrj min_ni_width = wi::min_precision (max_ni, UNSIGNED);
134938fd1498Szrj
135038fd1498Szrj /* Find a scalar mode for which WHILE_ULT is supported. */
135138fd1498Szrj opt_scalar_int_mode cmp_mode_iter;
135238fd1498Szrj tree cmp_type = NULL_TREE;
135338fd1498Szrj FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
135438fd1498Szrj {
135538fd1498Szrj unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
135638fd1498Szrj if (cmp_bits >= min_ni_width
135738fd1498Szrj && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
135838fd1498Szrj {
135938fd1498Szrj tree this_type = build_nonstandard_integer_type (cmp_bits, true);
136038fd1498Szrj if (this_type
136138fd1498Szrj && can_produce_all_loop_masks_p (loop_vinfo, this_type))
136238fd1498Szrj {
136338fd1498Szrj /* Although we could stop as soon as we find a valid mode,
136438fd1498Szrj it's often better to continue until we hit Pmode, since the
136538fd1498Szrj operands to the WHILE are more likely to be reusable in
136638fd1498Szrj address calculations. */
136738fd1498Szrj cmp_type = this_type;
136838fd1498Szrj if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
136938fd1498Szrj break;
137038fd1498Szrj }
137138fd1498Szrj }
137238fd1498Szrj }
137338fd1498Szrj
137438fd1498Szrj if (!cmp_type)
137538fd1498Szrj return false;
137638fd1498Szrj
137738fd1498Szrj LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
137838fd1498Szrj return true;
137938fd1498Szrj }
138038fd1498Szrj
138138fd1498Szrj /* Calculate the cost of one scalar iteration of the loop. */
138238fd1498Szrj static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)138338fd1498Szrj vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
138438fd1498Szrj {
138538fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
138638fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
138738fd1498Szrj int nbbs = loop->num_nodes, factor;
138838fd1498Szrj int innerloop_iters, i;
138938fd1498Szrj
139038fd1498Szrj /* Gather costs for statements in the scalar loop. */
139138fd1498Szrj
139238fd1498Szrj /* FORNOW. */
139338fd1498Szrj innerloop_iters = 1;
139438fd1498Szrj if (loop->inner)
139538fd1498Szrj innerloop_iters = 50; /* FIXME */
139638fd1498Szrj
139738fd1498Szrj for (i = 0; i < nbbs; i++)
139838fd1498Szrj {
139938fd1498Szrj gimple_stmt_iterator si;
140038fd1498Szrj basic_block bb = bbs[i];
140138fd1498Szrj
140238fd1498Szrj if (bb->loop_father == loop->inner)
140338fd1498Szrj factor = innerloop_iters;
140438fd1498Szrj else
140538fd1498Szrj factor = 1;
140638fd1498Szrj
140738fd1498Szrj for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
140838fd1498Szrj {
140938fd1498Szrj gimple *stmt = gsi_stmt (si);
141038fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
141138fd1498Szrj
141238fd1498Szrj if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
141338fd1498Szrj continue;
141438fd1498Szrj
141538fd1498Szrj /* Skip stmts that are not vectorized inside the loop. */
141638fd1498Szrj if (stmt_info
141738fd1498Szrj && !STMT_VINFO_RELEVANT_P (stmt_info)
141838fd1498Szrj && (!STMT_VINFO_LIVE_P (stmt_info)
141938fd1498Szrj || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
142038fd1498Szrj && !STMT_VINFO_IN_PATTERN_P (stmt_info))
142138fd1498Szrj continue;
142238fd1498Szrj
142338fd1498Szrj vect_cost_for_stmt kind;
142438fd1498Szrj if (STMT_VINFO_DATA_REF (stmt_info))
142538fd1498Szrj {
142638fd1498Szrj if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
142738fd1498Szrj kind = scalar_load;
142838fd1498Szrj else
142938fd1498Szrj kind = scalar_store;
143038fd1498Szrj }
143138fd1498Szrj else
143238fd1498Szrj kind = scalar_stmt;
143338fd1498Szrj
143438fd1498Szrj record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
143538fd1498Szrj factor, kind, stmt_info, 0, vect_prologue);
143638fd1498Szrj }
143738fd1498Szrj }
143838fd1498Szrj
143938fd1498Szrj /* Now accumulate cost. */
144038fd1498Szrj void *target_cost_data = init_cost (loop);
144138fd1498Szrj stmt_info_for_cost *si;
144238fd1498Szrj int j;
144338fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
144438fd1498Szrj j, si)
144538fd1498Szrj {
144638fd1498Szrj struct _stmt_vec_info *stmt_info
144738fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
144838fd1498Szrj (void) add_stmt_cost (target_cost_data, si->count,
144938fd1498Szrj si->kind, stmt_info, si->misalign,
145038fd1498Szrj vect_body);
145138fd1498Szrj }
145238fd1498Szrj unsigned dummy, body_cost = 0;
145338fd1498Szrj finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
145438fd1498Szrj destroy_cost_data (target_cost_data);
145538fd1498Szrj LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
145638fd1498Szrj }
145738fd1498Szrj
145838fd1498Szrj
145938fd1498Szrj /* Function vect_analyze_loop_form_1.
146038fd1498Szrj
146138fd1498Szrj Verify that certain CFG restrictions hold, including:
146238fd1498Szrj - the loop has a pre-header
146338fd1498Szrj - the loop has a single entry and exit
146438fd1498Szrj - the loop exit condition is simple enough
146538fd1498Szrj - the number of iterations can be analyzed, i.e, a countable loop. The
146638fd1498Szrj niter could be analyzed under some assumptions. */
146738fd1498Szrj
146838fd1498Szrj bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)146938fd1498Szrj vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
147038fd1498Szrj tree *assumptions, tree *number_of_iterationsm1,
147138fd1498Szrj tree *number_of_iterations, gcond **inner_loop_cond)
147238fd1498Szrj {
147338fd1498Szrj if (dump_enabled_p ())
147438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
147538fd1498Szrj "=== vect_analyze_loop_form ===\n");
147638fd1498Szrj
147738fd1498Szrj /* Different restrictions apply when we are considering an inner-most loop,
147838fd1498Szrj vs. an outer (nested) loop.
147938fd1498Szrj (FORNOW. May want to relax some of these restrictions in the future). */
148038fd1498Szrj
148138fd1498Szrj if (!loop->inner)
148238fd1498Szrj {
148338fd1498Szrj /* Inner-most loop. We currently require that the number of BBs is
148438fd1498Szrj exactly 2 (the header and latch). Vectorizable inner-most loops
148538fd1498Szrj look like this:
148638fd1498Szrj
148738fd1498Szrj (pre-header)
148838fd1498Szrj |
148938fd1498Szrj header <--------+
149038fd1498Szrj | | |
149138fd1498Szrj | +--> latch --+
149238fd1498Szrj |
149338fd1498Szrj (exit-bb) */
149438fd1498Szrj
149538fd1498Szrj if (loop->num_nodes != 2)
149638fd1498Szrj {
149738fd1498Szrj if (dump_enabled_p ())
149838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
149938fd1498Szrj "not vectorized: control flow in loop.\n");
150038fd1498Szrj return false;
150138fd1498Szrj }
150238fd1498Szrj
150338fd1498Szrj if (empty_block_p (loop->header))
150438fd1498Szrj {
150538fd1498Szrj if (dump_enabled_p ())
150638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
150738fd1498Szrj "not vectorized: empty loop.\n");
150838fd1498Szrj return false;
150938fd1498Szrj }
151038fd1498Szrj }
151138fd1498Szrj else
151238fd1498Szrj {
151338fd1498Szrj struct loop *innerloop = loop->inner;
151438fd1498Szrj edge entryedge;
151538fd1498Szrj
151638fd1498Szrj /* Nested loop. We currently require that the loop is doubly-nested,
151738fd1498Szrj contains a single inner loop, and the number of BBs is exactly 5.
151838fd1498Szrj Vectorizable outer-loops look like this:
151938fd1498Szrj
152038fd1498Szrj (pre-header)
152138fd1498Szrj |
152238fd1498Szrj header <---+
152338fd1498Szrj | |
152438fd1498Szrj inner-loop |
152538fd1498Szrj | |
152638fd1498Szrj tail ------+
152738fd1498Szrj |
152838fd1498Szrj (exit-bb)
152938fd1498Szrj
153038fd1498Szrj The inner-loop has the properties expected of inner-most loops
153138fd1498Szrj as described above. */
153238fd1498Szrj
153338fd1498Szrj if ((loop->inner)->inner || (loop->inner)->next)
153438fd1498Szrj {
153538fd1498Szrj if (dump_enabled_p ())
153638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
153738fd1498Szrj "not vectorized: multiple nested loops.\n");
153838fd1498Szrj return false;
153938fd1498Szrj }
154038fd1498Szrj
154138fd1498Szrj if (loop->num_nodes != 5)
154238fd1498Szrj {
154338fd1498Szrj if (dump_enabled_p ())
154438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
154538fd1498Szrj "not vectorized: control flow in loop.\n");
154638fd1498Szrj return false;
154738fd1498Szrj }
154838fd1498Szrj
154938fd1498Szrj entryedge = loop_preheader_edge (innerloop);
155038fd1498Szrj if (entryedge->src != loop->header
155138fd1498Szrj || !single_exit (innerloop)
155238fd1498Szrj || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
155338fd1498Szrj {
155438fd1498Szrj if (dump_enabled_p ())
155538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
155638fd1498Szrj "not vectorized: unsupported outerloop form.\n");
155738fd1498Szrj return false;
155838fd1498Szrj }
155938fd1498Szrj
156038fd1498Szrj /* Analyze the inner-loop. */
156138fd1498Szrj tree inner_niterm1, inner_niter, inner_assumptions;
156238fd1498Szrj if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
156338fd1498Szrj &inner_assumptions, &inner_niterm1,
156438fd1498Szrj &inner_niter, NULL)
156538fd1498Szrj /* Don't support analyzing niter under assumptions for inner
156638fd1498Szrj loop. */
156738fd1498Szrj || !integer_onep (inner_assumptions))
156838fd1498Szrj {
156938fd1498Szrj if (dump_enabled_p ())
157038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
157138fd1498Szrj "not vectorized: Bad inner loop.\n");
157238fd1498Szrj return false;
157338fd1498Szrj }
157438fd1498Szrj
157538fd1498Szrj if (!expr_invariant_in_loop_p (loop, inner_niter))
157638fd1498Szrj {
157738fd1498Szrj if (dump_enabled_p ())
157838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
157938fd1498Szrj "not vectorized: inner-loop count not"
158038fd1498Szrj " invariant.\n");
158138fd1498Szrj return false;
158238fd1498Szrj }
158338fd1498Szrj
158438fd1498Szrj if (dump_enabled_p ())
158538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
158638fd1498Szrj "Considering outer-loop vectorization.\n");
158738fd1498Szrj }
158838fd1498Szrj
158938fd1498Szrj if (!single_exit (loop)
159038fd1498Szrj || EDGE_COUNT (loop->header->preds) != 2)
159138fd1498Szrj {
159238fd1498Szrj if (dump_enabled_p ())
159338fd1498Szrj {
159438fd1498Szrj if (!single_exit (loop))
159538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
159638fd1498Szrj "not vectorized: multiple exits.\n");
159738fd1498Szrj else if (EDGE_COUNT (loop->header->preds) != 2)
159838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
159938fd1498Szrj "not vectorized: too many incoming edges.\n");
160038fd1498Szrj }
160138fd1498Szrj return false;
160238fd1498Szrj }
160338fd1498Szrj
160438fd1498Szrj /* We assume that the loop exit condition is at the end of the loop. i.e,
160538fd1498Szrj that the loop is represented as a do-while (with a proper if-guard
160638fd1498Szrj before the loop if needed), where the loop header contains all the
160738fd1498Szrj executable statements, and the latch is empty. */
160838fd1498Szrj if (!empty_block_p (loop->latch)
160938fd1498Szrj || !gimple_seq_empty_p (phi_nodes (loop->latch)))
161038fd1498Szrj {
161138fd1498Szrj if (dump_enabled_p ())
161238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
161338fd1498Szrj "not vectorized: latch block not empty.\n");
161438fd1498Szrj return false;
161538fd1498Szrj }
161638fd1498Szrj
161738fd1498Szrj /* Make sure the exit is not abnormal. */
161838fd1498Szrj edge e = single_exit (loop);
161938fd1498Szrj if (e->flags & EDGE_ABNORMAL)
162038fd1498Szrj {
162138fd1498Szrj if (dump_enabled_p ())
162238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
162338fd1498Szrj "not vectorized: abnormal loop exit edge.\n");
162438fd1498Szrj return false;
162538fd1498Szrj }
162638fd1498Szrj
162738fd1498Szrj *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
162838fd1498Szrj number_of_iterationsm1);
162938fd1498Szrj if (!*loop_cond)
163038fd1498Szrj {
163138fd1498Szrj if (dump_enabled_p ())
163238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
163338fd1498Szrj "not vectorized: complicated exit condition.\n");
163438fd1498Szrj return false;
163538fd1498Szrj }
163638fd1498Szrj
163738fd1498Szrj if (integer_zerop (*assumptions)
163838fd1498Szrj || !*number_of_iterations
163938fd1498Szrj || chrec_contains_undetermined (*number_of_iterations))
164038fd1498Szrj {
164138fd1498Szrj if (dump_enabled_p ())
164238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
164338fd1498Szrj "not vectorized: number of iterations cannot be "
164438fd1498Szrj "computed.\n");
164538fd1498Szrj return false;
164638fd1498Szrj }
164738fd1498Szrj
164838fd1498Szrj if (integer_zerop (*number_of_iterations))
164938fd1498Szrj {
165038fd1498Szrj if (dump_enabled_p ())
165138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
165238fd1498Szrj "not vectorized: number of iterations = 0.\n");
165338fd1498Szrj return false;
165438fd1498Szrj }
165538fd1498Szrj
165638fd1498Szrj return true;
165738fd1498Szrj }
165838fd1498Szrj
165938fd1498Szrj /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
166038fd1498Szrj
166138fd1498Szrj loop_vec_info
vect_analyze_loop_form(struct loop * loop)166238fd1498Szrj vect_analyze_loop_form (struct loop *loop)
166338fd1498Szrj {
166438fd1498Szrj tree assumptions, number_of_iterations, number_of_iterationsm1;
166538fd1498Szrj gcond *loop_cond, *inner_loop_cond = NULL;
166638fd1498Szrj
166738fd1498Szrj if (! vect_analyze_loop_form_1 (loop, &loop_cond,
166838fd1498Szrj &assumptions, &number_of_iterationsm1,
166938fd1498Szrj &number_of_iterations, &inner_loop_cond))
167038fd1498Szrj return NULL;
167138fd1498Szrj
167238fd1498Szrj loop_vec_info loop_vinfo = new _loop_vec_info (loop);
167338fd1498Szrj LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
167438fd1498Szrj LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
167538fd1498Szrj LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
167638fd1498Szrj if (!integer_onep (assumptions))
167738fd1498Szrj {
167838fd1498Szrj /* We consider to vectorize this loop by versioning it under
167938fd1498Szrj some assumptions. In order to do this, we need to clear
168038fd1498Szrj existing information computed by scev and niter analyzer. */
168138fd1498Szrj scev_reset_htab ();
168238fd1498Szrj free_numbers_of_iterations_estimates (loop);
168338fd1498Szrj /* Also set flag for this loop so that following scev and niter
168438fd1498Szrj analysis are done under the assumptions. */
168538fd1498Szrj loop_constraint_set (loop, LOOP_C_FINITE);
168638fd1498Szrj /* Also record the assumptions for versioning. */
168738fd1498Szrj LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
168838fd1498Szrj }
168938fd1498Szrj
169038fd1498Szrj if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
169138fd1498Szrj {
169238fd1498Szrj if (dump_enabled_p ())
169338fd1498Szrj {
169438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
169538fd1498Szrj "Symbolic number of iterations is ");
169638fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
169738fd1498Szrj dump_printf (MSG_NOTE, "\n");
169838fd1498Szrj }
169938fd1498Szrj }
170038fd1498Szrj
170138fd1498Szrj STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
170238fd1498Szrj if (inner_loop_cond)
170338fd1498Szrj STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
170438fd1498Szrj = loop_exit_ctrl_vec_info_type;
170538fd1498Szrj
170638fd1498Szrj gcc_assert (!loop->aux);
170738fd1498Szrj loop->aux = loop_vinfo;
170838fd1498Szrj return loop_vinfo;
170938fd1498Szrj }
171038fd1498Szrj
171138fd1498Szrj
171238fd1498Szrj
171338fd1498Szrj /* Scan the loop stmts and dependent on whether there are any (non-)SLP
171438fd1498Szrj statements update the vectorization factor. */
171538fd1498Szrj
171638fd1498Szrj static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)171738fd1498Szrj vect_update_vf_for_slp (loop_vec_info loop_vinfo)
171838fd1498Szrj {
171938fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
172038fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
172138fd1498Szrj int nbbs = loop->num_nodes;
172238fd1498Szrj poly_uint64 vectorization_factor;
172338fd1498Szrj int i;
172438fd1498Szrj
172538fd1498Szrj if (dump_enabled_p ())
172638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
172738fd1498Szrj "=== vect_update_vf_for_slp ===\n");
172838fd1498Szrj
172938fd1498Szrj vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
173038fd1498Szrj gcc_assert (known_ne (vectorization_factor, 0U));
173138fd1498Szrj
173238fd1498Szrj /* If all the stmts in the loop can be SLPed, we perform only SLP, and
173338fd1498Szrj vectorization factor of the loop is the unrolling factor required by
173438fd1498Szrj the SLP instances. If that unrolling factor is 1, we say, that we
173538fd1498Szrj perform pure SLP on loop - cross iteration parallelism is not
173638fd1498Szrj exploited. */
173738fd1498Szrj bool only_slp_in_loop = true;
173838fd1498Szrj for (i = 0; i < nbbs; i++)
173938fd1498Szrj {
174038fd1498Szrj basic_block bb = bbs[i];
174138fd1498Szrj for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
174238fd1498Szrj gsi_next (&si))
174338fd1498Szrj {
174438fd1498Szrj gimple *stmt = gsi_stmt (si);
174538fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
174638fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (stmt_info)
174738fd1498Szrj && STMT_VINFO_RELATED_STMT (stmt_info))
174838fd1498Szrj {
174938fd1498Szrj stmt = STMT_VINFO_RELATED_STMT (stmt_info);
175038fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
175138fd1498Szrj }
175238fd1498Szrj if ((STMT_VINFO_RELEVANT_P (stmt_info)
175338fd1498Szrj || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
175438fd1498Szrj && !PURE_SLP_STMT (stmt_info))
175538fd1498Szrj /* STMT needs both SLP and loop-based vectorization. */
175638fd1498Szrj only_slp_in_loop = false;
175738fd1498Szrj }
175838fd1498Szrj }
175938fd1498Szrj
176038fd1498Szrj if (only_slp_in_loop)
176138fd1498Szrj {
176238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
176338fd1498Szrj "Loop contains only SLP stmts\n");
176438fd1498Szrj vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
176538fd1498Szrj }
176638fd1498Szrj else
176738fd1498Szrj {
176838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
176938fd1498Szrj "Loop contains SLP and non-SLP stmts\n");
177038fd1498Szrj /* Both the vectorization factor and unroll factor have the form
177138fd1498Szrj current_vector_size * X for some rational X, so they must have
177238fd1498Szrj a common multiple. */
177338fd1498Szrj vectorization_factor
177438fd1498Szrj = force_common_multiple (vectorization_factor,
177538fd1498Szrj LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
177638fd1498Szrj }
177738fd1498Szrj
177838fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
177938fd1498Szrj if (dump_enabled_p ())
178038fd1498Szrj {
178138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
178238fd1498Szrj "Updating vectorization factor to ");
178338fd1498Szrj dump_dec (MSG_NOTE, vectorization_factor);
178438fd1498Szrj dump_printf (MSG_NOTE, ".\n");
178538fd1498Szrj }
178638fd1498Szrj }
178738fd1498Szrj
178838fd1498Szrj /* Return true if STMT_INFO describes a double reduction phi and if
178938fd1498Szrj the other phi in the reduction is also relevant for vectorization.
179038fd1498Szrj This rejects cases such as:
179138fd1498Szrj
179238fd1498Szrj outer1:
179338fd1498Szrj x_1 = PHI <x_3(outer2), ...>;
179438fd1498Szrj ...
179538fd1498Szrj
179638fd1498Szrj inner:
179738fd1498Szrj x_2 = ...;
179838fd1498Szrj ...
179938fd1498Szrj
180038fd1498Szrj outer2:
180138fd1498Szrj x_3 = PHI <x_2(inner)>;
180238fd1498Szrj
180338fd1498Szrj if nothing in x_2 or elsewhere makes x_1 relevant. */
180438fd1498Szrj
180538fd1498Szrj static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)180638fd1498Szrj vect_active_double_reduction_p (stmt_vec_info stmt_info)
180738fd1498Szrj {
180838fd1498Szrj if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
180938fd1498Szrj return false;
181038fd1498Szrj
181138fd1498Szrj gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
181238fd1498Szrj return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
181338fd1498Szrj }
181438fd1498Szrj
181538fd1498Szrj /* Function vect_analyze_loop_operations.
181638fd1498Szrj
181738fd1498Szrj Scan the loop stmts and make sure they are all vectorizable. */
181838fd1498Szrj
181938fd1498Szrj static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)182038fd1498Szrj vect_analyze_loop_operations (loop_vec_info loop_vinfo)
182138fd1498Szrj {
182238fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
182338fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
182438fd1498Szrj int nbbs = loop->num_nodes;
182538fd1498Szrj int i;
182638fd1498Szrj stmt_vec_info stmt_info;
182738fd1498Szrj bool need_to_vectorize = false;
182838fd1498Szrj bool ok;
182938fd1498Szrj
183038fd1498Szrj if (dump_enabled_p ())
183138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
183238fd1498Szrj "=== vect_analyze_loop_operations ===\n");
183338fd1498Szrj
183438fd1498Szrj for (i = 0; i < nbbs; i++)
183538fd1498Szrj {
183638fd1498Szrj basic_block bb = bbs[i];
183738fd1498Szrj
183838fd1498Szrj for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
183938fd1498Szrj gsi_next (&si))
184038fd1498Szrj {
184138fd1498Szrj gphi *phi = si.phi ();
184238fd1498Szrj ok = true;
184338fd1498Szrj
184438fd1498Szrj stmt_info = vinfo_for_stmt (phi);
184538fd1498Szrj if (dump_enabled_p ())
184638fd1498Szrj {
184738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
184838fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
184938fd1498Szrj }
185038fd1498Szrj if (virtual_operand_p (gimple_phi_result (phi)))
185138fd1498Szrj continue;
185238fd1498Szrj
185338fd1498Szrj /* Inner-loop loop-closed exit phi in outer-loop vectorization
185438fd1498Szrj (i.e., a phi in the tail of the outer-loop). */
185538fd1498Szrj if (! is_loop_header_bb_p (bb))
185638fd1498Szrj {
185738fd1498Szrj /* FORNOW: we currently don't support the case that these phis
185838fd1498Szrj are not used in the outerloop (unless it is double reduction,
185938fd1498Szrj i.e., this phi is vect_reduction_def), cause this case
186038fd1498Szrj requires to actually do something here. */
186138fd1498Szrj if (STMT_VINFO_LIVE_P (stmt_info)
186238fd1498Szrj && !vect_active_double_reduction_p (stmt_info))
186338fd1498Szrj {
186438fd1498Szrj if (dump_enabled_p ())
186538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
186638fd1498Szrj "Unsupported loop-closed phi in "
186738fd1498Szrj "outer-loop.\n");
186838fd1498Szrj return false;
186938fd1498Szrj }
187038fd1498Szrj
187138fd1498Szrj /* If PHI is used in the outer loop, we check that its operand
187238fd1498Szrj is defined in the inner loop. */
187338fd1498Szrj if (STMT_VINFO_RELEVANT_P (stmt_info))
187438fd1498Szrj {
187538fd1498Szrj tree phi_op;
187638fd1498Szrj gimple *op_def_stmt;
187738fd1498Szrj
187838fd1498Szrj if (gimple_phi_num_args (phi) != 1)
187938fd1498Szrj return false;
188038fd1498Szrj
188138fd1498Szrj phi_op = PHI_ARG_DEF (phi, 0);
188238fd1498Szrj if (TREE_CODE (phi_op) != SSA_NAME)
188338fd1498Szrj return false;
188438fd1498Szrj
188538fd1498Szrj op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
188638fd1498Szrj if (gimple_nop_p (op_def_stmt)
188738fd1498Szrj || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
188838fd1498Szrj || !vinfo_for_stmt (op_def_stmt))
188938fd1498Szrj return false;
189038fd1498Szrj
189138fd1498Szrj if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
189238fd1498Szrj != vect_used_in_outer
189338fd1498Szrj && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
189438fd1498Szrj != vect_used_in_outer_by_reduction)
189538fd1498Szrj return false;
189638fd1498Szrj }
189738fd1498Szrj
189838fd1498Szrj continue;
189938fd1498Szrj }
190038fd1498Szrj
190138fd1498Szrj gcc_assert (stmt_info);
190238fd1498Szrj
190338fd1498Szrj if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
190438fd1498Szrj || STMT_VINFO_LIVE_P (stmt_info))
190538fd1498Szrj && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
190638fd1498Szrj {
190738fd1498Szrj /* A scalar-dependence cycle that we don't support. */
190838fd1498Szrj if (dump_enabled_p ())
190938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
191038fd1498Szrj "not vectorized: scalar dependence cycle.\n");
191138fd1498Szrj return false;
191238fd1498Szrj }
191338fd1498Szrj
191438fd1498Szrj if (STMT_VINFO_RELEVANT_P (stmt_info))
191538fd1498Szrj {
191638fd1498Szrj need_to_vectorize = true;
191738fd1498Szrj if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
191838fd1498Szrj && ! PURE_SLP_STMT (stmt_info))
191938fd1498Szrj ok = vectorizable_induction (phi, NULL, NULL, NULL);
192038fd1498Szrj else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
192138fd1498Szrj || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
192238fd1498Szrj && ! PURE_SLP_STMT (stmt_info))
192338fd1498Szrj ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
192438fd1498Szrj }
192538fd1498Szrj
192638fd1498Szrj /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
192738fd1498Szrj if (ok
192838fd1498Szrj && STMT_VINFO_LIVE_P (stmt_info)
192938fd1498Szrj && !PURE_SLP_STMT (stmt_info))
193038fd1498Szrj ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
193138fd1498Szrj
193238fd1498Szrj if (!ok)
193338fd1498Szrj {
193438fd1498Szrj if (dump_enabled_p ())
193538fd1498Szrj {
193638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
193738fd1498Szrj "not vectorized: relevant phi not "
193838fd1498Szrj "supported: ");
193938fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
194038fd1498Szrj }
194138fd1498Szrj return false;
194238fd1498Szrj }
194338fd1498Szrj }
194438fd1498Szrj
194538fd1498Szrj for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
194638fd1498Szrj gsi_next (&si))
194738fd1498Szrj {
194838fd1498Szrj gimple *stmt = gsi_stmt (si);
194938fd1498Szrj if (!gimple_clobber_p (stmt)
195038fd1498Szrj && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
195138fd1498Szrj return false;
195238fd1498Szrj }
195338fd1498Szrj } /* bbs */
195438fd1498Szrj
195538fd1498Szrj /* All operations in the loop are either irrelevant (deal with loop
195638fd1498Szrj control, or dead), or only used outside the loop and can be moved
195738fd1498Szrj out of the loop (e.g. invariants, inductions). The loop can be
195838fd1498Szrj optimized away by scalar optimizations. We're better off not
195938fd1498Szrj touching this loop. */
196038fd1498Szrj if (!need_to_vectorize)
196138fd1498Szrj {
196238fd1498Szrj if (dump_enabled_p ())
196338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
196438fd1498Szrj "All the computation can be taken out of the loop.\n");
196538fd1498Szrj if (dump_enabled_p ())
196638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
196738fd1498Szrj "not vectorized: redundant loop. no profit to "
196838fd1498Szrj "vectorize.\n");
196938fd1498Szrj return false;
197038fd1498Szrj }
197138fd1498Szrj
197238fd1498Szrj return true;
197338fd1498Szrj }
197438fd1498Szrj
197538fd1498Szrj /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
197638fd1498Szrj is worthwhile to vectorize. Return 1 if definitely yes, 0 if
197738fd1498Szrj definitely no, or -1 if it's worth retrying. */
197838fd1498Szrj
197938fd1498Szrj static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)198038fd1498Szrj vect_analyze_loop_costing (loop_vec_info loop_vinfo)
198138fd1498Szrj {
198238fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
198338fd1498Szrj unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
198438fd1498Szrj
198538fd1498Szrj /* Only fully-masked loops can have iteration counts less than the
198638fd1498Szrj vectorization factor. */
198738fd1498Szrj if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
198838fd1498Szrj {
198938fd1498Szrj HOST_WIDE_INT max_niter;
199038fd1498Szrj
199138fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
199238fd1498Szrj max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
199338fd1498Szrj else
199438fd1498Szrj max_niter = max_stmt_executions_int (loop);
199538fd1498Szrj
199638fd1498Szrj if (max_niter != -1
199738fd1498Szrj && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
199838fd1498Szrj {
199938fd1498Szrj if (dump_enabled_p ())
200038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
200138fd1498Szrj "not vectorized: iteration count smaller than "
200238fd1498Szrj "vectorization factor.\n");
200338fd1498Szrj return 0;
200438fd1498Szrj }
200538fd1498Szrj }
200638fd1498Szrj
200738fd1498Szrj int min_profitable_iters, min_profitable_estimate;
200838fd1498Szrj vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
200938fd1498Szrj &min_profitable_estimate);
201038fd1498Szrj
201138fd1498Szrj if (min_profitable_iters < 0)
201238fd1498Szrj {
201338fd1498Szrj if (dump_enabled_p ())
201438fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201538fd1498Szrj "not vectorized: vectorization not profitable.\n");
201638fd1498Szrj if (dump_enabled_p ())
201738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201838fd1498Szrj "not vectorized: vector version will never be "
201938fd1498Szrj "profitable.\n");
202038fd1498Szrj return -1;
202138fd1498Szrj }
202238fd1498Szrj
202338fd1498Szrj int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
202438fd1498Szrj * assumed_vf);
202538fd1498Szrj
202638fd1498Szrj /* Use the cost model only if it is more conservative than user specified
202738fd1498Szrj threshold. */
202838fd1498Szrj unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
202938fd1498Szrj min_profitable_iters);
203038fd1498Szrj
203138fd1498Szrj LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
203238fd1498Szrj
203338fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
203438fd1498Szrj && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
203538fd1498Szrj {
203638fd1498Szrj if (dump_enabled_p ())
203738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
203838fd1498Szrj "not vectorized: vectorization not profitable.\n");
203938fd1498Szrj if (dump_enabled_p ())
204038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
204138fd1498Szrj "not vectorized: iteration count smaller than user "
204238fd1498Szrj "specified loop bound parameter or minimum profitable "
204338fd1498Szrj "iterations (whichever is more conservative).\n");
204438fd1498Szrj return 0;
204538fd1498Szrj }
204638fd1498Szrj
204738fd1498Szrj HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
204838fd1498Szrj if (estimated_niter == -1)
204938fd1498Szrj estimated_niter = likely_max_stmt_executions_int (loop);
205038fd1498Szrj if (estimated_niter != -1
205138fd1498Szrj && ((unsigned HOST_WIDE_INT) estimated_niter
205238fd1498Szrj < MAX (th, (unsigned) min_profitable_estimate)))
205338fd1498Szrj {
205438fd1498Szrj if (dump_enabled_p ())
205538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
205638fd1498Szrj "not vectorized: estimated iteration count too "
205738fd1498Szrj "small.\n");
205838fd1498Szrj if (dump_enabled_p ())
205938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
206038fd1498Szrj "not vectorized: estimated iteration count smaller "
206138fd1498Szrj "than specified loop bound parameter or minimum "
206238fd1498Szrj "profitable iterations (whichever is more "
206338fd1498Szrj "conservative).\n");
206438fd1498Szrj return -1;
206538fd1498Szrj }
206638fd1498Szrj
206738fd1498Szrj return 1;
206838fd1498Szrj }
206938fd1498Szrj
207038fd1498Szrj
207138fd1498Szrj /* Function vect_analyze_loop_2.
207238fd1498Szrj
207338fd1498Szrj Apply a set of analyses on LOOP, and create a loop_vec_info struct
207438fd1498Szrj for it. The different analyses will record information in the
207538fd1498Szrj loop_vec_info struct. */
207638fd1498Szrj static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)207738fd1498Szrj vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
207838fd1498Szrj {
207938fd1498Szrj bool ok;
208038fd1498Szrj int res;
208138fd1498Szrj unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
208238fd1498Szrj poly_uint64 min_vf = 2;
208338fd1498Szrj unsigned int n_stmts = 0;
208438fd1498Szrj
208538fd1498Szrj /* The first group of checks is independent of the vector size. */
208638fd1498Szrj fatal = true;
208738fd1498Szrj
208838fd1498Szrj /* Find all data references in the loop (which correspond to vdefs/vuses)
208938fd1498Szrj and analyze their evolution in the loop. */
209038fd1498Szrj
209138fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
209238fd1498Szrj
209338fd1498Szrj loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
209438fd1498Szrj if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
209538fd1498Szrj {
209638fd1498Szrj if (dump_enabled_p ())
209738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
209838fd1498Szrj "not vectorized: loop nest containing two "
209938fd1498Szrj "or more consecutive inner loops cannot be "
210038fd1498Szrj "vectorized\n");
210138fd1498Szrj return false;
210238fd1498Szrj }
210338fd1498Szrj
210438fd1498Szrj for (unsigned i = 0; i < loop->num_nodes; i++)
210538fd1498Szrj for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
210638fd1498Szrj !gsi_end_p (gsi); gsi_next (&gsi))
210738fd1498Szrj {
210838fd1498Szrj gimple *stmt = gsi_stmt (gsi);
210938fd1498Szrj if (is_gimple_debug (stmt))
211038fd1498Szrj continue;
211138fd1498Szrj ++n_stmts;
211238fd1498Szrj if (!find_data_references_in_stmt (loop, stmt,
211338fd1498Szrj &LOOP_VINFO_DATAREFS (loop_vinfo)))
211438fd1498Szrj {
211538fd1498Szrj if (is_gimple_call (stmt) && loop->safelen)
211638fd1498Szrj {
211738fd1498Szrj tree fndecl = gimple_call_fndecl (stmt), op;
211838fd1498Szrj if (fndecl != NULL_TREE)
211938fd1498Szrj {
212038fd1498Szrj cgraph_node *node = cgraph_node::get (fndecl);
212138fd1498Szrj if (node != NULL && node->simd_clones != NULL)
212238fd1498Szrj {
212338fd1498Szrj unsigned int j, n = gimple_call_num_args (stmt);
212438fd1498Szrj for (j = 0; j < n; j++)
212538fd1498Szrj {
212638fd1498Szrj op = gimple_call_arg (stmt, j);
212738fd1498Szrj if (DECL_P (op)
212838fd1498Szrj || (REFERENCE_CLASS_P (op)
212938fd1498Szrj && get_base_address (op)))
213038fd1498Szrj break;
213138fd1498Szrj }
213238fd1498Szrj op = gimple_call_lhs (stmt);
213338fd1498Szrj /* Ignore #pragma omp declare simd functions
213438fd1498Szrj if they don't have data references in the
213538fd1498Szrj call stmt itself. */
213638fd1498Szrj if (j == n
213738fd1498Szrj && !(op
213838fd1498Szrj && (DECL_P (op)
213938fd1498Szrj || (REFERENCE_CLASS_P (op)
214038fd1498Szrj && get_base_address (op)))))
214138fd1498Szrj continue;
214238fd1498Szrj }
214338fd1498Szrj }
214438fd1498Szrj }
214538fd1498Szrj if (dump_enabled_p ())
214638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214738fd1498Szrj "not vectorized: loop contains function "
214838fd1498Szrj "calls or data references that cannot "
214938fd1498Szrj "be analyzed\n");
215038fd1498Szrj return false;
215138fd1498Szrj }
215238fd1498Szrj }
215338fd1498Szrj
215438fd1498Szrj /* Analyze the data references and also adjust the minimal
215538fd1498Szrj vectorization factor according to the loads and stores. */
215638fd1498Szrj
215738fd1498Szrj ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
215838fd1498Szrj if (!ok)
215938fd1498Szrj {
216038fd1498Szrj if (dump_enabled_p ())
216138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
216238fd1498Szrj "bad data references.\n");
216338fd1498Szrj return false;
216438fd1498Szrj }
216538fd1498Szrj
216638fd1498Szrj /* Classify all cross-iteration scalar data-flow cycles.
216738fd1498Szrj Cross-iteration cycles caused by virtual phis are analyzed separately. */
216838fd1498Szrj vect_analyze_scalar_cycles (loop_vinfo);
216938fd1498Szrj
217038fd1498Szrj vect_pattern_recog (loop_vinfo);
217138fd1498Szrj
217238fd1498Szrj vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
217338fd1498Szrj
217438fd1498Szrj /* Analyze the access patterns of the data-refs in the loop (consecutive,
217538fd1498Szrj complex, etc.). FORNOW: Only handle consecutive access pattern. */
217638fd1498Szrj
217738fd1498Szrj ok = vect_analyze_data_ref_accesses (loop_vinfo);
217838fd1498Szrj if (!ok)
217938fd1498Szrj {
218038fd1498Szrj if (dump_enabled_p ())
218138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
218238fd1498Szrj "bad data access.\n");
218338fd1498Szrj return false;
218438fd1498Szrj }
218538fd1498Szrj
218638fd1498Szrj /* Data-flow analysis to detect stmts that do not need to be vectorized. */
218738fd1498Szrj
218838fd1498Szrj ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
218938fd1498Szrj if (!ok)
219038fd1498Szrj {
219138fd1498Szrj if (dump_enabled_p ())
219238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
219338fd1498Szrj "unexpected pattern.\n");
219438fd1498Szrj return false;
219538fd1498Szrj }
219638fd1498Szrj
219738fd1498Szrj /* While the rest of the analysis below depends on it in some way. */
219838fd1498Szrj fatal = false;
219938fd1498Szrj
220038fd1498Szrj /* Analyze data dependences between the data-refs in the loop
220138fd1498Szrj and adjust the maximum vectorization factor according to
220238fd1498Szrj the dependences.
220338fd1498Szrj FORNOW: fail at the first data dependence that we encounter. */
220438fd1498Szrj
220538fd1498Szrj ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
220638fd1498Szrj if (!ok
220738fd1498Szrj || (max_vf != MAX_VECTORIZATION_FACTOR
220838fd1498Szrj && maybe_lt (max_vf, min_vf)))
220938fd1498Szrj {
221038fd1498Szrj if (dump_enabled_p ())
221138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
221238fd1498Szrj "bad data dependence.\n");
221338fd1498Szrj return false;
221438fd1498Szrj }
221538fd1498Szrj LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
221638fd1498Szrj
221738fd1498Szrj ok = vect_determine_vectorization_factor (loop_vinfo);
221838fd1498Szrj if (!ok)
221938fd1498Szrj {
222038fd1498Szrj if (dump_enabled_p ())
222138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
222238fd1498Szrj "can't determine vectorization factor.\n");
222338fd1498Szrj return false;
222438fd1498Szrj }
222538fd1498Szrj if (max_vf != MAX_VECTORIZATION_FACTOR
222638fd1498Szrj && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
222738fd1498Szrj {
222838fd1498Szrj if (dump_enabled_p ())
222938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
223038fd1498Szrj "bad data dependence.\n");
223138fd1498Szrj return false;
223238fd1498Szrj }
223338fd1498Szrj
223438fd1498Szrj /* Compute the scalar iteration cost. */
223538fd1498Szrj vect_compute_single_scalar_iteration_cost (loop_vinfo);
223638fd1498Szrj
223738fd1498Szrj poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
223838fd1498Szrj unsigned th;
223938fd1498Szrj
224038fd1498Szrj /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
224138fd1498Szrj ok = vect_analyze_slp (loop_vinfo, n_stmts);
224238fd1498Szrj if (!ok)
224338fd1498Szrj return false;
224438fd1498Szrj
224538fd1498Szrj /* If there are any SLP instances mark them as pure_slp. */
224638fd1498Szrj bool slp = vect_make_slp_decision (loop_vinfo);
224738fd1498Szrj if (slp)
224838fd1498Szrj {
224938fd1498Szrj /* Find stmts that need to be both vectorized and SLPed. */
225038fd1498Szrj vect_detect_hybrid_slp (loop_vinfo);
225138fd1498Szrj
225238fd1498Szrj /* Update the vectorization factor based on the SLP decision. */
225338fd1498Szrj vect_update_vf_for_slp (loop_vinfo);
225438fd1498Szrj }
225538fd1498Szrj
225638fd1498Szrj bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
225738fd1498Szrj
225838fd1498Szrj /* We don't expect to have to roll back to anything other than an empty
225938fd1498Szrj set of rgroups. */
226038fd1498Szrj gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
226138fd1498Szrj
226238fd1498Szrj /* This is the point where we can re-start analysis with SLP forced off. */
226338fd1498Szrj start_over:
226438fd1498Szrj
226538fd1498Szrj /* Now the vectorization factor is final. */
226638fd1498Szrj poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
226738fd1498Szrj gcc_assert (known_ne (vectorization_factor, 0U));
226838fd1498Szrj
226938fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
227038fd1498Szrj {
227138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
227238fd1498Szrj "vectorization_factor = ");
227338fd1498Szrj dump_dec (MSG_NOTE, vectorization_factor);
227438fd1498Szrj dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
227538fd1498Szrj LOOP_VINFO_INT_NITERS (loop_vinfo));
227638fd1498Szrj }
227738fd1498Szrj
227838fd1498Szrj HOST_WIDE_INT max_niter
227938fd1498Szrj = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
228038fd1498Szrj
228138fd1498Szrj /* Analyze the alignment of the data-refs in the loop.
228238fd1498Szrj Fail if a data reference is found that cannot be vectorized. */
228338fd1498Szrj
228438fd1498Szrj ok = vect_analyze_data_refs_alignment (loop_vinfo);
228538fd1498Szrj if (!ok)
228638fd1498Szrj {
228738fd1498Szrj if (dump_enabled_p ())
228838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228938fd1498Szrj "bad data alignment.\n");
229038fd1498Szrj return false;
229138fd1498Szrj }
229238fd1498Szrj
229338fd1498Szrj /* Prune the list of ddrs to be tested at run-time by versioning for alias.
229438fd1498Szrj It is important to call pruning after vect_analyze_data_ref_accesses,
229538fd1498Szrj since we use grouping information gathered by interleaving analysis. */
229638fd1498Szrj ok = vect_prune_runtime_alias_test_list (loop_vinfo);
229738fd1498Szrj if (!ok)
229838fd1498Szrj return false;
229938fd1498Szrj
230038fd1498Szrj /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
230138fd1498Szrj vectorization. */
230238fd1498Szrj if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
230338fd1498Szrj {
230438fd1498Szrj /* This pass will decide on using loop versioning and/or loop peeling in
230538fd1498Szrj order to enhance the alignment of data references in the loop. */
230638fd1498Szrj ok = vect_enhance_data_refs_alignment (loop_vinfo);
230738fd1498Szrj if (!ok)
230838fd1498Szrj {
230938fd1498Szrj if (dump_enabled_p ())
231038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
231138fd1498Szrj "bad data alignment.\n");
231238fd1498Szrj return false;
231338fd1498Szrj }
231438fd1498Szrj }
231538fd1498Szrj
231638fd1498Szrj if (slp)
231738fd1498Szrj {
231838fd1498Szrj /* Analyze operations in the SLP instances. Note this may
231938fd1498Szrj remove unsupported SLP instances which makes the above
232038fd1498Szrj SLP kind detection invalid. */
232138fd1498Szrj unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
232238fd1498Szrj vect_slp_analyze_operations (loop_vinfo);
232338fd1498Szrj if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
232438fd1498Szrj goto again;
232538fd1498Szrj }
232638fd1498Szrj
232738fd1498Szrj /* Scan all the remaining operations in the loop that are not subject
232838fd1498Szrj to SLP and make sure they are vectorizable. */
232938fd1498Szrj ok = vect_analyze_loop_operations (loop_vinfo);
233038fd1498Szrj if (!ok)
233138fd1498Szrj {
233238fd1498Szrj if (dump_enabled_p ())
233338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
233438fd1498Szrj "bad operation or unsupported loop bound.\n");
233538fd1498Szrj return false;
233638fd1498Szrj }
233738fd1498Szrj
233838fd1498Szrj /* Decide whether to use a fully-masked loop for this vectorization
233938fd1498Szrj factor. */
234038fd1498Szrj LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
234138fd1498Szrj = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
234238fd1498Szrj && vect_verify_full_masking (loop_vinfo));
234338fd1498Szrj if (dump_enabled_p ())
234438fd1498Szrj {
234538fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
234638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
234738fd1498Szrj "using a fully-masked loop.\n");
234838fd1498Szrj else
234938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
235038fd1498Szrj "not using a fully-masked loop.\n");
235138fd1498Szrj }
235238fd1498Szrj
235338fd1498Szrj /* If epilog loop is required because of data accesses with gaps,
235438fd1498Szrj one additional iteration needs to be peeled. Check if there is
235538fd1498Szrj enough iterations for vectorization. */
235638fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
235738fd1498Szrj && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
235838fd1498Szrj && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
235938fd1498Szrj {
236038fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
236138fd1498Szrj tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
236238fd1498Szrj
236338fd1498Szrj if (known_lt (wi::to_widest (scalar_niters), vf))
236438fd1498Szrj {
236538fd1498Szrj if (dump_enabled_p ())
236638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
236738fd1498Szrj "loop has no enough iterations to support"
236838fd1498Szrj " peeling for gaps.\n");
236938fd1498Szrj return false;
237038fd1498Szrj }
237138fd1498Szrj }
237238fd1498Szrj
237338fd1498Szrj /* Check the costings of the loop make vectorizing worthwhile. */
237438fd1498Szrj res = vect_analyze_loop_costing (loop_vinfo);
237538fd1498Szrj if (res < 0)
237638fd1498Szrj goto again;
237738fd1498Szrj if (!res)
237838fd1498Szrj {
237938fd1498Szrj if (dump_enabled_p ())
238038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
238138fd1498Szrj "Loop costings not worthwhile.\n");
238238fd1498Szrj return false;
238338fd1498Szrj }
238438fd1498Szrj
238538fd1498Szrj /* Decide whether we need to create an epilogue loop to handle
238638fd1498Szrj remaining scalar iterations. */
238738fd1498Szrj th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
238838fd1498Szrj
238938fd1498Szrj unsigned HOST_WIDE_INT const_vf;
239038fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
239138fd1498Szrj /* The main loop handles all iterations. */
239238fd1498Szrj LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
239338fd1498Szrj else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394*58e805e6Szrj && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
239538fd1498Szrj {
2396*58e805e6Szrj /* Work out the (constant) number of iterations that need to be
2397*58e805e6Szrj peeled for reasons other than niters. */
2398*58e805e6Szrj unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2399*58e805e6Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2400*58e805e6Szrj peel_niter += 1;
2401*58e805e6Szrj if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
240238fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
240338fd1498Szrj LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
240438fd1498Szrj }
240538fd1498Szrj else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2406*58e805e6Szrj /* ??? When peeling for gaps but not alignment, we could
2407*58e805e6Szrj try to check whether the (variable) niters is known to be
2408*58e805e6Szrj VF * N + 1. That's something of a niche case though. */
2409*58e805e6Szrj || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
241038fd1498Szrj || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
241138fd1498Szrj || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
241238fd1498Szrj < (unsigned) exact_log2 (const_vf))
241338fd1498Szrj /* In case of versioning, check if the maximum number of
241438fd1498Szrj iterations is greater than th. If they are identical,
241538fd1498Szrj the epilogue is unnecessary. */
241638fd1498Szrj && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
241738fd1498Szrj || ((unsigned HOST_WIDE_INT) max_niter
241838fd1498Szrj > (th / const_vf) * const_vf))))
241938fd1498Szrj LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
242038fd1498Szrj
242138fd1498Szrj /* If an epilogue loop is required make sure we can create one. */
242238fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
242338fd1498Szrj || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
242438fd1498Szrj {
242538fd1498Szrj if (dump_enabled_p ())
242638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
242738fd1498Szrj if (!vect_can_advance_ivs_p (loop_vinfo)
242838fd1498Szrj || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
242938fd1498Szrj single_exit (LOOP_VINFO_LOOP
243038fd1498Szrj (loop_vinfo))))
243138fd1498Szrj {
243238fd1498Szrj if (dump_enabled_p ())
243338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
243438fd1498Szrj "not vectorized: can't create required "
243538fd1498Szrj "epilog loop\n");
243638fd1498Szrj goto again;
243738fd1498Szrj }
243838fd1498Szrj }
243938fd1498Szrj
244038fd1498Szrj /* During peeling, we need to check if number of loop iterations is
244138fd1498Szrj enough for both peeled prolog loop and vector loop. This check
244238fd1498Szrj can be merged along with threshold check of loop versioning, so
244338fd1498Szrj increase threshold for this case if necessary. */
244438fd1498Szrj if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
244538fd1498Szrj {
244638fd1498Szrj poly_uint64 niters_th = 0;
244738fd1498Szrj
244838fd1498Szrj if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
244938fd1498Szrj {
245038fd1498Szrj /* Niters for peeled prolog loop. */
245138fd1498Szrj if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
245238fd1498Szrj {
245338fd1498Szrj struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
245438fd1498Szrj tree vectype
245538fd1498Szrj = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
245638fd1498Szrj niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
245738fd1498Szrj }
245838fd1498Szrj else
245938fd1498Szrj niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
246038fd1498Szrj }
246138fd1498Szrj
246238fd1498Szrj /* Niters for at least one iteration of vectorized loop. */
246338fd1498Szrj if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
246438fd1498Szrj niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
246538fd1498Szrj /* One additional iteration because of peeling for gap. */
246638fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
246738fd1498Szrj niters_th += 1;
246838fd1498Szrj LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
246938fd1498Szrj }
247038fd1498Szrj
247138fd1498Szrj gcc_assert (known_eq (vectorization_factor,
247238fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
247338fd1498Szrj
247438fd1498Szrj /* Ok to vectorize! */
247538fd1498Szrj return true;
247638fd1498Szrj
247738fd1498Szrj again:
247838fd1498Szrj /* Try again with SLP forced off but if we didn't do any SLP there is
247938fd1498Szrj no point in re-trying. */
248038fd1498Szrj if (!slp)
248138fd1498Szrj return false;
248238fd1498Szrj
248338fd1498Szrj /* If there are reduction chains re-trying will fail anyway. */
248438fd1498Szrj if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
248538fd1498Szrj return false;
248638fd1498Szrj
248738fd1498Szrj /* Likewise if the grouped loads or stores in the SLP cannot be handled
248838fd1498Szrj via interleaving or lane instructions. */
248938fd1498Szrj slp_instance instance;
249038fd1498Szrj slp_tree node;
249138fd1498Szrj unsigned i, j;
249238fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
249338fd1498Szrj {
249438fd1498Szrj stmt_vec_info vinfo;
249538fd1498Szrj vinfo = vinfo_for_stmt
249638fd1498Szrj (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
249738fd1498Szrj if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
249838fd1498Szrj continue;
249938fd1498Szrj vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
250038fd1498Szrj unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
250138fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (vinfo);
250238fd1498Szrj if (! vect_store_lanes_supported (vectype, size, false)
250338fd1498Szrj && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
250438fd1498Szrj && ! vect_grouped_store_supported (vectype, size))
250538fd1498Szrj return false;
250638fd1498Szrj FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
250738fd1498Szrj {
250838fd1498Szrj vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
250938fd1498Szrj vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
251038fd1498Szrj bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
251138fd1498Szrj size = STMT_VINFO_GROUP_SIZE (vinfo);
251238fd1498Szrj vectype = STMT_VINFO_VECTYPE (vinfo);
251338fd1498Szrj if (! vect_load_lanes_supported (vectype, size, false)
251438fd1498Szrj && ! vect_grouped_load_supported (vectype, single_element_p,
251538fd1498Szrj size))
251638fd1498Szrj return false;
251738fd1498Szrj }
251838fd1498Szrj }
251938fd1498Szrj
252038fd1498Szrj if (dump_enabled_p ())
252138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
252238fd1498Szrj "re-trying with SLP disabled\n");
252338fd1498Szrj
252438fd1498Szrj /* Roll back state appropriately. No SLP this time. */
252538fd1498Szrj slp = false;
252638fd1498Szrj /* Restore vectorization factor as it were without SLP. */
252738fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
252838fd1498Szrj /* Free the SLP instances. */
252938fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
253038fd1498Szrj vect_free_slp_instance (instance);
253138fd1498Szrj LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
253238fd1498Szrj /* Reset SLP type to loop_vect on all stmts. */
253338fd1498Szrj for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
253438fd1498Szrj {
253538fd1498Szrj basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
253638fd1498Szrj for (gimple_stmt_iterator si = gsi_start_phis (bb);
253738fd1498Szrj !gsi_end_p (si); gsi_next (&si))
253838fd1498Szrj {
253938fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
254038fd1498Szrj STMT_SLP_TYPE (stmt_info) = loop_vect;
254138fd1498Szrj }
254238fd1498Szrj for (gimple_stmt_iterator si = gsi_start_bb (bb);
254338fd1498Szrj !gsi_end_p (si); gsi_next (&si))
254438fd1498Szrj {
254538fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
254638fd1498Szrj STMT_SLP_TYPE (stmt_info) = loop_vect;
254738fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (stmt_info))
254838fd1498Szrj {
254938fd1498Szrj stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
255038fd1498Szrj STMT_SLP_TYPE (stmt_info) = loop_vect;
255138fd1498Szrj for (gimple_stmt_iterator pi
255238fd1498Szrj = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
255338fd1498Szrj !gsi_end_p (pi); gsi_next (&pi))
255438fd1498Szrj {
255538fd1498Szrj gimple *pstmt = gsi_stmt (pi);
255638fd1498Szrj STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
255738fd1498Szrj }
255838fd1498Szrj }
255938fd1498Szrj }
256038fd1498Szrj }
256138fd1498Szrj /* Free optimized alias test DDRS. */
256238fd1498Szrj LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
256338fd1498Szrj LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
256438fd1498Szrj LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
256538fd1498Szrj /* Reset target cost data. */
256638fd1498Szrj destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
256738fd1498Szrj LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
256838fd1498Szrj = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
256938fd1498Szrj /* Reset accumulated rgroup information. */
257038fd1498Szrj release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
257138fd1498Szrj /* Reset assorted flags. */
257238fd1498Szrj LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
257338fd1498Szrj LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
257438fd1498Szrj LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
257538fd1498Szrj LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
257638fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
257738fd1498Szrj
257838fd1498Szrj goto start_over;
257938fd1498Szrj }
258038fd1498Szrj
258138fd1498Szrj /* Function vect_analyze_loop.
258238fd1498Szrj
258338fd1498Szrj Apply a set of analyses on LOOP, and create a loop_vec_info struct
258438fd1498Szrj for it. The different analyses will record information in the
258538fd1498Szrj loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
258638fd1498Szrj be vectorized. */
258738fd1498Szrj loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo)258838fd1498Szrj vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
258938fd1498Szrj {
259038fd1498Szrj loop_vec_info loop_vinfo;
259138fd1498Szrj auto_vector_sizes vector_sizes;
259238fd1498Szrj
259338fd1498Szrj /* Autodetect first vector size we try. */
259438fd1498Szrj current_vector_size = 0;
259538fd1498Szrj targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
259638fd1498Szrj unsigned int next_size = 0;
259738fd1498Szrj
259838fd1498Szrj if (dump_enabled_p ())
259938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
260038fd1498Szrj "===== analyze_loop_nest =====\n");
260138fd1498Szrj
260238fd1498Szrj if (loop_outer (loop)
260338fd1498Szrj && loop_vec_info_for_loop (loop_outer (loop))
260438fd1498Szrj && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
260538fd1498Szrj {
260638fd1498Szrj if (dump_enabled_p ())
260738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
260838fd1498Szrj "outer-loop already vectorized.\n");
260938fd1498Szrj return NULL;
261038fd1498Szrj }
261138fd1498Szrj
261238fd1498Szrj poly_uint64 autodetected_vector_size = 0;
261338fd1498Szrj while (1)
261438fd1498Szrj {
261538fd1498Szrj /* Check the CFG characteristics of the loop (nesting, entry/exit). */
261638fd1498Szrj loop_vinfo = vect_analyze_loop_form (loop);
261738fd1498Szrj if (!loop_vinfo)
261838fd1498Szrj {
261938fd1498Szrj if (dump_enabled_p ())
262038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
262138fd1498Szrj "bad loop form.\n");
262238fd1498Szrj return NULL;
262338fd1498Szrj }
262438fd1498Szrj
262538fd1498Szrj bool fatal = false;
262638fd1498Szrj
262738fd1498Szrj if (orig_loop_vinfo)
262838fd1498Szrj LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
262938fd1498Szrj
263038fd1498Szrj if (vect_analyze_loop_2 (loop_vinfo, fatal))
263138fd1498Szrj {
263238fd1498Szrj LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
263338fd1498Szrj
263438fd1498Szrj return loop_vinfo;
263538fd1498Szrj }
263638fd1498Szrj
263738fd1498Szrj delete loop_vinfo;
263838fd1498Szrj
263938fd1498Szrj if (next_size == 0)
264038fd1498Szrj autodetected_vector_size = current_vector_size;
264138fd1498Szrj
264238fd1498Szrj if (next_size < vector_sizes.length ()
264338fd1498Szrj && known_eq (vector_sizes[next_size], autodetected_vector_size))
264438fd1498Szrj next_size += 1;
264538fd1498Szrj
264638fd1498Szrj if (fatal
264738fd1498Szrj || next_size == vector_sizes.length ()
264838fd1498Szrj || known_eq (current_vector_size, 0U))
264938fd1498Szrj return NULL;
265038fd1498Szrj
265138fd1498Szrj /* Try the next biggest vector size. */
265238fd1498Szrj current_vector_size = vector_sizes[next_size++];
265338fd1498Szrj if (dump_enabled_p ())
265438fd1498Szrj {
265538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
265638fd1498Szrj "***** Re-trying analysis with "
265738fd1498Szrj "vector size ");
265838fd1498Szrj dump_dec (MSG_NOTE, current_vector_size);
265938fd1498Szrj dump_printf (MSG_NOTE, "\n");
266038fd1498Szrj }
266138fd1498Szrj }
266238fd1498Szrj }
266338fd1498Szrj
266438fd1498Szrj /* Return true if there is an in-order reduction function for CODE, storing
266538fd1498Szrj it in *REDUC_FN if so. */
266638fd1498Szrj
266738fd1498Szrj static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)266838fd1498Szrj fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
266938fd1498Szrj {
267038fd1498Szrj switch (code)
267138fd1498Szrj {
267238fd1498Szrj case PLUS_EXPR:
267338fd1498Szrj *reduc_fn = IFN_FOLD_LEFT_PLUS;
267438fd1498Szrj return true;
267538fd1498Szrj
267638fd1498Szrj default:
267738fd1498Szrj return false;
267838fd1498Szrj }
267938fd1498Szrj }
268038fd1498Szrj
268138fd1498Szrj /* Function reduction_fn_for_scalar_code
268238fd1498Szrj
268338fd1498Szrj Input:
268438fd1498Szrj CODE - tree_code of a reduction operations.
268538fd1498Szrj
268638fd1498Szrj Output:
268738fd1498Szrj REDUC_FN - the corresponding internal function to be used to reduce the
268838fd1498Szrj vector of partial results into a single scalar result, or IFN_LAST
268938fd1498Szrj if the operation is a supported reduction operation, but does not have
269038fd1498Szrj such an internal function.
269138fd1498Szrj
269238fd1498Szrj Return FALSE if CODE currently cannot be vectorized as reduction. */
269338fd1498Szrj
269438fd1498Szrj static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)269538fd1498Szrj reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
269638fd1498Szrj {
269738fd1498Szrj switch (code)
269838fd1498Szrj {
269938fd1498Szrj case MAX_EXPR:
270038fd1498Szrj *reduc_fn = IFN_REDUC_MAX;
270138fd1498Szrj return true;
270238fd1498Szrj
270338fd1498Szrj case MIN_EXPR:
270438fd1498Szrj *reduc_fn = IFN_REDUC_MIN;
270538fd1498Szrj return true;
270638fd1498Szrj
270738fd1498Szrj case PLUS_EXPR:
270838fd1498Szrj *reduc_fn = IFN_REDUC_PLUS;
270938fd1498Szrj return true;
271038fd1498Szrj
271138fd1498Szrj case BIT_AND_EXPR:
271238fd1498Szrj *reduc_fn = IFN_REDUC_AND;
271338fd1498Szrj return true;
271438fd1498Szrj
271538fd1498Szrj case BIT_IOR_EXPR:
271638fd1498Szrj *reduc_fn = IFN_REDUC_IOR;
271738fd1498Szrj return true;
271838fd1498Szrj
271938fd1498Szrj case BIT_XOR_EXPR:
272038fd1498Szrj *reduc_fn = IFN_REDUC_XOR;
272138fd1498Szrj return true;
272238fd1498Szrj
272338fd1498Szrj case MULT_EXPR:
272438fd1498Szrj case MINUS_EXPR:
272538fd1498Szrj *reduc_fn = IFN_LAST;
272638fd1498Szrj return true;
272738fd1498Szrj
272838fd1498Szrj default:
272938fd1498Szrj return false;
273038fd1498Szrj }
273138fd1498Szrj }
273238fd1498Szrj
273338fd1498Szrj /* If there is a neutral value X such that SLP reduction NODE would not
273438fd1498Szrj be affected by the introduction of additional X elements, return that X,
273538fd1498Szrj otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
273638fd1498Szrj is true if the SLP statements perform a single reduction, false if each
273738fd1498Szrj statement performs an independent reduction. */
273838fd1498Szrj
273938fd1498Szrj static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)274038fd1498Szrj neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
274138fd1498Szrj bool reduc_chain)
274238fd1498Szrj {
274338fd1498Szrj vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
274438fd1498Szrj gimple *stmt = stmts[0];
274538fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
274638fd1498Szrj tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
274738fd1498Szrj tree scalar_type = TREE_TYPE (vector_type);
274838fd1498Szrj struct loop *loop = gimple_bb (stmt)->loop_father;
274938fd1498Szrj gcc_assert (loop);
275038fd1498Szrj
275138fd1498Szrj switch (code)
275238fd1498Szrj {
275338fd1498Szrj case WIDEN_SUM_EXPR:
275438fd1498Szrj case DOT_PROD_EXPR:
275538fd1498Szrj case SAD_EXPR:
275638fd1498Szrj case PLUS_EXPR:
275738fd1498Szrj case MINUS_EXPR:
275838fd1498Szrj case BIT_IOR_EXPR:
275938fd1498Szrj case BIT_XOR_EXPR:
276038fd1498Szrj return build_zero_cst (scalar_type);
276138fd1498Szrj
276238fd1498Szrj case MULT_EXPR:
276338fd1498Szrj return build_one_cst (scalar_type);
276438fd1498Szrj
276538fd1498Szrj case BIT_AND_EXPR:
276638fd1498Szrj return build_all_ones_cst (scalar_type);
276738fd1498Szrj
276838fd1498Szrj case MAX_EXPR:
276938fd1498Szrj case MIN_EXPR:
277038fd1498Szrj /* For MIN/MAX the initial values are neutral. A reduction chain
277138fd1498Szrj has only a single initial value, so that value is neutral for
277238fd1498Szrj all statements. */
277338fd1498Szrj if (reduc_chain)
277438fd1498Szrj return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
277538fd1498Szrj return NULL_TREE;
277638fd1498Szrj
277738fd1498Szrj default:
277838fd1498Szrj return NULL_TREE;
277938fd1498Szrj }
278038fd1498Szrj }
278138fd1498Szrj
278238fd1498Szrj /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
278338fd1498Szrj STMT is printed with a message MSG. */
278438fd1498Szrj
278538fd1498Szrj static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)278638fd1498Szrj report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
278738fd1498Szrj {
278838fd1498Szrj dump_printf_loc (msg_type, vect_location, "%s", msg);
278938fd1498Szrj dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
279038fd1498Szrj }
279138fd1498Szrj
279238fd1498Szrj
279338fd1498Szrj /* Detect SLP reduction of the form:
279438fd1498Szrj
279538fd1498Szrj #a1 = phi <a5, a0>
279638fd1498Szrj a2 = operation (a1)
279738fd1498Szrj a3 = operation (a2)
279838fd1498Szrj a4 = operation (a3)
279938fd1498Szrj a5 = operation (a4)
280038fd1498Szrj
280138fd1498Szrj #a = phi <a5>
280238fd1498Szrj
280338fd1498Szrj PHI is the reduction phi node (#a1 = phi <a5, a0> above)
280438fd1498Szrj FIRST_STMT is the first reduction stmt in the chain
280538fd1498Szrj (a2 = operation (a1)).
280638fd1498Szrj
280738fd1498Szrj Return TRUE if a reduction chain was detected. */
280838fd1498Szrj
280938fd1498Szrj static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)281038fd1498Szrj vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
281138fd1498Szrj gimple *first_stmt)
281238fd1498Szrj {
281338fd1498Szrj struct loop *loop = (gimple_bb (phi))->loop_father;
281438fd1498Szrj struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
281538fd1498Szrj enum tree_code code;
2816*58e805e6Szrj gimple *loop_use_stmt = NULL;
2817*58e805e6Szrj stmt_vec_info use_stmt_info;
281838fd1498Szrj tree lhs;
281938fd1498Szrj imm_use_iterator imm_iter;
282038fd1498Szrj use_operand_p use_p;
282138fd1498Szrj int nloop_uses, size = 0, n_out_of_loop_uses;
282238fd1498Szrj bool found = false;
282338fd1498Szrj
282438fd1498Szrj if (loop != vect_loop)
282538fd1498Szrj return false;
282638fd1498Szrj
2827*58e805e6Szrj auto_vec<stmt_vec_info, 8> reduc_chain;
282838fd1498Szrj lhs = PHI_RESULT (phi);
282938fd1498Szrj code = gimple_assign_rhs_code (first_stmt);
283038fd1498Szrj while (1)
283138fd1498Szrj {
283238fd1498Szrj nloop_uses = 0;
283338fd1498Szrj n_out_of_loop_uses = 0;
283438fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
283538fd1498Szrj {
283638fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
283738fd1498Szrj if (is_gimple_debug (use_stmt))
283838fd1498Szrj continue;
283938fd1498Szrj
284038fd1498Szrj /* Check if we got back to the reduction phi. */
284138fd1498Szrj if (use_stmt == phi)
284238fd1498Szrj {
284338fd1498Szrj loop_use_stmt = use_stmt;
284438fd1498Szrj found = true;
284538fd1498Szrj break;
284638fd1498Szrj }
284738fd1498Szrj
284838fd1498Szrj if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
284938fd1498Szrj {
285038fd1498Szrj loop_use_stmt = use_stmt;
285138fd1498Szrj nloop_uses++;
285238fd1498Szrj }
285338fd1498Szrj else
285438fd1498Szrj n_out_of_loop_uses++;
285538fd1498Szrj
285638fd1498Szrj /* There are can be either a single use in the loop or two uses in
285738fd1498Szrj phi nodes. */
285838fd1498Szrj if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
285938fd1498Szrj return false;
286038fd1498Szrj }
286138fd1498Szrj
286238fd1498Szrj if (found)
286338fd1498Szrj break;
286438fd1498Szrj
286538fd1498Szrj /* We reached a statement with no loop uses. */
286638fd1498Szrj if (nloop_uses == 0)
286738fd1498Szrj return false;
286838fd1498Szrj
286938fd1498Szrj /* This is a loop exit phi, and we haven't reached the reduction phi. */
287038fd1498Szrj if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
287138fd1498Szrj return false;
287238fd1498Szrj
287338fd1498Szrj if (!is_gimple_assign (loop_use_stmt)
287438fd1498Szrj || code != gimple_assign_rhs_code (loop_use_stmt)
287538fd1498Szrj || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
287638fd1498Szrj return false;
287738fd1498Szrj
287838fd1498Szrj /* Insert USE_STMT into reduction chain. */
287938fd1498Szrj use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2880*58e805e6Szrj reduc_chain.safe_push (use_stmt_info);
288138fd1498Szrj
288238fd1498Szrj lhs = gimple_assign_lhs (loop_use_stmt);
288338fd1498Szrj size++;
288438fd1498Szrj }
288538fd1498Szrj
288638fd1498Szrj if (!found || loop_use_stmt != phi || size < 2)
288738fd1498Szrj return false;
288838fd1498Szrj
288938fd1498Szrj /* Swap the operands, if needed, to make the reduction operand be the second
289038fd1498Szrj operand. */
289138fd1498Szrj lhs = PHI_RESULT (phi);
2892*58e805e6Szrj for (unsigned i = 0; i < reduc_chain.length (); ++i)
289338fd1498Szrj {
2894*58e805e6Szrj gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
289538fd1498Szrj if (gimple_assign_rhs2 (next_stmt) == lhs)
289638fd1498Szrj {
289738fd1498Szrj tree op = gimple_assign_rhs1 (next_stmt);
289838fd1498Szrj gimple *def_stmt = NULL;
289938fd1498Szrj
290038fd1498Szrj if (TREE_CODE (op) == SSA_NAME)
290138fd1498Szrj def_stmt = SSA_NAME_DEF_STMT (op);
290238fd1498Szrj
290338fd1498Szrj /* Check that the other def is either defined in the loop
290438fd1498Szrj ("vect_internal_def"), or it's an induction (defined by a
290538fd1498Szrj loop-header phi-node). */
290638fd1498Szrj if (def_stmt
290738fd1498Szrj && gimple_bb (def_stmt)
290838fd1498Szrj && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
290938fd1498Szrj && (is_gimple_assign (def_stmt)
291038fd1498Szrj || is_gimple_call (def_stmt)
291138fd1498Szrj || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
291238fd1498Szrj == vect_induction_def
291338fd1498Szrj || (gimple_code (def_stmt) == GIMPLE_PHI
291438fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
291538fd1498Szrj == vect_internal_def
291638fd1498Szrj && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
291738fd1498Szrj {
291838fd1498Szrj lhs = gimple_assign_lhs (next_stmt);
291938fd1498Szrj continue;
292038fd1498Szrj }
292138fd1498Szrj
292238fd1498Szrj return false;
292338fd1498Szrj }
292438fd1498Szrj else
292538fd1498Szrj {
292638fd1498Szrj tree op = gimple_assign_rhs2 (next_stmt);
292738fd1498Szrj gimple *def_stmt = NULL;
292838fd1498Szrj
292938fd1498Szrj if (TREE_CODE (op) == SSA_NAME)
293038fd1498Szrj def_stmt = SSA_NAME_DEF_STMT (op);
293138fd1498Szrj
293238fd1498Szrj /* Check that the other def is either defined in the loop
293338fd1498Szrj ("vect_internal_def"), or it's an induction (defined by a
293438fd1498Szrj loop-header phi-node). */
293538fd1498Szrj if (def_stmt
293638fd1498Szrj && gimple_bb (def_stmt)
293738fd1498Szrj && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
293838fd1498Szrj && (is_gimple_assign (def_stmt)
293938fd1498Szrj || is_gimple_call (def_stmt)
294038fd1498Szrj || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
294138fd1498Szrj == vect_induction_def
294238fd1498Szrj || (gimple_code (def_stmt) == GIMPLE_PHI
294338fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
294438fd1498Szrj == vect_internal_def
294538fd1498Szrj && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
294638fd1498Szrj {
294738fd1498Szrj if (dump_enabled_p ())
294838fd1498Szrj {
294938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
295038fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
295138fd1498Szrj }
295238fd1498Szrj
295338fd1498Szrj swap_ssa_operands (next_stmt,
295438fd1498Szrj gimple_assign_rhs1_ptr (next_stmt),
295538fd1498Szrj gimple_assign_rhs2_ptr (next_stmt));
295638fd1498Szrj update_stmt (next_stmt);
295738fd1498Szrj
295838fd1498Szrj if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
295938fd1498Szrj LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
296038fd1498Szrj }
296138fd1498Szrj else
296238fd1498Szrj return false;
296338fd1498Szrj }
296438fd1498Szrj
296538fd1498Szrj lhs = gimple_assign_lhs (next_stmt);
296638fd1498Szrj }
296738fd1498Szrj
2968*58e805e6Szrj /* Build up the actual chain. */
2969*58e805e6Szrj for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2970*58e805e6Szrj {
2971*58e805e6Szrj GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2972*58e805e6Szrj GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2973*58e805e6Szrj }
2974*58e805e6Szrj GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2975*58e805e6Szrj GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2976*58e805e6Szrj
297738fd1498Szrj /* Save the chain for further analysis in SLP detection. */
2978*58e805e6Szrj LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2979*58e805e6Szrj GROUP_SIZE (reduc_chain[0]) = size;
298038fd1498Szrj
298138fd1498Szrj return true;
298238fd1498Szrj }
298338fd1498Szrj
298438fd1498Szrj /* Return true if we need an in-order reduction for operation CODE
298538fd1498Szrj on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
298638fd1498Szrj overflow must wrap. */
298738fd1498Szrj
298838fd1498Szrj static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)298938fd1498Szrj needs_fold_left_reduction_p (tree type, tree_code code,
299038fd1498Szrj bool need_wrapping_integral_overflow)
299138fd1498Szrj {
299238fd1498Szrj /* CHECKME: check for !flag_finite_math_only too? */
299338fd1498Szrj if (SCALAR_FLOAT_TYPE_P (type))
299438fd1498Szrj switch (code)
299538fd1498Szrj {
299638fd1498Szrj case MIN_EXPR:
299738fd1498Szrj case MAX_EXPR:
299838fd1498Szrj return false;
299938fd1498Szrj
300038fd1498Szrj default:
300138fd1498Szrj return !flag_associative_math;
300238fd1498Szrj }
300338fd1498Szrj
300438fd1498Szrj if (INTEGRAL_TYPE_P (type))
300538fd1498Szrj {
300638fd1498Szrj if (!operation_no_trapping_overflow (type, code))
300738fd1498Szrj return true;
300838fd1498Szrj if (need_wrapping_integral_overflow
300938fd1498Szrj && !TYPE_OVERFLOW_WRAPS (type)
301038fd1498Szrj && operation_can_overflow (code))
301138fd1498Szrj return true;
301238fd1498Szrj return false;
301338fd1498Szrj }
301438fd1498Szrj
301538fd1498Szrj if (SAT_FIXED_POINT_TYPE_P (type))
301638fd1498Szrj return true;
301738fd1498Szrj
301838fd1498Szrj return false;
301938fd1498Szrj }
302038fd1498Szrj
302138fd1498Szrj /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
302238fd1498Szrj reduction operation CODE has a handled computation expression. */
302338fd1498Szrj
302438fd1498Szrj bool
check_reduction_path(location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)302538fd1498Szrj check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
302638fd1498Szrj enum tree_code code)
302738fd1498Szrj {
302838fd1498Szrj auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
302938fd1498Szrj auto_bitmap visited;
303038fd1498Szrj tree lookfor = PHI_RESULT (phi);
303138fd1498Szrj ssa_op_iter curri;
303238fd1498Szrj use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
303338fd1498Szrj while (USE_FROM_PTR (curr) != loop_arg)
303438fd1498Szrj curr = op_iter_next_use (&curri);
303538fd1498Szrj curri.i = curri.numops;
303638fd1498Szrj do
303738fd1498Szrj {
303838fd1498Szrj path.safe_push (std::make_pair (curri, curr));
303938fd1498Szrj tree use = USE_FROM_PTR (curr);
304038fd1498Szrj if (use == lookfor)
304138fd1498Szrj break;
304238fd1498Szrj gimple *def = SSA_NAME_DEF_STMT (use);
304338fd1498Szrj if (gimple_nop_p (def)
304438fd1498Szrj || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
304538fd1498Szrj {
304638fd1498Szrj pop:
304738fd1498Szrj do
304838fd1498Szrj {
304938fd1498Szrj std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
305038fd1498Szrj curri = x.first;
305138fd1498Szrj curr = x.second;
305238fd1498Szrj do
305338fd1498Szrj curr = op_iter_next_use (&curri);
305438fd1498Szrj /* Skip already visited or non-SSA operands (from iterating
305538fd1498Szrj over PHI args). */
305638fd1498Szrj while (curr != NULL_USE_OPERAND_P
305738fd1498Szrj && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
305838fd1498Szrj || ! bitmap_set_bit (visited,
305938fd1498Szrj SSA_NAME_VERSION
306038fd1498Szrj (USE_FROM_PTR (curr)))));
306138fd1498Szrj }
306238fd1498Szrj while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
306338fd1498Szrj if (curr == NULL_USE_OPERAND_P)
306438fd1498Szrj break;
306538fd1498Szrj }
306638fd1498Szrj else
306738fd1498Szrj {
306838fd1498Szrj if (gimple_code (def) == GIMPLE_PHI)
306938fd1498Szrj curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
307038fd1498Szrj else
307138fd1498Szrj curr = op_iter_init_use (&curri, def, SSA_OP_USE);
307238fd1498Szrj while (curr != NULL_USE_OPERAND_P
307338fd1498Szrj && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
307438fd1498Szrj || ! bitmap_set_bit (visited,
307538fd1498Szrj SSA_NAME_VERSION
307638fd1498Szrj (USE_FROM_PTR (curr)))))
307738fd1498Szrj curr = op_iter_next_use (&curri);
307838fd1498Szrj if (curr == NULL_USE_OPERAND_P)
307938fd1498Szrj goto pop;
308038fd1498Szrj }
308138fd1498Szrj }
308238fd1498Szrj while (1);
308338fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
308438fd1498Szrj {
308538fd1498Szrj dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
308638fd1498Szrj unsigned i;
308738fd1498Szrj std::pair<ssa_op_iter, use_operand_p> *x;
308838fd1498Szrj FOR_EACH_VEC_ELT (path, i, x)
308938fd1498Szrj {
309038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
309138fd1498Szrj dump_printf (MSG_NOTE, " ");
309238fd1498Szrj }
309338fd1498Szrj dump_printf (MSG_NOTE, "\n");
309438fd1498Szrj }
309538fd1498Szrj
309638fd1498Szrj /* Check whether the reduction path detected is valid. */
309738fd1498Szrj bool fail = path.length () == 0;
309838fd1498Szrj bool neg = false;
309938fd1498Szrj for (unsigned i = 1; i < path.length (); ++i)
310038fd1498Szrj {
310138fd1498Szrj gimple *use_stmt = USE_STMT (path[i].second);
310238fd1498Szrj tree op = USE_FROM_PTR (path[i].second);
310338fd1498Szrj if (! has_single_use (op)
310438fd1498Szrj || ! is_gimple_assign (use_stmt))
310538fd1498Szrj {
310638fd1498Szrj fail = true;
310738fd1498Szrj break;
310838fd1498Szrj }
310938fd1498Szrj if (gimple_assign_rhs_code (use_stmt) != code)
311038fd1498Szrj {
311138fd1498Szrj if (code == PLUS_EXPR
311238fd1498Szrj && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
311338fd1498Szrj {
311438fd1498Szrj /* Track whether we negate the reduction value each iteration. */
311538fd1498Szrj if (gimple_assign_rhs2 (use_stmt) == op)
311638fd1498Szrj neg = ! neg;
311738fd1498Szrj }
311838fd1498Szrj else
311938fd1498Szrj {
312038fd1498Szrj fail = true;
312138fd1498Szrj break;
312238fd1498Szrj }
312338fd1498Szrj }
312438fd1498Szrj }
312538fd1498Szrj return ! fail && ! neg;
312638fd1498Szrj }
312738fd1498Szrj
312838fd1498Szrj
312938fd1498Szrj /* Function vect_is_simple_reduction
313038fd1498Szrj
313138fd1498Szrj (1) Detect a cross-iteration def-use cycle that represents a simple
313238fd1498Szrj reduction computation. We look for the following pattern:
313338fd1498Szrj
313438fd1498Szrj loop_header:
313538fd1498Szrj a1 = phi < a0, a2 >
313638fd1498Szrj a3 = ...
313738fd1498Szrj a2 = operation (a3, a1)
313838fd1498Szrj
313938fd1498Szrj or
314038fd1498Szrj
314138fd1498Szrj a3 = ...
314238fd1498Szrj loop_header:
314338fd1498Szrj a1 = phi < a0, a2 >
314438fd1498Szrj a2 = operation (a3, a1)
314538fd1498Szrj
314638fd1498Szrj such that:
314738fd1498Szrj 1. operation is commutative and associative and it is safe to
314838fd1498Szrj change the order of the computation
314938fd1498Szrj 2. no uses for a2 in the loop (a2 is used out of the loop)
315038fd1498Szrj 3. no uses of a1 in the loop besides the reduction operation
315138fd1498Szrj 4. no uses of a1 outside the loop.
315238fd1498Szrj
315338fd1498Szrj Conditions 1,4 are tested here.
315438fd1498Szrj Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
315538fd1498Szrj
315638fd1498Szrj (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
315738fd1498Szrj nested cycles.
315838fd1498Szrj
315938fd1498Szrj (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
316038fd1498Szrj reductions:
316138fd1498Szrj
316238fd1498Szrj a1 = phi < a0, a2 >
316338fd1498Szrj inner loop (def of a3)
316438fd1498Szrj a2 = phi < a3 >
316538fd1498Szrj
316638fd1498Szrj (4) Detect condition expressions, ie:
316738fd1498Szrj for (int i = 0; i < N; i++)
316838fd1498Szrj if (a[i] < val)
316938fd1498Szrj ret_val = a[i];
317038fd1498Szrj
317138fd1498Szrj */
317238fd1498Szrj
317338fd1498Szrj static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)317438fd1498Szrj vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
317538fd1498Szrj bool *double_reduc,
317638fd1498Szrj bool need_wrapping_integral_overflow,
317738fd1498Szrj enum vect_reduction_type *v_reduc_type)
317838fd1498Szrj {
317938fd1498Szrj struct loop *loop = (gimple_bb (phi))->loop_father;
318038fd1498Szrj struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
318138fd1498Szrj gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
318238fd1498Szrj enum tree_code orig_code, code;
318338fd1498Szrj tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
318438fd1498Szrj tree type;
318538fd1498Szrj int nloop_uses;
318638fd1498Szrj tree name;
318738fd1498Szrj imm_use_iterator imm_iter;
318838fd1498Szrj use_operand_p use_p;
318938fd1498Szrj bool phi_def;
319038fd1498Szrj
319138fd1498Szrj *double_reduc = false;
319238fd1498Szrj *v_reduc_type = TREE_CODE_REDUCTION;
319338fd1498Szrj
319438fd1498Szrj tree phi_name = PHI_RESULT (phi);
319538fd1498Szrj /* ??? If there are no uses of the PHI result the inner loop reduction
319638fd1498Szrj won't be detected as possibly double-reduction by vectorizable_reduction
319738fd1498Szrj because that tries to walk the PHI arg from the preheader edge which
319838fd1498Szrj can be constant. See PR60382. */
319938fd1498Szrj if (has_zero_uses (phi_name))
320038fd1498Szrj return NULL;
320138fd1498Szrj nloop_uses = 0;
320238fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
320338fd1498Szrj {
320438fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
320538fd1498Szrj if (is_gimple_debug (use_stmt))
320638fd1498Szrj continue;
320738fd1498Szrj
320838fd1498Szrj if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
320938fd1498Szrj {
321038fd1498Szrj if (dump_enabled_p ())
321138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
321238fd1498Szrj "intermediate value used outside loop.\n");
321338fd1498Szrj
321438fd1498Szrj return NULL;
321538fd1498Szrj }
321638fd1498Szrj
321738fd1498Szrj nloop_uses++;
321838fd1498Szrj if (nloop_uses > 1)
321938fd1498Szrj {
322038fd1498Szrj if (dump_enabled_p ())
322138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
322238fd1498Szrj "reduction value used in loop.\n");
322338fd1498Szrj return NULL;
322438fd1498Szrj }
322538fd1498Szrj
322638fd1498Szrj phi_use_stmt = use_stmt;
322738fd1498Szrj }
322838fd1498Szrj
322938fd1498Szrj edge latch_e = loop_latch_edge (loop);
323038fd1498Szrj tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
323138fd1498Szrj if (TREE_CODE (loop_arg) != SSA_NAME)
323238fd1498Szrj {
323338fd1498Szrj if (dump_enabled_p ())
323438fd1498Szrj {
323538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
323638fd1498Szrj "reduction: not ssa_name: ");
323738fd1498Szrj dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
323838fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323938fd1498Szrj }
324038fd1498Szrj return NULL;
324138fd1498Szrj }
324238fd1498Szrj
324338fd1498Szrj def_stmt = SSA_NAME_DEF_STMT (loop_arg);
324438fd1498Szrj if (is_gimple_assign (def_stmt))
324538fd1498Szrj {
324638fd1498Szrj name = gimple_assign_lhs (def_stmt);
324738fd1498Szrj phi_def = false;
324838fd1498Szrj }
324938fd1498Szrj else if (gimple_code (def_stmt) == GIMPLE_PHI)
325038fd1498Szrj {
325138fd1498Szrj name = PHI_RESULT (def_stmt);
325238fd1498Szrj phi_def = true;
325338fd1498Szrj }
325438fd1498Szrj else
325538fd1498Szrj {
325638fd1498Szrj if (dump_enabled_p ())
325738fd1498Szrj {
325838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
325938fd1498Szrj "reduction: unhandled reduction operation: ");
326038fd1498Szrj dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
326138fd1498Szrj }
326238fd1498Szrj return NULL;
326338fd1498Szrj }
326438fd1498Szrj
326538fd1498Szrj if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
326638fd1498Szrj return NULL;
326738fd1498Szrj
326838fd1498Szrj nloop_uses = 0;
326938fd1498Szrj auto_vec<gphi *, 3> lcphis;
327038fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
327138fd1498Szrj {
327238fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
327338fd1498Szrj if (is_gimple_debug (use_stmt))
327438fd1498Szrj continue;
327538fd1498Szrj if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
327638fd1498Szrj nloop_uses++;
327738fd1498Szrj else
327838fd1498Szrj /* We can have more than one loop-closed PHI. */
327938fd1498Szrj lcphis.safe_push (as_a <gphi *> (use_stmt));
328038fd1498Szrj if (nloop_uses > 1)
328138fd1498Szrj {
328238fd1498Szrj if (dump_enabled_p ())
328338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
328438fd1498Szrj "reduction used in loop.\n");
328538fd1498Szrj return NULL;
328638fd1498Szrj }
328738fd1498Szrj }
328838fd1498Szrj
328938fd1498Szrj /* If DEF_STMT is a phi node itself, we expect it to have a single argument
329038fd1498Szrj defined in the inner loop. */
329138fd1498Szrj if (phi_def)
329238fd1498Szrj {
329338fd1498Szrj op1 = PHI_ARG_DEF (def_stmt, 0);
329438fd1498Szrj
329538fd1498Szrj if (gimple_phi_num_args (def_stmt) != 1
329638fd1498Szrj || TREE_CODE (op1) != SSA_NAME)
329738fd1498Szrj {
329838fd1498Szrj if (dump_enabled_p ())
329938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330038fd1498Szrj "unsupported phi node definition.\n");
330138fd1498Szrj
330238fd1498Szrj return NULL;
330338fd1498Szrj }
330438fd1498Szrj
330538fd1498Szrj def1 = SSA_NAME_DEF_STMT (op1);
330638fd1498Szrj if (gimple_bb (def1)
330738fd1498Szrj && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
330838fd1498Szrj && loop->inner
330938fd1498Szrj && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
331038fd1498Szrj && is_gimple_assign (def1)
331138fd1498Szrj && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
331238fd1498Szrj {
331338fd1498Szrj if (dump_enabled_p ())
331438fd1498Szrj report_vect_op (MSG_NOTE, def_stmt,
331538fd1498Szrj "detected double reduction: ");
331638fd1498Szrj
331738fd1498Szrj *double_reduc = true;
331838fd1498Szrj return def_stmt;
331938fd1498Szrj }
332038fd1498Szrj
332138fd1498Szrj return NULL;
332238fd1498Szrj }
332338fd1498Szrj
332438fd1498Szrj /* If we are vectorizing an inner reduction we are executing that
332538fd1498Szrj in the original order only in case we are not dealing with a
332638fd1498Szrj double reduction. */
332738fd1498Szrj bool check_reduction = true;
332838fd1498Szrj if (flow_loop_nested_p (vect_loop, loop))
332938fd1498Szrj {
333038fd1498Szrj gphi *lcphi;
333138fd1498Szrj unsigned i;
333238fd1498Szrj check_reduction = false;
333338fd1498Szrj FOR_EACH_VEC_ELT (lcphis, i, lcphi)
333438fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
333538fd1498Szrj {
333638fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
333738fd1498Szrj if (is_gimple_debug (use_stmt))
333838fd1498Szrj continue;
333938fd1498Szrj if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
334038fd1498Szrj check_reduction = true;
334138fd1498Szrj }
334238fd1498Szrj }
334338fd1498Szrj
334438fd1498Szrj bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
334538fd1498Szrj code = orig_code = gimple_assign_rhs_code (def_stmt);
334638fd1498Szrj
334738fd1498Szrj /* We can handle "res -= x[i]", which is non-associative by
334838fd1498Szrj simply rewriting this into "res += -x[i]". Avoid changing
334938fd1498Szrj gimple instruction for the first simple tests and only do this
335038fd1498Szrj if we're allowed to change code at all. */
335138fd1498Szrj if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
335238fd1498Szrj code = PLUS_EXPR;
335338fd1498Szrj
335438fd1498Szrj if (code == COND_EXPR)
335538fd1498Szrj {
335638fd1498Szrj if (! nested_in_vect_loop)
335738fd1498Szrj *v_reduc_type = COND_REDUCTION;
335838fd1498Szrj
335938fd1498Szrj op3 = gimple_assign_rhs1 (def_stmt);
336038fd1498Szrj if (COMPARISON_CLASS_P (op3))
336138fd1498Szrj {
336238fd1498Szrj op4 = TREE_OPERAND (op3, 1);
336338fd1498Szrj op3 = TREE_OPERAND (op3, 0);
336438fd1498Szrj }
336538fd1498Szrj if (op3 == phi_name || op4 == phi_name)
336638fd1498Szrj {
336738fd1498Szrj if (dump_enabled_p ())
336838fd1498Szrj report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
336938fd1498Szrj "reduction: condition depends on previous"
337038fd1498Szrj " iteration: ");
337138fd1498Szrj return NULL;
337238fd1498Szrj }
337338fd1498Szrj
337438fd1498Szrj op1 = gimple_assign_rhs2 (def_stmt);
337538fd1498Szrj op2 = gimple_assign_rhs3 (def_stmt);
337638fd1498Szrj }
337738fd1498Szrj else if (!commutative_tree_code (code) || !associative_tree_code (code))
337838fd1498Szrj {
337938fd1498Szrj if (dump_enabled_p ())
338038fd1498Szrj report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
338138fd1498Szrj "reduction: not commutative/associative: ");
338238fd1498Szrj return NULL;
338338fd1498Szrj }
338438fd1498Szrj else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
338538fd1498Szrj {
338638fd1498Szrj op1 = gimple_assign_rhs1 (def_stmt);
338738fd1498Szrj op2 = gimple_assign_rhs2 (def_stmt);
338838fd1498Szrj }
338938fd1498Szrj else
339038fd1498Szrj {
339138fd1498Szrj if (dump_enabled_p ())
339238fd1498Szrj report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
339338fd1498Szrj "reduction: not handled operation: ");
339438fd1498Szrj return NULL;
339538fd1498Szrj }
339638fd1498Szrj
339738fd1498Szrj if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
339838fd1498Szrj {
339938fd1498Szrj if (dump_enabled_p ())
340038fd1498Szrj report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
340138fd1498Szrj "reduction: both uses not ssa_names: ");
340238fd1498Szrj
340338fd1498Szrj return NULL;
340438fd1498Szrj }
340538fd1498Szrj
340638fd1498Szrj type = TREE_TYPE (gimple_assign_lhs (def_stmt));
340738fd1498Szrj if ((TREE_CODE (op1) == SSA_NAME
340838fd1498Szrj && !types_compatible_p (type,TREE_TYPE (op1)))
340938fd1498Szrj || (TREE_CODE (op2) == SSA_NAME
341038fd1498Szrj && !types_compatible_p (type, TREE_TYPE (op2)))
341138fd1498Szrj || (op3 && TREE_CODE (op3) == SSA_NAME
341238fd1498Szrj && !types_compatible_p (type, TREE_TYPE (op3)))
341338fd1498Szrj || (op4 && TREE_CODE (op4) == SSA_NAME
341438fd1498Szrj && !types_compatible_p (type, TREE_TYPE (op4))))
341538fd1498Szrj {
341638fd1498Szrj if (dump_enabled_p ())
341738fd1498Szrj {
341838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
341938fd1498Szrj "reduction: multiple types: operation type: ");
342038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
342138fd1498Szrj dump_printf (MSG_NOTE, ", operands types: ");
342238fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
342338fd1498Szrj TREE_TYPE (op1));
342438fd1498Szrj dump_printf (MSG_NOTE, ",");
342538fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
342638fd1498Szrj TREE_TYPE (op2));
342738fd1498Szrj if (op3)
342838fd1498Szrj {
342938fd1498Szrj dump_printf (MSG_NOTE, ",");
343038fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
343138fd1498Szrj TREE_TYPE (op3));
343238fd1498Szrj }
343338fd1498Szrj
343438fd1498Szrj if (op4)
343538fd1498Szrj {
343638fd1498Szrj dump_printf (MSG_NOTE, ",");
343738fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM,
343838fd1498Szrj TREE_TYPE (op4));
343938fd1498Szrj }
344038fd1498Szrj dump_printf (MSG_NOTE, "\n");
344138fd1498Szrj }
344238fd1498Szrj
344338fd1498Szrj return NULL;
344438fd1498Szrj }
344538fd1498Szrj
344638fd1498Szrj /* Check whether it's ok to change the order of the computation.
344738fd1498Szrj Generally, when vectorizing a reduction we change the order of the
344838fd1498Szrj computation. This may change the behavior of the program in some
344938fd1498Szrj cases, so we need to check that this is ok. One exception is when
345038fd1498Szrj vectorizing an outer-loop: the inner-loop is executed sequentially,
345138fd1498Szrj and therefore vectorizing reductions in the inner-loop during
345238fd1498Szrj outer-loop vectorization is safe. */
345338fd1498Szrj if (check_reduction
345438fd1498Szrj && *v_reduc_type == TREE_CODE_REDUCTION
345538fd1498Szrj && needs_fold_left_reduction_p (type, code,
345638fd1498Szrj need_wrapping_integral_overflow))
345738fd1498Szrj *v_reduc_type = FOLD_LEFT_REDUCTION;
345838fd1498Szrj
345938fd1498Szrj /* Reduction is safe. We're dealing with one of the following:
346038fd1498Szrj 1) integer arithmetic and no trapv
346138fd1498Szrj 2) floating point arithmetic, and special flags permit this optimization
346238fd1498Szrj 3) nested cycle (i.e., outer loop vectorization). */
346338fd1498Szrj if (TREE_CODE (op1) == SSA_NAME)
346438fd1498Szrj def1 = SSA_NAME_DEF_STMT (op1);
346538fd1498Szrj
346638fd1498Szrj if (TREE_CODE (op2) == SSA_NAME)
346738fd1498Szrj def2 = SSA_NAME_DEF_STMT (op2);
346838fd1498Szrj
346938fd1498Szrj if (code != COND_EXPR
347038fd1498Szrj && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
347138fd1498Szrj {
347238fd1498Szrj if (dump_enabled_p ())
347338fd1498Szrj report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
347438fd1498Szrj return NULL;
347538fd1498Szrj }
347638fd1498Szrj
347738fd1498Szrj /* Check that one def is the reduction def, defined by PHI,
347838fd1498Szrj the other def is either defined in the loop ("vect_internal_def"),
347938fd1498Szrj or it's an induction (defined by a loop-header phi-node). */
348038fd1498Szrj
348138fd1498Szrj if (def2 && def2 == phi
348238fd1498Szrj && (code == COND_EXPR
348338fd1498Szrj || !def1 || gimple_nop_p (def1)
348438fd1498Szrj || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
348538fd1498Szrj || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
348638fd1498Szrj && (is_gimple_assign (def1)
348738fd1498Szrj || is_gimple_call (def1)
348838fd1498Szrj || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
348938fd1498Szrj == vect_induction_def
349038fd1498Szrj || (gimple_code (def1) == GIMPLE_PHI
349138fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
349238fd1498Szrj == vect_internal_def
349338fd1498Szrj && !is_loop_header_bb_p (gimple_bb (def1)))))))
349438fd1498Szrj {
349538fd1498Szrj if (dump_enabled_p ())
349638fd1498Szrj report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
349738fd1498Szrj return def_stmt;
349838fd1498Szrj }
349938fd1498Szrj
350038fd1498Szrj if (def1 && def1 == phi
350138fd1498Szrj && (code == COND_EXPR
350238fd1498Szrj || !def2 || gimple_nop_p (def2)
350338fd1498Szrj || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
350438fd1498Szrj || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
350538fd1498Szrj && (is_gimple_assign (def2)
350638fd1498Szrj || is_gimple_call (def2)
350738fd1498Szrj || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
350838fd1498Szrj == vect_induction_def
350938fd1498Szrj || (gimple_code (def2) == GIMPLE_PHI
351038fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
351138fd1498Szrj == vect_internal_def
351238fd1498Szrj && !is_loop_header_bb_p (gimple_bb (def2)))))))
351338fd1498Szrj {
351438fd1498Szrj if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
351538fd1498Szrj {
351638fd1498Szrj /* Check if we can swap operands (just for simplicity - so that
351738fd1498Szrj the rest of the code can assume that the reduction variable
351838fd1498Szrj is always the last (second) argument). */
351938fd1498Szrj if (code == COND_EXPR)
352038fd1498Szrj {
352138fd1498Szrj /* Swap cond_expr by inverting the condition. */
352238fd1498Szrj tree cond_expr = gimple_assign_rhs1 (def_stmt);
352338fd1498Szrj enum tree_code invert_code = ERROR_MARK;
352438fd1498Szrj enum tree_code cond_code = TREE_CODE (cond_expr);
352538fd1498Szrj
352638fd1498Szrj if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
352738fd1498Szrj {
352838fd1498Szrj bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
352938fd1498Szrj invert_code = invert_tree_comparison (cond_code, honor_nans);
353038fd1498Szrj }
353138fd1498Szrj if (invert_code != ERROR_MARK)
353238fd1498Szrj {
353338fd1498Szrj TREE_SET_CODE (cond_expr, invert_code);
353438fd1498Szrj swap_ssa_operands (def_stmt,
353538fd1498Szrj gimple_assign_rhs2_ptr (def_stmt),
353638fd1498Szrj gimple_assign_rhs3_ptr (def_stmt));
353738fd1498Szrj }
353838fd1498Szrj else
353938fd1498Szrj {
354038fd1498Szrj if (dump_enabled_p ())
354138fd1498Szrj report_vect_op (MSG_NOTE, def_stmt,
354238fd1498Szrj "detected reduction: cannot swap operands "
354338fd1498Szrj "for cond_expr");
354438fd1498Szrj return NULL;
354538fd1498Szrj }
354638fd1498Szrj }
354738fd1498Szrj else
354838fd1498Szrj swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
354938fd1498Szrj gimple_assign_rhs2_ptr (def_stmt));
355038fd1498Szrj
355138fd1498Szrj if (dump_enabled_p ())
355238fd1498Szrj report_vect_op (MSG_NOTE, def_stmt,
355338fd1498Szrj "detected reduction: need to swap operands: ");
355438fd1498Szrj
355538fd1498Szrj if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
355638fd1498Szrj LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
355738fd1498Szrj }
355838fd1498Szrj else
355938fd1498Szrj {
356038fd1498Szrj if (dump_enabled_p ())
356138fd1498Szrj report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
356238fd1498Szrj }
356338fd1498Szrj
356438fd1498Szrj return def_stmt;
356538fd1498Szrj }
356638fd1498Szrj
356738fd1498Szrj /* Try to find SLP reduction chain. */
356838fd1498Szrj if (! nested_in_vect_loop
356938fd1498Szrj && code != COND_EXPR
357038fd1498Szrj && orig_code != MINUS_EXPR
357138fd1498Szrj && vect_is_slp_reduction (loop_info, phi, def_stmt))
357238fd1498Szrj {
357338fd1498Szrj if (dump_enabled_p ())
357438fd1498Szrj report_vect_op (MSG_NOTE, def_stmt,
357538fd1498Szrj "reduction: detected reduction chain: ");
357638fd1498Szrj
357738fd1498Szrj return def_stmt;
357838fd1498Szrj }
357938fd1498Szrj
358038fd1498Szrj /* Dissolve group eventually half-built by vect_is_slp_reduction. */
358138fd1498Szrj gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
358238fd1498Szrj while (first)
358338fd1498Szrj {
358438fd1498Szrj gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
358538fd1498Szrj GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
358638fd1498Szrj GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
358738fd1498Szrj first = next;
358838fd1498Szrj }
358938fd1498Szrj
359038fd1498Szrj /* Look for the expression computing loop_arg from loop PHI result. */
359138fd1498Szrj if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
359238fd1498Szrj code))
359338fd1498Szrj return def_stmt;
359438fd1498Szrj
359538fd1498Szrj if (dump_enabled_p ())
359638fd1498Szrj {
359738fd1498Szrj report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
359838fd1498Szrj "reduction: unknown pattern: ");
359938fd1498Szrj }
360038fd1498Szrj
360138fd1498Szrj return NULL;
360238fd1498Szrj }
360338fd1498Szrj
360438fd1498Szrj /* Wrapper around vect_is_simple_reduction, which will modify code
360538fd1498Szrj in-place if it enables detection of more reductions. Arguments
360638fd1498Szrj as there. */
360738fd1498Szrj
360838fd1498Szrj gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow)360938fd1498Szrj vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
361038fd1498Szrj bool *double_reduc,
361138fd1498Szrj bool need_wrapping_integral_overflow)
361238fd1498Szrj {
361338fd1498Szrj enum vect_reduction_type v_reduc_type;
361438fd1498Szrj gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
361538fd1498Szrj need_wrapping_integral_overflow,
361638fd1498Szrj &v_reduc_type);
361738fd1498Szrj if (def)
361838fd1498Szrj {
361938fd1498Szrj stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
362038fd1498Szrj STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
362138fd1498Szrj STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
362238fd1498Szrj reduc_def_info = vinfo_for_stmt (def);
362338fd1498Szrj STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
362438fd1498Szrj STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
362538fd1498Szrj }
362638fd1498Szrj return def;
362738fd1498Szrj }
362838fd1498Szrj
362938fd1498Szrj /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
363038fd1498Szrj int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)363138fd1498Szrj vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
363238fd1498Szrj int *peel_iters_epilogue,
363338fd1498Szrj stmt_vector_for_cost *scalar_cost_vec,
363438fd1498Szrj stmt_vector_for_cost *prologue_cost_vec,
363538fd1498Szrj stmt_vector_for_cost *epilogue_cost_vec)
363638fd1498Szrj {
363738fd1498Szrj int retval = 0;
363838fd1498Szrj int assumed_vf = vect_vf_for_cost (loop_vinfo);
363938fd1498Szrj
364038fd1498Szrj if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
364138fd1498Szrj {
364238fd1498Szrj *peel_iters_epilogue = assumed_vf / 2;
364338fd1498Szrj if (dump_enabled_p ())
364438fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
364538fd1498Szrj "cost model: epilogue peel iters set to vf/2 "
364638fd1498Szrj "because loop iterations are unknown .\n");
364738fd1498Szrj
364838fd1498Szrj /* If peeled iterations are known but number of scalar loop
364938fd1498Szrj iterations are unknown, count a taken branch per peeled loop. */
365038fd1498Szrj retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
365138fd1498Szrj NULL, 0, vect_prologue);
365238fd1498Szrj retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
365338fd1498Szrj NULL, 0, vect_epilogue);
365438fd1498Szrj }
365538fd1498Szrj else
365638fd1498Szrj {
365738fd1498Szrj int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
365838fd1498Szrj peel_iters_prologue = niters < peel_iters_prologue ?
365938fd1498Szrj niters : peel_iters_prologue;
366038fd1498Szrj *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
366138fd1498Szrj /* If we need to peel for gaps, but no peeling is required, we have to
366238fd1498Szrj peel VF iterations. */
366338fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
366438fd1498Szrj *peel_iters_epilogue = assumed_vf;
366538fd1498Szrj }
366638fd1498Szrj
366738fd1498Szrj stmt_info_for_cost *si;
366838fd1498Szrj int j;
366938fd1498Szrj if (peel_iters_prologue)
367038fd1498Szrj FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
367138fd1498Szrj {
367238fd1498Szrj stmt_vec_info stmt_info
367338fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
367438fd1498Szrj retval += record_stmt_cost (prologue_cost_vec,
367538fd1498Szrj si->count * peel_iters_prologue,
367638fd1498Szrj si->kind, stmt_info, si->misalign,
367738fd1498Szrj vect_prologue);
367838fd1498Szrj }
367938fd1498Szrj if (*peel_iters_epilogue)
368038fd1498Szrj FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
368138fd1498Szrj {
368238fd1498Szrj stmt_vec_info stmt_info
368338fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
368438fd1498Szrj retval += record_stmt_cost (epilogue_cost_vec,
368538fd1498Szrj si->count * *peel_iters_epilogue,
368638fd1498Szrj si->kind, stmt_info, si->misalign,
368738fd1498Szrj vect_epilogue);
368838fd1498Szrj }
368938fd1498Szrj
369038fd1498Szrj return retval;
369138fd1498Szrj }
369238fd1498Szrj
369338fd1498Szrj /* Function vect_estimate_min_profitable_iters
369438fd1498Szrj
369538fd1498Szrj Return the number of iterations required for the vector version of the
369638fd1498Szrj loop to be profitable relative to the cost of the scalar version of the
369738fd1498Szrj loop.
369838fd1498Szrj
369938fd1498Szrj *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
370038fd1498Szrj of iterations for vectorization. -1 value means loop vectorization
370138fd1498Szrj is not profitable. This returned value may be used for dynamic
370238fd1498Szrj profitability check.
370338fd1498Szrj
370438fd1498Szrj *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
370538fd1498Szrj for static check against estimated number of iterations. */
370638fd1498Szrj
370738fd1498Szrj static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)370838fd1498Szrj vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
370938fd1498Szrj int *ret_min_profitable_niters,
371038fd1498Szrj int *ret_min_profitable_estimate)
371138fd1498Szrj {
371238fd1498Szrj int min_profitable_iters;
371338fd1498Szrj int min_profitable_estimate;
371438fd1498Szrj int peel_iters_prologue;
371538fd1498Szrj int peel_iters_epilogue;
371638fd1498Szrj unsigned vec_inside_cost = 0;
371738fd1498Szrj int vec_outside_cost = 0;
371838fd1498Szrj unsigned vec_prologue_cost = 0;
371938fd1498Szrj unsigned vec_epilogue_cost = 0;
372038fd1498Szrj int scalar_single_iter_cost = 0;
372138fd1498Szrj int scalar_outside_cost = 0;
372238fd1498Szrj int assumed_vf = vect_vf_for_cost (loop_vinfo);
372338fd1498Szrj int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
372438fd1498Szrj void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
372538fd1498Szrj
372638fd1498Szrj /* Cost model disabled. */
372738fd1498Szrj if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
372838fd1498Szrj {
372938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
373038fd1498Szrj *ret_min_profitable_niters = 0;
373138fd1498Szrj *ret_min_profitable_estimate = 0;
373238fd1498Szrj return;
373338fd1498Szrj }
373438fd1498Szrj
373538fd1498Szrj /* Requires loop versioning tests to handle misalignment. */
373638fd1498Szrj if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
373738fd1498Szrj {
373838fd1498Szrj /* FIXME: Make cost depend on complexity of individual check. */
373938fd1498Szrj unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
374038fd1498Szrj (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
374138fd1498Szrj vect_prologue);
374238fd1498Szrj dump_printf (MSG_NOTE,
374338fd1498Szrj "cost model: Adding cost of checks for loop "
374438fd1498Szrj "versioning to treat misalignment.\n");
374538fd1498Szrj }
374638fd1498Szrj
374738fd1498Szrj /* Requires loop versioning with alias checks. */
374838fd1498Szrj if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
374938fd1498Szrj {
375038fd1498Szrj /* FIXME: Make cost depend on complexity of individual check. */
375138fd1498Szrj unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
375238fd1498Szrj (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
375338fd1498Szrj vect_prologue);
375438fd1498Szrj len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
375538fd1498Szrj if (len)
375638fd1498Szrj /* Count LEN - 1 ANDs and LEN comparisons. */
375738fd1498Szrj (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
375838fd1498Szrj NULL, 0, vect_prologue);
375938fd1498Szrj len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
376038fd1498Szrj if (len)
376138fd1498Szrj {
376238fd1498Szrj /* Count LEN - 1 ANDs and LEN comparisons. */
376338fd1498Szrj unsigned int nstmts = len * 2 - 1;
376438fd1498Szrj /* +1 for each bias that needs adding. */
376538fd1498Szrj for (unsigned int i = 0; i < len; ++i)
376638fd1498Szrj if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
376738fd1498Szrj nstmts += 1;
376838fd1498Szrj (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
376938fd1498Szrj NULL, 0, vect_prologue);
377038fd1498Szrj }
377138fd1498Szrj dump_printf (MSG_NOTE,
377238fd1498Szrj "cost model: Adding cost of checks for loop "
377338fd1498Szrj "versioning aliasing.\n");
377438fd1498Szrj }
377538fd1498Szrj
377638fd1498Szrj /* Requires loop versioning with niter checks. */
377738fd1498Szrj if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
377838fd1498Szrj {
377938fd1498Szrj /* FIXME: Make cost depend on complexity of individual check. */
378038fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
378138fd1498Szrj vect_prologue);
378238fd1498Szrj dump_printf (MSG_NOTE,
378338fd1498Szrj "cost model: Adding cost of checks for loop "
378438fd1498Szrj "versioning niters.\n");
378538fd1498Szrj }
378638fd1498Szrj
378738fd1498Szrj if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
378838fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
378938fd1498Szrj vect_prologue);
379038fd1498Szrj
379138fd1498Szrj /* Count statements in scalar loop. Using this as scalar cost for a single
379238fd1498Szrj iteration for now.
379338fd1498Szrj
379438fd1498Szrj TODO: Add outer loop support.
379538fd1498Szrj
379638fd1498Szrj TODO: Consider assigning different costs to different scalar
379738fd1498Szrj statements. */
379838fd1498Szrj
379938fd1498Szrj scalar_single_iter_cost
380038fd1498Szrj = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
380138fd1498Szrj
380238fd1498Szrj /* Add additional cost for the peeled instructions in prologue and epilogue
380338fd1498Szrj loop. (For fully-masked loops there will be no peeling.)
380438fd1498Szrj
380538fd1498Szrj FORNOW: If we don't know the value of peel_iters for prologue or epilogue
380638fd1498Szrj at compile-time - we assume it's vf/2 (the worst would be vf-1).
380738fd1498Szrj
380838fd1498Szrj TODO: Build an expression that represents peel_iters for prologue and
380938fd1498Szrj epilogue to be used in a run-time test. */
381038fd1498Szrj
381138fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
381238fd1498Szrj {
381338fd1498Szrj peel_iters_prologue = 0;
381438fd1498Szrj peel_iters_epilogue = 0;
381538fd1498Szrj
381638fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
381738fd1498Szrj {
381838fd1498Szrj /* We need to peel exactly one iteration. */
381938fd1498Szrj peel_iters_epilogue += 1;
382038fd1498Szrj stmt_info_for_cost *si;
382138fd1498Szrj int j;
382238fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
382338fd1498Szrj j, si)
382438fd1498Szrj {
382538fd1498Szrj struct _stmt_vec_info *stmt_info
382638fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
382738fd1498Szrj (void) add_stmt_cost (target_cost_data, si->count,
382838fd1498Szrj si->kind, stmt_info, si->misalign,
382938fd1498Szrj vect_epilogue);
383038fd1498Szrj }
383138fd1498Szrj }
383238fd1498Szrj }
383338fd1498Szrj else if (npeel < 0)
383438fd1498Szrj {
383538fd1498Szrj peel_iters_prologue = assumed_vf / 2;
383638fd1498Szrj dump_printf (MSG_NOTE, "cost model: "
383738fd1498Szrj "prologue peel iters set to vf/2.\n");
383838fd1498Szrj
383938fd1498Szrj /* If peeling for alignment is unknown, loop bound of main loop becomes
384038fd1498Szrj unknown. */
384138fd1498Szrj peel_iters_epilogue = assumed_vf / 2;
384238fd1498Szrj dump_printf (MSG_NOTE, "cost model: "
384338fd1498Szrj "epilogue peel iters set to vf/2 because "
384438fd1498Szrj "peeling for alignment is unknown.\n");
384538fd1498Szrj
384638fd1498Szrj /* If peeled iterations are unknown, count a taken branch and a not taken
384738fd1498Szrj branch per peeled loop. Even if scalar loop iterations are known,
384838fd1498Szrj vector iterations are not known since peeled prologue iterations are
384938fd1498Szrj not known. Hence guards remain the same. */
385038fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
385138fd1498Szrj NULL, 0, vect_prologue);
385238fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
385338fd1498Szrj NULL, 0, vect_prologue);
385438fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
385538fd1498Szrj NULL, 0, vect_epilogue);
385638fd1498Szrj (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
385738fd1498Szrj NULL, 0, vect_epilogue);
385838fd1498Szrj stmt_info_for_cost *si;
385938fd1498Szrj int j;
386038fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
386138fd1498Szrj {
386238fd1498Szrj struct _stmt_vec_info *stmt_info
386338fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
386438fd1498Szrj (void) add_stmt_cost (target_cost_data,
386538fd1498Szrj si->count * peel_iters_prologue,
386638fd1498Szrj si->kind, stmt_info, si->misalign,
386738fd1498Szrj vect_prologue);
386838fd1498Szrj (void) add_stmt_cost (target_cost_data,
386938fd1498Szrj si->count * peel_iters_epilogue,
387038fd1498Szrj si->kind, stmt_info, si->misalign,
387138fd1498Szrj vect_epilogue);
387238fd1498Szrj }
387338fd1498Szrj }
387438fd1498Szrj else
387538fd1498Szrj {
387638fd1498Szrj stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
387738fd1498Szrj stmt_info_for_cost *si;
387838fd1498Szrj int j;
387938fd1498Szrj void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
388038fd1498Szrj
388138fd1498Szrj prologue_cost_vec.create (2);
388238fd1498Szrj epilogue_cost_vec.create (2);
388338fd1498Szrj peel_iters_prologue = npeel;
388438fd1498Szrj
388538fd1498Szrj (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
388638fd1498Szrj &peel_iters_epilogue,
388738fd1498Szrj &LOOP_VINFO_SCALAR_ITERATION_COST
388838fd1498Szrj (loop_vinfo),
388938fd1498Szrj &prologue_cost_vec,
389038fd1498Szrj &epilogue_cost_vec);
389138fd1498Szrj
389238fd1498Szrj FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
389338fd1498Szrj {
389438fd1498Szrj struct _stmt_vec_info *stmt_info
389538fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
389638fd1498Szrj (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
389738fd1498Szrj si->misalign, vect_prologue);
389838fd1498Szrj }
389938fd1498Szrj
390038fd1498Szrj FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
390138fd1498Szrj {
390238fd1498Szrj struct _stmt_vec_info *stmt_info
390338fd1498Szrj = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
390438fd1498Szrj (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
390538fd1498Szrj si->misalign, vect_epilogue);
390638fd1498Szrj }
390738fd1498Szrj
390838fd1498Szrj prologue_cost_vec.release ();
390938fd1498Szrj epilogue_cost_vec.release ();
391038fd1498Szrj }
391138fd1498Szrj
391238fd1498Szrj /* FORNOW: The scalar outside cost is incremented in one of the
391338fd1498Szrj following ways:
391438fd1498Szrj
391538fd1498Szrj 1. The vectorizer checks for alignment and aliasing and generates
391638fd1498Szrj a condition that allows dynamic vectorization. A cost model
391738fd1498Szrj check is ANDED with the versioning condition. Hence scalar code
391838fd1498Szrj path now has the added cost of the versioning check.
391938fd1498Szrj
392038fd1498Szrj if (cost > th & versioning_check)
392138fd1498Szrj jmp to vector code
392238fd1498Szrj
392338fd1498Szrj Hence run-time scalar is incremented by not-taken branch cost.
392438fd1498Szrj
392538fd1498Szrj 2. The vectorizer then checks if a prologue is required. If the
392638fd1498Szrj cost model check was not done before during versioning, it has to
392738fd1498Szrj be done before the prologue check.
392838fd1498Szrj
392938fd1498Szrj if (cost <= th)
393038fd1498Szrj prologue = scalar_iters
393138fd1498Szrj if (prologue == 0)
393238fd1498Szrj jmp to vector code
393338fd1498Szrj else
393438fd1498Szrj execute prologue
393538fd1498Szrj if (prologue == num_iters)
393638fd1498Szrj go to exit
393738fd1498Szrj
393838fd1498Szrj Hence the run-time scalar cost is incremented by a taken branch,
393938fd1498Szrj plus a not-taken branch, plus a taken branch cost.
394038fd1498Szrj
394138fd1498Szrj 3. The vectorizer then checks if an epilogue is required. If the
394238fd1498Szrj cost model check was not done before during prologue check, it
394338fd1498Szrj has to be done with the epilogue check.
394438fd1498Szrj
394538fd1498Szrj if (prologue == 0)
394638fd1498Szrj jmp to vector code
394738fd1498Szrj else
394838fd1498Szrj execute prologue
394938fd1498Szrj if (prologue == num_iters)
395038fd1498Szrj go to exit
395138fd1498Szrj vector code:
395238fd1498Szrj if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
395338fd1498Szrj jmp to epilogue
395438fd1498Szrj
395538fd1498Szrj Hence the run-time scalar cost should be incremented by 2 taken
395638fd1498Szrj branches.
395738fd1498Szrj
395838fd1498Szrj TODO: The back end may reorder the BBS's differently and reverse
395938fd1498Szrj conditions/branch directions. Change the estimates below to
396038fd1498Szrj something more reasonable. */
396138fd1498Szrj
396238fd1498Szrj /* If the number of iterations is known and we do not do versioning, we can
396338fd1498Szrj decide whether to vectorize at compile time. Hence the scalar version
396438fd1498Szrj do not carry cost model guard costs. */
396538fd1498Szrj if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
396638fd1498Szrj || LOOP_REQUIRES_VERSIONING (loop_vinfo))
396738fd1498Szrj {
396838fd1498Szrj /* Cost model check occurs at versioning. */
396938fd1498Szrj if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
397038fd1498Szrj scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
397138fd1498Szrj else
397238fd1498Szrj {
397338fd1498Szrj /* Cost model check occurs at prologue generation. */
397438fd1498Szrj if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
397538fd1498Szrj scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
397638fd1498Szrj + vect_get_stmt_cost (cond_branch_not_taken);
397738fd1498Szrj /* Cost model check occurs at epilogue generation. */
397838fd1498Szrj else
397938fd1498Szrj scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
398038fd1498Szrj }
398138fd1498Szrj }
398238fd1498Szrj
398338fd1498Szrj /* Complete the target-specific cost calculations. */
398438fd1498Szrj finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
398538fd1498Szrj &vec_inside_cost, &vec_epilogue_cost);
398638fd1498Szrj
398738fd1498Szrj vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
398838fd1498Szrj
398938fd1498Szrj if (dump_enabled_p ())
399038fd1498Szrj {
399138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
399238fd1498Szrj dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
399338fd1498Szrj vec_inside_cost);
399438fd1498Szrj dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
399538fd1498Szrj vec_prologue_cost);
399638fd1498Szrj dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
399738fd1498Szrj vec_epilogue_cost);
399838fd1498Szrj dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
399938fd1498Szrj scalar_single_iter_cost);
400038fd1498Szrj dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
400138fd1498Szrj scalar_outside_cost);
400238fd1498Szrj dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
400338fd1498Szrj vec_outside_cost);
400438fd1498Szrj dump_printf (MSG_NOTE, " prologue iterations: %d\n",
400538fd1498Szrj peel_iters_prologue);
400638fd1498Szrj dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
400738fd1498Szrj peel_iters_epilogue);
400838fd1498Szrj }
400938fd1498Szrj
401038fd1498Szrj /* Calculate number of iterations required to make the vector version
401138fd1498Szrj profitable, relative to the loop bodies only. The following condition
401238fd1498Szrj must hold true:
401338fd1498Szrj SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
401438fd1498Szrj where
401538fd1498Szrj SIC = scalar iteration cost, VIC = vector iteration cost,
401638fd1498Szrj VOC = vector outside cost, VF = vectorization factor,
401738fd1498Szrj PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
401838fd1498Szrj SOC = scalar outside cost for run time cost model check. */
401938fd1498Szrj
402038fd1498Szrj if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
402138fd1498Szrj {
402238fd1498Szrj min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
402338fd1498Szrj * assumed_vf
402438fd1498Szrj - vec_inside_cost * peel_iters_prologue
402538fd1498Szrj - vec_inside_cost * peel_iters_epilogue);
402638fd1498Szrj if (min_profitable_iters <= 0)
402738fd1498Szrj min_profitable_iters = 0;
402838fd1498Szrj else
402938fd1498Szrj {
403038fd1498Szrj min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
403138fd1498Szrj - vec_inside_cost);
403238fd1498Szrj
403338fd1498Szrj if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
403438fd1498Szrj <= (((int) vec_inside_cost * min_profitable_iters)
403538fd1498Szrj + (((int) vec_outside_cost - scalar_outside_cost)
403638fd1498Szrj * assumed_vf)))
403738fd1498Szrj min_profitable_iters++;
403838fd1498Szrj }
403938fd1498Szrj }
404038fd1498Szrj /* vector version will never be profitable. */
404138fd1498Szrj else
404238fd1498Szrj {
404338fd1498Szrj if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
404438fd1498Szrj warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
404538fd1498Szrj "did not happen for a simd loop");
404638fd1498Szrj
404738fd1498Szrj if (dump_enabled_p ())
404838fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404938fd1498Szrj "cost model: the vector iteration cost = %d "
405038fd1498Szrj "divided by the scalar iteration cost = %d "
405138fd1498Szrj "is greater or equal to the vectorization factor = %d"
405238fd1498Szrj ".\n",
405338fd1498Szrj vec_inside_cost, scalar_single_iter_cost, assumed_vf);
405438fd1498Szrj *ret_min_profitable_niters = -1;
405538fd1498Szrj *ret_min_profitable_estimate = -1;
405638fd1498Szrj return;
405738fd1498Szrj }
405838fd1498Szrj
405938fd1498Szrj dump_printf (MSG_NOTE,
406038fd1498Szrj " Calculated minimum iters for profitability: %d\n",
406138fd1498Szrj min_profitable_iters);
406238fd1498Szrj
406338fd1498Szrj if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
406438fd1498Szrj && min_profitable_iters < (assumed_vf + peel_iters_prologue))
406538fd1498Szrj /* We want the vectorized loop to execute at least once. */
406638fd1498Szrj min_profitable_iters = assumed_vf + peel_iters_prologue;
406738fd1498Szrj
406838fd1498Szrj if (dump_enabled_p ())
406938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
407038fd1498Szrj " Runtime profitability threshold = %d\n",
407138fd1498Szrj min_profitable_iters);
407238fd1498Szrj
407338fd1498Szrj *ret_min_profitable_niters = min_profitable_iters;
407438fd1498Szrj
407538fd1498Szrj /* Calculate number of iterations required to make the vector version
407638fd1498Szrj profitable, relative to the loop bodies only.
407738fd1498Szrj
407838fd1498Szrj Non-vectorized variant is SIC * niters and it must win over vector
407938fd1498Szrj variant on the expected loop trip count. The following condition must hold true:
408038fd1498Szrj SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
408138fd1498Szrj
408238fd1498Szrj if (vec_outside_cost <= 0)
408338fd1498Szrj min_profitable_estimate = 0;
408438fd1498Szrj else
408538fd1498Szrj {
408638fd1498Szrj min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
408738fd1498Szrj * assumed_vf
408838fd1498Szrj - vec_inside_cost * peel_iters_prologue
408938fd1498Szrj - vec_inside_cost * peel_iters_epilogue)
409038fd1498Szrj / ((scalar_single_iter_cost * assumed_vf)
409138fd1498Szrj - vec_inside_cost);
409238fd1498Szrj }
409338fd1498Szrj min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
409438fd1498Szrj if (dump_enabled_p ())
409538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
409638fd1498Szrj " Static estimate profitability threshold = %d\n",
409738fd1498Szrj min_profitable_estimate);
409838fd1498Szrj
409938fd1498Szrj *ret_min_profitable_estimate = min_profitable_estimate;
410038fd1498Szrj }
410138fd1498Szrj
410238fd1498Szrj /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
410338fd1498Szrj vector elements (not bits) for a vector with NELT elements. */
410438fd1498Szrj static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)410538fd1498Szrj calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
410638fd1498Szrj vec_perm_builder *sel)
410738fd1498Szrj {
410838fd1498Szrj /* The encoding is a single stepped pattern. Any wrap-around is handled
410938fd1498Szrj by vec_perm_indices. */
411038fd1498Szrj sel->new_vector (nelt, 1, 3);
411138fd1498Szrj for (unsigned int i = 0; i < 3; i++)
411238fd1498Szrj sel->quick_push (i + offset);
411338fd1498Szrj }
411438fd1498Szrj
411538fd1498Szrj /* Checks whether the target supports whole-vector shifts for vectors of mode
411638fd1498Szrj MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
411738fd1498Szrj it supports vec_perm_const with masks for all necessary shift amounts. */
411838fd1498Szrj static bool
have_whole_vector_shift(machine_mode mode)411938fd1498Szrj have_whole_vector_shift (machine_mode mode)
412038fd1498Szrj {
412138fd1498Szrj if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
412238fd1498Szrj return true;
412338fd1498Szrj
412438fd1498Szrj /* Variable-length vectors should be handled via the optab. */
412538fd1498Szrj unsigned int nelt;
412638fd1498Szrj if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
412738fd1498Szrj return false;
412838fd1498Szrj
412938fd1498Szrj vec_perm_builder sel;
413038fd1498Szrj vec_perm_indices indices;
413138fd1498Szrj for (unsigned int i = nelt / 2; i >= 1; i /= 2)
413238fd1498Szrj {
413338fd1498Szrj calc_vec_perm_mask_for_shift (i, nelt, &sel);
413438fd1498Szrj indices.new_vector (sel, 2, nelt);
413538fd1498Szrj if (!can_vec_perm_const_p (mode, indices, false))
413638fd1498Szrj return false;
413738fd1498Szrj }
413838fd1498Szrj return true;
413938fd1498Szrj }
414038fd1498Szrj
414138fd1498Szrj /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
414238fd1498Szrj functions. Design better to avoid maintenance issues. */
414338fd1498Szrj
414438fd1498Szrj /* Function vect_model_reduction_cost.
414538fd1498Szrj
414638fd1498Szrj Models cost for a reduction operation, including the vector ops
414738fd1498Szrj generated within the strip-mine loop, the initial definition before
414838fd1498Szrj the loop, and the epilogue code that must be generated. */
414938fd1498Szrj
415038fd1498Szrj static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies)415138fd1498Szrj vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
415238fd1498Szrj int ncopies)
415338fd1498Szrj {
415438fd1498Szrj int prologue_cost = 0, epilogue_cost = 0, inside_cost;
415538fd1498Szrj enum tree_code code;
415638fd1498Szrj optab optab;
415738fd1498Szrj tree vectype;
415838fd1498Szrj gimple *orig_stmt;
415938fd1498Szrj machine_mode mode;
416038fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
416138fd1498Szrj struct loop *loop = NULL;
416238fd1498Szrj void *target_cost_data;
416338fd1498Szrj
416438fd1498Szrj if (loop_vinfo)
416538fd1498Szrj {
416638fd1498Szrj loop = LOOP_VINFO_LOOP (loop_vinfo);
416738fd1498Szrj target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
416838fd1498Szrj }
416938fd1498Szrj else
417038fd1498Szrj target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
417138fd1498Szrj
417238fd1498Szrj /* Condition reductions generate two reductions in the loop. */
417338fd1498Szrj vect_reduction_type reduction_type
417438fd1498Szrj = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
417538fd1498Szrj if (reduction_type == COND_REDUCTION)
417638fd1498Szrj ncopies *= 2;
417738fd1498Szrj
417838fd1498Szrj vectype = STMT_VINFO_VECTYPE (stmt_info);
417938fd1498Szrj mode = TYPE_MODE (vectype);
418038fd1498Szrj orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
418138fd1498Szrj
418238fd1498Szrj if (!orig_stmt)
418338fd1498Szrj orig_stmt = STMT_VINFO_STMT (stmt_info);
418438fd1498Szrj
418538fd1498Szrj code = gimple_assign_rhs_code (orig_stmt);
418638fd1498Szrj
418738fd1498Szrj if (reduction_type == EXTRACT_LAST_REDUCTION
418838fd1498Szrj || reduction_type == FOLD_LEFT_REDUCTION)
418938fd1498Szrj {
419038fd1498Szrj /* No extra instructions needed in the prologue. */
419138fd1498Szrj prologue_cost = 0;
419238fd1498Szrj
419338fd1498Szrj if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
419438fd1498Szrj /* Count one reduction-like operation per vector. */
419538fd1498Szrj inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
419638fd1498Szrj stmt_info, 0, vect_body);
419738fd1498Szrj else
419838fd1498Szrj {
419938fd1498Szrj /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
420038fd1498Szrj unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
420138fd1498Szrj inside_cost = add_stmt_cost (target_cost_data, nelements,
420238fd1498Szrj vec_to_scalar, stmt_info, 0,
420338fd1498Szrj vect_body);
420438fd1498Szrj inside_cost += add_stmt_cost (target_cost_data, nelements,
420538fd1498Szrj scalar_stmt, stmt_info, 0,
420638fd1498Szrj vect_body);
420738fd1498Szrj }
420838fd1498Szrj }
420938fd1498Szrj else
421038fd1498Szrj {
421138fd1498Szrj /* Add in cost for initial definition.
421238fd1498Szrj For cond reduction we have four vectors: initial index, step,
421338fd1498Szrj initial result of the data reduction, initial value of the index
421438fd1498Szrj reduction. */
421538fd1498Szrj int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
421638fd1498Szrj prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
421738fd1498Szrj scalar_to_vec, stmt_info, 0,
421838fd1498Szrj vect_prologue);
421938fd1498Szrj
422038fd1498Szrj /* Cost of reduction op inside loop. */
422138fd1498Szrj inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
422238fd1498Szrj stmt_info, 0, vect_body);
422338fd1498Szrj }
422438fd1498Szrj
422538fd1498Szrj /* Determine cost of epilogue code.
422638fd1498Szrj
422738fd1498Szrj We have a reduction operator that will reduce the vector in one statement.
422838fd1498Szrj Also requires scalar extract. */
422938fd1498Szrj
423038fd1498Szrj if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
423138fd1498Szrj {
423238fd1498Szrj if (reduc_fn != IFN_LAST)
423338fd1498Szrj {
423438fd1498Szrj if (reduction_type == COND_REDUCTION)
423538fd1498Szrj {
423638fd1498Szrj /* An EQ stmt and an COND_EXPR stmt. */
423738fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 2,
423838fd1498Szrj vector_stmt, stmt_info, 0,
423938fd1498Szrj vect_epilogue);
424038fd1498Szrj /* Reduction of the max index and a reduction of the found
424138fd1498Szrj values. */
424238fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 2,
424338fd1498Szrj vec_to_scalar, stmt_info, 0,
424438fd1498Szrj vect_epilogue);
424538fd1498Szrj /* A broadcast of the max value. */
424638fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 1,
424738fd1498Szrj scalar_to_vec, stmt_info, 0,
424838fd1498Szrj vect_epilogue);
424938fd1498Szrj }
425038fd1498Szrj else
425138fd1498Szrj {
425238fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
425338fd1498Szrj stmt_info, 0, vect_epilogue);
425438fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 1,
425538fd1498Szrj vec_to_scalar, stmt_info, 0,
425638fd1498Szrj vect_epilogue);
425738fd1498Szrj }
425838fd1498Szrj }
425938fd1498Szrj else if (reduction_type == COND_REDUCTION)
426038fd1498Szrj {
426138fd1498Szrj unsigned estimated_nunits = vect_nunits_for_cost (vectype);
426238fd1498Szrj /* Extraction of scalar elements. */
426338fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data,
426438fd1498Szrj 2 * estimated_nunits,
426538fd1498Szrj vec_to_scalar, stmt_info, 0,
426638fd1498Szrj vect_epilogue);
426738fd1498Szrj /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
426838fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data,
426938fd1498Szrj 2 * estimated_nunits - 3,
427038fd1498Szrj scalar_stmt, stmt_info, 0,
427138fd1498Szrj vect_epilogue);
427238fd1498Szrj }
427338fd1498Szrj else if (reduction_type == EXTRACT_LAST_REDUCTION
427438fd1498Szrj || reduction_type == FOLD_LEFT_REDUCTION)
427538fd1498Szrj /* No extra instructions need in the epilogue. */
427638fd1498Szrj ;
427738fd1498Szrj else
427838fd1498Szrj {
427938fd1498Szrj int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
428038fd1498Szrj tree bitsize =
428138fd1498Szrj TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
428238fd1498Szrj int element_bitsize = tree_to_uhwi (bitsize);
428338fd1498Szrj int nelements = vec_size_in_bits / element_bitsize;
428438fd1498Szrj
428538fd1498Szrj if (code == COND_EXPR)
428638fd1498Szrj code = MAX_EXPR;
428738fd1498Szrj
428838fd1498Szrj optab = optab_for_tree_code (code, vectype, optab_default);
428938fd1498Szrj
429038fd1498Szrj /* We have a whole vector shift available. */
429138fd1498Szrj if (optab != unknown_optab
429238fd1498Szrj && VECTOR_MODE_P (mode)
429338fd1498Szrj && optab_handler (optab, mode) != CODE_FOR_nothing
429438fd1498Szrj && have_whole_vector_shift (mode))
429538fd1498Szrj {
429638fd1498Szrj /* Final reduction via vector shifts and the reduction operator.
429738fd1498Szrj Also requires scalar extract. */
429838fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data,
429938fd1498Szrj exact_log2 (nelements) * 2,
430038fd1498Szrj vector_stmt, stmt_info, 0,
430138fd1498Szrj vect_epilogue);
430238fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data, 1,
430338fd1498Szrj vec_to_scalar, stmt_info, 0,
430438fd1498Szrj vect_epilogue);
430538fd1498Szrj }
430638fd1498Szrj else
430738fd1498Szrj /* Use extracts and reduction op for final reduction. For N
430838fd1498Szrj elements, we have N extracts and N-1 reduction ops. */
430938fd1498Szrj epilogue_cost += add_stmt_cost (target_cost_data,
431038fd1498Szrj nelements + nelements - 1,
431138fd1498Szrj vector_stmt, stmt_info, 0,
431238fd1498Szrj vect_epilogue);
431338fd1498Szrj }
431438fd1498Szrj }
431538fd1498Szrj
431638fd1498Szrj if (dump_enabled_p ())
431738fd1498Szrj dump_printf (MSG_NOTE,
431838fd1498Szrj "vect_model_reduction_cost: inside_cost = %d, "
431938fd1498Szrj "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
432038fd1498Szrj prologue_cost, epilogue_cost);
432138fd1498Szrj }
432238fd1498Szrj
432338fd1498Szrj
432438fd1498Szrj /* Function vect_model_induction_cost.
432538fd1498Szrj
432638fd1498Szrj Models cost for induction operations. */
432738fd1498Szrj
432838fd1498Szrj static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)432938fd1498Szrj vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
433038fd1498Szrj {
433138fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
433238fd1498Szrj void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
433338fd1498Szrj unsigned inside_cost, prologue_cost;
433438fd1498Szrj
433538fd1498Szrj if (PURE_SLP_STMT (stmt_info))
433638fd1498Szrj return;
433738fd1498Szrj
433838fd1498Szrj /* loop cost for vec_loop. */
433938fd1498Szrj inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
434038fd1498Szrj stmt_info, 0, vect_body);
434138fd1498Szrj
434238fd1498Szrj /* prologue cost for vec_init and vec_step. */
434338fd1498Szrj prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
434438fd1498Szrj stmt_info, 0, vect_prologue);
434538fd1498Szrj
434638fd1498Szrj if (dump_enabled_p ())
434738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
434838fd1498Szrj "vect_model_induction_cost: inside_cost = %d, "
434938fd1498Szrj "prologue_cost = %d .\n", inside_cost, prologue_cost);
435038fd1498Szrj }
435138fd1498Szrj
435238fd1498Szrj
435338fd1498Szrj
435438fd1498Szrj /* Function get_initial_def_for_reduction
435538fd1498Szrj
435638fd1498Szrj Input:
435738fd1498Szrj STMT - a stmt that performs a reduction operation in the loop.
435838fd1498Szrj INIT_VAL - the initial value of the reduction variable
435938fd1498Szrj
436038fd1498Szrj Output:
436138fd1498Szrj ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
436238fd1498Szrj of the reduction (used for adjusting the epilog - see below).
436338fd1498Szrj Return a vector variable, initialized according to the operation that STMT
436438fd1498Szrj performs. This vector will be used as the initial value of the
436538fd1498Szrj vector of partial results.
436638fd1498Szrj
436738fd1498Szrj Option1 (adjust in epilog): Initialize the vector as follows:
436838fd1498Szrj add/bit or/xor: [0,0,...,0,0]
436938fd1498Szrj mult/bit and: [1,1,...,1,1]
437038fd1498Szrj min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
437138fd1498Szrj and when necessary (e.g. add/mult case) let the caller know
437238fd1498Szrj that it needs to adjust the result by init_val.
437338fd1498Szrj
437438fd1498Szrj Option2: Initialize the vector as follows:
437538fd1498Szrj add/bit or/xor: [init_val,0,0,...,0]
437638fd1498Szrj mult/bit and: [init_val,1,1,...,1]
437738fd1498Szrj min/max/cond_expr: [init_val,init_val,...,init_val]
437838fd1498Szrj and no adjustments are needed.
437938fd1498Szrj
438038fd1498Szrj For example, for the following code:
438138fd1498Szrj
438238fd1498Szrj s = init_val;
438338fd1498Szrj for (i=0;i<n;i++)
438438fd1498Szrj s = s + a[i];
438538fd1498Szrj
438638fd1498Szrj STMT is 's = s + a[i]', and the reduction variable is 's'.
438738fd1498Szrj For a vector of 4 units, we want to return either [0,0,0,init_val],
438838fd1498Szrj or [0,0,0,0] and let the caller know that it needs to adjust
438938fd1498Szrj the result at the end by 'init_val'.
439038fd1498Szrj
439138fd1498Szrj FORNOW, we are using the 'adjust in epilog' scheme, because this way the
439238fd1498Szrj initialization vector is simpler (same element in all entries), if
439338fd1498Szrj ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
439438fd1498Szrj
439538fd1498Szrj A cost model should help decide between these two schemes. */
439638fd1498Szrj
439738fd1498Szrj tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)439838fd1498Szrj get_initial_def_for_reduction (gimple *stmt, tree init_val,
439938fd1498Szrj tree *adjustment_def)
440038fd1498Szrj {
440138fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
440238fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
440338fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
440438fd1498Szrj tree scalar_type = TREE_TYPE (init_val);
440538fd1498Szrj tree vectype = get_vectype_for_scalar_type (scalar_type);
440638fd1498Szrj enum tree_code code = gimple_assign_rhs_code (stmt);
440738fd1498Szrj tree def_for_init;
440838fd1498Szrj tree init_def;
440938fd1498Szrj bool nested_in_vect_loop = false;
441038fd1498Szrj REAL_VALUE_TYPE real_init_val = dconst0;
441138fd1498Szrj int int_init_val = 0;
441238fd1498Szrj gimple *def_stmt = NULL;
441338fd1498Szrj gimple_seq stmts = NULL;
441438fd1498Szrj
441538fd1498Szrj gcc_assert (vectype);
441638fd1498Szrj
441738fd1498Szrj gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
441838fd1498Szrj || SCALAR_FLOAT_TYPE_P (scalar_type));
441938fd1498Szrj
442038fd1498Szrj if (nested_in_vect_loop_p (loop, stmt))
442138fd1498Szrj nested_in_vect_loop = true;
442238fd1498Szrj else
442338fd1498Szrj gcc_assert (loop == (gimple_bb (stmt))->loop_father);
442438fd1498Szrj
442538fd1498Szrj /* In case of double reduction we only create a vector variable to be put
442638fd1498Szrj in the reduction phi node. The actual statement creation is done in
442738fd1498Szrj vect_create_epilog_for_reduction. */
442838fd1498Szrj if (adjustment_def && nested_in_vect_loop
442938fd1498Szrj && TREE_CODE (init_val) == SSA_NAME
443038fd1498Szrj && (def_stmt = SSA_NAME_DEF_STMT (init_val))
443138fd1498Szrj && gimple_code (def_stmt) == GIMPLE_PHI
443238fd1498Szrj && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
443338fd1498Szrj && vinfo_for_stmt (def_stmt)
443438fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
443538fd1498Szrj == vect_double_reduction_def)
443638fd1498Szrj {
443738fd1498Szrj *adjustment_def = NULL;
443838fd1498Szrj return vect_create_destination_var (init_val, vectype);
443938fd1498Szrj }
444038fd1498Szrj
444138fd1498Szrj vect_reduction_type reduction_type
444238fd1498Szrj = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
444338fd1498Szrj
444438fd1498Szrj /* In case of a nested reduction do not use an adjustment def as
444538fd1498Szrj that case is not supported by the epilogue generation correctly
444638fd1498Szrj if ncopies is not one. */
444738fd1498Szrj if (adjustment_def && nested_in_vect_loop)
444838fd1498Szrj {
444938fd1498Szrj *adjustment_def = NULL;
445038fd1498Szrj return vect_get_vec_def_for_operand (init_val, stmt);
445138fd1498Szrj }
445238fd1498Szrj
445338fd1498Szrj switch (code)
445438fd1498Szrj {
445538fd1498Szrj case WIDEN_SUM_EXPR:
445638fd1498Szrj case DOT_PROD_EXPR:
445738fd1498Szrj case SAD_EXPR:
445838fd1498Szrj case PLUS_EXPR:
445938fd1498Szrj case MINUS_EXPR:
446038fd1498Szrj case BIT_IOR_EXPR:
446138fd1498Szrj case BIT_XOR_EXPR:
446238fd1498Szrj case MULT_EXPR:
446338fd1498Szrj case BIT_AND_EXPR:
446438fd1498Szrj {
446538fd1498Szrj /* ADJUSTMENT_DEF is NULL when called from
446638fd1498Szrj vect_create_epilog_for_reduction to vectorize double reduction. */
446738fd1498Szrj if (adjustment_def)
446838fd1498Szrj *adjustment_def = init_val;
446938fd1498Szrj
447038fd1498Szrj if (code == MULT_EXPR)
447138fd1498Szrj {
447238fd1498Szrj real_init_val = dconst1;
447338fd1498Szrj int_init_val = 1;
447438fd1498Szrj }
447538fd1498Szrj
447638fd1498Szrj if (code == BIT_AND_EXPR)
447738fd1498Szrj int_init_val = -1;
447838fd1498Szrj
447938fd1498Szrj if (SCALAR_FLOAT_TYPE_P (scalar_type))
448038fd1498Szrj def_for_init = build_real (scalar_type, real_init_val);
448138fd1498Szrj else
448238fd1498Szrj def_for_init = build_int_cst (scalar_type, int_init_val);
448338fd1498Szrj
448438fd1498Szrj if (adjustment_def)
448538fd1498Szrj /* Option1: the first element is '0' or '1' as well. */
448638fd1498Szrj init_def = gimple_build_vector_from_val (&stmts, vectype,
448738fd1498Szrj def_for_init);
448838fd1498Szrj else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
448938fd1498Szrj {
449038fd1498Szrj /* Option2 (variable length): the first element is INIT_VAL. */
449138fd1498Szrj init_def = build_vector_from_val (vectype, def_for_init);
449238fd1498Szrj gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
449338fd1498Szrj 2, init_def, init_val);
449438fd1498Szrj init_def = make_ssa_name (vectype);
449538fd1498Szrj gimple_call_set_lhs (call, init_def);
449638fd1498Szrj gimple_seq_add_stmt (&stmts, call);
449738fd1498Szrj }
449838fd1498Szrj else
449938fd1498Szrj {
450038fd1498Szrj /* Option2: the first element is INIT_VAL. */
450138fd1498Szrj tree_vector_builder elts (vectype, 1, 2);
450238fd1498Szrj elts.quick_push (init_val);
450338fd1498Szrj elts.quick_push (def_for_init);
450438fd1498Szrj init_def = gimple_build_vector (&stmts, &elts);
450538fd1498Szrj }
450638fd1498Szrj }
450738fd1498Szrj break;
450838fd1498Szrj
450938fd1498Szrj case MIN_EXPR:
451038fd1498Szrj case MAX_EXPR:
451138fd1498Szrj case COND_EXPR:
451238fd1498Szrj {
451338fd1498Szrj if (adjustment_def)
451438fd1498Szrj {
451538fd1498Szrj *adjustment_def = NULL_TREE;
451638fd1498Szrj if (reduction_type != COND_REDUCTION
451738fd1498Szrj && reduction_type != EXTRACT_LAST_REDUCTION)
451838fd1498Szrj {
451938fd1498Szrj init_def = vect_get_vec_def_for_operand (init_val, stmt);
452038fd1498Szrj break;
452138fd1498Szrj }
452238fd1498Szrj }
452338fd1498Szrj init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
452438fd1498Szrj init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
452538fd1498Szrj }
452638fd1498Szrj break;
452738fd1498Szrj
452838fd1498Szrj default:
452938fd1498Szrj gcc_unreachable ();
453038fd1498Szrj }
453138fd1498Szrj
453238fd1498Szrj if (stmts)
453338fd1498Szrj gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
453438fd1498Szrj return init_def;
453538fd1498Szrj }
453638fd1498Szrj
453738fd1498Szrj /* Get at the initial defs for the reduction PHIs in SLP_NODE.
453838fd1498Szrj NUMBER_OF_VECTORS is the number of vector defs to create.
453938fd1498Szrj If NEUTRAL_OP is nonnull, introducing extra elements of that
454038fd1498Szrj value will not change the result. */
454138fd1498Szrj
454238fd1498Szrj static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)454338fd1498Szrj get_initial_defs_for_reduction (slp_tree slp_node,
454438fd1498Szrj vec<tree> *vec_oprnds,
454538fd1498Szrj unsigned int number_of_vectors,
454638fd1498Szrj bool reduc_chain, tree neutral_op)
454738fd1498Szrj {
454838fd1498Szrj vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
454938fd1498Szrj gimple *stmt = stmts[0];
455038fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
455138fd1498Szrj unsigned HOST_WIDE_INT nunits;
455238fd1498Szrj unsigned j, number_of_places_left_in_vector;
455338fd1498Szrj tree vector_type;
455438fd1498Szrj tree vop;
455538fd1498Szrj int group_size = stmts.length ();
455638fd1498Szrj unsigned int vec_num, i;
455738fd1498Szrj unsigned number_of_copies = 1;
455838fd1498Szrj vec<tree> voprnds;
455938fd1498Szrj voprnds.create (number_of_vectors);
456038fd1498Szrj struct loop *loop;
456138fd1498Szrj auto_vec<tree, 16> permute_results;
456238fd1498Szrj
456338fd1498Szrj vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
456438fd1498Szrj
456538fd1498Szrj gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
456638fd1498Szrj
456738fd1498Szrj loop = (gimple_bb (stmt))->loop_father;
456838fd1498Szrj gcc_assert (loop);
456938fd1498Szrj edge pe = loop_preheader_edge (loop);
457038fd1498Szrj
457138fd1498Szrj gcc_assert (!reduc_chain || neutral_op);
457238fd1498Szrj
457338fd1498Szrj /* NUMBER_OF_COPIES is the number of times we need to use the same values in
457438fd1498Szrj created vectors. It is greater than 1 if unrolling is performed.
457538fd1498Szrj
457638fd1498Szrj For example, we have two scalar operands, s1 and s2 (e.g., group of
457738fd1498Szrj strided accesses of size two), while NUNITS is four (i.e., four scalars
457838fd1498Szrj of this type can be packed in a vector). The output vector will contain
457938fd1498Szrj two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
458038fd1498Szrj will be 2).
458138fd1498Szrj
458238fd1498Szrj If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
458338fd1498Szrj containing the operands.
458438fd1498Szrj
458538fd1498Szrj For example, NUNITS is four as before, and the group size is 8
458638fd1498Szrj (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
458738fd1498Szrj {s5, s6, s7, s8}. */
458838fd1498Szrj
458938fd1498Szrj if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
459038fd1498Szrj nunits = group_size;
459138fd1498Szrj
459238fd1498Szrj number_of_copies = nunits * number_of_vectors / group_size;
459338fd1498Szrj
459438fd1498Szrj number_of_places_left_in_vector = nunits;
459538fd1498Szrj bool constant_p = true;
459638fd1498Szrj tree_vector_builder elts (vector_type, nunits, 1);
459738fd1498Szrj elts.quick_grow (nunits);
459838fd1498Szrj for (j = 0; j < number_of_copies; j++)
459938fd1498Szrj {
460038fd1498Szrj for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
460138fd1498Szrj {
460238fd1498Szrj tree op;
460338fd1498Szrj /* Get the def before the loop. In reduction chain we have only
460438fd1498Szrj one initial value. */
460538fd1498Szrj if ((j != (number_of_copies - 1)
460638fd1498Szrj || (reduc_chain && i != 0))
460738fd1498Szrj && neutral_op)
460838fd1498Szrj op = neutral_op;
460938fd1498Szrj else
461038fd1498Szrj op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
461138fd1498Szrj
461238fd1498Szrj /* Create 'vect_ = {op0,op1,...,opn}'. */
461338fd1498Szrj number_of_places_left_in_vector--;
461438fd1498Szrj elts[number_of_places_left_in_vector] = op;
461538fd1498Szrj if (!CONSTANT_CLASS_P (op))
461638fd1498Szrj constant_p = false;
461738fd1498Szrj
461838fd1498Szrj if (number_of_places_left_in_vector == 0)
461938fd1498Szrj {
462038fd1498Szrj gimple_seq ctor_seq = NULL;
462138fd1498Szrj tree init;
462238fd1498Szrj if (constant_p && !neutral_op
462338fd1498Szrj ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
462438fd1498Szrj : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
462538fd1498Szrj /* Build the vector directly from ELTS. */
462638fd1498Szrj init = gimple_build_vector (&ctor_seq, &elts);
462738fd1498Szrj else if (neutral_op)
462838fd1498Szrj {
462938fd1498Szrj /* Build a vector of the neutral value and shift the
463038fd1498Szrj other elements into place. */
463138fd1498Szrj init = gimple_build_vector_from_val (&ctor_seq, vector_type,
463238fd1498Szrj neutral_op);
463338fd1498Szrj int k = nunits;
463438fd1498Szrj while (k > 0 && elts[k - 1] == neutral_op)
463538fd1498Szrj k -= 1;
463638fd1498Szrj while (k > 0)
463738fd1498Szrj {
463838fd1498Szrj k -= 1;
463938fd1498Szrj gcall *call = gimple_build_call_internal
464038fd1498Szrj (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
464138fd1498Szrj init = make_ssa_name (vector_type);
464238fd1498Szrj gimple_call_set_lhs (call, init);
464338fd1498Szrj gimple_seq_add_stmt (&ctor_seq, call);
464438fd1498Szrj }
464538fd1498Szrj }
464638fd1498Szrj else
464738fd1498Szrj {
464838fd1498Szrj /* First time round, duplicate ELTS to fill the
464938fd1498Szrj required number of vectors, then cherry pick the
465038fd1498Szrj appropriate result for each iteration. */
465138fd1498Szrj if (vec_oprnds->is_empty ())
465238fd1498Szrj duplicate_and_interleave (&ctor_seq, vector_type, elts,
465338fd1498Szrj number_of_vectors,
465438fd1498Szrj permute_results);
465538fd1498Szrj init = permute_results[number_of_vectors - j - 1];
465638fd1498Szrj }
465738fd1498Szrj if (ctor_seq != NULL)
465838fd1498Szrj gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
465938fd1498Szrj voprnds.quick_push (init);
466038fd1498Szrj
466138fd1498Szrj number_of_places_left_in_vector = nunits;
466238fd1498Szrj elts.new_vector (vector_type, nunits, 1);
466338fd1498Szrj elts.quick_grow (nunits);
466438fd1498Szrj constant_p = true;
466538fd1498Szrj }
466638fd1498Szrj }
466738fd1498Szrj }
466838fd1498Szrj
466938fd1498Szrj /* Since the vectors are created in the reverse order, we should invert
467038fd1498Szrj them. */
467138fd1498Szrj vec_num = voprnds.length ();
467238fd1498Szrj for (j = vec_num; j != 0; j--)
467338fd1498Szrj {
467438fd1498Szrj vop = voprnds[j - 1];
467538fd1498Szrj vec_oprnds->quick_push (vop);
467638fd1498Szrj }
467738fd1498Szrj
467838fd1498Szrj voprnds.release ();
467938fd1498Szrj
468038fd1498Szrj /* In case that VF is greater than the unrolling factor needed for the SLP
468138fd1498Szrj group of stmts, NUMBER_OF_VECTORS to be created is greater than
468238fd1498Szrj NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
468338fd1498Szrj to replicate the vectors. */
468438fd1498Szrj tree neutral_vec = NULL;
468538fd1498Szrj while (number_of_vectors > vec_oprnds->length ())
468638fd1498Szrj {
468738fd1498Szrj if (neutral_op)
468838fd1498Szrj {
468938fd1498Szrj if (!neutral_vec)
469038fd1498Szrj {
469138fd1498Szrj gimple_seq ctor_seq = NULL;
469238fd1498Szrj neutral_vec = gimple_build_vector_from_val
469338fd1498Szrj (&ctor_seq, vector_type, neutral_op);
469438fd1498Szrj if (ctor_seq != NULL)
469538fd1498Szrj gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
469638fd1498Szrj }
469738fd1498Szrj vec_oprnds->quick_push (neutral_vec);
469838fd1498Szrj }
469938fd1498Szrj else
470038fd1498Szrj {
470138fd1498Szrj for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
470238fd1498Szrj vec_oprnds->quick_push (vop);
470338fd1498Szrj }
470438fd1498Szrj }
470538fd1498Szrj }
470638fd1498Szrj
470738fd1498Szrj
470838fd1498Szrj /* Function vect_create_epilog_for_reduction
470938fd1498Szrj
471038fd1498Szrj Create code at the loop-epilog to finalize the result of a reduction
471138fd1498Szrj computation.
471238fd1498Szrj
471338fd1498Szrj VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
471438fd1498Szrj reduction statements.
471538fd1498Szrj STMT is the scalar reduction stmt that is being vectorized.
471638fd1498Szrj NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
471738fd1498Szrj number of elements that we can fit in a vectype (nunits). In this case
471838fd1498Szrj we have to generate more than one vector stmt - i.e - we need to "unroll"
471938fd1498Szrj the vector stmt by a factor VF/nunits. For more details see documentation
472038fd1498Szrj in vectorizable_operation.
472138fd1498Szrj REDUC_FN is the internal function for the epilog reduction.
472238fd1498Szrj REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
472338fd1498Szrj computation.
472438fd1498Szrj REDUC_INDEX is the index of the operand in the right hand side of the
472538fd1498Szrj statement that is defined by REDUCTION_PHI.
472638fd1498Szrj DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
472738fd1498Szrj SLP_NODE is an SLP node containing a group of reduction statements. The
472838fd1498Szrj first one in this group is STMT.
472938fd1498Szrj INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
473038fd1498Szrj when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
473138fd1498Szrj be smaller than any value of the IV in the loop, for MIN_EXPR larger than
473238fd1498Szrj any value of the IV in the loop.
473338fd1498Szrj INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
473438fd1498Szrj NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
473538fd1498Szrj null if this is not an SLP reduction
473638fd1498Szrj
473738fd1498Szrj This function:
473838fd1498Szrj 1. Creates the reduction def-use cycles: sets the arguments for
473938fd1498Szrj REDUCTION_PHIS:
474038fd1498Szrj The loop-entry argument is the vectorized initial-value of the reduction.
474138fd1498Szrj The loop-latch argument is taken from VECT_DEFS - the vector of partial
474238fd1498Szrj sums.
474338fd1498Szrj 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
474438fd1498Szrj by calling the function specified by REDUC_FN if available, or by
474538fd1498Szrj other means (whole-vector shifts or a scalar loop).
474638fd1498Szrj The function also creates a new phi node at the loop exit to preserve
474738fd1498Szrj loop-closed form, as illustrated below.
474838fd1498Szrj
474938fd1498Szrj The flow at the entry to this function:
475038fd1498Szrj
475138fd1498Szrj loop:
475238fd1498Szrj vec_def = phi <null, null> # REDUCTION_PHI
475338fd1498Szrj VECT_DEF = vector_stmt # vectorized form of STMT
475438fd1498Szrj s_loop = scalar_stmt # (scalar) STMT
475538fd1498Szrj loop_exit:
475638fd1498Szrj s_out0 = phi <s_loop> # (scalar) EXIT_PHI
475738fd1498Szrj use <s_out0>
475838fd1498Szrj use <s_out0>
475938fd1498Szrj
476038fd1498Szrj The above is transformed by this function into:
476138fd1498Szrj
476238fd1498Szrj loop:
476338fd1498Szrj vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
476438fd1498Szrj VECT_DEF = vector_stmt # vectorized form of STMT
476538fd1498Szrj s_loop = scalar_stmt # (scalar) STMT
476638fd1498Szrj loop_exit:
476738fd1498Szrj s_out0 = phi <s_loop> # (scalar) EXIT_PHI
476838fd1498Szrj v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
476938fd1498Szrj v_out2 = reduce <v_out1>
477038fd1498Szrj s_out3 = extract_field <v_out2, 0>
477138fd1498Szrj s_out4 = adjust_result <s_out3>
477238fd1498Szrj use <s_out4>
477338fd1498Szrj use <s_out4>
477438fd1498Szrj */
477538fd1498Szrj
477638fd1498Szrj static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<gimple * > reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)477738fd1498Szrj vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
477838fd1498Szrj gimple *reduc_def_stmt,
477938fd1498Szrj int ncopies, internal_fn reduc_fn,
478038fd1498Szrj vec<gimple *> reduction_phis,
478138fd1498Szrj bool double_reduc,
478238fd1498Szrj slp_tree slp_node,
478338fd1498Szrj slp_instance slp_node_instance,
478438fd1498Szrj tree induc_val, enum tree_code induc_code,
478538fd1498Szrj tree neutral_op)
478638fd1498Szrj {
478738fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
478838fd1498Szrj stmt_vec_info prev_phi_info;
478938fd1498Szrj tree vectype;
479038fd1498Szrj machine_mode mode;
479138fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
479238fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
479338fd1498Szrj basic_block exit_bb;
479438fd1498Szrj tree scalar_dest;
479538fd1498Szrj tree scalar_type;
479638fd1498Szrj gimple *new_phi = NULL, *phi;
479738fd1498Szrj gimple_stmt_iterator exit_gsi;
479838fd1498Szrj tree vec_dest;
479938fd1498Szrj tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
480038fd1498Szrj gimple *epilog_stmt = NULL;
480138fd1498Szrj enum tree_code code = gimple_assign_rhs_code (stmt);
480238fd1498Szrj gimple *exit_phi;
480338fd1498Szrj tree bitsize;
480438fd1498Szrj tree adjustment_def = NULL;
480538fd1498Szrj tree vec_initial_def = NULL;
480638fd1498Szrj tree expr, def, initial_def = NULL;
480738fd1498Szrj tree orig_name, scalar_result;
480838fd1498Szrj imm_use_iterator imm_iter, phi_imm_iter;
480938fd1498Szrj use_operand_p use_p, phi_use_p;
481038fd1498Szrj gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
481138fd1498Szrj bool nested_in_vect_loop = false;
481238fd1498Szrj auto_vec<gimple *> new_phis;
481338fd1498Szrj auto_vec<gimple *> inner_phis;
481438fd1498Szrj enum vect_def_type dt = vect_unknown_def_type;
481538fd1498Szrj int j, i;
481638fd1498Szrj auto_vec<tree> scalar_results;
481738fd1498Szrj unsigned int group_size = 1, k, ratio;
481838fd1498Szrj auto_vec<tree> vec_initial_defs;
481938fd1498Szrj auto_vec<gimple *> phis;
482038fd1498Szrj bool slp_reduc = false;
482138fd1498Szrj bool direct_slp_reduc;
482238fd1498Szrj tree new_phi_result;
482338fd1498Szrj gimple *inner_phi = NULL;
482438fd1498Szrj tree induction_index = NULL_TREE;
482538fd1498Szrj
482638fd1498Szrj if (slp_node)
482738fd1498Szrj group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
482838fd1498Szrj
482938fd1498Szrj if (nested_in_vect_loop_p (loop, stmt))
483038fd1498Szrj {
483138fd1498Szrj outer_loop = loop;
483238fd1498Szrj loop = loop->inner;
483338fd1498Szrj nested_in_vect_loop = true;
483438fd1498Szrj gcc_assert (!slp_node);
483538fd1498Szrj }
483638fd1498Szrj
483738fd1498Szrj vectype = STMT_VINFO_VECTYPE (stmt_info);
483838fd1498Szrj gcc_assert (vectype);
483938fd1498Szrj mode = TYPE_MODE (vectype);
484038fd1498Szrj
484138fd1498Szrj /* 1. Create the reduction def-use cycle:
484238fd1498Szrj Set the arguments of REDUCTION_PHIS, i.e., transform
484338fd1498Szrj
484438fd1498Szrj loop:
484538fd1498Szrj vec_def = phi <null, null> # REDUCTION_PHI
484638fd1498Szrj VECT_DEF = vector_stmt # vectorized form of STMT
484738fd1498Szrj ...
484838fd1498Szrj
484938fd1498Szrj into:
485038fd1498Szrj
485138fd1498Szrj loop:
485238fd1498Szrj vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
485338fd1498Szrj VECT_DEF = vector_stmt # vectorized form of STMT
485438fd1498Szrj ...
485538fd1498Szrj
485638fd1498Szrj (in case of SLP, do it for all the phis). */
485738fd1498Szrj
485838fd1498Szrj /* Get the loop-entry arguments. */
485938fd1498Szrj enum vect_def_type initial_def_dt = vect_unknown_def_type;
486038fd1498Szrj if (slp_node)
486138fd1498Szrj {
486238fd1498Szrj unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
486338fd1498Szrj vec_initial_defs.reserve (vec_num);
486438fd1498Szrj get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
486538fd1498Szrj &vec_initial_defs, vec_num,
486638fd1498Szrj GROUP_FIRST_ELEMENT (stmt_info),
486738fd1498Szrj neutral_op);
486838fd1498Szrj }
486938fd1498Szrj else
487038fd1498Szrj {
487138fd1498Szrj /* Get at the scalar def before the loop, that defines the initial value
487238fd1498Szrj of the reduction variable. */
487338fd1498Szrj gimple *def_stmt;
487438fd1498Szrj initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
487538fd1498Szrj loop_preheader_edge (loop));
487638fd1498Szrj /* Optimize: if initial_def is for REDUC_MAX smaller than the base
487738fd1498Szrj and we can't use zero for induc_val, use initial_def. Similarly
487838fd1498Szrj for REDUC_MIN and initial_def larger than the base. */
487938fd1498Szrj if (TREE_CODE (initial_def) == INTEGER_CST
488038fd1498Szrj && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
488138fd1498Szrj == INTEGER_INDUC_COND_REDUCTION)
488238fd1498Szrj && !integer_zerop (induc_val)
488338fd1498Szrj && ((induc_code == MAX_EXPR
488438fd1498Szrj && tree_int_cst_lt (initial_def, induc_val))
488538fd1498Szrj || (induc_code == MIN_EXPR
488638fd1498Szrj && tree_int_cst_lt (induc_val, initial_def))))
488738fd1498Szrj induc_val = initial_def;
488838fd1498Szrj vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
488938fd1498Szrj vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
489038fd1498Szrj &adjustment_def);
489138fd1498Szrj vec_initial_defs.create (1);
489238fd1498Szrj vec_initial_defs.quick_push (vec_initial_def);
489338fd1498Szrj }
489438fd1498Szrj
489538fd1498Szrj /* Set phi nodes arguments. */
489638fd1498Szrj FOR_EACH_VEC_ELT (reduction_phis, i, phi)
489738fd1498Szrj {
489838fd1498Szrj tree vec_init_def = vec_initial_defs[i];
489938fd1498Szrj tree def = vect_defs[i];
490038fd1498Szrj for (j = 0; j < ncopies; j++)
490138fd1498Szrj {
490238fd1498Szrj if (j != 0)
490338fd1498Szrj {
490438fd1498Szrj phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
490538fd1498Szrj if (nested_in_vect_loop)
490638fd1498Szrj vec_init_def
490738fd1498Szrj = vect_get_vec_def_for_stmt_copy (initial_def_dt,
490838fd1498Szrj vec_init_def);
490938fd1498Szrj }
491038fd1498Szrj
491138fd1498Szrj /* Set the loop-entry arg of the reduction-phi. */
491238fd1498Szrj
491338fd1498Szrj if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
491438fd1498Szrj == INTEGER_INDUC_COND_REDUCTION)
491538fd1498Szrj {
491638fd1498Szrj /* Initialise the reduction phi to zero. This prevents initial
491738fd1498Szrj values of non-zero interferring with the reduction op. */
491838fd1498Szrj gcc_assert (ncopies == 1);
491938fd1498Szrj gcc_assert (i == 0);
492038fd1498Szrj
492138fd1498Szrj tree vec_init_def_type = TREE_TYPE (vec_init_def);
492238fd1498Szrj tree induc_val_vec
492338fd1498Szrj = build_vector_from_val (vec_init_def_type, induc_val);
492438fd1498Szrj
492538fd1498Szrj add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
492638fd1498Szrj loop_preheader_edge (loop), UNKNOWN_LOCATION);
492738fd1498Szrj }
492838fd1498Szrj else
492938fd1498Szrj add_phi_arg (as_a <gphi *> (phi), vec_init_def,
493038fd1498Szrj loop_preheader_edge (loop), UNKNOWN_LOCATION);
493138fd1498Szrj
493238fd1498Szrj /* Set the loop-latch arg for the reduction-phi. */
493338fd1498Szrj if (j > 0)
493438fd1498Szrj def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
493538fd1498Szrj
493638fd1498Szrj add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
493738fd1498Szrj UNKNOWN_LOCATION);
493838fd1498Szrj
493938fd1498Szrj if (dump_enabled_p ())
494038fd1498Szrj {
494138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
494238fd1498Szrj "transform reduction: created def-use cycle: ");
494338fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
494438fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
494538fd1498Szrj }
494638fd1498Szrj }
494738fd1498Szrj }
494838fd1498Szrj
494938fd1498Szrj /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
495038fd1498Szrj which is updated with the current index of the loop for every match of
495138fd1498Szrj the original loop's cond_expr (VEC_STMT). This results in a vector
495238fd1498Szrj containing the last time the condition passed for that vector lane.
495338fd1498Szrj The first match will be a 1 to allow 0 to be used for non-matching
495438fd1498Szrj indexes. If there are no matches at all then the vector will be all
495538fd1498Szrj zeroes. */
495638fd1498Szrj if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
495738fd1498Szrj {
495838fd1498Szrj tree indx_before_incr, indx_after_incr;
495938fd1498Szrj poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
496038fd1498Szrj
496138fd1498Szrj gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
496238fd1498Szrj gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
496338fd1498Szrj
496438fd1498Szrj int scalar_precision
496538fd1498Szrj = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
496638fd1498Szrj tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
496738fd1498Szrj tree cr_index_vector_type = build_vector_type
496838fd1498Szrj (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
496938fd1498Szrj
497038fd1498Szrj /* First we create a simple vector induction variable which starts
497138fd1498Szrj with the values {1,2,3,...} (SERIES_VECT) and increments by the
497238fd1498Szrj vector size (STEP). */
497338fd1498Szrj
497438fd1498Szrj /* Create a {1,2,3,...} vector. */
497538fd1498Szrj tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
497638fd1498Szrj
497738fd1498Szrj /* Create a vector of the step value. */
497838fd1498Szrj tree step = build_int_cst (cr_index_scalar_type, nunits_out);
497938fd1498Szrj tree vec_step = build_vector_from_val (cr_index_vector_type, step);
498038fd1498Szrj
498138fd1498Szrj /* Create an induction variable. */
498238fd1498Szrj gimple_stmt_iterator incr_gsi;
498338fd1498Szrj bool insert_after;
498438fd1498Szrj standard_iv_increment_position (loop, &incr_gsi, &insert_after);
498538fd1498Szrj create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
498638fd1498Szrj insert_after, &indx_before_incr, &indx_after_incr);
498738fd1498Szrj
498838fd1498Szrj /* Next create a new phi node vector (NEW_PHI_TREE) which starts
498938fd1498Szrj filled with zeros (VEC_ZERO). */
499038fd1498Szrj
499138fd1498Szrj /* Create a vector of 0s. */
499238fd1498Szrj tree zero = build_zero_cst (cr_index_scalar_type);
499338fd1498Szrj tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
499438fd1498Szrj
499538fd1498Szrj /* Create a vector phi node. */
499638fd1498Szrj tree new_phi_tree = make_ssa_name (cr_index_vector_type);
499738fd1498Szrj new_phi = create_phi_node (new_phi_tree, loop->header);
499838fd1498Szrj set_vinfo_for_stmt (new_phi,
499938fd1498Szrj new_stmt_vec_info (new_phi, loop_vinfo));
500038fd1498Szrj add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
500138fd1498Szrj loop_preheader_edge (loop), UNKNOWN_LOCATION);
500238fd1498Szrj
500338fd1498Szrj /* Now take the condition from the loops original cond_expr
500438fd1498Szrj (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
500538fd1498Szrj every match uses values from the induction variable
500638fd1498Szrj (INDEX_BEFORE_INCR) otherwise uses values from the phi node
500738fd1498Szrj (NEW_PHI_TREE).
500838fd1498Szrj Finally, we update the phi (NEW_PHI_TREE) to take the value of
500938fd1498Szrj the new cond_expr (INDEX_COND_EXPR). */
501038fd1498Szrj
501138fd1498Szrj /* Duplicate the condition from vec_stmt. */
501238fd1498Szrj tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
501338fd1498Szrj
501438fd1498Szrj /* Create a conditional, where the condition is taken from vec_stmt
501538fd1498Szrj (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
501638fd1498Szrj else is the phi (NEW_PHI_TREE). */
501738fd1498Szrj tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
501838fd1498Szrj ccompare, indx_before_incr,
501938fd1498Szrj new_phi_tree);
502038fd1498Szrj induction_index = make_ssa_name (cr_index_vector_type);
502138fd1498Szrj gimple *index_condition = gimple_build_assign (induction_index,
502238fd1498Szrj index_cond_expr);
502338fd1498Szrj gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
502438fd1498Szrj stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
502538fd1498Szrj loop_vinfo);
502638fd1498Szrj STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
502738fd1498Szrj set_vinfo_for_stmt (index_condition, index_vec_info);
502838fd1498Szrj
502938fd1498Szrj /* Update the phi with the vec cond. */
503038fd1498Szrj add_phi_arg (as_a <gphi *> (new_phi), induction_index,
503138fd1498Szrj loop_latch_edge (loop), UNKNOWN_LOCATION);
503238fd1498Szrj }
503338fd1498Szrj
503438fd1498Szrj /* 2. Create epilog code.
503538fd1498Szrj The reduction epilog code operates across the elements of the vector
503638fd1498Szrj of partial results computed by the vectorized loop.
503738fd1498Szrj The reduction epilog code consists of:
503838fd1498Szrj
503938fd1498Szrj step 1: compute the scalar result in a vector (v_out2)
504038fd1498Szrj step 2: extract the scalar result (s_out3) from the vector (v_out2)
504138fd1498Szrj step 3: adjust the scalar result (s_out3) if needed.
504238fd1498Szrj
504338fd1498Szrj Step 1 can be accomplished using one the following three schemes:
504438fd1498Szrj (scheme 1) using reduc_fn, if available.
504538fd1498Szrj (scheme 2) using whole-vector shifts, if available.
504638fd1498Szrj (scheme 3) using a scalar loop. In this case steps 1+2 above are
504738fd1498Szrj combined.
504838fd1498Szrj
504938fd1498Szrj The overall epilog code looks like this:
505038fd1498Szrj
505138fd1498Szrj s_out0 = phi <s_loop> # original EXIT_PHI
505238fd1498Szrj v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
505338fd1498Szrj v_out2 = reduce <v_out1> # step 1
505438fd1498Szrj s_out3 = extract_field <v_out2, 0> # step 2
505538fd1498Szrj s_out4 = adjust_result <s_out3> # step 3
505638fd1498Szrj
505738fd1498Szrj (step 3 is optional, and steps 1 and 2 may be combined).
505838fd1498Szrj Lastly, the uses of s_out0 are replaced by s_out4. */
505938fd1498Szrj
506038fd1498Szrj
506138fd1498Szrj /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
506238fd1498Szrj v_out1 = phi <VECT_DEF>
506338fd1498Szrj Store them in NEW_PHIS. */
506438fd1498Szrj
506538fd1498Szrj exit_bb = single_exit (loop)->dest;
506638fd1498Szrj prev_phi_info = NULL;
506738fd1498Szrj new_phis.create (vect_defs.length ());
506838fd1498Szrj FOR_EACH_VEC_ELT (vect_defs, i, def)
506938fd1498Szrj {
507038fd1498Szrj for (j = 0; j < ncopies; j++)
507138fd1498Szrj {
507238fd1498Szrj tree new_def = copy_ssa_name (def);
507338fd1498Szrj phi = create_phi_node (new_def, exit_bb);
507438fd1498Szrj set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
507538fd1498Szrj if (j == 0)
507638fd1498Szrj new_phis.quick_push (phi);
507738fd1498Szrj else
507838fd1498Szrj {
507938fd1498Szrj def = vect_get_vec_def_for_stmt_copy (dt, def);
508038fd1498Szrj STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
508138fd1498Szrj }
508238fd1498Szrj
508338fd1498Szrj SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
508438fd1498Szrj prev_phi_info = vinfo_for_stmt (phi);
508538fd1498Szrj }
508638fd1498Szrj }
508738fd1498Szrj
508838fd1498Szrj /* The epilogue is created for the outer-loop, i.e., for the loop being
508938fd1498Szrj vectorized. Create exit phis for the outer loop. */
509038fd1498Szrj if (double_reduc)
509138fd1498Szrj {
509238fd1498Szrj loop = outer_loop;
509338fd1498Szrj exit_bb = single_exit (loop)->dest;
509438fd1498Szrj inner_phis.create (vect_defs.length ());
509538fd1498Szrj FOR_EACH_VEC_ELT (new_phis, i, phi)
509638fd1498Szrj {
509738fd1498Szrj tree new_result = copy_ssa_name (PHI_RESULT (phi));
509838fd1498Szrj gphi *outer_phi = create_phi_node (new_result, exit_bb);
509938fd1498Szrj SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
510038fd1498Szrj PHI_RESULT (phi));
510138fd1498Szrj set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
510238fd1498Szrj loop_vinfo));
510338fd1498Szrj inner_phis.quick_push (phi);
510438fd1498Szrj new_phis[i] = outer_phi;
510538fd1498Szrj prev_phi_info = vinfo_for_stmt (outer_phi);
510638fd1498Szrj while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
510738fd1498Szrj {
510838fd1498Szrj phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
510938fd1498Szrj new_result = copy_ssa_name (PHI_RESULT (phi));
511038fd1498Szrj outer_phi = create_phi_node (new_result, exit_bb);
511138fd1498Szrj SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
511238fd1498Szrj PHI_RESULT (phi));
511338fd1498Szrj set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
511438fd1498Szrj loop_vinfo));
511538fd1498Szrj STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
511638fd1498Szrj prev_phi_info = vinfo_for_stmt (outer_phi);
511738fd1498Szrj }
511838fd1498Szrj }
511938fd1498Szrj }
512038fd1498Szrj
512138fd1498Szrj exit_gsi = gsi_after_labels (exit_bb);
512238fd1498Szrj
512338fd1498Szrj /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
512438fd1498Szrj (i.e. when reduc_fn is not available) and in the final adjustment
512538fd1498Szrj code (if needed). Also get the original scalar reduction variable as
512638fd1498Szrj defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
512738fd1498Szrj represents a reduction pattern), the tree-code and scalar-def are
512838fd1498Szrj taken from the original stmt that the pattern-stmt (STMT) replaces.
512938fd1498Szrj Otherwise (it is a regular reduction) - the tree-code and scalar-def
513038fd1498Szrj are taken from STMT. */
513138fd1498Szrj
513238fd1498Szrj orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
513338fd1498Szrj if (!orig_stmt)
513438fd1498Szrj {
513538fd1498Szrj /* Regular reduction */
513638fd1498Szrj orig_stmt = stmt;
513738fd1498Szrj }
513838fd1498Szrj else
513938fd1498Szrj {
514038fd1498Szrj /* Reduction pattern */
514138fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
514238fd1498Szrj gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
514338fd1498Szrj gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
514438fd1498Szrj }
514538fd1498Szrj
514638fd1498Szrj code = gimple_assign_rhs_code (orig_stmt);
514738fd1498Szrj /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
514838fd1498Szrj partial results are added and not subtracted. */
514938fd1498Szrj if (code == MINUS_EXPR)
515038fd1498Szrj code = PLUS_EXPR;
515138fd1498Szrj
515238fd1498Szrj scalar_dest = gimple_assign_lhs (orig_stmt);
515338fd1498Szrj scalar_type = TREE_TYPE (scalar_dest);
515438fd1498Szrj scalar_results.create (group_size);
515538fd1498Szrj new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
515638fd1498Szrj bitsize = TYPE_SIZE (scalar_type);
515738fd1498Szrj
515838fd1498Szrj /* In case this is a reduction in an inner-loop while vectorizing an outer
515938fd1498Szrj loop - we don't need to extract a single scalar result at the end of the
516038fd1498Szrj inner-loop (unless it is double reduction, i.e., the use of reduction is
516138fd1498Szrj outside the outer-loop). The final vector of partial results will be used
516238fd1498Szrj in the vectorized outer-loop, or reduced to a scalar result at the end of
516338fd1498Szrj the outer-loop. */
516438fd1498Szrj if (nested_in_vect_loop && !double_reduc)
516538fd1498Szrj goto vect_finalize_reduction;
516638fd1498Szrj
516738fd1498Szrj /* SLP reduction without reduction chain, e.g.,
516838fd1498Szrj # a1 = phi <a2, a0>
516938fd1498Szrj # b1 = phi <b2, b0>
517038fd1498Szrj a2 = operation (a1)
517138fd1498Szrj b2 = operation (b1) */
517238fd1498Szrj slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
517338fd1498Szrj
517438fd1498Szrj /* True if we should implement SLP_REDUC using native reduction operations
517538fd1498Szrj instead of scalar operations. */
517638fd1498Szrj direct_slp_reduc = (reduc_fn != IFN_LAST
517738fd1498Szrj && slp_reduc
517838fd1498Szrj && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
517938fd1498Szrj
518038fd1498Szrj /* In case of reduction chain, e.g.,
518138fd1498Szrj # a1 = phi <a3, a0>
518238fd1498Szrj a2 = operation (a1)
518338fd1498Szrj a3 = operation (a2),
518438fd1498Szrj
518538fd1498Szrj we may end up with more than one vector result. Here we reduce them to
518638fd1498Szrj one vector. */
518738fd1498Szrj if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
518838fd1498Szrj {
518938fd1498Szrj tree first_vect = PHI_RESULT (new_phis[0]);
519038fd1498Szrj gassign *new_vec_stmt = NULL;
519138fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype);
519238fd1498Szrj for (k = 1; k < new_phis.length (); k++)
519338fd1498Szrj {
519438fd1498Szrj gimple *next_phi = new_phis[k];
519538fd1498Szrj tree second_vect = PHI_RESULT (next_phi);
519638fd1498Szrj tree tem = make_ssa_name (vec_dest, new_vec_stmt);
519738fd1498Szrj new_vec_stmt = gimple_build_assign (tem, code,
519838fd1498Szrj first_vect, second_vect);
519938fd1498Szrj gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
520038fd1498Szrj first_vect = tem;
520138fd1498Szrj }
520238fd1498Szrj
520338fd1498Szrj new_phi_result = first_vect;
520438fd1498Szrj if (new_vec_stmt)
520538fd1498Szrj {
520638fd1498Szrj new_phis.truncate (0);
520738fd1498Szrj new_phis.safe_push (new_vec_stmt);
520838fd1498Szrj }
520938fd1498Szrj }
521038fd1498Szrj /* Likewise if we couldn't use a single defuse cycle. */
521138fd1498Szrj else if (ncopies > 1)
521238fd1498Szrj {
521338fd1498Szrj gcc_assert (new_phis.length () == 1);
521438fd1498Szrj tree first_vect = PHI_RESULT (new_phis[0]);
521538fd1498Szrj gassign *new_vec_stmt = NULL;
521638fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype);
521738fd1498Szrj gimple *next_phi = new_phis[0];
521838fd1498Szrj for (int k = 1; k < ncopies; ++k)
521938fd1498Szrj {
522038fd1498Szrj next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
522138fd1498Szrj tree second_vect = PHI_RESULT (next_phi);
522238fd1498Szrj tree tem = make_ssa_name (vec_dest, new_vec_stmt);
522338fd1498Szrj new_vec_stmt = gimple_build_assign (tem, code,
522438fd1498Szrj first_vect, second_vect);
522538fd1498Szrj gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
522638fd1498Szrj first_vect = tem;
522738fd1498Szrj }
522838fd1498Szrj new_phi_result = first_vect;
522938fd1498Szrj new_phis.truncate (0);
523038fd1498Szrj new_phis.safe_push (new_vec_stmt);
523138fd1498Szrj }
523238fd1498Szrj else
523338fd1498Szrj new_phi_result = PHI_RESULT (new_phis[0]);
523438fd1498Szrj
523538fd1498Szrj if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
523638fd1498Szrj && reduc_fn != IFN_LAST)
523738fd1498Szrj {
523838fd1498Szrj /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
523938fd1498Szrj various data values where the condition matched and another vector
524038fd1498Szrj (INDUCTION_INDEX) containing all the indexes of those matches. We
524138fd1498Szrj need to extract the last matching index (which will be the index with
524238fd1498Szrj highest value) and use this to index into the data vector.
524338fd1498Szrj For the case where there were no matches, the data vector will contain
524438fd1498Szrj all default values and the index vector will be all zeros. */
524538fd1498Szrj
524638fd1498Szrj /* Get various versions of the type of the vector of indexes. */
524738fd1498Szrj tree index_vec_type = TREE_TYPE (induction_index);
524838fd1498Szrj gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
524938fd1498Szrj tree index_scalar_type = TREE_TYPE (index_vec_type);
525038fd1498Szrj tree index_vec_cmp_type = build_same_sized_truth_vector_type
525138fd1498Szrj (index_vec_type);
525238fd1498Szrj
525338fd1498Szrj /* Get an unsigned integer version of the type of the data vector. */
525438fd1498Szrj int scalar_precision
525538fd1498Szrj = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
525638fd1498Szrj tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
525738fd1498Szrj tree vectype_unsigned = build_vector_type
525838fd1498Szrj (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
525938fd1498Szrj
526038fd1498Szrj /* First we need to create a vector (ZERO_VEC) of zeros and another
526138fd1498Szrj vector (MAX_INDEX_VEC) filled with the last matching index, which we
526238fd1498Szrj can create using a MAX reduction and then expanding.
526338fd1498Szrj In the case where the loop never made any matches, the max index will
526438fd1498Szrj be zero. */
526538fd1498Szrj
526638fd1498Szrj /* Vector of {0, 0, 0,...}. */
526738fd1498Szrj tree zero_vec = make_ssa_name (vectype);
526838fd1498Szrj tree zero_vec_rhs = build_zero_cst (vectype);
526938fd1498Szrj gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
527038fd1498Szrj gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
527138fd1498Szrj
527238fd1498Szrj /* Find maximum value from the vector of found indexes. */
527338fd1498Szrj tree max_index = make_ssa_name (index_scalar_type);
527438fd1498Szrj gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
527538fd1498Szrj 1, induction_index);
527638fd1498Szrj gimple_call_set_lhs (max_index_stmt, max_index);
527738fd1498Szrj gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
527838fd1498Szrj
527938fd1498Szrj /* Vector of {max_index, max_index, max_index,...}. */
528038fd1498Szrj tree max_index_vec = make_ssa_name (index_vec_type);
528138fd1498Szrj tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
528238fd1498Szrj max_index);
528338fd1498Szrj gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
528438fd1498Szrj max_index_vec_rhs);
528538fd1498Szrj gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
528638fd1498Szrj
528738fd1498Szrj /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
528838fd1498Szrj with the vector (INDUCTION_INDEX) of found indexes, choosing values
528938fd1498Szrj from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
529038fd1498Szrj otherwise. Only one value should match, resulting in a vector
529138fd1498Szrj (VEC_COND) with one data value and the rest zeros.
529238fd1498Szrj In the case where the loop never made any matches, every index will
529338fd1498Szrj match, resulting in a vector with all data values (which will all be
529438fd1498Szrj the default value). */
529538fd1498Szrj
529638fd1498Szrj /* Compare the max index vector to the vector of found indexes to find
529738fd1498Szrj the position of the max value. */
529838fd1498Szrj tree vec_compare = make_ssa_name (index_vec_cmp_type);
529938fd1498Szrj gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
530038fd1498Szrj induction_index,
530138fd1498Szrj max_index_vec);
530238fd1498Szrj gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
530338fd1498Szrj
530438fd1498Szrj /* Use the compare to choose either values from the data vector or
530538fd1498Szrj zero. */
530638fd1498Szrj tree vec_cond = make_ssa_name (vectype);
530738fd1498Szrj gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
530838fd1498Szrj vec_compare, new_phi_result,
530938fd1498Szrj zero_vec);
531038fd1498Szrj gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
531138fd1498Szrj
531238fd1498Szrj /* Finally we need to extract the data value from the vector (VEC_COND)
531338fd1498Szrj into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
531438fd1498Szrj reduction, but because this doesn't exist, we can use a MAX reduction
531538fd1498Szrj instead. The data value might be signed or a float so we need to cast
531638fd1498Szrj it first.
531738fd1498Szrj In the case where the loop never made any matches, the data values are
531838fd1498Szrj all identical, and so will reduce down correctly. */
531938fd1498Szrj
532038fd1498Szrj /* Make the matched data values unsigned. */
532138fd1498Szrj tree vec_cond_cast = make_ssa_name (vectype_unsigned);
532238fd1498Szrj tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
532338fd1498Szrj vec_cond);
532438fd1498Szrj gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
532538fd1498Szrj VIEW_CONVERT_EXPR,
532638fd1498Szrj vec_cond_cast_rhs);
532738fd1498Szrj gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
532838fd1498Szrj
532938fd1498Szrj /* Reduce down to a scalar value. */
533038fd1498Szrj tree data_reduc = make_ssa_name (scalar_type_unsigned);
533138fd1498Szrj gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
533238fd1498Szrj 1, vec_cond_cast);
533338fd1498Szrj gimple_call_set_lhs (data_reduc_stmt, data_reduc);
533438fd1498Szrj gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
533538fd1498Szrj
533638fd1498Szrj /* Convert the reduced value back to the result type and set as the
533738fd1498Szrj result. */
533838fd1498Szrj gimple_seq stmts = NULL;
533938fd1498Szrj new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
534038fd1498Szrj data_reduc);
534138fd1498Szrj gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
534238fd1498Szrj scalar_results.safe_push (new_temp);
534338fd1498Szrj }
534438fd1498Szrj else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
534538fd1498Szrj && reduc_fn == IFN_LAST)
534638fd1498Szrj {
534738fd1498Szrj /* Condition reduction without supported IFN_REDUC_MAX. Generate
534838fd1498Szrj idx = 0;
534938fd1498Szrj idx_val = induction_index[0];
535038fd1498Szrj val = data_reduc[0];
535138fd1498Szrj for (idx = 0, val = init, i = 0; i < nelts; ++i)
535238fd1498Szrj if (induction_index[i] > idx_val)
535338fd1498Szrj val = data_reduc[i], idx_val = induction_index[i];
535438fd1498Szrj return val; */
535538fd1498Szrj
535638fd1498Szrj tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
535738fd1498Szrj tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
535838fd1498Szrj unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
535938fd1498Szrj poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
536038fd1498Szrj /* Enforced by vectorizable_reduction, which ensures we have target
536138fd1498Szrj support before allowing a conditional reduction on variable-length
536238fd1498Szrj vectors. */
536338fd1498Szrj unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
536438fd1498Szrj tree idx_val = NULL_TREE, val = NULL_TREE;
536538fd1498Szrj for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
536638fd1498Szrj {
536738fd1498Szrj tree old_idx_val = idx_val;
536838fd1498Szrj tree old_val = val;
536938fd1498Szrj idx_val = make_ssa_name (idx_eltype);
537038fd1498Szrj epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
537138fd1498Szrj build3 (BIT_FIELD_REF, idx_eltype,
537238fd1498Szrj induction_index,
537338fd1498Szrj bitsize_int (el_size),
537438fd1498Szrj bitsize_int (off)));
537538fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
537638fd1498Szrj val = make_ssa_name (data_eltype);
537738fd1498Szrj epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
537838fd1498Szrj build3 (BIT_FIELD_REF,
537938fd1498Szrj data_eltype,
538038fd1498Szrj new_phi_result,
538138fd1498Szrj bitsize_int (el_size),
538238fd1498Szrj bitsize_int (off)));
538338fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
538438fd1498Szrj if (off != 0)
538538fd1498Szrj {
538638fd1498Szrj tree new_idx_val = idx_val;
538738fd1498Szrj tree new_val = val;
538838fd1498Szrj if (off != v_size - el_size)
538938fd1498Szrj {
539038fd1498Szrj new_idx_val = make_ssa_name (idx_eltype);
539138fd1498Szrj epilog_stmt = gimple_build_assign (new_idx_val,
539238fd1498Szrj MAX_EXPR, idx_val,
539338fd1498Szrj old_idx_val);
539438fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
539538fd1498Szrj }
539638fd1498Szrj new_val = make_ssa_name (data_eltype);
539738fd1498Szrj epilog_stmt = gimple_build_assign (new_val,
539838fd1498Szrj COND_EXPR,
539938fd1498Szrj build2 (GT_EXPR,
540038fd1498Szrj boolean_type_node,
540138fd1498Szrj idx_val,
540238fd1498Szrj old_idx_val),
540338fd1498Szrj val, old_val);
540438fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
540538fd1498Szrj idx_val = new_idx_val;
540638fd1498Szrj val = new_val;
540738fd1498Szrj }
540838fd1498Szrj }
540938fd1498Szrj /* Convert the reduced value back to the result type and set as the
541038fd1498Szrj result. */
541138fd1498Szrj gimple_seq stmts = NULL;
541238fd1498Szrj val = gimple_convert (&stmts, scalar_type, val);
541338fd1498Szrj gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
541438fd1498Szrj scalar_results.safe_push (val);
541538fd1498Szrj }
541638fd1498Szrj
541738fd1498Szrj /* 2.3 Create the reduction code, using one of the three schemes described
541838fd1498Szrj above. In SLP we simply need to extract all the elements from the
541938fd1498Szrj vector (without reducing them), so we use scalar shifts. */
542038fd1498Szrj else if (reduc_fn != IFN_LAST && !slp_reduc)
542138fd1498Szrj {
542238fd1498Szrj tree tmp;
542338fd1498Szrj tree vec_elem_type;
542438fd1498Szrj
542538fd1498Szrj /* Case 1: Create:
542638fd1498Szrj v_out2 = reduc_expr <v_out1> */
542738fd1498Szrj
542838fd1498Szrj if (dump_enabled_p ())
542938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
543038fd1498Szrj "Reduce using direct vector reduction.\n");
543138fd1498Szrj
543238fd1498Szrj vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
543338fd1498Szrj if (!useless_type_conversion_p (scalar_type, vec_elem_type))
543438fd1498Szrj {
543538fd1498Szrj tree tmp_dest
543638fd1498Szrj = vect_create_destination_var (scalar_dest, vec_elem_type);
543738fd1498Szrj epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
543838fd1498Szrj new_phi_result);
543938fd1498Szrj gimple_set_lhs (epilog_stmt, tmp_dest);
544038fd1498Szrj new_temp = make_ssa_name (tmp_dest, epilog_stmt);
544138fd1498Szrj gimple_set_lhs (epilog_stmt, new_temp);
544238fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
544338fd1498Szrj
544438fd1498Szrj epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
544538fd1498Szrj new_temp);
544638fd1498Szrj }
544738fd1498Szrj else
544838fd1498Szrj {
544938fd1498Szrj epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
545038fd1498Szrj new_phi_result);
545138fd1498Szrj gimple_set_lhs (epilog_stmt, new_scalar_dest);
545238fd1498Szrj }
545338fd1498Szrj
545438fd1498Szrj new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
545538fd1498Szrj gimple_set_lhs (epilog_stmt, new_temp);
545638fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
545738fd1498Szrj
545838fd1498Szrj if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
545938fd1498Szrj == INTEGER_INDUC_COND_REDUCTION)
546038fd1498Szrj && !operand_equal_p (initial_def, induc_val, 0))
546138fd1498Szrj {
546238fd1498Szrj /* Earlier we set the initial value to be a vector if induc_val
546338fd1498Szrj values. Check the result and if it is induc_val then replace
546438fd1498Szrj with the original initial value, unless induc_val is
546538fd1498Szrj the same as initial_def already. */
546638fd1498Szrj tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
546738fd1498Szrj induc_val);
546838fd1498Szrj
546938fd1498Szrj tmp = make_ssa_name (new_scalar_dest);
547038fd1498Szrj epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
547138fd1498Szrj initial_def, new_temp);
547238fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
547338fd1498Szrj new_temp = tmp;
547438fd1498Szrj }
547538fd1498Szrj
547638fd1498Szrj scalar_results.safe_push (new_temp);
547738fd1498Szrj }
547838fd1498Szrj else if (direct_slp_reduc)
547938fd1498Szrj {
548038fd1498Szrj /* Here we create one vector for each of the GROUP_SIZE results,
548138fd1498Szrj with the elements for other SLP statements replaced with the
548238fd1498Szrj neutral value. We can then do a normal reduction on each vector. */
548338fd1498Szrj
548438fd1498Szrj /* Enforced by vectorizable_reduction. */
548538fd1498Szrj gcc_assert (new_phis.length () == 1);
548638fd1498Szrj gcc_assert (pow2p_hwi (group_size));
548738fd1498Szrj
548838fd1498Szrj slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
548938fd1498Szrj vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
549038fd1498Szrj gimple_seq seq = NULL;
549138fd1498Szrj
549238fd1498Szrj /* Build a vector {0, 1, 2, ...}, with the same number of elements
549338fd1498Szrj and the same element size as VECTYPE. */
549438fd1498Szrj tree index = build_index_vector (vectype, 0, 1);
549538fd1498Szrj tree index_type = TREE_TYPE (index);
549638fd1498Szrj tree index_elt_type = TREE_TYPE (index_type);
549738fd1498Szrj tree mask_type = build_same_sized_truth_vector_type (index_type);
549838fd1498Szrj
549938fd1498Szrj /* Create a vector that, for each element, identifies which of
550038fd1498Szrj the GROUP_SIZE results should use it. */
550138fd1498Szrj tree index_mask = build_int_cst (index_elt_type, group_size - 1);
550238fd1498Szrj index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
550338fd1498Szrj build_vector_from_val (index_type, index_mask));
550438fd1498Szrj
550538fd1498Szrj /* Get a neutral vector value. This is simply a splat of the neutral
550638fd1498Szrj scalar value if we have one, otherwise the initial scalar value
550738fd1498Szrj is itself a neutral value. */
550838fd1498Szrj tree vector_identity = NULL_TREE;
550938fd1498Szrj if (neutral_op)
551038fd1498Szrj vector_identity = gimple_build_vector_from_val (&seq, vectype,
551138fd1498Szrj neutral_op);
551238fd1498Szrj for (unsigned int i = 0; i < group_size; ++i)
551338fd1498Szrj {
551438fd1498Szrj /* If there's no univeral neutral value, we can use the
551538fd1498Szrj initial scalar value from the original PHI. This is used
551638fd1498Szrj for MIN and MAX reduction, for example. */
551738fd1498Szrj if (!neutral_op)
551838fd1498Szrj {
551938fd1498Szrj tree scalar_value
552038fd1498Szrj = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
552138fd1498Szrj loop_preheader_edge (loop));
552238fd1498Szrj vector_identity = gimple_build_vector_from_val (&seq, vectype,
552338fd1498Szrj scalar_value);
552438fd1498Szrj }
552538fd1498Szrj
552638fd1498Szrj /* Calculate the equivalent of:
552738fd1498Szrj
552838fd1498Szrj sel[j] = (index[j] == i);
552938fd1498Szrj
553038fd1498Szrj which selects the elements of NEW_PHI_RESULT that should
553138fd1498Szrj be included in the result. */
553238fd1498Szrj tree compare_val = build_int_cst (index_elt_type, i);
553338fd1498Szrj compare_val = build_vector_from_val (index_type, compare_val);
553438fd1498Szrj tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
553538fd1498Szrj index, compare_val);
553638fd1498Szrj
553738fd1498Szrj /* Calculate the equivalent of:
553838fd1498Szrj
553938fd1498Szrj vec = seq ? new_phi_result : vector_identity;
554038fd1498Szrj
554138fd1498Szrj VEC is now suitable for a full vector reduction. */
554238fd1498Szrj tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
554338fd1498Szrj sel, new_phi_result, vector_identity);
554438fd1498Szrj
554538fd1498Szrj /* Do the reduction and convert it to the appropriate type. */
554638fd1498Szrj gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
554738fd1498Szrj tree scalar = make_ssa_name (TREE_TYPE (vectype));
554838fd1498Szrj gimple_call_set_lhs (call, scalar);
554938fd1498Szrj gimple_seq_add_stmt (&seq, call);
555038fd1498Szrj scalar = gimple_convert (&seq, scalar_type, scalar);
555138fd1498Szrj scalar_results.safe_push (scalar);
555238fd1498Szrj }
555338fd1498Szrj gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
555438fd1498Szrj }
555538fd1498Szrj else
555638fd1498Szrj {
555738fd1498Szrj bool reduce_with_shift;
555838fd1498Szrj tree vec_temp;
555938fd1498Szrj
556038fd1498Szrj /* COND reductions all do the final reduction with MAX_EXPR
556138fd1498Szrj or MIN_EXPR. */
556238fd1498Szrj if (code == COND_EXPR)
556338fd1498Szrj {
556438fd1498Szrj if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
556538fd1498Szrj == INTEGER_INDUC_COND_REDUCTION)
556638fd1498Szrj code = induc_code;
5567*58e805e6Szrj else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5568*58e805e6Szrj == CONST_COND_REDUCTION)
5569*58e805e6Szrj code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
557038fd1498Szrj else
557138fd1498Szrj code = MAX_EXPR;
557238fd1498Szrj }
557338fd1498Szrj
557438fd1498Szrj /* See if the target wants to do the final (shift) reduction
557538fd1498Szrj in a vector mode of smaller size and first reduce upper/lower
557638fd1498Szrj halves against each other. */
557738fd1498Szrj enum machine_mode mode1 = mode;
557838fd1498Szrj tree vectype1 = vectype;
557938fd1498Szrj unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
558038fd1498Szrj unsigned sz1 = sz;
558138fd1498Szrj if (!slp_reduc
558238fd1498Szrj && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
558338fd1498Szrj sz1 = GET_MODE_SIZE (mode1).to_constant ();
558438fd1498Szrj
558538fd1498Szrj vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
558638fd1498Szrj reduce_with_shift = have_whole_vector_shift (mode1);
558738fd1498Szrj if (!VECTOR_MODE_P (mode1))
558838fd1498Szrj reduce_with_shift = false;
558938fd1498Szrj else
559038fd1498Szrj {
559138fd1498Szrj optab optab = optab_for_tree_code (code, vectype1, optab_default);
559238fd1498Szrj if (optab_handler (optab, mode1) == CODE_FOR_nothing)
559338fd1498Szrj reduce_with_shift = false;
559438fd1498Szrj }
559538fd1498Szrj
559638fd1498Szrj /* First reduce the vector to the desired vector size we should
559738fd1498Szrj do shift reduction on by combining upper and lower halves. */
559838fd1498Szrj new_temp = new_phi_result;
559938fd1498Szrj while (sz > sz1)
560038fd1498Szrj {
560138fd1498Szrj gcc_assert (!slp_reduc);
560238fd1498Szrj sz /= 2;
560338fd1498Szrj vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
560438fd1498Szrj
560538fd1498Szrj /* The target has to make sure we support lowpart/highpart
560638fd1498Szrj extraction, either via direct vector extract or through
560738fd1498Szrj an integer mode punning. */
560838fd1498Szrj tree dst1, dst2;
560938fd1498Szrj if (convert_optab_handler (vec_extract_optab,
561038fd1498Szrj TYPE_MODE (TREE_TYPE (new_temp)),
561138fd1498Szrj TYPE_MODE (vectype1))
561238fd1498Szrj != CODE_FOR_nothing)
561338fd1498Szrj {
561438fd1498Szrj /* Extract sub-vectors directly once vec_extract becomes
561538fd1498Szrj a conversion optab. */
561638fd1498Szrj dst1 = make_ssa_name (vectype1);
561738fd1498Szrj epilog_stmt
561838fd1498Szrj = gimple_build_assign (dst1, BIT_FIELD_REF,
561938fd1498Szrj build3 (BIT_FIELD_REF, vectype1,
562038fd1498Szrj new_temp, TYPE_SIZE (vectype1),
562138fd1498Szrj bitsize_int (0)));
562238fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
562338fd1498Szrj dst2 = make_ssa_name (vectype1);
562438fd1498Szrj epilog_stmt
562538fd1498Szrj = gimple_build_assign (dst2, BIT_FIELD_REF,
562638fd1498Szrj build3 (BIT_FIELD_REF, vectype1,
562738fd1498Szrj new_temp, TYPE_SIZE (vectype1),
562838fd1498Szrj bitsize_int (sz * BITS_PER_UNIT)));
562938fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
563038fd1498Szrj }
563138fd1498Szrj else
563238fd1498Szrj {
563338fd1498Szrj /* Extract via punning to appropriately sized integer mode
563438fd1498Szrj vector. */
563538fd1498Szrj tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
563638fd1498Szrj 1);
563738fd1498Szrj tree etype = build_vector_type (eltype, 2);
563838fd1498Szrj gcc_assert (convert_optab_handler (vec_extract_optab,
563938fd1498Szrj TYPE_MODE (etype),
564038fd1498Szrj TYPE_MODE (eltype))
564138fd1498Szrj != CODE_FOR_nothing);
564238fd1498Szrj tree tem = make_ssa_name (etype);
564338fd1498Szrj epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
564438fd1498Szrj build1 (VIEW_CONVERT_EXPR,
564538fd1498Szrj etype, new_temp));
564638fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
564738fd1498Szrj new_temp = tem;
564838fd1498Szrj tem = make_ssa_name (eltype);
564938fd1498Szrj epilog_stmt
565038fd1498Szrj = gimple_build_assign (tem, BIT_FIELD_REF,
565138fd1498Szrj build3 (BIT_FIELD_REF, eltype,
565238fd1498Szrj new_temp, TYPE_SIZE (eltype),
565338fd1498Szrj bitsize_int (0)));
565438fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
565538fd1498Szrj dst1 = make_ssa_name (vectype1);
565638fd1498Szrj epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
565738fd1498Szrj build1 (VIEW_CONVERT_EXPR,
565838fd1498Szrj vectype1, tem));
565938fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
566038fd1498Szrj tem = make_ssa_name (eltype);
566138fd1498Szrj epilog_stmt
566238fd1498Szrj = gimple_build_assign (tem, BIT_FIELD_REF,
566338fd1498Szrj build3 (BIT_FIELD_REF, eltype,
566438fd1498Szrj new_temp, TYPE_SIZE (eltype),
566538fd1498Szrj bitsize_int (sz * BITS_PER_UNIT)));
566638fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
566738fd1498Szrj dst2 = make_ssa_name (vectype1);
566838fd1498Szrj epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
566938fd1498Szrj build1 (VIEW_CONVERT_EXPR,
567038fd1498Szrj vectype1, tem));
567138fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
567238fd1498Szrj }
567338fd1498Szrj
567438fd1498Szrj new_temp = make_ssa_name (vectype1);
567538fd1498Szrj epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
567638fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
567738fd1498Szrj }
567838fd1498Szrj
567938fd1498Szrj if (reduce_with_shift && !slp_reduc)
568038fd1498Szrj {
568138fd1498Szrj int element_bitsize = tree_to_uhwi (bitsize);
568238fd1498Szrj /* Enforced by vectorizable_reduction, which disallows SLP reductions
568338fd1498Szrj for variable-length vectors and also requires direct target support
568438fd1498Szrj for loop reductions. */
568538fd1498Szrj int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
568638fd1498Szrj int nelements = vec_size_in_bits / element_bitsize;
568738fd1498Szrj vec_perm_builder sel;
568838fd1498Szrj vec_perm_indices indices;
568938fd1498Szrj
569038fd1498Szrj int elt_offset;
569138fd1498Szrj
569238fd1498Szrj tree zero_vec = build_zero_cst (vectype1);
569338fd1498Szrj /* Case 2: Create:
569438fd1498Szrj for (offset = nelements/2; offset >= 1; offset/=2)
569538fd1498Szrj {
569638fd1498Szrj Create: va' = vec_shift <va, offset>
569738fd1498Szrj Create: va = vop <va, va'>
569838fd1498Szrj } */
569938fd1498Szrj
570038fd1498Szrj tree rhs;
570138fd1498Szrj
570238fd1498Szrj if (dump_enabled_p ())
570338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
570438fd1498Szrj "Reduce using vector shifts\n");
570538fd1498Szrj
570638fd1498Szrj mode1 = TYPE_MODE (vectype1);
570738fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype1);
570838fd1498Szrj for (elt_offset = nelements / 2;
570938fd1498Szrj elt_offset >= 1;
571038fd1498Szrj elt_offset /= 2)
571138fd1498Szrj {
571238fd1498Szrj calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
571338fd1498Szrj indices.new_vector (sel, 2, nelements);
571438fd1498Szrj tree mask = vect_gen_perm_mask_any (vectype1, indices);
571538fd1498Szrj epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
571638fd1498Szrj new_temp, zero_vec, mask);
571738fd1498Szrj new_name = make_ssa_name (vec_dest, epilog_stmt);
571838fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_name);
571938fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
572038fd1498Szrj
572138fd1498Szrj epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
572238fd1498Szrj new_temp);
572338fd1498Szrj new_temp = make_ssa_name (vec_dest, epilog_stmt);
572438fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_temp);
572538fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
572638fd1498Szrj }
572738fd1498Szrj
572838fd1498Szrj /* 2.4 Extract the final scalar result. Create:
572938fd1498Szrj s_out3 = extract_field <v_out2, bitpos> */
573038fd1498Szrj
573138fd1498Szrj if (dump_enabled_p ())
573238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
573338fd1498Szrj "extract scalar result\n");
573438fd1498Szrj
573538fd1498Szrj rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
573638fd1498Szrj bitsize, bitsize_zero_node);
573738fd1498Szrj epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
573838fd1498Szrj new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
573938fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_temp);
574038fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
574138fd1498Szrj scalar_results.safe_push (new_temp);
574238fd1498Szrj }
574338fd1498Szrj else
574438fd1498Szrj {
574538fd1498Szrj /* Case 3: Create:
574638fd1498Szrj s = extract_field <v_out2, 0>
574738fd1498Szrj for (offset = element_size;
574838fd1498Szrj offset < vector_size;
574938fd1498Szrj offset += element_size;)
575038fd1498Szrj {
575138fd1498Szrj Create: s' = extract_field <v_out2, offset>
575238fd1498Szrj Create: s = op <s, s'> // For non SLP cases
575338fd1498Szrj } */
575438fd1498Szrj
575538fd1498Szrj if (dump_enabled_p ())
575638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
575738fd1498Szrj "Reduce using scalar code.\n");
575838fd1498Szrj
575938fd1498Szrj int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
576038fd1498Szrj int element_bitsize = tree_to_uhwi (bitsize);
576138fd1498Szrj FOR_EACH_VEC_ELT (new_phis, i, new_phi)
576238fd1498Szrj {
576338fd1498Szrj int bit_offset;
576438fd1498Szrj if (gimple_code (new_phi) == GIMPLE_PHI)
576538fd1498Szrj vec_temp = PHI_RESULT (new_phi);
576638fd1498Szrj else
576738fd1498Szrj vec_temp = gimple_assign_lhs (new_phi);
576838fd1498Szrj tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
576938fd1498Szrj bitsize_zero_node);
577038fd1498Szrj epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
577138fd1498Szrj new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
577238fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_temp);
577338fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
577438fd1498Szrj
577538fd1498Szrj /* In SLP we don't need to apply reduction operation, so we just
577638fd1498Szrj collect s' values in SCALAR_RESULTS. */
577738fd1498Szrj if (slp_reduc)
577838fd1498Szrj scalar_results.safe_push (new_temp);
577938fd1498Szrj
578038fd1498Szrj for (bit_offset = element_bitsize;
578138fd1498Szrj bit_offset < vec_size_in_bits;
578238fd1498Szrj bit_offset += element_bitsize)
578338fd1498Szrj {
578438fd1498Szrj tree bitpos = bitsize_int (bit_offset);
578538fd1498Szrj tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
578638fd1498Szrj bitsize, bitpos);
578738fd1498Szrj
578838fd1498Szrj epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
578938fd1498Szrj new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
579038fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_name);
579138fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
579238fd1498Szrj
579338fd1498Szrj if (slp_reduc)
579438fd1498Szrj {
579538fd1498Szrj /* In SLP we don't need to apply reduction operation, so
579638fd1498Szrj we just collect s' values in SCALAR_RESULTS. */
579738fd1498Szrj new_temp = new_name;
579838fd1498Szrj scalar_results.safe_push (new_name);
579938fd1498Szrj }
580038fd1498Szrj else
580138fd1498Szrj {
580238fd1498Szrj epilog_stmt = gimple_build_assign (new_scalar_dest, code,
580338fd1498Szrj new_name, new_temp);
580438fd1498Szrj new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
580538fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_temp);
580638fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
580738fd1498Szrj }
580838fd1498Szrj }
580938fd1498Szrj }
581038fd1498Szrj
581138fd1498Szrj /* The only case where we need to reduce scalar results in SLP, is
581238fd1498Szrj unrolling. If the size of SCALAR_RESULTS is greater than
581338fd1498Szrj GROUP_SIZE, we reduce them combining elements modulo
581438fd1498Szrj GROUP_SIZE. */
581538fd1498Szrj if (slp_reduc)
581638fd1498Szrj {
581738fd1498Szrj tree res, first_res, new_res;
581838fd1498Szrj gimple *new_stmt;
581938fd1498Szrj
582038fd1498Szrj /* Reduce multiple scalar results in case of SLP unrolling. */
582138fd1498Szrj for (j = group_size; scalar_results.iterate (j, &res);
582238fd1498Szrj j++)
582338fd1498Szrj {
582438fd1498Szrj first_res = scalar_results[j % group_size];
582538fd1498Szrj new_stmt = gimple_build_assign (new_scalar_dest, code,
582638fd1498Szrj first_res, res);
582738fd1498Szrj new_res = make_ssa_name (new_scalar_dest, new_stmt);
582838fd1498Szrj gimple_assign_set_lhs (new_stmt, new_res);
582938fd1498Szrj gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
583038fd1498Szrj scalar_results[j % group_size] = new_res;
583138fd1498Szrj }
583238fd1498Szrj }
583338fd1498Szrj else
583438fd1498Szrj /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
583538fd1498Szrj scalar_results.safe_push (new_temp);
583638fd1498Szrj }
583738fd1498Szrj
583838fd1498Szrj if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
583938fd1498Szrj == INTEGER_INDUC_COND_REDUCTION)
584038fd1498Szrj && !operand_equal_p (initial_def, induc_val, 0))
584138fd1498Szrj {
584238fd1498Szrj /* Earlier we set the initial value to be a vector if induc_val
584338fd1498Szrj values. Check the result and if it is induc_val then replace
584438fd1498Szrj with the original initial value, unless induc_val is
584538fd1498Szrj the same as initial_def already. */
584638fd1498Szrj tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
584738fd1498Szrj induc_val);
584838fd1498Szrj
584938fd1498Szrj tree tmp = make_ssa_name (new_scalar_dest);
585038fd1498Szrj epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
585138fd1498Szrj initial_def, new_temp);
585238fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
585338fd1498Szrj scalar_results[0] = tmp;
585438fd1498Szrj }
585538fd1498Szrj }
585638fd1498Szrj
585738fd1498Szrj vect_finalize_reduction:
585838fd1498Szrj
585938fd1498Szrj if (double_reduc)
586038fd1498Szrj loop = loop->inner;
586138fd1498Szrj
586238fd1498Szrj /* 2.5 Adjust the final result by the initial value of the reduction
586338fd1498Szrj variable. (When such adjustment is not needed, then
586438fd1498Szrj 'adjustment_def' is zero). For example, if code is PLUS we create:
586538fd1498Szrj new_temp = loop_exit_def + adjustment_def */
586638fd1498Szrj
586738fd1498Szrj if (adjustment_def)
586838fd1498Szrj {
586938fd1498Szrj gcc_assert (!slp_reduc);
587038fd1498Szrj if (nested_in_vect_loop)
587138fd1498Szrj {
587238fd1498Szrj new_phi = new_phis[0];
587338fd1498Szrj gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
587438fd1498Szrj expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
587538fd1498Szrj new_dest = vect_create_destination_var (scalar_dest, vectype);
587638fd1498Szrj }
587738fd1498Szrj else
587838fd1498Szrj {
587938fd1498Szrj new_temp = scalar_results[0];
588038fd1498Szrj gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
588138fd1498Szrj expr = build2 (code, scalar_type, new_temp, adjustment_def);
588238fd1498Szrj new_dest = vect_create_destination_var (scalar_dest, scalar_type);
588338fd1498Szrj }
588438fd1498Szrj
588538fd1498Szrj epilog_stmt = gimple_build_assign (new_dest, expr);
588638fd1498Szrj new_temp = make_ssa_name (new_dest, epilog_stmt);
588738fd1498Szrj gimple_assign_set_lhs (epilog_stmt, new_temp);
588838fd1498Szrj gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
588938fd1498Szrj if (nested_in_vect_loop)
589038fd1498Szrj {
589138fd1498Szrj set_vinfo_for_stmt (epilog_stmt,
589238fd1498Szrj new_stmt_vec_info (epilog_stmt, loop_vinfo));
589338fd1498Szrj STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
589438fd1498Szrj STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
589538fd1498Szrj
589638fd1498Szrj if (!double_reduc)
589738fd1498Szrj scalar_results.quick_push (new_temp);
589838fd1498Szrj else
589938fd1498Szrj scalar_results[0] = new_temp;
590038fd1498Szrj }
590138fd1498Szrj else
590238fd1498Szrj scalar_results[0] = new_temp;
590338fd1498Szrj
590438fd1498Szrj new_phis[0] = epilog_stmt;
590538fd1498Szrj }
590638fd1498Szrj
590738fd1498Szrj /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
590838fd1498Szrj phis with new adjusted scalar results, i.e., replace use <s_out0>
590938fd1498Szrj with use <s_out4>.
591038fd1498Szrj
591138fd1498Szrj Transform:
591238fd1498Szrj loop_exit:
591338fd1498Szrj s_out0 = phi <s_loop> # (scalar) EXIT_PHI
591438fd1498Szrj v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
591538fd1498Szrj v_out2 = reduce <v_out1>
591638fd1498Szrj s_out3 = extract_field <v_out2, 0>
591738fd1498Szrj s_out4 = adjust_result <s_out3>
591838fd1498Szrj use <s_out0>
591938fd1498Szrj use <s_out0>
592038fd1498Szrj
592138fd1498Szrj into:
592238fd1498Szrj
592338fd1498Szrj loop_exit:
592438fd1498Szrj s_out0 = phi <s_loop> # (scalar) EXIT_PHI
592538fd1498Szrj v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
592638fd1498Szrj v_out2 = reduce <v_out1>
592738fd1498Szrj s_out3 = extract_field <v_out2, 0>
592838fd1498Szrj s_out4 = adjust_result <s_out3>
592938fd1498Szrj use <s_out4>
593038fd1498Szrj use <s_out4> */
593138fd1498Szrj
593238fd1498Szrj
593338fd1498Szrj /* In SLP reduction chain we reduce vector results into one vector if
593438fd1498Szrj necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
593538fd1498Szrj the last stmt in the reduction chain, since we are looking for the loop
593638fd1498Szrj exit phi node. */
593738fd1498Szrj if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
593838fd1498Szrj {
593938fd1498Szrj gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
594038fd1498Szrj /* Handle reduction patterns. */
594138fd1498Szrj if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
594238fd1498Szrj dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
594338fd1498Szrj
594438fd1498Szrj scalar_dest = gimple_assign_lhs (dest_stmt);
594538fd1498Szrj group_size = 1;
594638fd1498Szrj }
594738fd1498Szrj
594838fd1498Szrj /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
594938fd1498Szrj case that GROUP_SIZE is greater than vectorization factor). Therefore, we
595038fd1498Szrj need to match SCALAR_RESULTS with corresponding statements. The first
595138fd1498Szrj (GROUP_SIZE / number of new vector stmts) scalar results correspond to
595238fd1498Szrj the first vector stmt, etc.
595338fd1498Szrj (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
595438fd1498Szrj if (group_size > new_phis.length ())
595538fd1498Szrj {
595638fd1498Szrj ratio = group_size / new_phis.length ();
595738fd1498Szrj gcc_assert (!(group_size % new_phis.length ()));
595838fd1498Szrj }
595938fd1498Szrj else
596038fd1498Szrj ratio = 1;
596138fd1498Szrj
596238fd1498Szrj for (k = 0; k < group_size; k++)
596338fd1498Szrj {
596438fd1498Szrj if (k % ratio == 0)
596538fd1498Szrj {
596638fd1498Szrj epilog_stmt = new_phis[k / ratio];
596738fd1498Szrj reduction_phi = reduction_phis[k / ratio];
596838fd1498Szrj if (double_reduc)
596938fd1498Szrj inner_phi = inner_phis[k / ratio];
597038fd1498Szrj }
597138fd1498Szrj
597238fd1498Szrj if (slp_reduc)
597338fd1498Szrj {
597438fd1498Szrj gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
597538fd1498Szrj
597638fd1498Szrj orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
597738fd1498Szrj /* SLP statements can't participate in patterns. */
597838fd1498Szrj gcc_assert (!orig_stmt);
597938fd1498Szrj scalar_dest = gimple_assign_lhs (current_stmt);
598038fd1498Szrj }
598138fd1498Szrj
598238fd1498Szrj phis.create (3);
598338fd1498Szrj /* Find the loop-closed-use at the loop exit of the original scalar
598438fd1498Szrj result. (The reduction result is expected to have two immediate uses -
598538fd1498Szrj one at the latch block, and one at the loop exit). */
598638fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
598738fd1498Szrj if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
598838fd1498Szrj && !is_gimple_debug (USE_STMT (use_p)))
598938fd1498Szrj phis.safe_push (USE_STMT (use_p));
599038fd1498Szrj
599138fd1498Szrj /* While we expect to have found an exit_phi because of loop-closed-ssa
599238fd1498Szrj form we can end up without one if the scalar cycle is dead. */
599338fd1498Szrj
599438fd1498Szrj FOR_EACH_VEC_ELT (phis, i, exit_phi)
599538fd1498Szrj {
599638fd1498Szrj if (outer_loop)
599738fd1498Szrj {
599838fd1498Szrj stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
599938fd1498Szrj gphi *vect_phi;
600038fd1498Szrj
600138fd1498Szrj /* FORNOW. Currently not supporting the case that an inner-loop
600238fd1498Szrj reduction is not used in the outer-loop (but only outside the
600338fd1498Szrj outer-loop), unless it is double reduction. */
600438fd1498Szrj gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
600538fd1498Szrj && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
600638fd1498Szrj || double_reduc);
600738fd1498Szrj
600838fd1498Szrj if (double_reduc)
600938fd1498Szrj STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
601038fd1498Szrj else
601138fd1498Szrj STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
601238fd1498Szrj if (!double_reduc
601338fd1498Szrj || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
601438fd1498Szrj != vect_double_reduction_def)
601538fd1498Szrj continue;
601638fd1498Szrj
601738fd1498Szrj /* Handle double reduction:
601838fd1498Szrj
601938fd1498Szrj stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
602038fd1498Szrj stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
602138fd1498Szrj stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
602238fd1498Szrj stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
602338fd1498Szrj
602438fd1498Szrj At that point the regular reduction (stmt2 and stmt3) is
602538fd1498Szrj already vectorized, as well as the exit phi node, stmt4.
602638fd1498Szrj Here we vectorize the phi node of double reduction, stmt1, and
602738fd1498Szrj update all relevant statements. */
602838fd1498Szrj
602938fd1498Szrj /* Go through all the uses of s2 to find double reduction phi
603038fd1498Szrj node, i.e., stmt1 above. */
603138fd1498Szrj orig_name = PHI_RESULT (exit_phi);
603238fd1498Szrj FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
603338fd1498Szrj {
603438fd1498Szrj stmt_vec_info use_stmt_vinfo;
603538fd1498Szrj stmt_vec_info new_phi_vinfo;
603638fd1498Szrj tree vect_phi_init, preheader_arg, vect_phi_res;
603738fd1498Szrj basic_block bb = gimple_bb (use_stmt);
603838fd1498Szrj gimple *use;
603938fd1498Szrj
604038fd1498Szrj /* Check that USE_STMT is really double reduction phi
604138fd1498Szrj node. */
604238fd1498Szrj if (gimple_code (use_stmt) != GIMPLE_PHI
604338fd1498Szrj || gimple_phi_num_args (use_stmt) != 2
604438fd1498Szrj || bb->loop_father != outer_loop)
604538fd1498Szrj continue;
604638fd1498Szrj use_stmt_vinfo = vinfo_for_stmt (use_stmt);
604738fd1498Szrj if (!use_stmt_vinfo
604838fd1498Szrj || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
604938fd1498Szrj != vect_double_reduction_def)
605038fd1498Szrj continue;
605138fd1498Szrj
605238fd1498Szrj /* Create vector phi node for double reduction:
605338fd1498Szrj vs1 = phi <vs0, vs2>
605438fd1498Szrj vs1 was created previously in this function by a call to
605538fd1498Szrj vect_get_vec_def_for_operand and is stored in
605638fd1498Szrj vec_initial_def;
605738fd1498Szrj vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
605838fd1498Szrj vs0 is created here. */
605938fd1498Szrj
606038fd1498Szrj /* Create vector phi node. */
606138fd1498Szrj vect_phi = create_phi_node (vec_initial_def, bb);
606238fd1498Szrj new_phi_vinfo = new_stmt_vec_info (vect_phi,
606338fd1498Szrj loop_vec_info_for_loop (outer_loop));
606438fd1498Szrj set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
606538fd1498Szrj
606638fd1498Szrj /* Create vs0 - initial def of the double reduction phi. */
606738fd1498Szrj preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
606838fd1498Szrj loop_preheader_edge (outer_loop));
606938fd1498Szrj vect_phi_init = get_initial_def_for_reduction
607038fd1498Szrj (stmt, preheader_arg, NULL);
607138fd1498Szrj
607238fd1498Szrj /* Update phi node arguments with vs0 and vs2. */
607338fd1498Szrj add_phi_arg (vect_phi, vect_phi_init,
607438fd1498Szrj loop_preheader_edge (outer_loop),
607538fd1498Szrj UNKNOWN_LOCATION);
607638fd1498Szrj add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
607738fd1498Szrj loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
607838fd1498Szrj if (dump_enabled_p ())
607938fd1498Szrj {
608038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
608138fd1498Szrj "created double reduction phi node: ");
608238fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
608338fd1498Szrj }
608438fd1498Szrj
608538fd1498Szrj vect_phi_res = PHI_RESULT (vect_phi);
608638fd1498Szrj
608738fd1498Szrj /* Replace the use, i.e., set the correct vs1 in the regular
608838fd1498Szrj reduction phi node. FORNOW, NCOPIES is always 1, so the
608938fd1498Szrj loop is redundant. */
609038fd1498Szrj use = reduction_phi;
609138fd1498Szrj for (j = 0; j < ncopies; j++)
609238fd1498Szrj {
609338fd1498Szrj edge pr_edge = loop_preheader_edge (loop);
609438fd1498Szrj SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
609538fd1498Szrj use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
609638fd1498Szrj }
609738fd1498Szrj }
609838fd1498Szrj }
609938fd1498Szrj }
610038fd1498Szrj
610138fd1498Szrj phis.release ();
610238fd1498Szrj if (nested_in_vect_loop)
610338fd1498Szrj {
610438fd1498Szrj if (double_reduc)
610538fd1498Szrj loop = outer_loop;
610638fd1498Szrj else
610738fd1498Szrj continue;
610838fd1498Szrj }
610938fd1498Szrj
611038fd1498Szrj phis.create (3);
611138fd1498Szrj /* Find the loop-closed-use at the loop exit of the original scalar
611238fd1498Szrj result. (The reduction result is expected to have two immediate uses,
611338fd1498Szrj one at the latch block, and one at the loop exit). For double
611438fd1498Szrj reductions we are looking for exit phis of the outer loop. */
611538fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
611638fd1498Szrj {
611738fd1498Szrj if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
611838fd1498Szrj {
611938fd1498Szrj if (!is_gimple_debug (USE_STMT (use_p)))
612038fd1498Szrj phis.safe_push (USE_STMT (use_p));
612138fd1498Szrj }
612238fd1498Szrj else
612338fd1498Szrj {
612438fd1498Szrj if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
612538fd1498Szrj {
612638fd1498Szrj tree phi_res = PHI_RESULT (USE_STMT (use_p));
612738fd1498Szrj
612838fd1498Szrj FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
612938fd1498Szrj {
613038fd1498Szrj if (!flow_bb_inside_loop_p (loop,
613138fd1498Szrj gimple_bb (USE_STMT (phi_use_p)))
613238fd1498Szrj && !is_gimple_debug (USE_STMT (phi_use_p)))
613338fd1498Szrj phis.safe_push (USE_STMT (phi_use_p));
613438fd1498Szrj }
613538fd1498Szrj }
613638fd1498Szrj }
613738fd1498Szrj }
613838fd1498Szrj
613938fd1498Szrj FOR_EACH_VEC_ELT (phis, i, exit_phi)
614038fd1498Szrj {
614138fd1498Szrj /* Replace the uses: */
614238fd1498Szrj orig_name = PHI_RESULT (exit_phi);
614338fd1498Szrj scalar_result = scalar_results[k];
614438fd1498Szrj FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
614538fd1498Szrj FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
614638fd1498Szrj SET_USE (use_p, scalar_result);
614738fd1498Szrj }
614838fd1498Szrj
614938fd1498Szrj phis.release ();
615038fd1498Szrj }
615138fd1498Szrj }
615238fd1498Szrj
615338fd1498Szrj /* Return a vector of type VECTYPE that is equal to the vector select
615438fd1498Szrj operation "MASK ? VEC : IDENTITY". Insert the select statements
615538fd1498Szrj before GSI. */
615638fd1498Szrj
615738fd1498Szrj static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)615838fd1498Szrj merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
615938fd1498Szrj tree vec, tree identity)
616038fd1498Szrj {
616138fd1498Szrj tree cond = make_temp_ssa_name (vectype, NULL, "cond");
616238fd1498Szrj gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
616338fd1498Szrj mask, vec, identity);
616438fd1498Szrj gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
616538fd1498Szrj return cond;
616638fd1498Szrj }
616738fd1498Szrj
616838fd1498Szrj /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
616938fd1498Szrj order, starting with LHS. Insert the extraction statements before GSI and
617038fd1498Szrj associate the new scalar SSA names with variable SCALAR_DEST.
617138fd1498Szrj Return the SSA name for the result. */
617238fd1498Szrj
617338fd1498Szrj static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)617438fd1498Szrj vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
617538fd1498Szrj tree_code code, tree lhs, tree vector_rhs)
617638fd1498Szrj {
617738fd1498Szrj tree vectype = TREE_TYPE (vector_rhs);
617838fd1498Szrj tree scalar_type = TREE_TYPE (vectype);
617938fd1498Szrj tree bitsize = TYPE_SIZE (scalar_type);
618038fd1498Szrj unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
618138fd1498Szrj unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
618238fd1498Szrj
618338fd1498Szrj for (unsigned HOST_WIDE_INT bit_offset = 0;
618438fd1498Szrj bit_offset < vec_size_in_bits;
618538fd1498Szrj bit_offset += element_bitsize)
618638fd1498Szrj {
618738fd1498Szrj tree bitpos = bitsize_int (bit_offset);
618838fd1498Szrj tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
618938fd1498Szrj bitsize, bitpos);
619038fd1498Szrj
619138fd1498Szrj gassign *stmt = gimple_build_assign (scalar_dest, rhs);
619238fd1498Szrj rhs = make_ssa_name (scalar_dest, stmt);
619338fd1498Szrj gimple_assign_set_lhs (stmt, rhs);
619438fd1498Szrj gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
619538fd1498Szrj
619638fd1498Szrj stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
619738fd1498Szrj tree new_name = make_ssa_name (scalar_dest, stmt);
619838fd1498Szrj gimple_assign_set_lhs (stmt, new_name);
619938fd1498Szrj gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
620038fd1498Szrj lhs = new_name;
620138fd1498Szrj }
620238fd1498Szrj return lhs;
620338fd1498Szrj }
620438fd1498Szrj
620538fd1498Szrj /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
620638fd1498Szrj statement that sets the live-out value. REDUC_DEF_STMT is the phi
620738fd1498Szrj statement. CODE is the operation performed by STMT and OPS are
620838fd1498Szrj its scalar operands. REDUC_INDEX is the index of the operand in
620938fd1498Szrj OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
621038fd1498Szrj implements in-order reduction, or IFN_LAST if we should open-code it.
621138fd1498Szrj VECTYPE_IN is the type of the vector input. MASKS specifies the masks
621238fd1498Szrj that should be used to control the operation in a fully-masked loop. */
621338fd1498Szrj
621438fd1498Szrj static bool
vectorize_fold_left_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)621538fd1498Szrj vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
621638fd1498Szrj gimple **vec_stmt, slp_tree slp_node,
621738fd1498Szrj gimple *reduc_def_stmt,
621838fd1498Szrj tree_code code, internal_fn reduc_fn,
621938fd1498Szrj tree ops[3], tree vectype_in,
622038fd1498Szrj int reduc_index, vec_loop_masks *masks)
622138fd1498Szrj {
622238fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
622338fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
622438fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622538fd1498Szrj tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
622638fd1498Szrj gimple *new_stmt = NULL;
622738fd1498Szrj
622838fd1498Szrj int ncopies;
622938fd1498Szrj if (slp_node)
623038fd1498Szrj ncopies = 1;
623138fd1498Szrj else
623238fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
623338fd1498Szrj
623438fd1498Szrj gcc_assert (!nested_in_vect_loop_p (loop, stmt));
623538fd1498Szrj gcc_assert (ncopies == 1);
623638fd1498Szrj gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
623738fd1498Szrj gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
623838fd1498Szrj gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
623938fd1498Szrj == FOLD_LEFT_REDUCTION);
624038fd1498Szrj
624138fd1498Szrj if (slp_node)
624238fd1498Szrj gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
624338fd1498Szrj TYPE_VECTOR_SUBPARTS (vectype_in)));
624438fd1498Szrj
624538fd1498Szrj tree op0 = ops[1 - reduc_index];
624638fd1498Szrj
624738fd1498Szrj int group_size = 1;
624838fd1498Szrj gimple *scalar_dest_def;
624938fd1498Szrj auto_vec<tree> vec_oprnds0;
625038fd1498Szrj if (slp_node)
625138fd1498Szrj {
625238fd1498Szrj vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
625338fd1498Szrj group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
625438fd1498Szrj scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
625538fd1498Szrj }
625638fd1498Szrj else
625738fd1498Szrj {
625838fd1498Szrj tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
625938fd1498Szrj vec_oprnds0.create (1);
626038fd1498Szrj vec_oprnds0.quick_push (loop_vec_def0);
626138fd1498Szrj scalar_dest_def = stmt;
626238fd1498Szrj }
626338fd1498Szrj
626438fd1498Szrj tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
626538fd1498Szrj tree scalar_type = TREE_TYPE (scalar_dest);
626638fd1498Szrj tree reduc_var = gimple_phi_result (reduc_def_stmt);
626738fd1498Szrj
626838fd1498Szrj int vec_num = vec_oprnds0.length ();
626938fd1498Szrj gcc_assert (vec_num == 1 || slp_node);
627038fd1498Szrj tree vec_elem_type = TREE_TYPE (vectype_out);
627138fd1498Szrj gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
627238fd1498Szrj
627338fd1498Szrj tree vector_identity = NULL_TREE;
627438fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
627538fd1498Szrj vector_identity = build_zero_cst (vectype_out);
627638fd1498Szrj
627738fd1498Szrj tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
627838fd1498Szrj int i;
627938fd1498Szrj tree def0;
628038fd1498Szrj FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
628138fd1498Szrj {
628238fd1498Szrj tree mask = NULL_TREE;
628338fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
628438fd1498Szrj mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
628538fd1498Szrj
628638fd1498Szrj /* Handle MINUS by adding the negative. */
628738fd1498Szrj if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
628838fd1498Szrj {
628938fd1498Szrj tree negated = make_ssa_name (vectype_out);
629038fd1498Szrj new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
629138fd1498Szrj gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
629238fd1498Szrj def0 = negated;
629338fd1498Szrj }
629438fd1498Szrj
629538fd1498Szrj if (mask)
629638fd1498Szrj def0 = merge_with_identity (gsi, mask, vectype_out, def0,
629738fd1498Szrj vector_identity);
629838fd1498Szrj
629938fd1498Szrj /* On the first iteration the input is simply the scalar phi
630038fd1498Szrj result, and for subsequent iterations it is the output of
630138fd1498Szrj the preceding operation. */
630238fd1498Szrj if (reduc_fn != IFN_LAST)
630338fd1498Szrj {
630438fd1498Szrj new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
630538fd1498Szrj /* For chained SLP reductions the output of the previous reduction
630638fd1498Szrj operation serves as the input of the next. For the final statement
630738fd1498Szrj the output cannot be a temporary - we reuse the original
630838fd1498Szrj scalar destination of the last statement. */
630938fd1498Szrj if (i != vec_num - 1)
631038fd1498Szrj {
631138fd1498Szrj gimple_set_lhs (new_stmt, scalar_dest_var);
631238fd1498Szrj reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
631338fd1498Szrj gimple_set_lhs (new_stmt, reduc_var);
631438fd1498Szrj }
631538fd1498Szrj }
631638fd1498Szrj else
631738fd1498Szrj {
631838fd1498Szrj reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
631938fd1498Szrj reduc_var, def0);
632038fd1498Szrj new_stmt = SSA_NAME_DEF_STMT (reduc_var);
632138fd1498Szrj /* Remove the statement, so that we can use the same code paths
632238fd1498Szrj as for statements that we've just created. */
632338fd1498Szrj gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6324*58e805e6Szrj gsi_remove (&tmp_gsi, true);
632538fd1498Szrj }
632638fd1498Szrj
632738fd1498Szrj if (i == vec_num - 1)
632838fd1498Szrj {
632938fd1498Szrj gimple_set_lhs (new_stmt, scalar_dest);
633038fd1498Szrj vect_finish_replace_stmt (scalar_dest_def, new_stmt);
633138fd1498Szrj }
633238fd1498Szrj else
633338fd1498Szrj vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
633438fd1498Szrj
633538fd1498Szrj if (slp_node)
633638fd1498Szrj SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
633738fd1498Szrj }
633838fd1498Szrj
633938fd1498Szrj if (!slp_node)
634038fd1498Szrj STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
634138fd1498Szrj
634238fd1498Szrj return true;
634338fd1498Szrj }
634438fd1498Szrj
634538fd1498Szrj /* Function is_nonwrapping_integer_induction.
634638fd1498Szrj
634738fd1498Szrj Check if STMT (which is part of loop LOOP) both increments and
634838fd1498Szrj does not cause overflow. */
634938fd1498Szrj
635038fd1498Szrj static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)635138fd1498Szrj is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
635238fd1498Szrj {
635338fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
635438fd1498Szrj tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
635538fd1498Szrj tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
635638fd1498Szrj tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
635738fd1498Szrj widest_int ni, max_loop_value, lhs_max;
635838fd1498Szrj bool overflow = false;
635938fd1498Szrj
636038fd1498Szrj /* Make sure the loop is integer based. */
636138fd1498Szrj if (TREE_CODE (base) != INTEGER_CST
636238fd1498Szrj || TREE_CODE (step) != INTEGER_CST)
636338fd1498Szrj return false;
636438fd1498Szrj
636538fd1498Szrj /* Check that the max size of the loop will not wrap. */
636638fd1498Szrj
636738fd1498Szrj if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
636838fd1498Szrj return true;
636938fd1498Szrj
637038fd1498Szrj if (! max_stmt_executions (loop, &ni))
637138fd1498Szrj return false;
637238fd1498Szrj
637338fd1498Szrj max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
637438fd1498Szrj &overflow);
637538fd1498Szrj if (overflow)
637638fd1498Szrj return false;
637738fd1498Szrj
637838fd1498Szrj max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
637938fd1498Szrj TYPE_SIGN (lhs_type), &overflow);
638038fd1498Szrj if (overflow)
638138fd1498Szrj return false;
638238fd1498Szrj
638338fd1498Szrj return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
638438fd1498Szrj <= TYPE_PRECISION (lhs_type));
638538fd1498Szrj }
638638fd1498Szrj
638738fd1498Szrj /* Function vectorizable_reduction.
638838fd1498Szrj
638938fd1498Szrj Check if STMT performs a reduction operation that can be vectorized.
639038fd1498Szrj If VEC_STMT is also passed, vectorize the STMT: create a vectorized
639138fd1498Szrj stmt to replace it, put it in VEC_STMT, and insert it at GSI.
639238fd1498Szrj Return FALSE if not a vectorizable STMT, TRUE otherwise.
639338fd1498Szrj
639438fd1498Szrj This function also handles reduction idioms (patterns) that have been
639538fd1498Szrj recognized in advance during vect_pattern_recog. In this case, STMT may be
639638fd1498Szrj of this form:
639738fd1498Szrj X = pattern_expr (arg0, arg1, ..., X)
639838fd1498Szrj and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
639938fd1498Szrj sequence that had been detected and replaced by the pattern-stmt (STMT).
640038fd1498Szrj
640138fd1498Szrj This function also handles reduction of condition expressions, for example:
640238fd1498Szrj for (int i = 0; i < N; i++)
640338fd1498Szrj if (a[i] < value)
640438fd1498Szrj last = a[i];
640538fd1498Szrj This is handled by vectorising the loop and creating an additional vector
640638fd1498Szrj containing the loop indexes for which "a[i] < value" was true. In the
640738fd1498Szrj function epilogue this is reduced to a single max value and then used to
640838fd1498Szrj index into the vector of results.
640938fd1498Szrj
641038fd1498Szrj In some cases of reduction patterns, the type of the reduction variable X is
641138fd1498Szrj different than the type of the other arguments of STMT.
641238fd1498Szrj In such cases, the vectype that is used when transforming STMT into a vector
641338fd1498Szrj stmt is different than the vectype that is used to determine the
641438fd1498Szrj vectorization factor, because it consists of a different number of elements
641538fd1498Szrj than the actual number of elements that are being operated upon in parallel.
641638fd1498Szrj
641738fd1498Szrj For example, consider an accumulation of shorts into an int accumulator.
641838fd1498Szrj On some targets it's possible to vectorize this pattern operating on 8
641938fd1498Szrj shorts at a time (hence, the vectype for purposes of determining the
642038fd1498Szrj vectorization factor should be V8HI); on the other hand, the vectype that
642138fd1498Szrj is used to create the vector form is actually V4SI (the type of the result).
642238fd1498Szrj
642338fd1498Szrj Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
642438fd1498Szrj indicates what is the actual level of parallelism (V8HI in the example), so
642538fd1498Szrj that the right vectorization factor would be derived. This vectype
642638fd1498Szrj corresponds to the type of arguments to the reduction stmt, and should *NOT*
642738fd1498Szrj be used to create the vectorized stmt. The right vectype for the vectorized
642838fd1498Szrj stmt is obtained from the type of the result X:
642938fd1498Szrj get_vectype_for_scalar_type (TREE_TYPE (X))
643038fd1498Szrj
643138fd1498Szrj This means that, contrary to "regular" reductions (or "regular" stmts in
643238fd1498Szrj general), the following equation:
643338fd1498Szrj STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
643438fd1498Szrj does *NOT* necessarily hold for reduction patterns. */
643538fd1498Szrj
643638fd1498Szrj bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)643738fd1498Szrj vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
643838fd1498Szrj gimple **vec_stmt, slp_tree slp_node,
643938fd1498Szrj slp_instance slp_node_instance)
644038fd1498Szrj {
644138fd1498Szrj tree vec_dest;
644238fd1498Szrj tree scalar_dest;
644338fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
644438fd1498Szrj tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
644538fd1498Szrj tree vectype_in = NULL_TREE;
644638fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
644738fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
644838fd1498Szrj enum tree_code code, orig_code;
644938fd1498Szrj internal_fn reduc_fn;
645038fd1498Szrj machine_mode vec_mode;
645138fd1498Szrj int op_type;
645238fd1498Szrj optab optab;
645338fd1498Szrj tree new_temp = NULL_TREE;
645438fd1498Szrj gimple *def_stmt;
645538fd1498Szrj enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
645638fd1498Szrj gimple *cond_reduc_def_stmt = NULL;
645738fd1498Szrj enum tree_code cond_reduc_op_code = ERROR_MARK;
645838fd1498Szrj tree scalar_type;
645938fd1498Szrj bool is_simple_use;
646038fd1498Szrj gimple *orig_stmt;
646138fd1498Szrj stmt_vec_info orig_stmt_info = NULL;
646238fd1498Szrj int i;
646338fd1498Szrj int ncopies;
646438fd1498Szrj int epilog_copies;
646538fd1498Szrj stmt_vec_info prev_stmt_info, prev_phi_info;
646638fd1498Szrj bool single_defuse_cycle = false;
646738fd1498Szrj gimple *new_stmt = NULL;
646838fd1498Szrj int j;
646938fd1498Szrj tree ops[3];
647038fd1498Szrj enum vect_def_type dts[3];
647138fd1498Szrj bool nested_cycle = false, found_nested_cycle_def = false;
647238fd1498Szrj bool double_reduc = false;
647338fd1498Szrj basic_block def_bb;
647438fd1498Szrj struct loop * def_stmt_loop, *outer_loop = NULL;
647538fd1498Szrj tree def_arg;
647638fd1498Szrj gimple *def_arg_stmt;
647738fd1498Szrj auto_vec<tree> vec_oprnds0;
647838fd1498Szrj auto_vec<tree> vec_oprnds1;
647938fd1498Szrj auto_vec<tree> vec_oprnds2;
648038fd1498Szrj auto_vec<tree> vect_defs;
648138fd1498Szrj auto_vec<gimple *> phis;
648238fd1498Szrj int vec_num;
648338fd1498Szrj tree def0, tem;
648438fd1498Szrj bool first_p = true;
648538fd1498Szrj tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
648638fd1498Szrj tree cond_reduc_val = NULL_TREE;
648738fd1498Szrj
648838fd1498Szrj /* Make sure it was already recognized as a reduction computation. */
648938fd1498Szrj if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
649038fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
649138fd1498Szrj return false;
649238fd1498Szrj
649338fd1498Szrj if (nested_in_vect_loop_p (loop, stmt))
649438fd1498Szrj {
649538fd1498Szrj outer_loop = loop;
649638fd1498Szrj loop = loop->inner;
649738fd1498Szrj nested_cycle = true;
649838fd1498Szrj }
649938fd1498Szrj
650038fd1498Szrj /* In case of reduction chain we switch to the first stmt in the chain, but
650138fd1498Szrj we don't update STMT_INFO, since only the last stmt is marked as reduction
650238fd1498Szrj and has reduction properties. */
650338fd1498Szrj if (GROUP_FIRST_ELEMENT (stmt_info)
650438fd1498Szrj && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
650538fd1498Szrj {
650638fd1498Szrj stmt = GROUP_FIRST_ELEMENT (stmt_info);
650738fd1498Szrj first_p = false;
650838fd1498Szrj }
650938fd1498Szrj
651038fd1498Szrj if (gimple_code (stmt) == GIMPLE_PHI)
651138fd1498Szrj {
651238fd1498Szrj /* Analysis is fully done on the reduction stmt invocation. */
651338fd1498Szrj if (! vec_stmt)
651438fd1498Szrj {
651538fd1498Szrj if (slp_node)
651638fd1498Szrj slp_node_instance->reduc_phis = slp_node;
651738fd1498Szrj
651838fd1498Szrj STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
651938fd1498Szrj return true;
652038fd1498Szrj }
652138fd1498Szrj
652238fd1498Szrj if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
652338fd1498Szrj /* Leave the scalar phi in place. Note that checking
652438fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
652538fd1498Szrj for reductions involving a single statement. */
652638fd1498Szrj return true;
652738fd1498Szrj
652838fd1498Szrj gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
652938fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
653038fd1498Szrj reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
653138fd1498Szrj
653238fd1498Szrj if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
653338fd1498Szrj == EXTRACT_LAST_REDUCTION)
653438fd1498Szrj /* Leave the scalar phi in place. */
653538fd1498Szrj return true;
653638fd1498Szrj
653738fd1498Szrj gcc_assert (is_gimple_assign (reduc_stmt));
653838fd1498Szrj for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
653938fd1498Szrj {
654038fd1498Szrj tree op = gimple_op (reduc_stmt, k);
654138fd1498Szrj if (op == gimple_phi_result (stmt))
654238fd1498Szrj continue;
654338fd1498Szrj if (k == 1
654438fd1498Szrj && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
654538fd1498Szrj continue;
654638fd1498Szrj if (!vectype_in
654738fd1498Szrj || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
654838fd1498Szrj < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
654938fd1498Szrj vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
655038fd1498Szrj break;
655138fd1498Szrj }
655238fd1498Szrj gcc_assert (vectype_in);
655338fd1498Szrj
655438fd1498Szrj if (slp_node)
655538fd1498Szrj ncopies = 1;
655638fd1498Szrj else
655738fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
655838fd1498Szrj
655938fd1498Szrj use_operand_p use_p;
656038fd1498Szrj gimple *use_stmt;
656138fd1498Szrj if (ncopies > 1
656238fd1498Szrj && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
656338fd1498Szrj <= vect_used_only_live)
656438fd1498Szrj && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
656538fd1498Szrj && (use_stmt == reduc_stmt
656638fd1498Szrj || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
656738fd1498Szrj == reduc_stmt)))
656838fd1498Szrj single_defuse_cycle = true;
656938fd1498Szrj
657038fd1498Szrj /* Create the destination vector */
657138fd1498Szrj scalar_dest = gimple_assign_lhs (reduc_stmt);
657238fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
657338fd1498Szrj
657438fd1498Szrj if (slp_node)
657538fd1498Szrj /* The size vect_schedule_slp_instance computes is off for us. */
657638fd1498Szrj vec_num = vect_get_num_vectors
657738fd1498Szrj (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
657838fd1498Szrj * SLP_TREE_SCALAR_STMTS (slp_node).length (),
657938fd1498Szrj vectype_in);
658038fd1498Szrj else
658138fd1498Szrj vec_num = 1;
658238fd1498Szrj
658338fd1498Szrj /* Generate the reduction PHIs upfront. */
658438fd1498Szrj prev_phi_info = NULL;
658538fd1498Szrj for (j = 0; j < ncopies; j++)
658638fd1498Szrj {
658738fd1498Szrj if (j == 0 || !single_defuse_cycle)
658838fd1498Szrj {
658938fd1498Szrj for (i = 0; i < vec_num; i++)
659038fd1498Szrj {
659138fd1498Szrj /* Create the reduction-phi that defines the reduction
659238fd1498Szrj operand. */
659338fd1498Szrj gimple *new_phi = create_phi_node (vec_dest, loop->header);
659438fd1498Szrj set_vinfo_for_stmt (new_phi,
659538fd1498Szrj new_stmt_vec_info (new_phi, loop_vinfo));
659638fd1498Szrj
659738fd1498Szrj if (slp_node)
659838fd1498Szrj SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
659938fd1498Szrj else
660038fd1498Szrj {
660138fd1498Szrj if (j == 0)
660238fd1498Szrj STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
660338fd1498Szrj else
660438fd1498Szrj STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
660538fd1498Szrj prev_phi_info = vinfo_for_stmt (new_phi);
660638fd1498Szrj }
660738fd1498Szrj }
660838fd1498Szrj }
660938fd1498Szrj }
661038fd1498Szrj
661138fd1498Szrj return true;
661238fd1498Szrj }
661338fd1498Szrj
661438fd1498Szrj /* 1. Is vectorizable reduction? */
661538fd1498Szrj /* Not supportable if the reduction variable is used in the loop, unless
661638fd1498Szrj it's a reduction chain. */
661738fd1498Szrj if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
661838fd1498Szrj && !GROUP_FIRST_ELEMENT (stmt_info))
661938fd1498Szrj return false;
662038fd1498Szrj
662138fd1498Szrj /* Reductions that are not used even in an enclosing outer-loop,
662238fd1498Szrj are expected to be "live" (used out of the loop). */
662338fd1498Szrj if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
662438fd1498Szrj && !STMT_VINFO_LIVE_P (stmt_info))
662538fd1498Szrj return false;
662638fd1498Szrj
662738fd1498Szrj /* 2. Has this been recognized as a reduction pattern?
662838fd1498Szrj
662938fd1498Szrj Check if STMT represents a pattern that has been recognized
663038fd1498Szrj in earlier analysis stages. For stmts that represent a pattern,
663138fd1498Szrj the STMT_VINFO_RELATED_STMT field records the last stmt in
663238fd1498Szrj the original sequence that constitutes the pattern. */
663338fd1498Szrj
663438fd1498Szrj orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
663538fd1498Szrj if (orig_stmt)
663638fd1498Szrj {
663738fd1498Szrj orig_stmt_info = vinfo_for_stmt (orig_stmt);
663838fd1498Szrj gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
663938fd1498Szrj gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
664038fd1498Szrj }
664138fd1498Szrj
664238fd1498Szrj /* 3. Check the operands of the operation. The first operands are defined
664338fd1498Szrj inside the loop body. The last operand is the reduction variable,
664438fd1498Szrj which is defined by the loop-header-phi. */
664538fd1498Szrj
664638fd1498Szrj gcc_assert (is_gimple_assign (stmt));
664738fd1498Szrj
664838fd1498Szrj /* Flatten RHS. */
664938fd1498Szrj switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
665038fd1498Szrj {
665138fd1498Szrj case GIMPLE_BINARY_RHS:
665238fd1498Szrj code = gimple_assign_rhs_code (stmt);
665338fd1498Szrj op_type = TREE_CODE_LENGTH (code);
665438fd1498Szrj gcc_assert (op_type == binary_op);
665538fd1498Szrj ops[0] = gimple_assign_rhs1 (stmt);
665638fd1498Szrj ops[1] = gimple_assign_rhs2 (stmt);
665738fd1498Szrj break;
665838fd1498Szrj
665938fd1498Szrj case GIMPLE_TERNARY_RHS:
666038fd1498Szrj code = gimple_assign_rhs_code (stmt);
666138fd1498Szrj op_type = TREE_CODE_LENGTH (code);
666238fd1498Szrj gcc_assert (op_type == ternary_op);
666338fd1498Szrj ops[0] = gimple_assign_rhs1 (stmt);
666438fd1498Szrj ops[1] = gimple_assign_rhs2 (stmt);
666538fd1498Szrj ops[2] = gimple_assign_rhs3 (stmt);
666638fd1498Szrj break;
666738fd1498Szrj
666838fd1498Szrj case GIMPLE_UNARY_RHS:
666938fd1498Szrj return false;
667038fd1498Szrj
667138fd1498Szrj default:
667238fd1498Szrj gcc_unreachable ();
667338fd1498Szrj }
667438fd1498Szrj
667538fd1498Szrj if (code == COND_EXPR && slp_node)
667638fd1498Szrj return false;
667738fd1498Szrj
667838fd1498Szrj scalar_dest = gimple_assign_lhs (stmt);
667938fd1498Szrj scalar_type = TREE_TYPE (scalar_dest);
668038fd1498Szrj if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
668138fd1498Szrj && !SCALAR_FLOAT_TYPE_P (scalar_type))
668238fd1498Szrj return false;
668338fd1498Szrj
668438fd1498Szrj /* Do not try to vectorize bit-precision reductions. */
668538fd1498Szrj if (!type_has_mode_precision_p (scalar_type))
668638fd1498Szrj return false;
668738fd1498Szrj
668838fd1498Szrj /* All uses but the last are expected to be defined in the loop.
668938fd1498Szrj The last use is the reduction variable. In case of nested cycle this
669038fd1498Szrj assumption is not true: we use reduc_index to record the index of the
669138fd1498Szrj reduction variable. */
669238fd1498Szrj gimple *reduc_def_stmt = NULL;
669338fd1498Szrj int reduc_index = -1;
669438fd1498Szrj for (i = 0; i < op_type; i++)
669538fd1498Szrj {
669638fd1498Szrj /* The condition of COND_EXPR is checked in vectorizable_condition(). */
669738fd1498Szrj if (i == 0 && code == COND_EXPR)
669838fd1498Szrj continue;
669938fd1498Szrj
670038fd1498Szrj is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
670138fd1498Szrj &def_stmt, &dts[i], &tem);
670238fd1498Szrj dt = dts[i];
670338fd1498Szrj gcc_assert (is_simple_use);
670438fd1498Szrj if (dt == vect_reduction_def)
670538fd1498Szrj {
670638fd1498Szrj reduc_def_stmt = def_stmt;
670738fd1498Szrj reduc_index = i;
670838fd1498Szrj continue;
670938fd1498Szrj }
671038fd1498Szrj else if (tem)
671138fd1498Szrj {
671238fd1498Szrj /* To properly compute ncopies we are interested in the widest
671338fd1498Szrj input type in case we're looking at a widening accumulation. */
671438fd1498Szrj if (!vectype_in
671538fd1498Szrj || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
671638fd1498Szrj < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
671738fd1498Szrj vectype_in = tem;
671838fd1498Szrj }
671938fd1498Szrj
672038fd1498Szrj if (dt != vect_internal_def
672138fd1498Szrj && dt != vect_external_def
672238fd1498Szrj && dt != vect_constant_def
672338fd1498Szrj && dt != vect_induction_def
672438fd1498Szrj && !(dt == vect_nested_cycle && nested_cycle))
672538fd1498Szrj return false;
672638fd1498Szrj
672738fd1498Szrj if (dt == vect_nested_cycle)
672838fd1498Szrj {
672938fd1498Szrj found_nested_cycle_def = true;
673038fd1498Szrj reduc_def_stmt = def_stmt;
673138fd1498Szrj reduc_index = i;
673238fd1498Szrj }
673338fd1498Szrj
673438fd1498Szrj if (i == 1 && code == COND_EXPR)
673538fd1498Szrj {
673638fd1498Szrj /* Record how value of COND_EXPR is defined. */
673738fd1498Szrj if (dt == vect_constant_def)
673838fd1498Szrj {
673938fd1498Szrj cond_reduc_dt = dt;
674038fd1498Szrj cond_reduc_val = ops[i];
674138fd1498Szrj }
674238fd1498Szrj if (dt == vect_induction_def
674338fd1498Szrj && def_stmt != NULL
674438fd1498Szrj && is_nonwrapping_integer_induction (def_stmt, loop))
674538fd1498Szrj {
674638fd1498Szrj cond_reduc_dt = dt;
674738fd1498Szrj cond_reduc_def_stmt = def_stmt;
674838fd1498Szrj }
674938fd1498Szrj }
675038fd1498Szrj }
675138fd1498Szrj
675238fd1498Szrj if (!vectype_in)
675338fd1498Szrj vectype_in = vectype_out;
675438fd1498Szrj
675538fd1498Szrj /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
675638fd1498Szrj directy used in stmt. */
675738fd1498Szrj if (reduc_index == -1)
675838fd1498Szrj {
675938fd1498Szrj if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
676038fd1498Szrj {
676138fd1498Szrj if (dump_enabled_p ())
676238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676338fd1498Szrj "in-order reduction chain without SLP.\n");
676438fd1498Szrj return false;
676538fd1498Szrj }
676638fd1498Szrj
676738fd1498Szrj if (orig_stmt)
676838fd1498Szrj reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
676938fd1498Szrj else
677038fd1498Szrj reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
677138fd1498Szrj }
677238fd1498Szrj
677338fd1498Szrj if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
677438fd1498Szrj return false;
677538fd1498Szrj
677638fd1498Szrj if (!(reduc_index == -1
677738fd1498Szrj || dts[reduc_index] == vect_reduction_def
677838fd1498Szrj || dts[reduc_index] == vect_nested_cycle
677938fd1498Szrj || ((dts[reduc_index] == vect_internal_def
678038fd1498Szrj || dts[reduc_index] == vect_external_def
678138fd1498Szrj || dts[reduc_index] == vect_constant_def
678238fd1498Szrj || dts[reduc_index] == vect_induction_def)
678338fd1498Szrj && nested_cycle && found_nested_cycle_def)))
678438fd1498Szrj {
678538fd1498Szrj /* For pattern recognized stmts, orig_stmt might be a reduction,
678638fd1498Szrj but some helper statements for the pattern might not, or
678738fd1498Szrj might be COND_EXPRs with reduction uses in the condition. */
678838fd1498Szrj gcc_assert (orig_stmt);
678938fd1498Szrj return false;
679038fd1498Szrj }
679138fd1498Szrj
679238fd1498Szrj stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
679338fd1498Szrj enum vect_reduction_type v_reduc_type
679438fd1498Szrj = STMT_VINFO_REDUC_TYPE (reduc_def_info);
679538fd1498Szrj gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
679638fd1498Szrj
679738fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
679838fd1498Szrj /* If we have a condition reduction, see if we can simplify it further. */
679938fd1498Szrj if (v_reduc_type == COND_REDUCTION)
680038fd1498Szrj {
680138fd1498Szrj /* TODO: We can't yet handle reduction chains, since we need to treat
680238fd1498Szrj each COND_EXPR in the chain specially, not just the last one.
680338fd1498Szrj E.g. for:
680438fd1498Szrj
680538fd1498Szrj x_1 = PHI <x_3, ...>
680638fd1498Szrj x_2 = a_2 ? ... : x_1;
680738fd1498Szrj x_3 = a_3 ? ... : x_2;
680838fd1498Szrj
680938fd1498Szrj we're interested in the last element in x_3 for which a_2 || a_3
681038fd1498Szrj is true, whereas the current reduction chain handling would
681138fd1498Szrj vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
681238fd1498Szrj as a reduction operation. */
681338fd1498Szrj if (reduc_index == -1)
681438fd1498Szrj {
681538fd1498Szrj if (dump_enabled_p ())
681638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681738fd1498Szrj "conditional reduction chains not supported\n");
681838fd1498Szrj return false;
681938fd1498Szrj }
682038fd1498Szrj
682138fd1498Szrj /* vect_is_simple_reduction ensured that operand 2 is the
682238fd1498Szrj loop-carried operand. */
682338fd1498Szrj gcc_assert (reduc_index == 2);
682438fd1498Szrj
682538fd1498Szrj /* Loop peeling modifies initial value of reduction PHI, which
682638fd1498Szrj makes the reduction stmt to be transformed different to the
682738fd1498Szrj original stmt analyzed. We need to record reduction code for
682838fd1498Szrj CONST_COND_REDUCTION type reduction at analyzing stage, thus
682938fd1498Szrj it can be used directly at transform stage. */
683038fd1498Szrj if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
683138fd1498Szrj || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
683238fd1498Szrj {
683338fd1498Szrj /* Also set the reduction type to CONST_COND_REDUCTION. */
683438fd1498Szrj gcc_assert (cond_reduc_dt == vect_constant_def);
683538fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
683638fd1498Szrj }
683738fd1498Szrj else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
683838fd1498Szrj vectype_in, OPTIMIZE_FOR_SPEED))
683938fd1498Szrj {
684038fd1498Szrj if (dump_enabled_p ())
684138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
684238fd1498Szrj "optimizing condition reduction with"
684338fd1498Szrj " FOLD_EXTRACT_LAST.\n");
684438fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
684538fd1498Szrj }
684638fd1498Szrj else if (cond_reduc_dt == vect_induction_def)
684738fd1498Szrj {
684838fd1498Szrj stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
684938fd1498Szrj tree base
685038fd1498Szrj = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
685138fd1498Szrj tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
685238fd1498Szrj
685338fd1498Szrj gcc_assert (TREE_CODE (base) == INTEGER_CST
685438fd1498Szrj && TREE_CODE (step) == INTEGER_CST);
685538fd1498Szrj cond_reduc_val = NULL_TREE;
685638fd1498Szrj /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
685738fd1498Szrj above base; punt if base is the minimum value of the type for
685838fd1498Szrj MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
685938fd1498Szrj if (tree_int_cst_sgn (step) == -1)
686038fd1498Szrj {
686138fd1498Szrj cond_reduc_op_code = MIN_EXPR;
686238fd1498Szrj if (tree_int_cst_sgn (base) == -1)
686338fd1498Szrj cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
686438fd1498Szrj else if (tree_int_cst_lt (base,
686538fd1498Szrj TYPE_MAX_VALUE (TREE_TYPE (base))))
686638fd1498Szrj cond_reduc_val
686738fd1498Szrj = int_const_binop (PLUS_EXPR, base, integer_one_node);
686838fd1498Szrj }
686938fd1498Szrj else
687038fd1498Szrj {
687138fd1498Szrj cond_reduc_op_code = MAX_EXPR;
687238fd1498Szrj if (tree_int_cst_sgn (base) == 1)
687338fd1498Szrj cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
687438fd1498Szrj else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
687538fd1498Szrj base))
687638fd1498Szrj cond_reduc_val
687738fd1498Szrj = int_const_binop (MINUS_EXPR, base, integer_one_node);
687838fd1498Szrj }
687938fd1498Szrj if (cond_reduc_val)
688038fd1498Szrj {
688138fd1498Szrj if (dump_enabled_p ())
688238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
688338fd1498Szrj "condition expression based on "
688438fd1498Szrj "integer induction.\n");
688538fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
688638fd1498Szrj = INTEGER_INDUC_COND_REDUCTION;
688738fd1498Szrj }
688838fd1498Szrj }
688938fd1498Szrj else if (cond_reduc_dt == vect_constant_def)
689038fd1498Szrj {
689138fd1498Szrj enum vect_def_type cond_initial_dt;
689238fd1498Szrj gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
689338fd1498Szrj tree cond_initial_val
689438fd1498Szrj = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
689538fd1498Szrj
689638fd1498Szrj gcc_assert (cond_reduc_val != NULL_TREE);
689738fd1498Szrj vect_is_simple_use (cond_initial_val, loop_vinfo,
689838fd1498Szrj &def_stmt, &cond_initial_dt);
689938fd1498Szrj if (cond_initial_dt == vect_constant_def
690038fd1498Szrj && types_compatible_p (TREE_TYPE (cond_initial_val),
690138fd1498Szrj TREE_TYPE (cond_reduc_val)))
690238fd1498Szrj {
690338fd1498Szrj tree e = fold_binary (LE_EXPR, boolean_type_node,
690438fd1498Szrj cond_initial_val, cond_reduc_val);
690538fd1498Szrj if (e && (integer_onep (e) || integer_zerop (e)))
690638fd1498Szrj {
690738fd1498Szrj if (dump_enabled_p ())
690838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
690938fd1498Szrj "condition expression based on "
691038fd1498Szrj "compile time constant.\n");
691138fd1498Szrj /* Record reduction code at analysis stage. */
691238fd1498Szrj STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
691338fd1498Szrj = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
691438fd1498Szrj STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
691538fd1498Szrj = CONST_COND_REDUCTION;
691638fd1498Szrj }
691738fd1498Szrj }
691838fd1498Szrj }
691938fd1498Szrj }
692038fd1498Szrj
692138fd1498Szrj if (orig_stmt)
692238fd1498Szrj gcc_assert (tmp == orig_stmt
692338fd1498Szrj || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
692438fd1498Szrj else
692538fd1498Szrj /* We changed STMT to be the first stmt in reduction chain, hence we
692638fd1498Szrj check that in this case the first element in the chain is STMT. */
692738fd1498Szrj gcc_assert (stmt == tmp
692838fd1498Szrj || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
692938fd1498Szrj
693038fd1498Szrj if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
693138fd1498Szrj return false;
693238fd1498Szrj
693338fd1498Szrj if (slp_node)
693438fd1498Szrj ncopies = 1;
693538fd1498Szrj else
693638fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
693738fd1498Szrj
693838fd1498Szrj gcc_assert (ncopies >= 1);
693938fd1498Szrj
694038fd1498Szrj vec_mode = TYPE_MODE (vectype_in);
694138fd1498Szrj poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
694238fd1498Szrj
694338fd1498Szrj if (code == COND_EXPR)
694438fd1498Szrj {
694538fd1498Szrj /* Only call during the analysis stage, otherwise we'll lose
694638fd1498Szrj STMT_VINFO_TYPE. */
694738fd1498Szrj if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
694838fd1498Szrj ops[reduc_index], 0, NULL))
694938fd1498Szrj {
695038fd1498Szrj if (dump_enabled_p ())
695138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695238fd1498Szrj "unsupported condition in reduction\n");
695338fd1498Szrj return false;
695438fd1498Szrj }
695538fd1498Szrj }
695638fd1498Szrj else
695738fd1498Szrj {
695838fd1498Szrj /* 4. Supportable by target? */
695938fd1498Szrj
696038fd1498Szrj if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
696138fd1498Szrj || code == LROTATE_EXPR || code == RROTATE_EXPR)
696238fd1498Szrj {
696338fd1498Szrj /* Shifts and rotates are only supported by vectorizable_shifts,
696438fd1498Szrj not vectorizable_reduction. */
696538fd1498Szrj if (dump_enabled_p ())
696638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696738fd1498Szrj "unsupported shift or rotation.\n");
696838fd1498Szrj return false;
696938fd1498Szrj }
697038fd1498Szrj
697138fd1498Szrj /* 4.1. check support for the operation in the loop */
697238fd1498Szrj optab = optab_for_tree_code (code, vectype_in, optab_default);
697338fd1498Szrj if (!optab)
697438fd1498Szrj {
697538fd1498Szrj if (dump_enabled_p ())
697638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
697738fd1498Szrj "no optab.\n");
697838fd1498Szrj
697938fd1498Szrj return false;
698038fd1498Szrj }
698138fd1498Szrj
698238fd1498Szrj if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
698338fd1498Szrj {
698438fd1498Szrj if (dump_enabled_p ())
698538fd1498Szrj dump_printf (MSG_NOTE, "op not supported by target.\n");
698638fd1498Szrj
698738fd1498Szrj if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
698838fd1498Szrj || !vect_worthwhile_without_simd_p (loop_vinfo, code))
698938fd1498Szrj return false;
699038fd1498Szrj
699138fd1498Szrj if (dump_enabled_p ())
699238fd1498Szrj dump_printf (MSG_NOTE, "proceeding using word mode.\n");
699338fd1498Szrj }
699438fd1498Szrj
699538fd1498Szrj /* Worthwhile without SIMD support? */
699638fd1498Szrj if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
699738fd1498Szrj && !vect_worthwhile_without_simd_p (loop_vinfo, code))
699838fd1498Szrj {
699938fd1498Szrj if (dump_enabled_p ())
700038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700138fd1498Szrj "not worthwhile without SIMD support.\n");
700238fd1498Szrj
700338fd1498Szrj return false;
700438fd1498Szrj }
700538fd1498Szrj }
700638fd1498Szrj
700738fd1498Szrj /* 4.2. Check support for the epilog operation.
700838fd1498Szrj
700938fd1498Szrj If STMT represents a reduction pattern, then the type of the
701038fd1498Szrj reduction variable may be different than the type of the rest
701138fd1498Szrj of the arguments. For example, consider the case of accumulation
701238fd1498Szrj of shorts into an int accumulator; The original code:
701338fd1498Szrj S1: int_a = (int) short_a;
701438fd1498Szrj orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
701538fd1498Szrj
701638fd1498Szrj was replaced with:
701738fd1498Szrj STMT: int_acc = widen_sum <short_a, int_acc>
701838fd1498Szrj
701938fd1498Szrj This means that:
702038fd1498Szrj 1. The tree-code that is used to create the vector operation in the
702138fd1498Szrj epilog code (that reduces the partial results) is not the
702238fd1498Szrj tree-code of STMT, but is rather the tree-code of the original
702338fd1498Szrj stmt from the pattern that STMT is replacing. I.e, in the example
702438fd1498Szrj above we want to use 'widen_sum' in the loop, but 'plus' in the
702538fd1498Szrj epilog.
702638fd1498Szrj 2. The type (mode) we use to check available target support
702738fd1498Szrj for the vector operation to be created in the *epilog*, is
702838fd1498Szrj determined by the type of the reduction variable (in the example
702938fd1498Szrj above we'd check this: optab_handler (plus_optab, vect_int_mode])).
703038fd1498Szrj However the type (mode) we use to check available target support
703138fd1498Szrj for the vector operation to be created *inside the loop*, is
703238fd1498Szrj determined by the type of the other arguments to STMT (in the
703338fd1498Szrj example we'd check this: optab_handler (widen_sum_optab,
703438fd1498Szrj vect_short_mode)).
703538fd1498Szrj
703638fd1498Szrj This is contrary to "regular" reductions, in which the types of all
703738fd1498Szrj the arguments are the same as the type of the reduction variable.
703838fd1498Szrj For "regular" reductions we can therefore use the same vector type
703938fd1498Szrj (and also the same tree-code) when generating the epilog code and
704038fd1498Szrj when generating the code inside the loop. */
704138fd1498Szrj
704238fd1498Szrj vect_reduction_type reduction_type
704338fd1498Szrj = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
704438fd1498Szrj if (orig_stmt
704538fd1498Szrj && (reduction_type == TREE_CODE_REDUCTION
704638fd1498Szrj || reduction_type == FOLD_LEFT_REDUCTION))
704738fd1498Szrj {
704838fd1498Szrj /* This is a reduction pattern: get the vectype from the type of the
704938fd1498Szrj reduction variable, and get the tree-code from orig_stmt. */
705038fd1498Szrj orig_code = gimple_assign_rhs_code (orig_stmt);
705138fd1498Szrj gcc_assert (vectype_out);
705238fd1498Szrj vec_mode = TYPE_MODE (vectype_out);
705338fd1498Szrj }
705438fd1498Szrj else
705538fd1498Szrj {
705638fd1498Szrj /* Regular reduction: use the same vectype and tree-code as used for
705738fd1498Szrj the vector code inside the loop can be used for the epilog code. */
705838fd1498Szrj orig_code = code;
705938fd1498Szrj
706038fd1498Szrj if (code == MINUS_EXPR)
706138fd1498Szrj orig_code = PLUS_EXPR;
706238fd1498Szrj
706338fd1498Szrj /* For simple condition reductions, replace with the actual expression
706438fd1498Szrj we want to base our reduction around. */
706538fd1498Szrj if (reduction_type == CONST_COND_REDUCTION)
706638fd1498Szrj {
706738fd1498Szrj orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
706838fd1498Szrj gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
706938fd1498Szrj }
707038fd1498Szrj else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
707138fd1498Szrj orig_code = cond_reduc_op_code;
707238fd1498Szrj }
707338fd1498Szrj
707438fd1498Szrj if (nested_cycle)
707538fd1498Szrj {
707638fd1498Szrj def_bb = gimple_bb (reduc_def_stmt);
707738fd1498Szrj def_stmt_loop = def_bb->loop_father;
707838fd1498Szrj def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
707938fd1498Szrj loop_preheader_edge (def_stmt_loop));
708038fd1498Szrj if (TREE_CODE (def_arg) == SSA_NAME
708138fd1498Szrj && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
708238fd1498Szrj && gimple_code (def_arg_stmt) == GIMPLE_PHI
708338fd1498Szrj && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
708438fd1498Szrj && vinfo_for_stmt (def_arg_stmt)
708538fd1498Szrj && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
708638fd1498Szrj == vect_double_reduction_def)
708738fd1498Szrj double_reduc = true;
708838fd1498Szrj }
708938fd1498Szrj
709038fd1498Szrj reduc_fn = IFN_LAST;
709138fd1498Szrj
709238fd1498Szrj if (reduction_type == TREE_CODE_REDUCTION
709338fd1498Szrj || reduction_type == FOLD_LEFT_REDUCTION
709438fd1498Szrj || reduction_type == INTEGER_INDUC_COND_REDUCTION
709538fd1498Szrj || reduction_type == CONST_COND_REDUCTION)
709638fd1498Szrj {
709738fd1498Szrj if (reduction_type == FOLD_LEFT_REDUCTION
709838fd1498Szrj ? fold_left_reduction_fn (orig_code, &reduc_fn)
709938fd1498Szrj : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
710038fd1498Szrj {
710138fd1498Szrj if (reduc_fn != IFN_LAST
710238fd1498Szrj && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
710338fd1498Szrj OPTIMIZE_FOR_SPEED))
710438fd1498Szrj {
710538fd1498Szrj if (dump_enabled_p ())
710638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710738fd1498Szrj "reduc op not supported by target.\n");
710838fd1498Szrj
710938fd1498Szrj reduc_fn = IFN_LAST;
711038fd1498Szrj }
711138fd1498Szrj }
711238fd1498Szrj else
711338fd1498Szrj {
711438fd1498Szrj if (!nested_cycle || double_reduc)
711538fd1498Szrj {
711638fd1498Szrj if (dump_enabled_p ())
711738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711838fd1498Szrj "no reduc code for scalar code.\n");
711938fd1498Szrj
712038fd1498Szrj return false;
712138fd1498Szrj }
712238fd1498Szrj }
712338fd1498Szrj }
712438fd1498Szrj else if (reduction_type == COND_REDUCTION)
712538fd1498Szrj {
712638fd1498Szrj int scalar_precision
712738fd1498Szrj = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
712838fd1498Szrj cr_index_scalar_type = make_unsigned_type (scalar_precision);
712938fd1498Szrj cr_index_vector_type = build_vector_type (cr_index_scalar_type,
713038fd1498Szrj nunits_out);
713138fd1498Szrj
713238fd1498Szrj if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
713338fd1498Szrj OPTIMIZE_FOR_SPEED))
713438fd1498Szrj reduc_fn = IFN_REDUC_MAX;
713538fd1498Szrj }
713638fd1498Szrj
713738fd1498Szrj if (reduction_type != EXTRACT_LAST_REDUCTION
713838fd1498Szrj && reduc_fn == IFN_LAST
713938fd1498Szrj && !nunits_out.is_constant ())
714038fd1498Szrj {
714138fd1498Szrj if (dump_enabled_p ())
714238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714338fd1498Szrj "missing target support for reduction on"
714438fd1498Szrj " variable-length vectors.\n");
714538fd1498Szrj return false;
714638fd1498Szrj }
714738fd1498Szrj
714838fd1498Szrj if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
714938fd1498Szrj && ncopies > 1)
715038fd1498Szrj {
715138fd1498Szrj if (dump_enabled_p ())
715238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
715338fd1498Szrj "multiple types in double reduction or condition "
715438fd1498Szrj "reduction.\n");
715538fd1498Szrj return false;
715638fd1498Szrj }
715738fd1498Szrj
715838fd1498Szrj /* For SLP reductions, see if there is a neutral value we can use. */
715938fd1498Szrj tree neutral_op = NULL_TREE;
716038fd1498Szrj if (slp_node)
716138fd1498Szrj neutral_op
716238fd1498Szrj = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
716338fd1498Szrj GROUP_FIRST_ELEMENT (stmt_info) != NULL);
716438fd1498Szrj
716538fd1498Szrj if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
716638fd1498Szrj {
716738fd1498Szrj /* We can't support in-order reductions of code such as this:
716838fd1498Szrj
716938fd1498Szrj for (int i = 0; i < n1; ++i)
717038fd1498Szrj for (int j = 0; j < n2; ++j)
717138fd1498Szrj l += a[j];
717238fd1498Szrj
717338fd1498Szrj since GCC effectively transforms the loop when vectorizing:
717438fd1498Szrj
717538fd1498Szrj for (int i = 0; i < n1 / VF; ++i)
717638fd1498Szrj for (int j = 0; j < n2; ++j)
717738fd1498Szrj for (int k = 0; k < VF; ++k)
717838fd1498Szrj l += a[j];
717938fd1498Szrj
718038fd1498Szrj which is a reassociation of the original operation. */
718138fd1498Szrj if (dump_enabled_p ())
718238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718338fd1498Szrj "in-order double reduction not supported.\n");
718438fd1498Szrj
718538fd1498Szrj return false;
718638fd1498Szrj }
718738fd1498Szrj
718838fd1498Szrj if (reduction_type == FOLD_LEFT_REDUCTION
718938fd1498Szrj && slp_node
719038fd1498Szrj && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
719138fd1498Szrj {
719238fd1498Szrj /* We cannot use in-order reductions in this case because there is
719338fd1498Szrj an implicit reassociation of the operations involved. */
719438fd1498Szrj if (dump_enabled_p ())
719538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719638fd1498Szrj "in-order unchained SLP reductions not supported.\n");
719738fd1498Szrj return false;
719838fd1498Szrj }
719938fd1498Szrj
720038fd1498Szrj /* For double reductions, and for SLP reductions with a neutral value,
720138fd1498Szrj we construct a variable-length initial vector by loading a vector
720238fd1498Szrj full of the neutral value and then shift-and-inserting the start
720338fd1498Szrj values into the low-numbered elements. */
720438fd1498Szrj if ((double_reduc || neutral_op)
720538fd1498Szrj && !nunits_out.is_constant ()
720638fd1498Szrj && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
720738fd1498Szrj vectype_out, OPTIMIZE_FOR_SPEED))
720838fd1498Szrj {
720938fd1498Szrj if (dump_enabled_p ())
721038fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721138fd1498Szrj "reduction on variable-length vectors requires"
721238fd1498Szrj " target support for a vector-shift-and-insert"
721338fd1498Szrj " operation.\n");
721438fd1498Szrj return false;
721538fd1498Szrj }
721638fd1498Szrj
721738fd1498Szrj /* Check extra constraints for variable-length unchained SLP reductions. */
721838fd1498Szrj if (STMT_SLP_TYPE (stmt_info)
721938fd1498Szrj && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
722038fd1498Szrj && !nunits_out.is_constant ())
722138fd1498Szrj {
722238fd1498Szrj /* We checked above that we could build the initial vector when
722338fd1498Szrj there's a neutral element value. Check here for the case in
722438fd1498Szrj which each SLP statement has its own initial value and in which
722538fd1498Szrj that value needs to be repeated for every instance of the
722638fd1498Szrj statement within the initial vector. */
722738fd1498Szrj unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
722838fd1498Szrj scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
722938fd1498Szrj if (!neutral_op
723038fd1498Szrj && !can_duplicate_and_interleave_p (group_size, elt_mode))
723138fd1498Szrj {
723238fd1498Szrj if (dump_enabled_p ())
723338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
723438fd1498Szrj "unsupported form of SLP reduction for"
723538fd1498Szrj " variable-length vectors: cannot build"
723638fd1498Szrj " initial vector.\n");
723738fd1498Szrj return false;
723838fd1498Szrj }
723938fd1498Szrj /* The epilogue code relies on the number of elements being a multiple
724038fd1498Szrj of the group size. The duplicate-and-interleave approach to setting
724138fd1498Szrj up the the initial vector does too. */
724238fd1498Szrj if (!multiple_p (nunits_out, group_size))
724338fd1498Szrj {
724438fd1498Szrj if (dump_enabled_p ())
724538fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
724638fd1498Szrj "unsupported form of SLP reduction for"
724738fd1498Szrj " variable-length vectors: the vector size"
724838fd1498Szrj " is not a multiple of the number of results.\n");
724938fd1498Szrj return false;
725038fd1498Szrj }
725138fd1498Szrj }
725238fd1498Szrj
725338fd1498Szrj /* In case of widenning multiplication by a constant, we update the type
725438fd1498Szrj of the constant to be the type of the other operand. We check that the
725538fd1498Szrj constant fits the type in the pattern recognition pass. */
725638fd1498Szrj if (code == DOT_PROD_EXPR
725738fd1498Szrj && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
725838fd1498Szrj {
725938fd1498Szrj if (TREE_CODE (ops[0]) == INTEGER_CST)
726038fd1498Szrj ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
726138fd1498Szrj else if (TREE_CODE (ops[1]) == INTEGER_CST)
726238fd1498Szrj ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
726338fd1498Szrj else
726438fd1498Szrj {
726538fd1498Szrj if (dump_enabled_p ())
726638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
726738fd1498Szrj "invalid types in dot-prod\n");
726838fd1498Szrj
726938fd1498Szrj return false;
727038fd1498Szrj }
727138fd1498Szrj }
727238fd1498Szrj
727338fd1498Szrj if (reduction_type == COND_REDUCTION)
727438fd1498Szrj {
727538fd1498Szrj widest_int ni;
727638fd1498Szrj
727738fd1498Szrj if (! max_loop_iterations (loop, &ni))
727838fd1498Szrj {
727938fd1498Szrj if (dump_enabled_p ())
728038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
728138fd1498Szrj "loop count not known, cannot create cond "
728238fd1498Szrj "reduction.\n");
728338fd1498Szrj return false;
728438fd1498Szrj }
728538fd1498Szrj /* Convert backedges to iterations. */
728638fd1498Szrj ni += 1;
728738fd1498Szrj
728838fd1498Szrj /* The additional index will be the same type as the condition. Check
728938fd1498Szrj that the loop can fit into this less one (because we'll use up the
729038fd1498Szrj zero slot for when there are no matches). */
729138fd1498Szrj tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
729238fd1498Szrj if (wi::geu_p (ni, wi::to_widest (max_index)))
729338fd1498Szrj {
729438fd1498Szrj if (dump_enabled_p ())
729538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
729638fd1498Szrj "loop size is greater than data size.\n");
729738fd1498Szrj return false;
729838fd1498Szrj }
729938fd1498Szrj }
730038fd1498Szrj
730138fd1498Szrj /* In case the vectorization factor (VF) is bigger than the number
730238fd1498Szrj of elements that we can fit in a vectype (nunits), we have to generate
730338fd1498Szrj more than one vector stmt - i.e - we need to "unroll" the
730438fd1498Szrj vector stmt by a factor VF/nunits. For more details see documentation
730538fd1498Szrj in vectorizable_operation. */
730638fd1498Szrj
730738fd1498Szrj /* If the reduction is used in an outer loop we need to generate
730838fd1498Szrj VF intermediate results, like so (e.g. for ncopies=2):
730938fd1498Szrj r0 = phi (init, r0)
731038fd1498Szrj r1 = phi (init, r1)
731138fd1498Szrj r0 = x0 + r0;
731238fd1498Szrj r1 = x1 + r1;
731338fd1498Szrj (i.e. we generate VF results in 2 registers).
731438fd1498Szrj In this case we have a separate def-use cycle for each copy, and therefore
731538fd1498Szrj for each copy we get the vector def for the reduction variable from the
731638fd1498Szrj respective phi node created for this copy.
731738fd1498Szrj
731838fd1498Szrj Otherwise (the reduction is unused in the loop nest), we can combine
731938fd1498Szrj together intermediate results, like so (e.g. for ncopies=2):
732038fd1498Szrj r = phi (init, r)
732138fd1498Szrj r = x0 + r;
732238fd1498Szrj r = x1 + r;
732338fd1498Szrj (i.e. we generate VF/2 results in a single register).
732438fd1498Szrj In this case for each copy we get the vector def for the reduction variable
732538fd1498Szrj from the vectorized reduction operation generated in the previous iteration.
732638fd1498Szrj
732738fd1498Szrj This only works when we see both the reduction PHI and its only consumer
732838fd1498Szrj in vectorizable_reduction and there are no intermediate stmts
732938fd1498Szrj participating. */
733038fd1498Szrj use_operand_p use_p;
733138fd1498Szrj gimple *use_stmt;
733238fd1498Szrj if (ncopies > 1
733338fd1498Szrj && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
733438fd1498Szrj && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
733538fd1498Szrj && (use_stmt == stmt
733638fd1498Szrj || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
733738fd1498Szrj {
733838fd1498Szrj single_defuse_cycle = true;
733938fd1498Szrj epilog_copies = 1;
734038fd1498Szrj }
734138fd1498Szrj else
734238fd1498Szrj epilog_copies = ncopies;
734338fd1498Szrj
734438fd1498Szrj /* If the reduction stmt is one of the patterns that have lane
734538fd1498Szrj reduction embedded we cannot handle the case of ! single_defuse_cycle. */
734638fd1498Szrj if ((ncopies > 1
734738fd1498Szrj && ! single_defuse_cycle)
734838fd1498Szrj && (code == DOT_PROD_EXPR
734938fd1498Szrj || code == WIDEN_SUM_EXPR
735038fd1498Szrj || code == SAD_EXPR))
735138fd1498Szrj {
735238fd1498Szrj if (dump_enabled_p ())
735338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
735438fd1498Szrj "multi def-use cycle not possible for lane-reducing "
735538fd1498Szrj "reduction operation\n");
735638fd1498Szrj return false;
735738fd1498Szrj }
735838fd1498Szrj
735938fd1498Szrj if (slp_node)
736038fd1498Szrj vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
736138fd1498Szrj else
736238fd1498Szrj vec_num = 1;
736338fd1498Szrj
736438fd1498Szrj internal_fn cond_fn = get_conditional_internal_fn (code);
736538fd1498Szrj vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
736638fd1498Szrj
736738fd1498Szrj if (!vec_stmt) /* transformation not required. */
736838fd1498Szrj {
736938fd1498Szrj if (first_p)
737038fd1498Szrj vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
737138fd1498Szrj if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
737238fd1498Szrj {
737338fd1498Szrj if (reduction_type != FOLD_LEFT_REDUCTION
737438fd1498Szrj && (cond_fn == IFN_LAST
737538fd1498Szrj || !direct_internal_fn_supported_p (cond_fn, vectype_in,
737638fd1498Szrj OPTIMIZE_FOR_SPEED)))
737738fd1498Szrj {
737838fd1498Szrj if (dump_enabled_p ())
737938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738038fd1498Szrj "can't use a fully-masked loop because no"
738138fd1498Szrj " conditional operation is available.\n");
738238fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
738338fd1498Szrj }
738438fd1498Szrj else if (reduc_index == -1)
738538fd1498Szrj {
738638fd1498Szrj if (dump_enabled_p ())
738738fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738838fd1498Szrj "can't use a fully-masked loop for chained"
738938fd1498Szrj " reductions.\n");
739038fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
739138fd1498Szrj }
739238fd1498Szrj else
739338fd1498Szrj vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
739438fd1498Szrj vectype_in);
739538fd1498Szrj }
739638fd1498Szrj if (dump_enabled_p ()
739738fd1498Szrj && reduction_type == FOLD_LEFT_REDUCTION)
739838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
739938fd1498Szrj "using an in-order (fold-left) reduction.\n");
740038fd1498Szrj STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
740138fd1498Szrj return true;
740238fd1498Szrj }
740338fd1498Szrj
740438fd1498Szrj /* Transform. */
740538fd1498Szrj
740638fd1498Szrj if (dump_enabled_p ())
740738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
740838fd1498Szrj
740938fd1498Szrj /* FORNOW: Multiple types are not supported for condition. */
741038fd1498Szrj if (code == COND_EXPR)
741138fd1498Szrj gcc_assert (ncopies == 1);
741238fd1498Szrj
741338fd1498Szrj bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
741438fd1498Szrj
741538fd1498Szrj if (reduction_type == FOLD_LEFT_REDUCTION)
741638fd1498Szrj return vectorize_fold_left_reduction
741738fd1498Szrj (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
741838fd1498Szrj reduc_fn, ops, vectype_in, reduc_index, masks);
741938fd1498Szrj
742038fd1498Szrj if (reduction_type == EXTRACT_LAST_REDUCTION)
742138fd1498Szrj {
742238fd1498Szrj gcc_assert (!slp_node);
742338fd1498Szrj return vectorizable_condition (stmt, gsi, vec_stmt,
742438fd1498Szrj NULL, reduc_index, NULL);
742538fd1498Szrj }
742638fd1498Szrj
742738fd1498Szrj /* Create the destination vector */
742838fd1498Szrj vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
742938fd1498Szrj
743038fd1498Szrj prev_stmt_info = NULL;
743138fd1498Szrj prev_phi_info = NULL;
743238fd1498Szrj if (!slp_node)
743338fd1498Szrj {
743438fd1498Szrj vec_oprnds0.create (1);
743538fd1498Szrj vec_oprnds1.create (1);
743638fd1498Szrj if (op_type == ternary_op)
743738fd1498Szrj vec_oprnds2.create (1);
743838fd1498Szrj }
743938fd1498Szrj
744038fd1498Szrj phis.create (vec_num);
744138fd1498Szrj vect_defs.create (vec_num);
744238fd1498Szrj if (!slp_node)
744338fd1498Szrj vect_defs.quick_push (NULL_TREE);
744438fd1498Szrj
744538fd1498Szrj if (slp_node)
744638fd1498Szrj phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
744738fd1498Szrj else
744838fd1498Szrj phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
744938fd1498Szrj
745038fd1498Szrj for (j = 0; j < ncopies; j++)
745138fd1498Szrj {
745238fd1498Szrj if (code == COND_EXPR)
745338fd1498Szrj {
745438fd1498Szrj gcc_assert (!slp_node);
745538fd1498Szrj vectorizable_condition (stmt, gsi, vec_stmt,
745638fd1498Szrj PHI_RESULT (phis[0]),
745738fd1498Szrj reduc_index, NULL);
745838fd1498Szrj /* Multiple types are not supported for condition. */
745938fd1498Szrj break;
746038fd1498Szrj }
746138fd1498Szrj
746238fd1498Szrj /* Handle uses. */
746338fd1498Szrj if (j == 0)
746438fd1498Szrj {
746538fd1498Szrj if (slp_node)
746638fd1498Szrj {
746738fd1498Szrj /* Get vec defs for all the operands except the reduction index,
746838fd1498Szrj ensuring the ordering of the ops in the vector is kept. */
746938fd1498Szrj auto_vec<tree, 3> slp_ops;
747038fd1498Szrj auto_vec<vec<tree>, 3> vec_defs;
747138fd1498Szrj
747238fd1498Szrj slp_ops.quick_push (ops[0]);
747338fd1498Szrj slp_ops.quick_push (ops[1]);
747438fd1498Szrj if (op_type == ternary_op)
747538fd1498Szrj slp_ops.quick_push (ops[2]);
747638fd1498Szrj
747738fd1498Szrj vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
747838fd1498Szrj
747938fd1498Szrj vec_oprnds0.safe_splice (vec_defs[0]);
748038fd1498Szrj vec_defs[0].release ();
748138fd1498Szrj vec_oprnds1.safe_splice (vec_defs[1]);
748238fd1498Szrj vec_defs[1].release ();
748338fd1498Szrj if (op_type == ternary_op)
748438fd1498Szrj {
748538fd1498Szrj vec_oprnds2.safe_splice (vec_defs[2]);
748638fd1498Szrj vec_defs[2].release ();
748738fd1498Szrj }
748838fd1498Szrj }
748938fd1498Szrj else
749038fd1498Szrj {
749138fd1498Szrj vec_oprnds0.quick_push
749238fd1498Szrj (vect_get_vec_def_for_operand (ops[0], stmt));
749338fd1498Szrj vec_oprnds1.quick_push
749438fd1498Szrj (vect_get_vec_def_for_operand (ops[1], stmt));
749538fd1498Szrj if (op_type == ternary_op)
749638fd1498Szrj vec_oprnds2.quick_push
749738fd1498Szrj (vect_get_vec_def_for_operand (ops[2], stmt));
749838fd1498Szrj }
749938fd1498Szrj }
750038fd1498Szrj else
750138fd1498Szrj {
750238fd1498Szrj if (!slp_node)
750338fd1498Szrj {
750438fd1498Szrj gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
750538fd1498Szrj
750638fd1498Szrj if (single_defuse_cycle && reduc_index == 0)
750738fd1498Szrj vec_oprnds0[0] = gimple_get_lhs (new_stmt);
750838fd1498Szrj else
750938fd1498Szrj vec_oprnds0[0]
751038fd1498Szrj = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
751138fd1498Szrj if (single_defuse_cycle && reduc_index == 1)
751238fd1498Szrj vec_oprnds1[0] = gimple_get_lhs (new_stmt);
751338fd1498Szrj else
751438fd1498Szrj vec_oprnds1[0]
751538fd1498Szrj = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
751638fd1498Szrj if (op_type == ternary_op)
751738fd1498Szrj {
751838fd1498Szrj if (single_defuse_cycle && reduc_index == 2)
751938fd1498Szrj vec_oprnds2[0] = gimple_get_lhs (new_stmt);
752038fd1498Szrj else
752138fd1498Szrj vec_oprnds2[0]
752238fd1498Szrj = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
752338fd1498Szrj }
752438fd1498Szrj }
752538fd1498Szrj }
752638fd1498Szrj
752738fd1498Szrj FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
752838fd1498Szrj {
752938fd1498Szrj tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
753038fd1498Szrj if (masked_loop_p)
753138fd1498Szrj {
753238fd1498Szrj /* Make sure that the reduction accumulator is vop[0]. */
753338fd1498Szrj if (reduc_index == 1)
753438fd1498Szrj {
753538fd1498Szrj gcc_assert (commutative_tree_code (code));
753638fd1498Szrj std::swap (vop[0], vop[1]);
753738fd1498Szrj }
753838fd1498Szrj tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
753938fd1498Szrj vectype_in, i * ncopies + j);
754038fd1498Szrj gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
754138fd1498Szrj vop[0], vop[1]);
754238fd1498Szrj new_temp = make_ssa_name (vec_dest, call);
754338fd1498Szrj gimple_call_set_lhs (call, new_temp);
754438fd1498Szrj gimple_call_set_nothrow (call, true);
754538fd1498Szrj new_stmt = call;
754638fd1498Szrj }
754738fd1498Szrj else
754838fd1498Szrj {
754938fd1498Szrj if (op_type == ternary_op)
755038fd1498Szrj vop[2] = vec_oprnds2[i];
755138fd1498Szrj
755238fd1498Szrj new_temp = make_ssa_name (vec_dest, new_stmt);
755338fd1498Szrj new_stmt = gimple_build_assign (new_temp, code,
755438fd1498Szrj vop[0], vop[1], vop[2]);
755538fd1498Szrj }
755638fd1498Szrj vect_finish_stmt_generation (stmt, new_stmt, gsi);
755738fd1498Szrj
755838fd1498Szrj if (slp_node)
755938fd1498Szrj {
756038fd1498Szrj SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
756138fd1498Szrj vect_defs.quick_push (new_temp);
756238fd1498Szrj }
756338fd1498Szrj else
756438fd1498Szrj vect_defs[0] = new_temp;
756538fd1498Szrj }
756638fd1498Szrj
756738fd1498Szrj if (slp_node)
756838fd1498Szrj continue;
756938fd1498Szrj
757038fd1498Szrj if (j == 0)
757138fd1498Szrj STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
757238fd1498Szrj else
757338fd1498Szrj STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
757438fd1498Szrj
757538fd1498Szrj prev_stmt_info = vinfo_for_stmt (new_stmt);
757638fd1498Szrj }
757738fd1498Szrj
757838fd1498Szrj /* Finalize the reduction-phi (set its arguments) and create the
757938fd1498Szrj epilog reduction code. */
758038fd1498Szrj if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
758138fd1498Szrj vect_defs[0] = gimple_get_lhs (*vec_stmt);
758238fd1498Szrj
758338fd1498Szrj vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
758438fd1498Szrj epilog_copies, reduc_fn, phis,
758538fd1498Szrj double_reduc, slp_node, slp_node_instance,
758638fd1498Szrj cond_reduc_val, cond_reduc_op_code,
758738fd1498Szrj neutral_op);
758838fd1498Szrj
758938fd1498Szrj return true;
759038fd1498Szrj }
759138fd1498Szrj
759238fd1498Szrj /* Function vect_min_worthwhile_factor.
759338fd1498Szrj
759438fd1498Szrj For a loop where we could vectorize the operation indicated by CODE,
759538fd1498Szrj return the minimum vectorization factor that makes it worthwhile
759638fd1498Szrj to use generic vectors. */
759738fd1498Szrj static unsigned int
vect_min_worthwhile_factor(enum tree_code code)759838fd1498Szrj vect_min_worthwhile_factor (enum tree_code code)
759938fd1498Szrj {
760038fd1498Szrj switch (code)
760138fd1498Szrj {
760238fd1498Szrj case PLUS_EXPR:
760338fd1498Szrj case MINUS_EXPR:
760438fd1498Szrj case NEGATE_EXPR:
760538fd1498Szrj return 4;
760638fd1498Szrj
760738fd1498Szrj case BIT_AND_EXPR:
760838fd1498Szrj case BIT_IOR_EXPR:
760938fd1498Szrj case BIT_XOR_EXPR:
761038fd1498Szrj case BIT_NOT_EXPR:
761138fd1498Szrj return 2;
761238fd1498Szrj
761338fd1498Szrj default:
761438fd1498Szrj return INT_MAX;
761538fd1498Szrj }
761638fd1498Szrj }
761738fd1498Szrj
761838fd1498Szrj /* Return true if VINFO indicates we are doing loop vectorization and if
761938fd1498Szrj it is worth decomposing CODE operations into scalar operations for
762038fd1498Szrj that loop's vectorization factor. */
762138fd1498Szrj
762238fd1498Szrj bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)762338fd1498Szrj vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
762438fd1498Szrj {
762538fd1498Szrj loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
762638fd1498Szrj unsigned HOST_WIDE_INT value;
762738fd1498Szrj return (loop_vinfo
762838fd1498Szrj && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
762938fd1498Szrj && value >= vect_min_worthwhile_factor (code));
763038fd1498Szrj }
763138fd1498Szrj
763238fd1498Szrj /* Function vectorizable_induction
763338fd1498Szrj
763438fd1498Szrj Check if PHI performs an induction computation that can be vectorized.
763538fd1498Szrj If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
763638fd1498Szrj phi to replace it, put it in VEC_STMT, and add it to the same basic block.
763738fd1498Szrj Return FALSE if not a vectorizable STMT, TRUE otherwise. */
763838fd1498Szrj
763938fd1498Szrj bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt,slp_tree slp_node)764038fd1498Szrj vectorizable_induction (gimple *phi,
764138fd1498Szrj gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
764238fd1498Szrj gimple **vec_stmt, slp_tree slp_node)
764338fd1498Szrj {
764438fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (phi);
764538fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
764638fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
764738fd1498Szrj unsigned ncopies;
764838fd1498Szrj bool nested_in_vect_loop = false;
764938fd1498Szrj struct loop *iv_loop;
765038fd1498Szrj tree vec_def;
765138fd1498Szrj edge pe = loop_preheader_edge (loop);
765238fd1498Szrj basic_block new_bb;
765338fd1498Szrj tree new_vec, vec_init, vec_step, t;
765438fd1498Szrj tree new_name;
765538fd1498Szrj gimple *new_stmt;
765638fd1498Szrj gphi *induction_phi;
765738fd1498Szrj tree induc_def, vec_dest;
765838fd1498Szrj tree init_expr, step_expr;
765938fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
766038fd1498Szrj unsigned i;
766138fd1498Szrj tree expr;
766238fd1498Szrj gimple_seq stmts;
766338fd1498Szrj imm_use_iterator imm_iter;
766438fd1498Szrj use_operand_p use_p;
766538fd1498Szrj gimple *exit_phi;
766638fd1498Szrj edge latch_e;
766738fd1498Szrj tree loop_arg;
766838fd1498Szrj gimple_stmt_iterator si;
766938fd1498Szrj basic_block bb = gimple_bb (phi);
767038fd1498Szrj
767138fd1498Szrj if (gimple_code (phi) != GIMPLE_PHI)
767238fd1498Szrj return false;
767338fd1498Szrj
767438fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info))
767538fd1498Szrj return false;
767638fd1498Szrj
767738fd1498Szrj /* Make sure it was recognized as induction computation. */
767838fd1498Szrj if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
767938fd1498Szrj return false;
768038fd1498Szrj
768138fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
768238fd1498Szrj poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
768338fd1498Szrj
768438fd1498Szrj if (slp_node)
768538fd1498Szrj ncopies = 1;
768638fd1498Szrj else
768738fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, vectype);
768838fd1498Szrj gcc_assert (ncopies >= 1);
768938fd1498Szrj
769038fd1498Szrj /* FORNOW. These restrictions should be relaxed. */
769138fd1498Szrj if (nested_in_vect_loop_p (loop, phi))
769238fd1498Szrj {
769338fd1498Szrj imm_use_iterator imm_iter;
769438fd1498Szrj use_operand_p use_p;
769538fd1498Szrj gimple *exit_phi;
769638fd1498Szrj edge latch_e;
769738fd1498Szrj tree loop_arg;
769838fd1498Szrj
769938fd1498Szrj if (ncopies > 1)
770038fd1498Szrj {
770138fd1498Szrj if (dump_enabled_p ())
770238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
770338fd1498Szrj "multiple types in nested loop.\n");
770438fd1498Szrj return false;
770538fd1498Szrj }
770638fd1498Szrj
770738fd1498Szrj /* FORNOW: outer loop induction with SLP not supported. */
770838fd1498Szrj if (STMT_SLP_TYPE (stmt_info))
770938fd1498Szrj return false;
771038fd1498Szrj
771138fd1498Szrj exit_phi = NULL;
771238fd1498Szrj latch_e = loop_latch_edge (loop->inner);
771338fd1498Szrj loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
771438fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
771538fd1498Szrj {
771638fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
771738fd1498Szrj if (is_gimple_debug (use_stmt))
771838fd1498Szrj continue;
771938fd1498Szrj
772038fd1498Szrj if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
772138fd1498Szrj {
772238fd1498Szrj exit_phi = use_stmt;
772338fd1498Szrj break;
772438fd1498Szrj }
772538fd1498Szrj }
772638fd1498Szrj if (exit_phi)
772738fd1498Szrj {
772838fd1498Szrj stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
772938fd1498Szrj if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
773038fd1498Szrj && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
773138fd1498Szrj {
773238fd1498Szrj if (dump_enabled_p ())
773338fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
773438fd1498Szrj "inner-loop induction only used outside "
773538fd1498Szrj "of the outer vectorized loop.\n");
773638fd1498Szrj return false;
773738fd1498Szrj }
773838fd1498Szrj }
773938fd1498Szrj
774038fd1498Szrj nested_in_vect_loop = true;
774138fd1498Szrj iv_loop = loop->inner;
774238fd1498Szrj }
774338fd1498Szrj else
774438fd1498Szrj iv_loop = loop;
774538fd1498Szrj gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
774638fd1498Szrj
774738fd1498Szrj if (slp_node && !nunits.is_constant ())
774838fd1498Szrj {
774938fd1498Szrj /* The current SLP code creates the initial value element-by-element. */
775038fd1498Szrj if (dump_enabled_p ())
775138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
775238fd1498Szrj "SLP induction not supported for variable-length"
775338fd1498Szrj " vectors.\n");
775438fd1498Szrj return false;
775538fd1498Szrj }
775638fd1498Szrj
775738fd1498Szrj if (!vec_stmt) /* transformation not required. */
775838fd1498Szrj {
775938fd1498Szrj STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
776038fd1498Szrj if (dump_enabled_p ())
776138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
776238fd1498Szrj "=== vectorizable_induction ===\n");
776338fd1498Szrj vect_model_induction_cost (stmt_info, ncopies);
776438fd1498Szrj return true;
776538fd1498Szrj }
776638fd1498Szrj
776738fd1498Szrj /* Transform. */
776838fd1498Szrj
776938fd1498Szrj /* Compute a vector variable, initialized with the first VF values of
777038fd1498Szrj the induction variable. E.g., for an iv with IV_PHI='X' and
777138fd1498Szrj evolution S, for a vector of 4 units, we want to compute:
777238fd1498Szrj [X, X + S, X + 2*S, X + 3*S]. */
777338fd1498Szrj
777438fd1498Szrj if (dump_enabled_p ())
777538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
777638fd1498Szrj
777738fd1498Szrj latch_e = loop_latch_edge (iv_loop);
777838fd1498Szrj loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
777938fd1498Szrj
778038fd1498Szrj step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
778138fd1498Szrj gcc_assert (step_expr != NULL_TREE);
778238fd1498Szrj
778338fd1498Szrj pe = loop_preheader_edge (iv_loop);
778438fd1498Szrj init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
778538fd1498Szrj loop_preheader_edge (iv_loop));
778638fd1498Szrj
778738fd1498Szrj stmts = NULL;
778838fd1498Szrj if (!nested_in_vect_loop)
778938fd1498Szrj {
779038fd1498Szrj /* Convert the initial value to the desired type. */
779138fd1498Szrj tree new_type = TREE_TYPE (vectype);
779238fd1498Szrj init_expr = gimple_convert (&stmts, new_type, init_expr);
779338fd1498Szrj
779438fd1498Szrj /* If we are using the loop mask to "peel" for alignment then we need
779538fd1498Szrj to adjust the start value here. */
779638fd1498Szrj tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
779738fd1498Szrj if (skip_niters != NULL_TREE)
779838fd1498Szrj {
779938fd1498Szrj if (FLOAT_TYPE_P (vectype))
780038fd1498Szrj skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
780138fd1498Szrj skip_niters);
780238fd1498Szrj else
780338fd1498Szrj skip_niters = gimple_convert (&stmts, new_type, skip_niters);
780438fd1498Szrj tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
780538fd1498Szrj skip_niters, step_expr);
780638fd1498Szrj init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
780738fd1498Szrj init_expr, skip_step);
780838fd1498Szrj }
780938fd1498Szrj }
781038fd1498Szrj
781138fd1498Szrj /* Convert the step to the desired type. */
781238fd1498Szrj step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
781338fd1498Szrj
781438fd1498Szrj if (stmts)
781538fd1498Szrj {
781638fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
781738fd1498Szrj gcc_assert (!new_bb);
781838fd1498Szrj }
781938fd1498Szrj
782038fd1498Szrj /* Find the first insertion point in the BB. */
782138fd1498Szrj si = gsi_after_labels (bb);
782238fd1498Szrj
782338fd1498Szrj /* For SLP induction we have to generate several IVs as for example
782438fd1498Szrj with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
782538fd1498Szrj [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
782638fd1498Szrj [VF*S, VF*S, VF*S, VF*S] for all. */
782738fd1498Szrj if (slp_node)
782838fd1498Szrj {
782938fd1498Szrj /* Enforced above. */
783038fd1498Szrj unsigned int const_nunits = nunits.to_constant ();
783138fd1498Szrj
783238fd1498Szrj /* Generate [VF*S, VF*S, ... ]. */
783338fd1498Szrj if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
783438fd1498Szrj {
783538fd1498Szrj expr = build_int_cst (integer_type_node, vf);
783638fd1498Szrj expr = fold_convert (TREE_TYPE (step_expr), expr);
783738fd1498Szrj }
783838fd1498Szrj else
783938fd1498Szrj expr = build_int_cst (TREE_TYPE (step_expr), vf);
784038fd1498Szrj new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
784138fd1498Szrj expr, step_expr);
784238fd1498Szrj if (! CONSTANT_CLASS_P (new_name))
784338fd1498Szrj new_name = vect_init_vector (phi, new_name,
784438fd1498Szrj TREE_TYPE (step_expr), NULL);
784538fd1498Szrj new_vec = build_vector_from_val (vectype, new_name);
784638fd1498Szrj vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
784738fd1498Szrj
784838fd1498Szrj /* Now generate the IVs. */
784938fd1498Szrj unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
785038fd1498Szrj unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
785138fd1498Szrj unsigned elts = const_nunits * nvects;
785238fd1498Szrj unsigned nivs = least_common_multiple (group_size,
785338fd1498Szrj const_nunits) / const_nunits;
785438fd1498Szrj gcc_assert (elts % group_size == 0);
785538fd1498Szrj tree elt = init_expr;
785638fd1498Szrj unsigned ivn;
785738fd1498Szrj for (ivn = 0; ivn < nivs; ++ivn)
785838fd1498Szrj {
785938fd1498Szrj tree_vector_builder elts (vectype, const_nunits, 1);
786038fd1498Szrj stmts = NULL;
786138fd1498Szrj for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
786238fd1498Szrj {
786338fd1498Szrj if (ivn*const_nunits + eltn >= group_size
786438fd1498Szrj && (ivn * const_nunits + eltn) % group_size == 0)
786538fd1498Szrj elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
786638fd1498Szrj elt, step_expr);
786738fd1498Szrj elts.quick_push (elt);
786838fd1498Szrj }
786938fd1498Szrj vec_init = gimple_build_vector (&stmts, &elts);
787038fd1498Szrj if (stmts)
787138fd1498Szrj {
787238fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
787338fd1498Szrj gcc_assert (!new_bb);
787438fd1498Szrj }
787538fd1498Szrj
787638fd1498Szrj /* Create the induction-phi that defines the induction-operand. */
787738fd1498Szrj vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
787838fd1498Szrj induction_phi = create_phi_node (vec_dest, iv_loop->header);
787938fd1498Szrj set_vinfo_for_stmt (induction_phi,
788038fd1498Szrj new_stmt_vec_info (induction_phi, loop_vinfo));
788138fd1498Szrj induc_def = PHI_RESULT (induction_phi);
788238fd1498Szrj
788338fd1498Szrj /* Create the iv update inside the loop */
788438fd1498Szrj vec_def = make_ssa_name (vec_dest);
788538fd1498Szrj new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
788638fd1498Szrj gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
788738fd1498Szrj set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
788838fd1498Szrj
788938fd1498Szrj /* Set the arguments of the phi node: */
789038fd1498Szrj add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
789138fd1498Szrj add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
789238fd1498Szrj UNKNOWN_LOCATION);
789338fd1498Szrj
789438fd1498Szrj SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
789538fd1498Szrj }
789638fd1498Szrj
789738fd1498Szrj /* Re-use IVs when we can. */
789838fd1498Szrj if (ivn < nvects)
789938fd1498Szrj {
790038fd1498Szrj unsigned vfp
790138fd1498Szrj = least_common_multiple (group_size, const_nunits) / group_size;
790238fd1498Szrj /* Generate [VF'*S, VF'*S, ... ]. */
790338fd1498Szrj if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
790438fd1498Szrj {
790538fd1498Szrj expr = build_int_cst (integer_type_node, vfp);
790638fd1498Szrj expr = fold_convert (TREE_TYPE (step_expr), expr);
790738fd1498Szrj }
790838fd1498Szrj else
790938fd1498Szrj expr = build_int_cst (TREE_TYPE (step_expr), vfp);
791038fd1498Szrj new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
791138fd1498Szrj expr, step_expr);
791238fd1498Szrj if (! CONSTANT_CLASS_P (new_name))
791338fd1498Szrj new_name = vect_init_vector (phi, new_name,
791438fd1498Szrj TREE_TYPE (step_expr), NULL);
791538fd1498Szrj new_vec = build_vector_from_val (vectype, new_name);
791638fd1498Szrj vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
791738fd1498Szrj for (; ivn < nvects; ++ivn)
791838fd1498Szrj {
791938fd1498Szrj gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
792038fd1498Szrj tree def;
792138fd1498Szrj if (gimple_code (iv) == GIMPLE_PHI)
792238fd1498Szrj def = gimple_phi_result (iv);
792338fd1498Szrj else
792438fd1498Szrj def = gimple_assign_lhs (iv);
792538fd1498Szrj new_stmt = gimple_build_assign (make_ssa_name (vectype),
792638fd1498Szrj PLUS_EXPR,
792738fd1498Szrj def, vec_step);
792838fd1498Szrj if (gimple_code (iv) == GIMPLE_PHI)
792938fd1498Szrj gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
793038fd1498Szrj else
793138fd1498Szrj {
793238fd1498Szrj gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
793338fd1498Szrj gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
793438fd1498Szrj }
793538fd1498Szrj set_vinfo_for_stmt (new_stmt,
793638fd1498Szrj new_stmt_vec_info (new_stmt, loop_vinfo));
793738fd1498Szrj SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
793838fd1498Szrj }
793938fd1498Szrj }
794038fd1498Szrj
794138fd1498Szrj return true;
794238fd1498Szrj }
794338fd1498Szrj
794438fd1498Szrj /* Create the vector that holds the initial_value of the induction. */
794538fd1498Szrj if (nested_in_vect_loop)
794638fd1498Szrj {
794738fd1498Szrj /* iv_loop is nested in the loop to be vectorized. init_expr had already
794838fd1498Szrj been created during vectorization of previous stmts. We obtain it
794938fd1498Szrj from the STMT_VINFO_VEC_STMT of the defining stmt. */
795038fd1498Szrj vec_init = vect_get_vec_def_for_operand (init_expr, phi);
795138fd1498Szrj /* If the initial value is not of proper type, convert it. */
795238fd1498Szrj if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
795338fd1498Szrj {
795438fd1498Szrj new_stmt
795538fd1498Szrj = gimple_build_assign (vect_get_new_ssa_name (vectype,
795638fd1498Szrj vect_simple_var,
795738fd1498Szrj "vec_iv_"),
795838fd1498Szrj VIEW_CONVERT_EXPR,
795938fd1498Szrj build1 (VIEW_CONVERT_EXPR, vectype,
796038fd1498Szrj vec_init));
796138fd1498Szrj vec_init = gimple_assign_lhs (new_stmt);
796238fd1498Szrj new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
796338fd1498Szrj new_stmt);
796438fd1498Szrj gcc_assert (!new_bb);
796538fd1498Szrj set_vinfo_for_stmt (new_stmt,
796638fd1498Szrj new_stmt_vec_info (new_stmt, loop_vinfo));
796738fd1498Szrj }
796838fd1498Szrj }
796938fd1498Szrj else
797038fd1498Szrj {
797138fd1498Szrj /* iv_loop is the loop to be vectorized. Create:
797238fd1498Szrj vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
797338fd1498Szrj stmts = NULL;
797438fd1498Szrj new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
797538fd1498Szrj
797638fd1498Szrj unsigned HOST_WIDE_INT const_nunits;
797738fd1498Szrj if (nunits.is_constant (&const_nunits))
797838fd1498Szrj {
797938fd1498Szrj tree_vector_builder elts (vectype, const_nunits, 1);
798038fd1498Szrj elts.quick_push (new_name);
798138fd1498Szrj for (i = 1; i < const_nunits; i++)
798238fd1498Szrj {
798338fd1498Szrj /* Create: new_name_i = new_name + step_expr */
798438fd1498Szrj new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
798538fd1498Szrj new_name, step_expr);
798638fd1498Szrj elts.quick_push (new_name);
798738fd1498Szrj }
798838fd1498Szrj /* Create a vector from [new_name_0, new_name_1, ...,
798938fd1498Szrj new_name_nunits-1] */
799038fd1498Szrj vec_init = gimple_build_vector (&stmts, &elts);
799138fd1498Szrj }
799238fd1498Szrj else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
799338fd1498Szrj /* Build the initial value directly from a VEC_SERIES_EXPR. */
799438fd1498Szrj vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
799538fd1498Szrj new_name, step_expr);
799638fd1498Szrj else
799738fd1498Szrj {
799838fd1498Szrj /* Build:
799938fd1498Szrj [base, base, base, ...]
800038fd1498Szrj + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
800138fd1498Szrj gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
800238fd1498Szrj gcc_assert (flag_associative_math);
800338fd1498Szrj tree index = build_index_vector (vectype, 0, 1);
800438fd1498Szrj tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
800538fd1498Szrj new_name);
800638fd1498Szrj tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
800738fd1498Szrj step_expr);
800838fd1498Szrj vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
800938fd1498Szrj vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
801038fd1498Szrj vec_init, step_vec);
801138fd1498Szrj vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
801238fd1498Szrj vec_init, base_vec);
801338fd1498Szrj }
801438fd1498Szrj
801538fd1498Szrj if (stmts)
801638fd1498Szrj {
801738fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
801838fd1498Szrj gcc_assert (!new_bb);
801938fd1498Szrj }
802038fd1498Szrj }
802138fd1498Szrj
802238fd1498Szrj
802338fd1498Szrj /* Create the vector that holds the step of the induction. */
802438fd1498Szrj if (nested_in_vect_loop)
802538fd1498Szrj /* iv_loop is nested in the loop to be vectorized. Generate:
802638fd1498Szrj vec_step = [S, S, S, S] */
802738fd1498Szrj new_name = step_expr;
802838fd1498Szrj else
802938fd1498Szrj {
803038fd1498Szrj /* iv_loop is the loop to be vectorized. Generate:
803138fd1498Szrj vec_step = [VF*S, VF*S, VF*S, VF*S] */
803238fd1498Szrj gimple_seq seq = NULL;
803338fd1498Szrj if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
803438fd1498Szrj {
803538fd1498Szrj expr = build_int_cst (integer_type_node, vf);
803638fd1498Szrj expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
803738fd1498Szrj }
803838fd1498Szrj else
803938fd1498Szrj expr = build_int_cst (TREE_TYPE (step_expr), vf);
804038fd1498Szrj new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
804138fd1498Szrj expr, step_expr);
804238fd1498Szrj if (seq)
804338fd1498Szrj {
804438fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
804538fd1498Szrj gcc_assert (!new_bb);
804638fd1498Szrj }
804738fd1498Szrj }
804838fd1498Szrj
804938fd1498Szrj t = unshare_expr (new_name);
805038fd1498Szrj gcc_assert (CONSTANT_CLASS_P (new_name)
805138fd1498Szrj || TREE_CODE (new_name) == SSA_NAME);
805238fd1498Szrj new_vec = build_vector_from_val (vectype, t);
805338fd1498Szrj vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
805438fd1498Szrj
805538fd1498Szrj
805638fd1498Szrj /* Create the following def-use cycle:
805738fd1498Szrj loop prolog:
805838fd1498Szrj vec_init = ...
805938fd1498Szrj vec_step = ...
806038fd1498Szrj loop:
806138fd1498Szrj vec_iv = PHI <vec_init, vec_loop>
806238fd1498Szrj ...
806338fd1498Szrj STMT
806438fd1498Szrj ...
806538fd1498Szrj vec_loop = vec_iv + vec_step; */
806638fd1498Szrj
806738fd1498Szrj /* Create the induction-phi that defines the induction-operand. */
806838fd1498Szrj vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
806938fd1498Szrj induction_phi = create_phi_node (vec_dest, iv_loop->header);
807038fd1498Szrj set_vinfo_for_stmt (induction_phi,
807138fd1498Szrj new_stmt_vec_info (induction_phi, loop_vinfo));
807238fd1498Szrj induc_def = PHI_RESULT (induction_phi);
807338fd1498Szrj
807438fd1498Szrj /* Create the iv update inside the loop */
807538fd1498Szrj vec_def = make_ssa_name (vec_dest);
807638fd1498Szrj new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
807738fd1498Szrj gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
807838fd1498Szrj set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
807938fd1498Szrj
808038fd1498Szrj /* Set the arguments of the phi node: */
808138fd1498Szrj add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
808238fd1498Szrj add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
808338fd1498Szrj UNKNOWN_LOCATION);
808438fd1498Szrj
808538fd1498Szrj STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
808638fd1498Szrj
808738fd1498Szrj /* In case that vectorization factor (VF) is bigger than the number
808838fd1498Szrj of elements that we can fit in a vectype (nunits), we have to generate
808938fd1498Szrj more than one vector stmt - i.e - we need to "unroll" the
809038fd1498Szrj vector stmt by a factor VF/nunits. For more details see documentation
809138fd1498Szrj in vectorizable_operation. */
809238fd1498Szrj
809338fd1498Szrj if (ncopies > 1)
809438fd1498Szrj {
809538fd1498Szrj gimple_seq seq = NULL;
809638fd1498Szrj stmt_vec_info prev_stmt_vinfo;
809738fd1498Szrj /* FORNOW. This restriction should be relaxed. */
809838fd1498Szrj gcc_assert (!nested_in_vect_loop);
809938fd1498Szrj
810038fd1498Szrj /* Create the vector that holds the step of the induction. */
810138fd1498Szrj if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
810238fd1498Szrj {
810338fd1498Szrj expr = build_int_cst (integer_type_node, nunits);
810438fd1498Szrj expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
810538fd1498Szrj }
810638fd1498Szrj else
810738fd1498Szrj expr = build_int_cst (TREE_TYPE (step_expr), nunits);
810838fd1498Szrj new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
810938fd1498Szrj expr, step_expr);
811038fd1498Szrj if (seq)
811138fd1498Szrj {
811238fd1498Szrj new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
811338fd1498Szrj gcc_assert (!new_bb);
811438fd1498Szrj }
811538fd1498Szrj
811638fd1498Szrj t = unshare_expr (new_name);
811738fd1498Szrj gcc_assert (CONSTANT_CLASS_P (new_name)
811838fd1498Szrj || TREE_CODE (new_name) == SSA_NAME);
811938fd1498Szrj new_vec = build_vector_from_val (vectype, t);
812038fd1498Szrj vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
812138fd1498Szrj
812238fd1498Szrj vec_def = induc_def;
812338fd1498Szrj prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
812438fd1498Szrj for (i = 1; i < ncopies; i++)
812538fd1498Szrj {
812638fd1498Szrj /* vec_i = vec_prev + vec_step */
812738fd1498Szrj new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
812838fd1498Szrj vec_def, vec_step);
812938fd1498Szrj vec_def = make_ssa_name (vec_dest, new_stmt);
813038fd1498Szrj gimple_assign_set_lhs (new_stmt, vec_def);
813138fd1498Szrj
813238fd1498Szrj gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
813338fd1498Szrj set_vinfo_for_stmt (new_stmt,
813438fd1498Szrj new_stmt_vec_info (new_stmt, loop_vinfo));
813538fd1498Szrj STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
813638fd1498Szrj prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
813738fd1498Szrj }
813838fd1498Szrj }
813938fd1498Szrj
814038fd1498Szrj if (nested_in_vect_loop)
814138fd1498Szrj {
814238fd1498Szrj /* Find the loop-closed exit-phi of the induction, and record
814338fd1498Szrj the final vector of induction results: */
814438fd1498Szrj exit_phi = NULL;
814538fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
814638fd1498Szrj {
814738fd1498Szrj gimple *use_stmt = USE_STMT (use_p);
814838fd1498Szrj if (is_gimple_debug (use_stmt))
814938fd1498Szrj continue;
815038fd1498Szrj
815138fd1498Szrj if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
815238fd1498Szrj {
815338fd1498Szrj exit_phi = use_stmt;
815438fd1498Szrj break;
815538fd1498Szrj }
815638fd1498Szrj }
815738fd1498Szrj if (exit_phi)
815838fd1498Szrj {
815938fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
816038fd1498Szrj /* FORNOW. Currently not supporting the case that an inner-loop induction
816138fd1498Szrj is not used in the outer-loop (i.e. only outside the outer-loop). */
816238fd1498Szrj gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
816338fd1498Szrj && !STMT_VINFO_LIVE_P (stmt_vinfo));
816438fd1498Szrj
816538fd1498Szrj STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
816638fd1498Szrj if (dump_enabled_p ())
816738fd1498Szrj {
816838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
816938fd1498Szrj "vector of inductions after inner-loop:");
817038fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
817138fd1498Szrj }
817238fd1498Szrj }
817338fd1498Szrj }
817438fd1498Szrj
817538fd1498Szrj
817638fd1498Szrj if (dump_enabled_p ())
817738fd1498Szrj {
817838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
817938fd1498Szrj "transform induction: created def-use cycle: ");
818038fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
818138fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
818238fd1498Szrj SSA_NAME_DEF_STMT (vec_def), 0);
818338fd1498Szrj }
818438fd1498Szrj
818538fd1498Szrj return true;
818638fd1498Szrj }
818738fd1498Szrj
818838fd1498Szrj /* Function vectorizable_live_operation.
818938fd1498Szrj
819038fd1498Szrj STMT computes a value that is used outside the loop. Check if
819138fd1498Szrj it can be supported. */
819238fd1498Szrj
819338fd1498Szrj bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,gimple ** vec_stmt)819438fd1498Szrj vectorizable_live_operation (gimple *stmt,
819538fd1498Szrj gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
819638fd1498Szrj slp_tree slp_node, int slp_index,
819738fd1498Szrj gimple **vec_stmt)
819838fd1498Szrj {
819938fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
820038fd1498Szrj loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
820138fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
820238fd1498Szrj imm_use_iterator imm_iter;
820338fd1498Szrj tree lhs, lhs_type, bitsize, vec_bitsize;
820438fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
820538fd1498Szrj poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
820638fd1498Szrj int ncopies;
820738fd1498Szrj gimple *use_stmt;
820838fd1498Szrj auto_vec<tree> vec_oprnds;
820938fd1498Szrj int vec_entry = 0;
821038fd1498Szrj poly_uint64 vec_index = 0;
821138fd1498Szrj
821238fd1498Szrj gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
821338fd1498Szrj
821438fd1498Szrj if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
821538fd1498Szrj return false;
821638fd1498Szrj
821738fd1498Szrj /* FORNOW. CHECKME. */
821838fd1498Szrj if (nested_in_vect_loop_p (loop, stmt))
821938fd1498Szrj return false;
822038fd1498Szrj
822138fd1498Szrj /* If STMT is not relevant and it is a simple assignment and its inputs are
822238fd1498Szrj invariant then it can remain in place, unvectorized. The original last
822338fd1498Szrj scalar value that it computes will be used. */
822438fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info))
822538fd1498Szrj {
822638fd1498Szrj gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
822738fd1498Szrj if (dump_enabled_p ())
822838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
822938fd1498Szrj "statement is simple and uses invariant. Leaving in "
823038fd1498Szrj "place.\n");
823138fd1498Szrj return true;
823238fd1498Szrj }
823338fd1498Szrj
823438fd1498Szrj if (slp_node)
823538fd1498Szrj ncopies = 1;
823638fd1498Szrj else
823738fd1498Szrj ncopies = vect_get_num_copies (loop_vinfo, vectype);
823838fd1498Szrj
823938fd1498Szrj if (slp_node)
824038fd1498Szrj {
824138fd1498Szrj gcc_assert (slp_index >= 0);
824238fd1498Szrj
824338fd1498Szrj int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
824438fd1498Szrj int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
824538fd1498Szrj
824638fd1498Szrj /* Get the last occurrence of the scalar index from the concatenation of
824738fd1498Szrj all the slp vectors. Calculate which slp vector it is and the index
824838fd1498Szrj within. */
824938fd1498Szrj poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
825038fd1498Szrj
825138fd1498Szrj /* Calculate which vector contains the result, and which lane of
825238fd1498Szrj that vector we need. */
825338fd1498Szrj if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
825438fd1498Szrj {
825538fd1498Szrj if (dump_enabled_p ())
825638fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825738fd1498Szrj "Cannot determine which vector holds the"
825838fd1498Szrj " final result.\n");
825938fd1498Szrj return false;
826038fd1498Szrj }
826138fd1498Szrj }
826238fd1498Szrj
826338fd1498Szrj if (!vec_stmt)
826438fd1498Szrj {
826538fd1498Szrj /* No transformation required. */
826638fd1498Szrj if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
826738fd1498Szrj {
826838fd1498Szrj if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
826938fd1498Szrj OPTIMIZE_FOR_SPEED))
827038fd1498Szrj {
827138fd1498Szrj if (dump_enabled_p ())
827238fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
827338fd1498Szrj "can't use a fully-masked loop because "
827438fd1498Szrj "the target doesn't support extract last "
827538fd1498Szrj "reduction.\n");
827638fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
827738fd1498Szrj }
827838fd1498Szrj else if (slp_node)
827938fd1498Szrj {
828038fd1498Szrj if (dump_enabled_p ())
828138fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
828238fd1498Szrj "can't use a fully-masked loop because an "
828338fd1498Szrj "SLP statement is live after the loop.\n");
828438fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
828538fd1498Szrj }
828638fd1498Szrj else if (ncopies > 1)
828738fd1498Szrj {
828838fd1498Szrj if (dump_enabled_p ())
828938fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
829038fd1498Szrj "can't use a fully-masked loop because"
829138fd1498Szrj " ncopies is greater than 1.\n");
829238fd1498Szrj LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
829338fd1498Szrj }
829438fd1498Szrj else
829538fd1498Szrj {
829638fd1498Szrj gcc_assert (ncopies == 1 && !slp_node);
829738fd1498Szrj vect_record_loop_mask (loop_vinfo,
829838fd1498Szrj &LOOP_VINFO_MASKS (loop_vinfo),
829938fd1498Szrj 1, vectype);
830038fd1498Szrj }
830138fd1498Szrj }
830238fd1498Szrj return true;
830338fd1498Szrj }
830438fd1498Szrj
830538fd1498Szrj /* If stmt has a related stmt, then use that for getting the lhs. */
830638fd1498Szrj if (is_pattern_stmt_p (stmt_info))
830738fd1498Szrj stmt = STMT_VINFO_RELATED_STMT (stmt_info);
830838fd1498Szrj
830938fd1498Szrj lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
831038fd1498Szrj : gimple_get_lhs (stmt);
831138fd1498Szrj lhs_type = TREE_TYPE (lhs);
831238fd1498Szrj
831338fd1498Szrj bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
831438fd1498Szrj ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
831538fd1498Szrj : TYPE_SIZE (TREE_TYPE (vectype)));
831638fd1498Szrj vec_bitsize = TYPE_SIZE (vectype);
831738fd1498Szrj
831838fd1498Szrj /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
831938fd1498Szrj tree vec_lhs, bitstart;
832038fd1498Szrj if (slp_node)
832138fd1498Szrj {
832238fd1498Szrj gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
832338fd1498Szrj
832438fd1498Szrj /* Get the correct slp vectorized stmt. */
832538fd1498Szrj gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
832638fd1498Szrj if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
832738fd1498Szrj vec_lhs = gimple_phi_result (phi);
832838fd1498Szrj else
832938fd1498Szrj vec_lhs = gimple_get_lhs (vec_stmt);
833038fd1498Szrj
833138fd1498Szrj /* Get entry to use. */
833238fd1498Szrj bitstart = bitsize_int (vec_index);
833338fd1498Szrj bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
833438fd1498Szrj }
833538fd1498Szrj else
833638fd1498Szrj {
833738fd1498Szrj enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
833838fd1498Szrj vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
833938fd1498Szrj gcc_checking_assert (ncopies == 1
834038fd1498Szrj || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
834138fd1498Szrj
834238fd1498Szrj /* For multiple copies, get the last copy. */
834338fd1498Szrj for (int i = 1; i < ncopies; ++i)
834438fd1498Szrj vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
834538fd1498Szrj vec_lhs);
834638fd1498Szrj
834738fd1498Szrj /* Get the last lane in the vector. */
834838fd1498Szrj bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
834938fd1498Szrj }
835038fd1498Szrj
835138fd1498Szrj gimple_seq stmts = NULL;
835238fd1498Szrj tree new_tree;
835338fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
835438fd1498Szrj {
835538fd1498Szrj /* Emit:
835638fd1498Szrj
835738fd1498Szrj SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
835838fd1498Szrj
835938fd1498Szrj where VEC_LHS is the vectorized live-out result and MASK is
836038fd1498Szrj the loop mask for the final iteration. */
836138fd1498Szrj gcc_assert (ncopies == 1 && !slp_node);
836238fd1498Szrj tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
836338fd1498Szrj tree scalar_res = make_ssa_name (scalar_type);
836438fd1498Szrj tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
836538fd1498Szrj 1, vectype, 0);
836638fd1498Szrj gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
836738fd1498Szrj 2, mask, vec_lhs);
836838fd1498Szrj gimple_call_set_lhs (new_stmt, scalar_res);
836938fd1498Szrj gimple_seq_add_stmt (&stmts, new_stmt);
837038fd1498Szrj
837138fd1498Szrj /* Convert the extracted vector element to the required scalar type. */
837238fd1498Szrj new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
837338fd1498Szrj }
837438fd1498Szrj else
837538fd1498Szrj {
837638fd1498Szrj tree bftype = TREE_TYPE (vectype);
837738fd1498Szrj if (VECTOR_BOOLEAN_TYPE_P (vectype))
837838fd1498Szrj bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
837938fd1498Szrj new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
838038fd1498Szrj new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
838138fd1498Szrj &stmts, true, NULL_TREE);
838238fd1498Szrj }
838338fd1498Szrj
838438fd1498Szrj if (stmts)
838538fd1498Szrj gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
838638fd1498Szrj
838738fd1498Szrj /* Replace use of lhs with newly computed result. If the use stmt is a
838838fd1498Szrj single arg PHI, just replace all uses of PHI result. It's necessary
838938fd1498Szrj because lcssa PHI defining lhs may be before newly inserted stmt. */
839038fd1498Szrj use_operand_p use_p;
839138fd1498Szrj FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
839238fd1498Szrj if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
839338fd1498Szrj && !is_gimple_debug (use_stmt))
839438fd1498Szrj {
839538fd1498Szrj if (gimple_code (use_stmt) == GIMPLE_PHI
839638fd1498Szrj && gimple_phi_num_args (use_stmt) == 1)
839738fd1498Szrj {
839838fd1498Szrj replace_uses_by (gimple_phi_result (use_stmt), new_tree);
839938fd1498Szrj }
840038fd1498Szrj else
840138fd1498Szrj {
840238fd1498Szrj FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
840338fd1498Szrj SET_USE (use_p, new_tree);
840438fd1498Szrj }
840538fd1498Szrj update_stmt (use_stmt);
840638fd1498Szrj }
840738fd1498Szrj
840838fd1498Szrj return true;
840938fd1498Szrj }
841038fd1498Szrj
841138fd1498Szrj /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
841238fd1498Szrj
841338fd1498Szrj static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)841438fd1498Szrj vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
841538fd1498Szrj {
841638fd1498Szrj ssa_op_iter op_iter;
841738fd1498Szrj imm_use_iterator imm_iter;
841838fd1498Szrj def_operand_p def_p;
841938fd1498Szrj gimple *ustmt;
842038fd1498Szrj
842138fd1498Szrj FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
842238fd1498Szrj {
842338fd1498Szrj FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
842438fd1498Szrj {
842538fd1498Szrj basic_block bb;
842638fd1498Szrj
842738fd1498Szrj if (!is_gimple_debug (ustmt))
842838fd1498Szrj continue;
842938fd1498Szrj
843038fd1498Szrj bb = gimple_bb (ustmt);
843138fd1498Szrj
843238fd1498Szrj if (!flow_bb_inside_loop_p (loop, bb))
843338fd1498Szrj {
843438fd1498Szrj if (gimple_debug_bind_p (ustmt))
843538fd1498Szrj {
843638fd1498Szrj if (dump_enabled_p ())
843738fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
843838fd1498Szrj "killing debug use\n");
843938fd1498Szrj
844038fd1498Szrj gimple_debug_bind_reset_value (ustmt);
844138fd1498Szrj update_stmt (ustmt);
844238fd1498Szrj }
844338fd1498Szrj else
844438fd1498Szrj gcc_unreachable ();
844538fd1498Szrj }
844638fd1498Szrj }
844738fd1498Szrj }
844838fd1498Szrj }
844938fd1498Szrj
845038fd1498Szrj /* Given loop represented by LOOP_VINFO, return true if computation of
845138fd1498Szrj LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
845238fd1498Szrj otherwise. */
845338fd1498Szrj
845438fd1498Szrj static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)845538fd1498Szrj loop_niters_no_overflow (loop_vec_info loop_vinfo)
845638fd1498Szrj {
845738fd1498Szrj /* Constant case. */
845838fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
845938fd1498Szrj {
846038fd1498Szrj tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
846138fd1498Szrj tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
846238fd1498Szrj
846338fd1498Szrj gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
846438fd1498Szrj gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
846538fd1498Szrj if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
846638fd1498Szrj return true;
846738fd1498Szrj }
846838fd1498Szrj
846938fd1498Szrj widest_int max;
847038fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
847138fd1498Szrj /* Check the upper bound of loop niters. */
847238fd1498Szrj if (get_max_loop_iterations (loop, &max))
847338fd1498Szrj {
847438fd1498Szrj tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
847538fd1498Szrj signop sgn = TYPE_SIGN (type);
847638fd1498Szrj widest_int type_max = widest_int::from (wi::max_value (type), sgn);
847738fd1498Szrj if (max < type_max)
847838fd1498Szrj return true;
847938fd1498Szrj }
848038fd1498Szrj return false;
848138fd1498Szrj }
848238fd1498Szrj
848338fd1498Szrj /* Return a mask type with half the number of elements as TYPE. */
848438fd1498Szrj
848538fd1498Szrj tree
vect_halve_mask_nunits(tree type)848638fd1498Szrj vect_halve_mask_nunits (tree type)
848738fd1498Szrj {
848838fd1498Szrj poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
848938fd1498Szrj return build_truth_vector_type (nunits, current_vector_size);
849038fd1498Szrj }
849138fd1498Szrj
849238fd1498Szrj /* Return a mask type with twice as many elements as TYPE. */
849338fd1498Szrj
849438fd1498Szrj tree
vect_double_mask_nunits(tree type)849538fd1498Szrj vect_double_mask_nunits (tree type)
849638fd1498Szrj {
849738fd1498Szrj poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
849838fd1498Szrj return build_truth_vector_type (nunits, current_vector_size);
849938fd1498Szrj }
850038fd1498Szrj
850138fd1498Szrj /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
850238fd1498Szrj contain a sequence of NVECTORS masks that each control a vector of type
850338fd1498Szrj VECTYPE. */
850438fd1498Szrj
850538fd1498Szrj void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)850638fd1498Szrj vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
850738fd1498Szrj unsigned int nvectors, tree vectype)
850838fd1498Szrj {
850938fd1498Szrj gcc_assert (nvectors != 0);
851038fd1498Szrj if (masks->length () < nvectors)
851138fd1498Szrj masks->safe_grow_cleared (nvectors);
851238fd1498Szrj rgroup_masks *rgm = &(*masks)[nvectors - 1];
851338fd1498Szrj /* The number of scalars per iteration and the number of vectors are
851438fd1498Szrj both compile-time constants. */
851538fd1498Szrj unsigned int nscalars_per_iter
851638fd1498Szrj = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
851738fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
851838fd1498Szrj if (rgm->max_nscalars_per_iter < nscalars_per_iter)
851938fd1498Szrj {
852038fd1498Szrj rgm->max_nscalars_per_iter = nscalars_per_iter;
852138fd1498Szrj rgm->mask_type = build_same_sized_truth_vector_type (vectype);
852238fd1498Szrj }
852338fd1498Szrj }
852438fd1498Szrj
852538fd1498Szrj /* Given a complete set of masks MASKS, extract mask number INDEX
852638fd1498Szrj for an rgroup that operates on NVECTORS vectors of type VECTYPE,
852738fd1498Szrj where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
852838fd1498Szrj
852938fd1498Szrj See the comment above vec_loop_masks for more details about the mask
853038fd1498Szrj arrangement. */
853138fd1498Szrj
853238fd1498Szrj tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)853338fd1498Szrj vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
853438fd1498Szrj unsigned int nvectors, tree vectype, unsigned int index)
853538fd1498Szrj {
853638fd1498Szrj rgroup_masks *rgm = &(*masks)[nvectors - 1];
853738fd1498Szrj tree mask_type = rgm->mask_type;
853838fd1498Szrj
853938fd1498Szrj /* Populate the rgroup's mask array, if this is the first time we've
854038fd1498Szrj used it. */
854138fd1498Szrj if (rgm->masks.is_empty ())
854238fd1498Szrj {
854338fd1498Szrj rgm->masks.safe_grow_cleared (nvectors);
854438fd1498Szrj for (unsigned int i = 0; i < nvectors; ++i)
854538fd1498Szrj {
854638fd1498Szrj tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
854738fd1498Szrj /* Provide a dummy definition until the real one is available. */
854838fd1498Szrj SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
854938fd1498Szrj rgm->masks[i] = mask;
855038fd1498Szrj }
855138fd1498Szrj }
855238fd1498Szrj
855338fd1498Szrj tree mask = rgm->masks[index];
855438fd1498Szrj if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
855538fd1498Szrj TYPE_VECTOR_SUBPARTS (vectype)))
855638fd1498Szrj {
855738fd1498Szrj /* A loop mask for data type X can be reused for data type Y
855838fd1498Szrj if X has N times more elements than Y and if Y's elements
855938fd1498Szrj are N times bigger than X's. In this case each sequence
856038fd1498Szrj of N elements in the loop mask will be all-zero or all-one.
856138fd1498Szrj We can then view-convert the mask so that each sequence of
856238fd1498Szrj N elements is replaced by a single element. */
856338fd1498Szrj gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
856438fd1498Szrj TYPE_VECTOR_SUBPARTS (vectype)));
856538fd1498Szrj gimple_seq seq = NULL;
856638fd1498Szrj mask_type = build_same_sized_truth_vector_type (vectype);
856738fd1498Szrj mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
856838fd1498Szrj if (seq)
856938fd1498Szrj gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
857038fd1498Szrj }
857138fd1498Szrj return mask;
857238fd1498Szrj }
857338fd1498Szrj
857438fd1498Szrj /* Scale profiling counters by estimation for LOOP which is vectorized
857538fd1498Szrj by factor VF. */
857638fd1498Szrj
857738fd1498Szrj static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)857838fd1498Szrj scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
857938fd1498Szrj {
858038fd1498Szrj edge preheader = loop_preheader_edge (loop);
858138fd1498Szrj /* Reduce loop iterations by the vectorization factor. */
858238fd1498Szrj gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
858338fd1498Szrj profile_count freq_h = loop->header->count, freq_e = preheader->count ();
858438fd1498Szrj
858538fd1498Szrj if (freq_h.nonzero_p ())
858638fd1498Szrj {
858738fd1498Szrj profile_probability p;
858838fd1498Szrj
858938fd1498Szrj /* Avoid dropping loop body profile counter to 0 because of zero count
859038fd1498Szrj in loop's preheader. */
859138fd1498Szrj if (!(freq_e == profile_count::zero ()))
859238fd1498Szrj freq_e = freq_e.force_nonzero ();
859338fd1498Szrj p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
859438fd1498Szrj scale_loop_frequencies (loop, p);
859538fd1498Szrj }
859638fd1498Szrj
859738fd1498Szrj edge exit_e = single_exit (loop);
859838fd1498Szrj exit_e->probability = profile_probability::always ()
859938fd1498Szrj .apply_scale (1, new_est_niter + 1);
860038fd1498Szrj
860138fd1498Szrj edge exit_l = single_pred_edge (loop->latch);
860238fd1498Szrj profile_probability prob = exit_l->probability;
860338fd1498Szrj exit_l->probability = exit_e->probability.invert ();
860438fd1498Szrj if (prob.initialized_p () && exit_l->probability.initialized_p ())
860538fd1498Szrj scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
860638fd1498Szrj }
860738fd1498Szrj
860838fd1498Szrj /* Function vect_transform_loop.
860938fd1498Szrj
861038fd1498Szrj The analysis phase has determined that the loop is vectorizable.
861138fd1498Szrj Vectorize the loop - created vectorized stmts to replace the scalar
861238fd1498Szrj stmts in the loop, and update the loop exit condition.
861338fd1498Szrj Returns scalar epilogue loop if any. */
861438fd1498Szrj
861538fd1498Szrj struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)861638fd1498Szrj vect_transform_loop (loop_vec_info loop_vinfo)
861738fd1498Szrj {
861838fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
861938fd1498Szrj struct loop *epilogue = NULL;
862038fd1498Szrj basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
862138fd1498Szrj int nbbs = loop->num_nodes;
862238fd1498Szrj int i;
862338fd1498Szrj tree niters_vector = NULL_TREE;
862438fd1498Szrj tree step_vector = NULL_TREE;
862538fd1498Szrj tree niters_vector_mult_vf = NULL_TREE;
862638fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
862738fd1498Szrj unsigned int lowest_vf = constant_lower_bound (vf);
862838fd1498Szrj bool grouped_store;
862938fd1498Szrj bool slp_scheduled = false;
863038fd1498Szrj gimple *stmt, *pattern_stmt;
863138fd1498Szrj gimple_seq pattern_def_seq = NULL;
863238fd1498Szrj gimple_stmt_iterator pattern_def_si = gsi_none ();
863338fd1498Szrj bool transform_pattern_stmt = false;
863438fd1498Szrj bool check_profitability = false;
863538fd1498Szrj unsigned int th;
863638fd1498Szrj
863738fd1498Szrj if (dump_enabled_p ())
863838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
863938fd1498Szrj
864038fd1498Szrj /* Use the more conservative vectorization threshold. If the number
864138fd1498Szrj of iterations is constant assume the cost check has been performed
864238fd1498Szrj by our caller. If the threshold makes all loops profitable that
864338fd1498Szrj run at least the (estimated) vectorization factor number of times
864438fd1498Szrj checking is pointless, too. */
864538fd1498Szrj th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
864638fd1498Szrj if (th >= vect_vf_for_cost (loop_vinfo)
864738fd1498Szrj && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
864838fd1498Szrj {
864938fd1498Szrj if (dump_enabled_p ())
865038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
865138fd1498Szrj "Profitability threshold is %d loop iterations.\n",
865238fd1498Szrj th);
865338fd1498Szrj check_profitability = true;
865438fd1498Szrj }
865538fd1498Szrj
865638fd1498Szrj /* Make sure there exists a single-predecessor exit bb. Do this before
865738fd1498Szrj versioning. */
865838fd1498Szrj edge e = single_exit (loop);
865938fd1498Szrj if (! single_pred_p (e->dest))
866038fd1498Szrj {
866138fd1498Szrj split_loop_exit_edge (e);
866238fd1498Szrj if (dump_enabled_p ())
866338fd1498Szrj dump_printf (MSG_NOTE, "split exit edge\n");
866438fd1498Szrj }
866538fd1498Szrj
866638fd1498Szrj /* Version the loop first, if required, so the profitability check
866738fd1498Szrj comes first. */
866838fd1498Szrj
866938fd1498Szrj if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
867038fd1498Szrj {
867138fd1498Szrj poly_uint64 versioning_threshold
867238fd1498Szrj = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
867338fd1498Szrj if (check_profitability
867438fd1498Szrj && ordered_p (poly_uint64 (th), versioning_threshold))
867538fd1498Szrj {
867638fd1498Szrj versioning_threshold = ordered_max (poly_uint64 (th),
867738fd1498Szrj versioning_threshold);
867838fd1498Szrj check_profitability = false;
867938fd1498Szrj }
868038fd1498Szrj vect_loop_versioning (loop_vinfo, th, check_profitability,
868138fd1498Szrj versioning_threshold);
868238fd1498Szrj check_profitability = false;
868338fd1498Szrj }
868438fd1498Szrj
868538fd1498Szrj /* Make sure there exists a single-predecessor exit bb also on the
868638fd1498Szrj scalar loop copy. Do this after versioning but before peeling
868738fd1498Szrj so CFG structure is fine for both scalar and if-converted loop
868838fd1498Szrj to make slpeel_duplicate_current_defs_from_edges face matched
868938fd1498Szrj loop closed PHI nodes on the exit. */
869038fd1498Szrj if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
869138fd1498Szrj {
869238fd1498Szrj e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
869338fd1498Szrj if (! single_pred_p (e->dest))
869438fd1498Szrj {
869538fd1498Szrj split_loop_exit_edge (e);
869638fd1498Szrj if (dump_enabled_p ())
869738fd1498Szrj dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
869838fd1498Szrj }
869938fd1498Szrj }
870038fd1498Szrj
870138fd1498Szrj tree niters = vect_build_loop_niters (loop_vinfo);
870238fd1498Szrj LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
870338fd1498Szrj tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
870438fd1498Szrj bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
870538fd1498Szrj epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
870638fd1498Szrj &step_vector, &niters_vector_mult_vf, th,
870738fd1498Szrj check_profitability, niters_no_overflow);
870838fd1498Szrj
870938fd1498Szrj if (niters_vector == NULL_TREE)
871038fd1498Szrj {
871138fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
871238fd1498Szrj && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
871338fd1498Szrj && known_eq (lowest_vf, vf))
871438fd1498Szrj {
871538fd1498Szrj niters_vector
871638fd1498Szrj = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
871738fd1498Szrj LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
871838fd1498Szrj step_vector = build_one_cst (TREE_TYPE (niters));
871938fd1498Szrj }
872038fd1498Szrj else
872138fd1498Szrj vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
872238fd1498Szrj &step_vector, niters_no_overflow);
872338fd1498Szrj }
872438fd1498Szrj
872538fd1498Szrj /* 1) Make sure the loop header has exactly two entries
872638fd1498Szrj 2) Make sure we have a preheader basic block. */
872738fd1498Szrj
872838fd1498Szrj gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
872938fd1498Szrj
873038fd1498Szrj split_edge (loop_preheader_edge (loop));
873138fd1498Szrj
873238fd1498Szrj if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
873338fd1498Szrj && vect_use_loop_mask_for_alignment_p (loop_vinfo))
873438fd1498Szrj /* This will deal with any possible peeling. */
873538fd1498Szrj vect_prepare_for_masked_peels (loop_vinfo);
873638fd1498Szrj
873738fd1498Szrj /* FORNOW: the vectorizer supports only loops which body consist
873838fd1498Szrj of one basic block (header + empty latch). When the vectorizer will
873938fd1498Szrj support more involved loop forms, the order by which the BBs are
874038fd1498Szrj traversed need to be reconsidered. */
874138fd1498Szrj
874238fd1498Szrj for (i = 0; i < nbbs; i++)
874338fd1498Szrj {
874438fd1498Szrj basic_block bb = bbs[i];
874538fd1498Szrj stmt_vec_info stmt_info;
874638fd1498Szrj
874738fd1498Szrj for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
874838fd1498Szrj gsi_next (&si))
874938fd1498Szrj {
875038fd1498Szrj gphi *phi = si.phi ();
875138fd1498Szrj if (dump_enabled_p ())
875238fd1498Szrj {
875338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
875438fd1498Szrj "------>vectorizing phi: ");
875538fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
875638fd1498Szrj }
875738fd1498Szrj stmt_info = vinfo_for_stmt (phi);
875838fd1498Szrj if (!stmt_info)
875938fd1498Szrj continue;
876038fd1498Szrj
876138fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
876238fd1498Szrj vect_loop_kill_debug_uses (loop, phi);
876338fd1498Szrj
876438fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info)
876538fd1498Szrj && !STMT_VINFO_LIVE_P (stmt_info))
876638fd1498Szrj continue;
876738fd1498Szrj
876838fd1498Szrj if (STMT_VINFO_VECTYPE (stmt_info)
876938fd1498Szrj && (maybe_ne
877038fd1498Szrj (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
877138fd1498Szrj && dump_enabled_p ())
877238fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
877338fd1498Szrj
877438fd1498Szrj if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
877538fd1498Szrj || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
877638fd1498Szrj || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
877738fd1498Szrj && ! PURE_SLP_STMT (stmt_info))
877838fd1498Szrj {
877938fd1498Szrj if (dump_enabled_p ())
878038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
878138fd1498Szrj vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
878238fd1498Szrj }
878338fd1498Szrj }
878438fd1498Szrj
878538fd1498Szrj pattern_stmt = NULL;
878638fd1498Szrj for (gimple_stmt_iterator si = gsi_start_bb (bb);
878738fd1498Szrj !gsi_end_p (si) || transform_pattern_stmt;)
878838fd1498Szrj {
878938fd1498Szrj bool is_store;
879038fd1498Szrj
879138fd1498Szrj if (transform_pattern_stmt)
879238fd1498Szrj stmt = pattern_stmt;
879338fd1498Szrj else
879438fd1498Szrj {
879538fd1498Szrj stmt = gsi_stmt (si);
879638fd1498Szrj /* During vectorization remove existing clobber stmts. */
879738fd1498Szrj if (gimple_clobber_p (stmt))
879838fd1498Szrj {
879938fd1498Szrj unlink_stmt_vdef (stmt);
880038fd1498Szrj gsi_remove (&si, true);
880138fd1498Szrj release_defs (stmt);
880238fd1498Szrj continue;
880338fd1498Szrj }
880438fd1498Szrj }
880538fd1498Szrj
880638fd1498Szrj if (dump_enabled_p ())
880738fd1498Szrj {
880838fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
880938fd1498Szrj "------>vectorizing statement: ");
881038fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
881138fd1498Szrj }
881238fd1498Szrj
881338fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
881438fd1498Szrj
881538fd1498Szrj /* vector stmts created in the outer-loop during vectorization of
881638fd1498Szrj stmts in an inner-loop may not have a stmt_info, and do not
881738fd1498Szrj need to be vectorized. */
881838fd1498Szrj if (!stmt_info)
881938fd1498Szrj {
882038fd1498Szrj gsi_next (&si);
882138fd1498Szrj continue;
882238fd1498Szrj }
882338fd1498Szrj
882438fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
882538fd1498Szrj vect_loop_kill_debug_uses (loop, stmt);
882638fd1498Szrj
882738fd1498Szrj if (!STMT_VINFO_RELEVANT_P (stmt_info)
882838fd1498Szrj && !STMT_VINFO_LIVE_P (stmt_info))
882938fd1498Szrj {
883038fd1498Szrj if (STMT_VINFO_IN_PATTERN_P (stmt_info)
883138fd1498Szrj && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
883238fd1498Szrj && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
883338fd1498Szrj || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
883438fd1498Szrj {
883538fd1498Szrj stmt = pattern_stmt;
883638fd1498Szrj stmt_info = vinfo_for_stmt (stmt);
883738fd1498Szrj }
883838fd1498Szrj else
883938fd1498Szrj {
884038fd1498Szrj gsi_next (&si);
884138fd1498Szrj continue;
884238fd1498Szrj }
884338fd1498Szrj }
884438fd1498Szrj else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
884538fd1498Szrj && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
884638fd1498Szrj && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
884738fd1498Szrj || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
884838fd1498Szrj transform_pattern_stmt = true;
884938fd1498Szrj
885038fd1498Szrj /* If pattern statement has def stmts, vectorize them too. */
885138fd1498Szrj if (is_pattern_stmt_p (stmt_info))
885238fd1498Szrj {
885338fd1498Szrj if (pattern_def_seq == NULL)
885438fd1498Szrj {
885538fd1498Szrj pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
885638fd1498Szrj pattern_def_si = gsi_start (pattern_def_seq);
885738fd1498Szrj }
885838fd1498Szrj else if (!gsi_end_p (pattern_def_si))
885938fd1498Szrj gsi_next (&pattern_def_si);
886038fd1498Szrj if (pattern_def_seq != NULL)
886138fd1498Szrj {
886238fd1498Szrj gimple *pattern_def_stmt = NULL;
886338fd1498Szrj stmt_vec_info pattern_def_stmt_info = NULL;
886438fd1498Szrj
886538fd1498Szrj while (!gsi_end_p (pattern_def_si))
886638fd1498Szrj {
886738fd1498Szrj pattern_def_stmt = gsi_stmt (pattern_def_si);
886838fd1498Szrj pattern_def_stmt_info
886938fd1498Szrj = vinfo_for_stmt (pattern_def_stmt);
887038fd1498Szrj if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
887138fd1498Szrj || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
887238fd1498Szrj break;
887338fd1498Szrj gsi_next (&pattern_def_si);
887438fd1498Szrj }
887538fd1498Szrj
887638fd1498Szrj if (!gsi_end_p (pattern_def_si))
887738fd1498Szrj {
887838fd1498Szrj if (dump_enabled_p ())
887938fd1498Szrj {
888038fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
888138fd1498Szrj "==> vectorizing pattern def "
888238fd1498Szrj "stmt: ");
888338fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
888438fd1498Szrj pattern_def_stmt, 0);
888538fd1498Szrj }
888638fd1498Szrj
888738fd1498Szrj stmt = pattern_def_stmt;
888838fd1498Szrj stmt_info = pattern_def_stmt_info;
888938fd1498Szrj }
889038fd1498Szrj else
889138fd1498Szrj {
889238fd1498Szrj pattern_def_si = gsi_none ();
889338fd1498Szrj transform_pattern_stmt = false;
889438fd1498Szrj }
889538fd1498Szrj }
889638fd1498Szrj else
889738fd1498Szrj transform_pattern_stmt = false;
889838fd1498Szrj }
889938fd1498Szrj
890038fd1498Szrj if (STMT_VINFO_VECTYPE (stmt_info))
890138fd1498Szrj {
890238fd1498Szrj poly_uint64 nunits
890338fd1498Szrj = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
890438fd1498Szrj if (!STMT_SLP_TYPE (stmt_info)
890538fd1498Szrj && maybe_ne (nunits, vf)
890638fd1498Szrj && dump_enabled_p ())
890738fd1498Szrj /* For SLP VF is set according to unrolling factor, and not
890838fd1498Szrj to vector size, hence for SLP this print is not valid. */
890938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
891038fd1498Szrj }
891138fd1498Szrj
891238fd1498Szrj /* SLP. Schedule all the SLP instances when the first SLP stmt is
891338fd1498Szrj reached. */
891438fd1498Szrj if (STMT_SLP_TYPE (stmt_info))
891538fd1498Szrj {
891638fd1498Szrj if (!slp_scheduled)
891738fd1498Szrj {
891838fd1498Szrj slp_scheduled = true;
891938fd1498Szrj
892038fd1498Szrj if (dump_enabled_p ())
892138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
892238fd1498Szrj "=== scheduling SLP instances ===\n");
892338fd1498Szrj
892438fd1498Szrj vect_schedule_slp (loop_vinfo);
892538fd1498Szrj }
892638fd1498Szrj
892738fd1498Szrj /* Hybrid SLP stmts must be vectorized in addition to SLP. */
892838fd1498Szrj if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
892938fd1498Szrj {
893038fd1498Szrj if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
893138fd1498Szrj {
893238fd1498Szrj pattern_def_seq = NULL;
893338fd1498Szrj gsi_next (&si);
893438fd1498Szrj }
893538fd1498Szrj continue;
893638fd1498Szrj }
893738fd1498Szrj }
893838fd1498Szrj
893938fd1498Szrj /* -------- vectorize statement ------------ */
894038fd1498Szrj if (dump_enabled_p ())
894138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
894238fd1498Szrj
894338fd1498Szrj grouped_store = false;
894438fd1498Szrj is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
894538fd1498Szrj if (is_store)
894638fd1498Szrj {
894738fd1498Szrj if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
894838fd1498Szrj {
894938fd1498Szrj /* Interleaving. If IS_STORE is TRUE, the vectorization of the
895038fd1498Szrj interleaving chain was completed - free all the stores in
895138fd1498Szrj the chain. */
895238fd1498Szrj gsi_next (&si);
895338fd1498Szrj vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
895438fd1498Szrj }
895538fd1498Szrj else
895638fd1498Szrj {
895738fd1498Szrj /* Free the attached stmt_vec_info and remove the stmt. */
895838fd1498Szrj gimple *store = gsi_stmt (si);
895938fd1498Szrj free_stmt_vec_info (store);
896038fd1498Szrj unlink_stmt_vdef (store);
896138fd1498Szrj gsi_remove (&si, true);
896238fd1498Szrj release_defs (store);
896338fd1498Szrj }
896438fd1498Szrj
896538fd1498Szrj /* Stores can only appear at the end of pattern statements. */
896638fd1498Szrj gcc_assert (!transform_pattern_stmt);
896738fd1498Szrj pattern_def_seq = NULL;
896838fd1498Szrj }
896938fd1498Szrj else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
897038fd1498Szrj {
897138fd1498Szrj pattern_def_seq = NULL;
897238fd1498Szrj gsi_next (&si);
897338fd1498Szrj }
897438fd1498Szrj } /* stmts in BB */
897538fd1498Szrj
897638fd1498Szrj /* Stub out scalar statements that must not survive vectorization.
897738fd1498Szrj Doing this here helps with grouped statements, or statements that
897838fd1498Szrj are involved in patterns. */
897938fd1498Szrj for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
898038fd1498Szrj !gsi_end_p (gsi); gsi_next (&gsi))
898138fd1498Szrj {
898238fd1498Szrj gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
898338fd1498Szrj if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
898438fd1498Szrj {
898538fd1498Szrj tree lhs = gimple_get_lhs (call);
898638fd1498Szrj if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
898738fd1498Szrj {
898838fd1498Szrj tree zero = build_zero_cst (TREE_TYPE (lhs));
898938fd1498Szrj gimple *new_stmt = gimple_build_assign (lhs, zero);
899038fd1498Szrj gsi_replace (&gsi, new_stmt, true);
899138fd1498Szrj }
899238fd1498Szrj }
899338fd1498Szrj }
899438fd1498Szrj } /* BBs in loop */
899538fd1498Szrj
899638fd1498Szrj /* The vectorization factor is always > 1, so if we use an IV increment of 1.
899738fd1498Szrj a zero NITERS becomes a nonzero NITERS_VECTOR. */
899838fd1498Szrj if (integer_onep (step_vector))
899938fd1498Szrj niters_no_overflow = true;
900038fd1498Szrj vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
900138fd1498Szrj niters_vector_mult_vf, !niters_no_overflow);
900238fd1498Szrj
900338fd1498Szrj unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
900438fd1498Szrj scale_profile_for_vect_loop (loop, assumed_vf);
900538fd1498Szrj
900638fd1498Szrj /* True if the final iteration might not handle a full vector's
900738fd1498Szrj worth of scalar iterations. */
900838fd1498Szrj bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
900938fd1498Szrj /* The minimum number of iterations performed by the epilogue. This
901038fd1498Szrj is 1 when peeling for gaps because we always need a final scalar
901138fd1498Szrj iteration. */
901238fd1498Szrj int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
901338fd1498Szrj /* +1 to convert latch counts to loop iteration counts,
901438fd1498Szrj -min_epilogue_iters to remove iterations that cannot be performed
901538fd1498Szrj by the vector code. */
901638fd1498Szrj int bias_for_lowest = 1 - min_epilogue_iters;
901738fd1498Szrj int bias_for_assumed = bias_for_lowest;
901838fd1498Szrj int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
901938fd1498Szrj if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
902038fd1498Szrj {
902138fd1498Szrj /* When the amount of peeling is known at compile time, the first
902238fd1498Szrj iteration will have exactly alignment_npeels active elements.
902338fd1498Szrj In the worst case it will have at least one. */
902438fd1498Szrj int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
902538fd1498Szrj bias_for_lowest += lowest_vf - min_first_active;
902638fd1498Szrj bias_for_assumed += assumed_vf - min_first_active;
902738fd1498Szrj }
902838fd1498Szrj /* In these calculations the "- 1" converts loop iteration counts
902938fd1498Szrj back to latch counts. */
903038fd1498Szrj if (loop->any_upper_bound)
903138fd1498Szrj loop->nb_iterations_upper_bound
903238fd1498Szrj = (final_iter_may_be_partial
903338fd1498Szrj ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
903438fd1498Szrj lowest_vf) - 1
903538fd1498Szrj : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
903638fd1498Szrj lowest_vf) - 1);
903738fd1498Szrj if (loop->any_likely_upper_bound)
903838fd1498Szrj loop->nb_iterations_likely_upper_bound
903938fd1498Szrj = (final_iter_may_be_partial
904038fd1498Szrj ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
904138fd1498Szrj + bias_for_lowest, lowest_vf) - 1
904238fd1498Szrj : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
904338fd1498Szrj + bias_for_lowest, lowest_vf) - 1);
904438fd1498Szrj if (loop->any_estimate)
904538fd1498Szrj loop->nb_iterations_estimate
904638fd1498Szrj = (final_iter_may_be_partial
904738fd1498Szrj ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
904838fd1498Szrj assumed_vf) - 1
904938fd1498Szrj : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
905038fd1498Szrj assumed_vf) - 1);
905138fd1498Szrj
905238fd1498Szrj if (dump_enabled_p ())
905338fd1498Szrj {
905438fd1498Szrj if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
905538fd1498Szrj {
905638fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
905738fd1498Szrj "LOOP VECTORIZED\n");
905838fd1498Szrj if (loop->inner)
905938fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
906038fd1498Szrj "OUTER LOOP VECTORIZED\n");
906138fd1498Szrj dump_printf (MSG_NOTE, "\n");
906238fd1498Szrj }
906338fd1498Szrj else
906438fd1498Szrj {
906538fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
906638fd1498Szrj "LOOP EPILOGUE VECTORIZED (VS=");
906738fd1498Szrj dump_dec (MSG_NOTE, current_vector_size);
906838fd1498Szrj dump_printf (MSG_NOTE, ")\n");
906938fd1498Szrj }
907038fd1498Szrj }
907138fd1498Szrj
907238fd1498Szrj /* Free SLP instances here because otherwise stmt reference counting
907338fd1498Szrj won't work. */
907438fd1498Szrj slp_instance instance;
907538fd1498Szrj FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
907638fd1498Szrj vect_free_slp_instance (instance);
907738fd1498Szrj LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
907838fd1498Szrj /* Clear-up safelen field since its value is invalid after vectorization
907938fd1498Szrj since vectorized loop can have loop-carried dependencies. */
908038fd1498Szrj loop->safelen = 0;
908138fd1498Szrj
908238fd1498Szrj /* Don't vectorize epilogue for epilogue. */
908338fd1498Szrj if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
908438fd1498Szrj epilogue = NULL;
908538fd1498Szrj
908638fd1498Szrj if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
908738fd1498Szrj epilogue = NULL;
908838fd1498Szrj
908938fd1498Szrj if (epilogue)
909038fd1498Szrj {
909138fd1498Szrj auto_vector_sizes vector_sizes;
909238fd1498Szrj targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
909338fd1498Szrj unsigned int next_size = 0;
909438fd1498Szrj
909538fd1498Szrj if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
909638fd1498Szrj && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
909738fd1498Szrj && known_eq (vf, lowest_vf))
909838fd1498Szrj {
909938fd1498Szrj unsigned int eiters
910038fd1498Szrj = (LOOP_VINFO_INT_NITERS (loop_vinfo)
910138fd1498Szrj - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
910238fd1498Szrj eiters = eiters % lowest_vf;
910338fd1498Szrj epilogue->nb_iterations_upper_bound = eiters - 1;
910438fd1498Szrj
910538fd1498Szrj unsigned int ratio;
910638fd1498Szrj while (next_size < vector_sizes.length ()
910738fd1498Szrj && !(constant_multiple_p (current_vector_size,
910838fd1498Szrj vector_sizes[next_size], &ratio)
910938fd1498Szrj && eiters >= lowest_vf / ratio))
911038fd1498Szrj next_size += 1;
911138fd1498Szrj }
911238fd1498Szrj else
911338fd1498Szrj while (next_size < vector_sizes.length ()
911438fd1498Szrj && maybe_lt (current_vector_size, vector_sizes[next_size]))
911538fd1498Szrj next_size += 1;
911638fd1498Szrj
911738fd1498Szrj if (next_size == vector_sizes.length ())
911838fd1498Szrj epilogue = NULL;
911938fd1498Szrj }
912038fd1498Szrj
912138fd1498Szrj if (epilogue)
912238fd1498Szrj {
912338fd1498Szrj epilogue->force_vectorize = loop->force_vectorize;
912438fd1498Szrj epilogue->safelen = loop->safelen;
912538fd1498Szrj epilogue->dont_vectorize = false;
912638fd1498Szrj
912738fd1498Szrj /* We may need to if-convert epilogue to vectorize it. */
912838fd1498Szrj if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
912938fd1498Szrj tree_if_conversion (epilogue);
913038fd1498Szrj }
913138fd1498Szrj
913238fd1498Szrj return epilogue;
913338fd1498Szrj }
913438fd1498Szrj
913538fd1498Szrj /* The code below is trying to perform simple optimization - revert
913638fd1498Szrj if-conversion for masked stores, i.e. if the mask of a store is zero
913738fd1498Szrj do not perform it and all stored value producers also if possible.
913838fd1498Szrj For example,
913938fd1498Szrj for (i=0; i<n; i++)
914038fd1498Szrj if (c[i])
914138fd1498Szrj {
914238fd1498Szrj p1[i] += 1;
914338fd1498Szrj p2[i] = p3[i] +2;
914438fd1498Szrj }
914538fd1498Szrj this transformation will produce the following semi-hammock:
914638fd1498Szrj
914738fd1498Szrj if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
914838fd1498Szrj {
914938fd1498Szrj vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
915038fd1498Szrj vect__12.22_172 = vect__11.19_170 + vect_cst__171;
915138fd1498Szrj MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
915238fd1498Szrj vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
915338fd1498Szrj vect__19.28_184 = vect__18.25_182 + vect_cst__183;
915438fd1498Szrj MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
915538fd1498Szrj }
915638fd1498Szrj */
915738fd1498Szrj
915838fd1498Szrj void
optimize_mask_stores(struct loop * loop)915938fd1498Szrj optimize_mask_stores (struct loop *loop)
916038fd1498Szrj {
916138fd1498Szrj basic_block *bbs = get_loop_body (loop);
916238fd1498Szrj unsigned nbbs = loop->num_nodes;
916338fd1498Szrj unsigned i;
916438fd1498Szrj basic_block bb;
916538fd1498Szrj struct loop *bb_loop;
916638fd1498Szrj gimple_stmt_iterator gsi;
916738fd1498Szrj gimple *stmt;
916838fd1498Szrj auto_vec<gimple *> worklist;
916938fd1498Szrj
917038fd1498Szrj vect_location = find_loop_location (loop);
917138fd1498Szrj /* Pick up all masked stores in loop if any. */
917238fd1498Szrj for (i = 0; i < nbbs; i++)
917338fd1498Szrj {
917438fd1498Szrj bb = bbs[i];
917538fd1498Szrj for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
917638fd1498Szrj gsi_next (&gsi))
917738fd1498Szrj {
917838fd1498Szrj stmt = gsi_stmt (gsi);
917938fd1498Szrj if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
918038fd1498Szrj worklist.safe_push (stmt);
918138fd1498Szrj }
918238fd1498Szrj }
918338fd1498Szrj
918438fd1498Szrj free (bbs);
918538fd1498Szrj if (worklist.is_empty ())
918638fd1498Szrj return;
918738fd1498Szrj
918838fd1498Szrj /* Loop has masked stores. */
918938fd1498Szrj while (!worklist.is_empty ())
919038fd1498Szrj {
919138fd1498Szrj gimple *last, *last_store;
919238fd1498Szrj edge e, efalse;
919338fd1498Szrj tree mask;
919438fd1498Szrj basic_block store_bb, join_bb;
919538fd1498Szrj gimple_stmt_iterator gsi_to;
919638fd1498Szrj tree vdef, new_vdef;
919738fd1498Szrj gphi *phi;
919838fd1498Szrj tree vectype;
919938fd1498Szrj tree zero;
920038fd1498Szrj
920138fd1498Szrj last = worklist.pop ();
920238fd1498Szrj mask = gimple_call_arg (last, 2);
920338fd1498Szrj bb = gimple_bb (last);
920438fd1498Szrj /* Create then_bb and if-then structure in CFG, then_bb belongs to
920538fd1498Szrj the same loop as if_bb. It could be different to LOOP when two
920638fd1498Szrj level loop-nest is vectorized and mask_store belongs to the inner
920738fd1498Szrj one. */
920838fd1498Szrj e = split_block (bb, last);
920938fd1498Szrj bb_loop = bb->loop_father;
921038fd1498Szrj gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
921138fd1498Szrj join_bb = e->dest;
921238fd1498Szrj store_bb = create_empty_bb (bb);
921338fd1498Szrj add_bb_to_loop (store_bb, bb_loop);
921438fd1498Szrj e->flags = EDGE_TRUE_VALUE;
921538fd1498Szrj efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
921638fd1498Szrj /* Put STORE_BB to likely part. */
921738fd1498Szrj efalse->probability = profile_probability::unlikely ();
921838fd1498Szrj store_bb->count = efalse->count ();
921938fd1498Szrj make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
922038fd1498Szrj if (dom_info_available_p (CDI_DOMINATORS))
922138fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
922238fd1498Szrj if (dump_enabled_p ())
922338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
922438fd1498Szrj "Create new block %d to sink mask stores.",
922538fd1498Szrj store_bb->index);
922638fd1498Szrj /* Create vector comparison with boolean result. */
922738fd1498Szrj vectype = TREE_TYPE (mask);
922838fd1498Szrj zero = build_zero_cst (vectype);
922938fd1498Szrj stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
923038fd1498Szrj gsi = gsi_last_bb (bb);
923138fd1498Szrj gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
923238fd1498Szrj /* Create new PHI node for vdef of the last masked store:
923338fd1498Szrj .MEM_2 = VDEF <.MEM_1>
923438fd1498Szrj will be converted to
923538fd1498Szrj .MEM.3 = VDEF <.MEM_1>
923638fd1498Szrj and new PHI node will be created in join bb
923738fd1498Szrj .MEM_2 = PHI <.MEM_1, .MEM_3>
923838fd1498Szrj */
923938fd1498Szrj vdef = gimple_vdef (last);
924038fd1498Szrj new_vdef = make_ssa_name (gimple_vop (cfun), last);
924138fd1498Szrj gimple_set_vdef (last, new_vdef);
924238fd1498Szrj phi = create_phi_node (vdef, join_bb);
924338fd1498Szrj add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
924438fd1498Szrj
924538fd1498Szrj /* Put all masked stores with the same mask to STORE_BB if possible. */
924638fd1498Szrj while (true)
924738fd1498Szrj {
924838fd1498Szrj gimple_stmt_iterator gsi_from;
924938fd1498Szrj gimple *stmt1 = NULL;
925038fd1498Szrj
925138fd1498Szrj /* Move masked store to STORE_BB. */
925238fd1498Szrj last_store = last;
925338fd1498Szrj gsi = gsi_for_stmt (last);
925438fd1498Szrj gsi_from = gsi;
925538fd1498Szrj /* Shift GSI to the previous stmt for further traversal. */
925638fd1498Szrj gsi_prev (&gsi);
925738fd1498Szrj gsi_to = gsi_start_bb (store_bb);
925838fd1498Szrj gsi_move_before (&gsi_from, &gsi_to);
925938fd1498Szrj /* Setup GSI_TO to the non-empty block start. */
926038fd1498Szrj gsi_to = gsi_start_bb (store_bb);
926138fd1498Szrj if (dump_enabled_p ())
926238fd1498Szrj {
926338fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
926438fd1498Szrj "Move stmt to created bb\n");
926538fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
926638fd1498Szrj }
926738fd1498Szrj /* Move all stored value producers if possible. */
926838fd1498Szrj while (!gsi_end_p (gsi))
926938fd1498Szrj {
927038fd1498Szrj tree lhs;
927138fd1498Szrj imm_use_iterator imm_iter;
927238fd1498Szrj use_operand_p use_p;
927338fd1498Szrj bool res;
927438fd1498Szrj
927538fd1498Szrj /* Skip debug statements. */
927638fd1498Szrj if (is_gimple_debug (gsi_stmt (gsi)))
927738fd1498Szrj {
927838fd1498Szrj gsi_prev (&gsi);
927938fd1498Szrj continue;
928038fd1498Szrj }
928138fd1498Szrj stmt1 = gsi_stmt (gsi);
928238fd1498Szrj /* Do not consider statements writing to memory or having
928338fd1498Szrj volatile operand. */
928438fd1498Szrj if (gimple_vdef (stmt1)
928538fd1498Szrj || gimple_has_volatile_ops (stmt1))
928638fd1498Szrj break;
928738fd1498Szrj gsi_from = gsi;
928838fd1498Szrj gsi_prev (&gsi);
928938fd1498Szrj lhs = gimple_get_lhs (stmt1);
929038fd1498Szrj if (!lhs)
929138fd1498Szrj break;
929238fd1498Szrj
929338fd1498Szrj /* LHS of vectorized stmt must be SSA_NAME. */
929438fd1498Szrj if (TREE_CODE (lhs) != SSA_NAME)
929538fd1498Szrj break;
929638fd1498Szrj
929738fd1498Szrj if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
929838fd1498Szrj {
929938fd1498Szrj /* Remove dead scalar statement. */
930038fd1498Szrj if (has_zero_uses (lhs))
930138fd1498Szrj {
930238fd1498Szrj gsi_remove (&gsi_from, true);
930338fd1498Szrj continue;
930438fd1498Szrj }
930538fd1498Szrj }
930638fd1498Szrj
930738fd1498Szrj /* Check that LHS does not have uses outside of STORE_BB. */
930838fd1498Szrj res = true;
930938fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
931038fd1498Szrj {
931138fd1498Szrj gimple *use_stmt;
931238fd1498Szrj use_stmt = USE_STMT (use_p);
931338fd1498Szrj if (is_gimple_debug (use_stmt))
931438fd1498Szrj continue;
931538fd1498Szrj if (gimple_bb (use_stmt) != store_bb)
931638fd1498Szrj {
931738fd1498Szrj res = false;
931838fd1498Szrj break;
931938fd1498Szrj }
932038fd1498Szrj }
932138fd1498Szrj if (!res)
932238fd1498Szrj break;
932338fd1498Szrj
932438fd1498Szrj if (gimple_vuse (stmt1)
932538fd1498Szrj && gimple_vuse (stmt1) != gimple_vuse (last_store))
932638fd1498Szrj break;
932738fd1498Szrj
932838fd1498Szrj /* Can move STMT1 to STORE_BB. */
932938fd1498Szrj if (dump_enabled_p ())
933038fd1498Szrj {
933138fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
933238fd1498Szrj "Move stmt to created bb\n");
933338fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
933438fd1498Szrj }
933538fd1498Szrj gsi_move_before (&gsi_from, &gsi_to);
933638fd1498Szrj /* Shift GSI_TO for further insertion. */
933738fd1498Szrj gsi_prev (&gsi_to);
933838fd1498Szrj }
933938fd1498Szrj /* Put other masked stores with the same mask to STORE_BB. */
934038fd1498Szrj if (worklist.is_empty ()
934138fd1498Szrj || gimple_call_arg (worklist.last (), 2) != mask
934238fd1498Szrj || worklist.last () != stmt1)
934338fd1498Szrj break;
934438fd1498Szrj last = worklist.pop ();
934538fd1498Szrj }
934638fd1498Szrj add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
934738fd1498Szrj }
934838fd1498Szrj }
9349