1*38fd1498Szrj /* Vectorizer Specific Loop Manipulations
2*38fd1498Szrj Copyright (C) 2003-2018 Free Software Foundation, Inc.
3*38fd1498Szrj Contributed by Dorit Naishlos <dorit@il.ibm.com>
4*38fd1498Szrj and Ira Rosen <irar@il.ibm.com>
5*38fd1498Szrj
6*38fd1498Szrj This file is part of GCC.
7*38fd1498Szrj
8*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
9*38fd1498Szrj the terms of the GNU General Public License as published by the Free
10*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
11*38fd1498Szrj version.
12*38fd1498Szrj
13*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
15*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16*38fd1498Szrj for more details.
17*38fd1498Szrj
18*38fd1498Szrj You should have received a copy of the GNU General Public License
19*38fd1498Szrj along with GCC; see the file COPYING3. If not see
20*38fd1498Szrj <http://www.gnu.org/licenses/>. */
21*38fd1498Szrj
22*38fd1498Szrj #include "config.h"
23*38fd1498Szrj #include "system.h"
24*38fd1498Szrj #include "coretypes.h"
25*38fd1498Szrj #include "backend.h"
26*38fd1498Szrj #include "tree.h"
27*38fd1498Szrj #include "gimple.h"
28*38fd1498Szrj #include "cfghooks.h"
29*38fd1498Szrj #include "tree-pass.h"
30*38fd1498Szrj #include "ssa.h"
31*38fd1498Szrj #include "fold-const.h"
32*38fd1498Szrj #include "cfganal.h"
33*38fd1498Szrj #include "gimplify.h"
34*38fd1498Szrj #include "gimple-iterator.h"
35*38fd1498Szrj #include "gimplify-me.h"
36*38fd1498Szrj #include "tree-cfg.h"
37*38fd1498Szrj #include "tree-ssa-loop-manip.h"
38*38fd1498Szrj #include "tree-into-ssa.h"
39*38fd1498Szrj #include "tree-ssa.h"
40*38fd1498Szrj #include "cfgloop.h"
41*38fd1498Szrj #include "tree-scalar-evolution.h"
42*38fd1498Szrj #include "tree-vectorizer.h"
43*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
44*38fd1498Szrj #include "gimple-fold.h"
45*38fd1498Szrj #include "tree-ssa-loop-niter.h"
46*38fd1498Szrj #include "internal-fn.h"
47*38fd1498Szrj #include "stor-layout.h"
48*38fd1498Szrj #include "optabs-query.h"
49*38fd1498Szrj #include "vec-perm-indices.h"
50*38fd1498Szrj
51*38fd1498Szrj /*************************************************************************
52*38fd1498Szrj Simple Loop Peeling Utilities
53*38fd1498Szrj
54*38fd1498Szrj Utilities to support loop peeling for vectorization purposes.
55*38fd1498Szrj *************************************************************************/
56*38fd1498Szrj
57*38fd1498Szrj
58*38fd1498Szrj /* Renames the use *OP_P. */
59*38fd1498Szrj
60*38fd1498Szrj static void
rename_use_op(use_operand_p op_p)61*38fd1498Szrj rename_use_op (use_operand_p op_p)
62*38fd1498Szrj {
63*38fd1498Szrj tree new_name;
64*38fd1498Szrj
65*38fd1498Szrj if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66*38fd1498Szrj return;
67*38fd1498Szrj
68*38fd1498Szrj new_name = get_current_def (USE_FROM_PTR (op_p));
69*38fd1498Szrj
70*38fd1498Szrj /* Something defined outside of the loop. */
71*38fd1498Szrj if (!new_name)
72*38fd1498Szrj return;
73*38fd1498Szrj
74*38fd1498Szrj /* An ordinary ssa name defined in the loop. */
75*38fd1498Szrj
76*38fd1498Szrj SET_USE (op_p, new_name);
77*38fd1498Szrj }
78*38fd1498Szrj
79*38fd1498Szrj
80*38fd1498Szrj /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81*38fd1498Szrj on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82*38fd1498Szrj true. */
83*38fd1498Szrj
84*38fd1498Szrj static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)85*38fd1498Szrj rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86*38fd1498Szrj {
87*38fd1498Szrj gimple *stmt;
88*38fd1498Szrj use_operand_p use_p;
89*38fd1498Szrj ssa_op_iter iter;
90*38fd1498Szrj edge e;
91*38fd1498Szrj edge_iterator ei;
92*38fd1498Szrj struct loop *loop = bb->loop_father;
93*38fd1498Szrj struct loop *outer_loop = NULL;
94*38fd1498Szrj
95*38fd1498Szrj if (rename_from_outer_loop)
96*38fd1498Szrj {
97*38fd1498Szrj gcc_assert (loop);
98*38fd1498Szrj outer_loop = loop_outer (loop);
99*38fd1498Szrj }
100*38fd1498Szrj
101*38fd1498Szrj for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102*38fd1498Szrj gsi_next (&gsi))
103*38fd1498Szrj {
104*38fd1498Szrj stmt = gsi_stmt (gsi);
105*38fd1498Szrj FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106*38fd1498Szrj rename_use_op (use_p);
107*38fd1498Szrj }
108*38fd1498Szrj
109*38fd1498Szrj FOR_EACH_EDGE (e, ei, bb->preds)
110*38fd1498Szrj {
111*38fd1498Szrj if (!flow_bb_inside_loop_p (loop, e->src))
112*38fd1498Szrj {
113*38fd1498Szrj if (!rename_from_outer_loop)
114*38fd1498Szrj continue;
115*38fd1498Szrj if (e->src != outer_loop->header)
116*38fd1498Szrj {
117*38fd1498Szrj if (outer_loop->inner->next)
118*38fd1498Szrj {
119*38fd1498Szrj /* If outer_loop has 2 inner loops, allow there to
120*38fd1498Szrj be an extra basic block which decides which of the
121*38fd1498Szrj two loops to use using LOOP_VECTORIZED. */
122*38fd1498Szrj if (!single_pred_p (e->src)
123*38fd1498Szrj || single_pred (e->src) != outer_loop->header)
124*38fd1498Szrj continue;
125*38fd1498Szrj }
126*38fd1498Szrj }
127*38fd1498Szrj }
128*38fd1498Szrj for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129*38fd1498Szrj gsi_next (&gsi))
130*38fd1498Szrj rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131*38fd1498Szrj }
132*38fd1498Szrj }
133*38fd1498Szrj
134*38fd1498Szrj
135*38fd1498Szrj struct adjust_info
136*38fd1498Szrj {
137*38fd1498Szrj tree from, to;
138*38fd1498Szrj basic_block bb;
139*38fd1498Szrj };
140*38fd1498Szrj
141*38fd1498Szrj /* A stack of values to be adjusted in debug stmts. We have to
142*38fd1498Szrj process them LIFO, so that the closest substitution applies. If we
143*38fd1498Szrj processed them FIFO, without the stack, we might substitute uses
144*38fd1498Szrj with a PHI DEF that would soon become non-dominant, and when we got
145*38fd1498Szrj to the suitable one, it wouldn't have anything to substitute any
146*38fd1498Szrj more. */
147*38fd1498Szrj static vec<adjust_info, va_heap> adjust_vec;
148*38fd1498Szrj
149*38fd1498Szrj /* Adjust any debug stmts that referenced AI->from values to use the
150*38fd1498Szrj loop-closed AI->to, if the references are dominated by AI->bb and
151*38fd1498Szrj not by the definition of AI->from. */
152*38fd1498Szrj
153*38fd1498Szrj static void
adjust_debug_stmts_now(adjust_info * ai)154*38fd1498Szrj adjust_debug_stmts_now (adjust_info *ai)
155*38fd1498Szrj {
156*38fd1498Szrj basic_block bbphi = ai->bb;
157*38fd1498Szrj tree orig_def = ai->from;
158*38fd1498Szrj tree new_def = ai->to;
159*38fd1498Szrj imm_use_iterator imm_iter;
160*38fd1498Szrj gimple *stmt;
161*38fd1498Szrj basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162*38fd1498Szrj
163*38fd1498Szrj gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164*38fd1498Szrj
165*38fd1498Szrj /* Adjust any debug stmts that held onto non-loop-closed
166*38fd1498Szrj references. */
167*38fd1498Szrj FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168*38fd1498Szrj {
169*38fd1498Szrj use_operand_p use_p;
170*38fd1498Szrj basic_block bbuse;
171*38fd1498Szrj
172*38fd1498Szrj if (!is_gimple_debug (stmt))
173*38fd1498Szrj continue;
174*38fd1498Szrj
175*38fd1498Szrj gcc_assert (gimple_debug_bind_p (stmt));
176*38fd1498Szrj
177*38fd1498Szrj bbuse = gimple_bb (stmt);
178*38fd1498Szrj
179*38fd1498Szrj if ((bbuse == bbphi
180*38fd1498Szrj || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181*38fd1498Szrj && !(bbuse == bbdef
182*38fd1498Szrj || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183*38fd1498Szrj {
184*38fd1498Szrj if (new_def)
185*38fd1498Szrj FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186*38fd1498Szrj SET_USE (use_p, new_def);
187*38fd1498Szrj else
188*38fd1498Szrj {
189*38fd1498Szrj gimple_debug_bind_reset_value (stmt);
190*38fd1498Szrj update_stmt (stmt);
191*38fd1498Szrj }
192*38fd1498Szrj }
193*38fd1498Szrj }
194*38fd1498Szrj }
195*38fd1498Szrj
196*38fd1498Szrj /* Adjust debug stmts as scheduled before. */
197*38fd1498Szrj
198*38fd1498Szrj static void
adjust_vec_debug_stmts(void)199*38fd1498Szrj adjust_vec_debug_stmts (void)
200*38fd1498Szrj {
201*38fd1498Szrj if (!MAY_HAVE_DEBUG_BIND_STMTS)
202*38fd1498Szrj return;
203*38fd1498Szrj
204*38fd1498Szrj gcc_assert (adjust_vec.exists ());
205*38fd1498Szrj
206*38fd1498Szrj while (!adjust_vec.is_empty ())
207*38fd1498Szrj {
208*38fd1498Szrj adjust_debug_stmts_now (&adjust_vec.last ());
209*38fd1498Szrj adjust_vec.pop ();
210*38fd1498Szrj }
211*38fd1498Szrj }
212*38fd1498Szrj
213*38fd1498Szrj /* Adjust any debug stmts that referenced FROM values to use the
214*38fd1498Szrj loop-closed TO, if the references are dominated by BB and not by
215*38fd1498Szrj the definition of FROM. If adjust_vec is non-NULL, adjustments
216*38fd1498Szrj will be postponed until adjust_vec_debug_stmts is called. */
217*38fd1498Szrj
218*38fd1498Szrj static void
adjust_debug_stmts(tree from,tree to,basic_block bb)219*38fd1498Szrj adjust_debug_stmts (tree from, tree to, basic_block bb)
220*38fd1498Szrj {
221*38fd1498Szrj adjust_info ai;
222*38fd1498Szrj
223*38fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS
224*38fd1498Szrj && TREE_CODE (from) == SSA_NAME
225*38fd1498Szrj && ! SSA_NAME_IS_DEFAULT_DEF (from)
226*38fd1498Szrj && ! virtual_operand_p (from))
227*38fd1498Szrj {
228*38fd1498Szrj ai.from = from;
229*38fd1498Szrj ai.to = to;
230*38fd1498Szrj ai.bb = bb;
231*38fd1498Szrj
232*38fd1498Szrj if (adjust_vec.exists ())
233*38fd1498Szrj adjust_vec.safe_push (ai);
234*38fd1498Szrj else
235*38fd1498Szrj adjust_debug_stmts_now (&ai);
236*38fd1498Szrj }
237*38fd1498Szrj }
238*38fd1498Szrj
239*38fd1498Szrj /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240*38fd1498Szrj to adjust any debug stmts that referenced the old phi arg,
241*38fd1498Szrj presumably non-loop-closed references left over from other
242*38fd1498Szrj transformations. */
243*38fd1498Szrj
244*38fd1498Szrj static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)245*38fd1498Szrj adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246*38fd1498Szrj {
247*38fd1498Szrj tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248*38fd1498Szrj
249*38fd1498Szrj SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250*38fd1498Szrj
251*38fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS)
252*38fd1498Szrj adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253*38fd1498Szrj gimple_bb (update_phi));
254*38fd1498Szrj }
255*38fd1498Szrj
256*38fd1498Szrj /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257*38fd1498Szrj the mask should have during the first iteration and NEXT_MASK is the
258*38fd1498Szrj value that it should have on subsequent iterations. */
259*38fd1498Szrj
260*38fd1498Szrj static void
vect_set_loop_mask(struct loop * loop,tree mask,tree init_mask,tree next_mask)261*38fd1498Szrj vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262*38fd1498Szrj tree next_mask)
263*38fd1498Szrj {
264*38fd1498Szrj gphi *phi = create_phi_node (mask, loop->header);
265*38fd1498Szrj add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266*38fd1498Szrj add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267*38fd1498Szrj }
268*38fd1498Szrj
269*38fd1498Szrj /* Add SEQ to the end of LOOP's preheader block. */
270*38fd1498Szrj
271*38fd1498Szrj static void
add_preheader_seq(struct loop * loop,gimple_seq seq)272*38fd1498Szrj add_preheader_seq (struct loop *loop, gimple_seq seq)
273*38fd1498Szrj {
274*38fd1498Szrj if (seq)
275*38fd1498Szrj {
276*38fd1498Szrj edge pe = loop_preheader_edge (loop);
277*38fd1498Szrj basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278*38fd1498Szrj gcc_assert (!new_bb);
279*38fd1498Szrj }
280*38fd1498Szrj }
281*38fd1498Szrj
282*38fd1498Szrj /* Add SEQ to the beginning of LOOP's header block. */
283*38fd1498Szrj
284*38fd1498Szrj static void
add_header_seq(struct loop * loop,gimple_seq seq)285*38fd1498Szrj add_header_seq (struct loop *loop, gimple_seq seq)
286*38fd1498Szrj {
287*38fd1498Szrj if (seq)
288*38fd1498Szrj {
289*38fd1498Szrj gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290*38fd1498Szrj gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291*38fd1498Szrj }
292*38fd1498Szrj }
293*38fd1498Szrj
294*38fd1498Szrj /* Return true if the target can interleave elements of two vectors.
295*38fd1498Szrj OFFSET is 0 if the first half of the vectors should be interleaved
296*38fd1498Szrj or 1 if the second half should. When returning true, store the
297*38fd1498Szrj associated permutation in INDICES. */
298*38fd1498Szrj
299*38fd1498Szrj static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)300*38fd1498Szrj interleave_supported_p (vec_perm_indices *indices, tree vectype,
301*38fd1498Szrj unsigned int offset)
302*38fd1498Szrj {
303*38fd1498Szrj poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304*38fd1498Szrj poly_uint64 base = exact_div (nelts, 2) * offset;
305*38fd1498Szrj vec_perm_builder sel (nelts, 2, 3);
306*38fd1498Szrj for (unsigned int i = 0; i < 3; ++i)
307*38fd1498Szrj {
308*38fd1498Szrj sel.quick_push (base + i);
309*38fd1498Szrj sel.quick_push (base + i + nelts);
310*38fd1498Szrj }
311*38fd1498Szrj indices->new_vector (sel, 2, nelts);
312*38fd1498Szrj return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313*38fd1498Szrj }
314*38fd1498Szrj
315*38fd1498Szrj /* Try to use permutes to define the masks in DEST_RGM using the masks
316*38fd1498Szrj in SRC_RGM, given that the former has twice as many masks as the
317*38fd1498Szrj latter. Return true on success, adding any new statements to SEQ. */
318*38fd1498Szrj
319*38fd1498Szrj static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)320*38fd1498Szrj vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321*38fd1498Szrj rgroup_masks *src_rgm)
322*38fd1498Szrj {
323*38fd1498Szrj tree src_masktype = src_rgm->mask_type;
324*38fd1498Szrj tree dest_masktype = dest_rgm->mask_type;
325*38fd1498Szrj machine_mode src_mode = TYPE_MODE (src_masktype);
326*38fd1498Szrj if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327*38fd1498Szrj && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328*38fd1498Szrj && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329*38fd1498Szrj {
330*38fd1498Szrj /* Unpacking the source masks gives at least as many mask bits as
331*38fd1498Szrj we need. We can then VIEW_CONVERT any excess bits away. */
332*38fd1498Szrj tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333*38fd1498Szrj for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334*38fd1498Szrj {
335*38fd1498Szrj tree src = src_rgm->masks[i / 2];
336*38fd1498Szrj tree dest = dest_rgm->masks[i];
337*38fd1498Szrj tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338*38fd1498Szrj ? VEC_UNPACK_HI_EXPR
339*38fd1498Szrj : VEC_UNPACK_LO_EXPR);
340*38fd1498Szrj gassign *stmt;
341*38fd1498Szrj if (dest_masktype == unpack_masktype)
342*38fd1498Szrj stmt = gimple_build_assign (dest, code, src);
343*38fd1498Szrj else
344*38fd1498Szrj {
345*38fd1498Szrj tree temp = make_ssa_name (unpack_masktype);
346*38fd1498Szrj stmt = gimple_build_assign (temp, code, src);
347*38fd1498Szrj gimple_seq_add_stmt (seq, stmt);
348*38fd1498Szrj stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349*38fd1498Szrj build1 (VIEW_CONVERT_EXPR,
350*38fd1498Szrj dest_masktype, temp));
351*38fd1498Szrj }
352*38fd1498Szrj gimple_seq_add_stmt (seq, stmt);
353*38fd1498Szrj }
354*38fd1498Szrj return true;
355*38fd1498Szrj }
356*38fd1498Szrj vec_perm_indices indices[2];
357*38fd1498Szrj if (dest_masktype == src_masktype
358*38fd1498Szrj && interleave_supported_p (&indices[0], src_masktype, 0)
359*38fd1498Szrj && interleave_supported_p (&indices[1], src_masktype, 1))
360*38fd1498Szrj {
361*38fd1498Szrj /* The destination requires twice as many mask bits as the source, so
362*38fd1498Szrj we can use interleaving permutes to double up the number of bits. */
363*38fd1498Szrj tree masks[2];
364*38fd1498Szrj for (unsigned int i = 0; i < 2; ++i)
365*38fd1498Szrj masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366*38fd1498Szrj for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367*38fd1498Szrj {
368*38fd1498Szrj tree src = src_rgm->masks[i / 2];
369*38fd1498Szrj tree dest = dest_rgm->masks[i];
370*38fd1498Szrj gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371*38fd1498Szrj src, src, masks[i & 1]);
372*38fd1498Szrj gimple_seq_add_stmt (seq, stmt);
373*38fd1498Szrj }
374*38fd1498Szrj return true;
375*38fd1498Szrj }
376*38fd1498Szrj return false;
377*38fd1498Szrj }
378*38fd1498Szrj
379*38fd1498Szrj /* Helper for vect_set_loop_condition_masked. Generate definitions for
380*38fd1498Szrj all the masks in RGM and return a mask that is nonzero when the loop
381*38fd1498Szrj needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382*38fd1498Szrj Use LOOP_COND_GSI to insert code before the exit gcond.
383*38fd1498Szrj
384*38fd1498Szrj RGM belongs to loop LOOP. The loop originally iterated NITERS
385*38fd1498Szrj times and has been vectorized according to LOOP_VINFO. Each iteration
386*38fd1498Szrj of the vectorized loop handles VF iterations of the scalar loop.
387*38fd1498Szrj
388*38fd1498Szrj If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389*38fd1498Szrj starts with NITERS_SKIP dummy iterations of the scalar loop before
390*38fd1498Szrj the real work starts. The mask elements for these dummy iterations
391*38fd1498Szrj must be 0, to ensure that the extra iterations do not have an effect.
392*38fd1498Szrj
393*38fd1498Szrj It is known that:
394*38fd1498Szrj
395*38fd1498Szrj NITERS * RGM->max_nscalars_per_iter
396*38fd1498Szrj
397*38fd1498Szrj does not overflow. However, MIGHT_WRAP_P says whether an induction
398*38fd1498Szrj variable that starts at 0 and has step:
399*38fd1498Szrj
400*38fd1498Szrj VF * RGM->max_nscalars_per_iter
401*38fd1498Szrj
402*38fd1498Szrj might overflow before hitting a value above:
403*38fd1498Szrj
404*38fd1498Szrj (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405*38fd1498Szrj
406*38fd1498Szrj This means that we cannot guarantee that such an induction variable
407*38fd1498Szrj would ever hit a value that produces a set of all-false masks for RGM. */
408*38fd1498Szrj
409*38fd1498Szrj static tree
vect_set_loop_masks_directly(struct loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree vf,tree niters,tree niters_skip,bool might_wrap_p)410*38fd1498Szrj vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411*38fd1498Szrj gimple_seq *preheader_seq,
412*38fd1498Szrj gimple_stmt_iterator loop_cond_gsi,
413*38fd1498Szrj rgroup_masks *rgm, tree vf,
414*38fd1498Szrj tree niters, tree niters_skip,
415*38fd1498Szrj bool might_wrap_p)
416*38fd1498Szrj {
417*38fd1498Szrj tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418*38fd1498Szrj tree mask_type = rgm->mask_type;
419*38fd1498Szrj unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420*38fd1498Szrj poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421*38fd1498Szrj
422*38fd1498Szrj /* Calculate the maximum number of scalar values that the rgroup
423*38fd1498Szrj handles in total, the number that it handles for each iteration
424*38fd1498Szrj of the vector loop, and the number that it should skip during the
425*38fd1498Szrj first iteration of the vector loop. */
426*38fd1498Szrj tree nscalars_total = niters;
427*38fd1498Szrj tree nscalars_step = vf;
428*38fd1498Szrj tree nscalars_skip = niters_skip;
429*38fd1498Szrj if (nscalars_per_iter != 1)
430*38fd1498Szrj {
431*38fd1498Szrj /* We checked before choosing to use a fully-masked loop that these
432*38fd1498Szrj multiplications don't overflow. */
433*38fd1498Szrj tree factor = build_int_cst (compare_type, nscalars_per_iter);
434*38fd1498Szrj nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435*38fd1498Szrj nscalars_total, factor);
436*38fd1498Szrj nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437*38fd1498Szrj nscalars_step, factor);
438*38fd1498Szrj if (nscalars_skip)
439*38fd1498Szrj nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440*38fd1498Szrj nscalars_skip, factor);
441*38fd1498Szrj }
442*38fd1498Szrj
443*38fd1498Szrj /* Create an induction variable that counts the number of scalars
444*38fd1498Szrj processed. */
445*38fd1498Szrj tree index_before_incr, index_after_incr;
446*38fd1498Szrj gimple_stmt_iterator incr_gsi;
447*38fd1498Szrj bool insert_after;
448*38fd1498Szrj tree zero_index = build_int_cst (compare_type, 0);
449*38fd1498Szrj standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450*38fd1498Szrj create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451*38fd1498Szrj insert_after, &index_before_incr, &index_after_incr);
452*38fd1498Szrj
453*38fd1498Szrj tree test_index, test_limit, first_limit;
454*38fd1498Szrj gimple_stmt_iterator *test_gsi;
455*38fd1498Szrj if (might_wrap_p)
456*38fd1498Szrj {
457*38fd1498Szrj /* In principle the loop should stop iterating once the incremented
458*38fd1498Szrj IV reaches a value greater than or equal to:
459*38fd1498Szrj
460*38fd1498Szrj NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461*38fd1498Szrj
462*38fd1498Szrj However, there's no guarantee that this addition doesn't overflow
463*38fd1498Szrj the comparison type, or that the IV hits a value above it before
464*38fd1498Szrj wrapping around. We therefore adjust the limit down by one
465*38fd1498Szrj IV step:
466*38fd1498Szrj
467*38fd1498Szrj (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468*38fd1498Szrj -[infinite-prec] NSCALARS_STEP
469*38fd1498Szrj
470*38fd1498Szrj and compare the IV against this limit _before_ incrementing it.
471*38fd1498Szrj Since the comparison type is unsigned, we actually want the
472*38fd1498Szrj subtraction to saturate at zero:
473*38fd1498Szrj
474*38fd1498Szrj (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475*38fd1498Szrj -[sat] NSCALARS_STEP
476*38fd1498Szrj
477*38fd1498Szrj And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478*38fd1498Szrj
479*38fd1498Szrj NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480*38fd1498Szrj
481*38fd1498Szrj where the rightmost subtraction can be done directly in
482*38fd1498Szrj COMPARE_TYPE. */
483*38fd1498Szrj test_index = index_before_incr;
484*38fd1498Szrj tree adjust = nscalars_step;
485*38fd1498Szrj if (nscalars_skip)
486*38fd1498Szrj adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487*38fd1498Szrj adjust, nscalars_skip);
488*38fd1498Szrj test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489*38fd1498Szrj nscalars_total, adjust);
490*38fd1498Szrj test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491*38fd1498Szrj test_limit, adjust);
492*38fd1498Szrj test_gsi = &incr_gsi;
493*38fd1498Szrj
494*38fd1498Szrj /* Get a safe limit for the first iteration. */
495*38fd1498Szrj if (nscalars_skip)
496*38fd1498Szrj {
497*38fd1498Szrj /* The first vector iteration can handle at most NSCALARS_STEP
498*38fd1498Szrj scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
499*38fd1498Szrj NSCALARS_SKIP to that cannot overflow. */
500*38fd1498Szrj tree const_limit = build_int_cst (compare_type,
501*38fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502*38fd1498Szrj * nscalars_per_iter);
503*38fd1498Szrj first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504*38fd1498Szrj nscalars_total, const_limit);
505*38fd1498Szrj first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506*38fd1498Szrj first_limit, nscalars_skip);
507*38fd1498Szrj }
508*38fd1498Szrj else
509*38fd1498Szrj /* For the first iteration it doesn't matter whether the IV hits
510*38fd1498Szrj a value above NSCALARS_TOTAL. That only matters for the latch
511*38fd1498Szrj condition. */
512*38fd1498Szrj first_limit = nscalars_total;
513*38fd1498Szrj }
514*38fd1498Szrj else
515*38fd1498Szrj {
516*38fd1498Szrj /* Test the incremented IV, which will always hit a value above
517*38fd1498Szrj the bound before wrapping. */
518*38fd1498Szrj test_index = index_after_incr;
519*38fd1498Szrj test_limit = nscalars_total;
520*38fd1498Szrj if (nscalars_skip)
521*38fd1498Szrj test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522*38fd1498Szrj test_limit, nscalars_skip);
523*38fd1498Szrj test_gsi = &loop_cond_gsi;
524*38fd1498Szrj
525*38fd1498Szrj first_limit = test_limit;
526*38fd1498Szrj }
527*38fd1498Szrj
528*38fd1498Szrj /* Provide a definition of each mask in the group. */
529*38fd1498Szrj tree next_mask = NULL_TREE;
530*38fd1498Szrj tree mask;
531*38fd1498Szrj unsigned int i;
532*38fd1498Szrj FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533*38fd1498Szrj {
534*38fd1498Szrj /* Previous masks will cover BIAS scalars. This mask covers the
535*38fd1498Szrj next batch. */
536*38fd1498Szrj poly_uint64 bias = nscalars_per_mask * i;
537*38fd1498Szrj tree bias_tree = build_int_cst (compare_type, bias);
538*38fd1498Szrj gimple *tmp_stmt;
539*38fd1498Szrj
540*38fd1498Szrj /* See whether the first iteration of the vector loop is known
541*38fd1498Szrj to have a full mask. */
542*38fd1498Szrj poly_uint64 const_limit;
543*38fd1498Szrj bool first_iteration_full
544*38fd1498Szrj = (poly_int_tree_p (first_limit, &const_limit)
545*38fd1498Szrj && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546*38fd1498Szrj
547*38fd1498Szrj /* Rather than have a new IV that starts at BIAS and goes up to
548*38fd1498Szrj TEST_LIMIT, prefer to use the same 0-based IV for each mask
549*38fd1498Szrj and adjust the bound down by BIAS. */
550*38fd1498Szrj tree this_test_limit = test_limit;
551*38fd1498Szrj if (i != 0)
552*38fd1498Szrj {
553*38fd1498Szrj this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554*38fd1498Szrj compare_type, this_test_limit,
555*38fd1498Szrj bias_tree);
556*38fd1498Szrj this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557*38fd1498Szrj compare_type, this_test_limit,
558*38fd1498Szrj bias_tree);
559*38fd1498Szrj }
560*38fd1498Szrj
561*38fd1498Szrj /* Create the initial mask. First include all scalars that
562*38fd1498Szrj are within the loop limit. */
563*38fd1498Szrj tree init_mask = NULL_TREE;
564*38fd1498Szrj if (!first_iteration_full)
565*38fd1498Szrj {
566*38fd1498Szrj tree start, end;
567*38fd1498Szrj if (first_limit == test_limit)
568*38fd1498Szrj {
569*38fd1498Szrj /* Use a natural test between zero (the initial IV value)
570*38fd1498Szrj and the loop limit. The "else" block would be valid too,
571*38fd1498Szrj but this choice can avoid the need to load BIAS_TREE into
572*38fd1498Szrj a register. */
573*38fd1498Szrj start = zero_index;
574*38fd1498Szrj end = this_test_limit;
575*38fd1498Szrj }
576*38fd1498Szrj else
577*38fd1498Szrj {
578*38fd1498Szrj /* FIRST_LIMIT is the maximum number of scalars handled by the
579*38fd1498Szrj first iteration of the vector loop. Test the portion
580*38fd1498Szrj associated with this mask. */
581*38fd1498Szrj start = bias_tree;
582*38fd1498Szrj end = first_limit;
583*38fd1498Szrj }
584*38fd1498Szrj
585*38fd1498Szrj init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586*38fd1498Szrj tmp_stmt = vect_gen_while (init_mask, start, end);
587*38fd1498Szrj gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588*38fd1498Szrj }
589*38fd1498Szrj
590*38fd1498Szrj /* Now AND out the bits that are within the number of skipped
591*38fd1498Szrj scalars. */
592*38fd1498Szrj poly_uint64 const_skip;
593*38fd1498Szrj if (nscalars_skip
594*38fd1498Szrj && !(poly_int_tree_p (nscalars_skip, &const_skip)
595*38fd1498Szrj && known_le (const_skip, bias)))
596*38fd1498Szrj {
597*38fd1498Szrj tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598*38fd1498Szrj bias_tree, nscalars_skip);
599*38fd1498Szrj if (init_mask)
600*38fd1498Szrj init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601*38fd1498Szrj init_mask, unskipped_mask);
602*38fd1498Szrj else
603*38fd1498Szrj init_mask = unskipped_mask;
604*38fd1498Szrj }
605*38fd1498Szrj
606*38fd1498Szrj if (!init_mask)
607*38fd1498Szrj /* First iteration is full. */
608*38fd1498Szrj init_mask = build_minus_one_cst (mask_type);
609*38fd1498Szrj
610*38fd1498Szrj /* Get the mask value for the next iteration of the loop. */
611*38fd1498Szrj next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612*38fd1498Szrj gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613*38fd1498Szrj gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614*38fd1498Szrj
615*38fd1498Szrj vect_set_loop_mask (loop, mask, init_mask, next_mask);
616*38fd1498Szrj }
617*38fd1498Szrj return next_mask;
618*38fd1498Szrj }
619*38fd1498Szrj
620*38fd1498Szrj /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621*38fd1498Szrj LOOP_VINFO describes the vectorization of LOOP. NITERS is the
622*38fd1498Szrj number of iterations of the original scalar loop that should be
623*38fd1498Szrj handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
624*38fd1498Szrj as for vect_set_loop_condition.
625*38fd1498Szrj
626*38fd1498Szrj Insert the branch-back condition before LOOP_COND_GSI and return the
627*38fd1498Szrj final gcond. */
628*38fd1498Szrj
629*38fd1498Szrj static gcond *
vect_set_loop_condition_masked(struct loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)630*38fd1498Szrj vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631*38fd1498Szrj tree niters, tree final_iv,
632*38fd1498Szrj bool niters_maybe_zero,
633*38fd1498Szrj gimple_stmt_iterator loop_cond_gsi)
634*38fd1498Szrj {
635*38fd1498Szrj gimple_seq preheader_seq = NULL;
636*38fd1498Szrj gimple_seq header_seq = NULL;
637*38fd1498Szrj
638*38fd1498Szrj tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639*38fd1498Szrj unsigned int compare_precision = TYPE_PRECISION (compare_type);
640*38fd1498Szrj unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641*38fd1498Szrj tree orig_niters = niters;
642*38fd1498Szrj
643*38fd1498Szrj /* Type of the initial value of NITERS. */
644*38fd1498Szrj tree ni_actual_type = TREE_TYPE (niters);
645*38fd1498Szrj unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646*38fd1498Szrj
647*38fd1498Szrj /* Convert NITERS to the same size as the compare. */
648*38fd1498Szrj if (compare_precision > ni_actual_precision
649*38fd1498Szrj && niters_maybe_zero)
650*38fd1498Szrj {
651*38fd1498Szrj /* We know that there is always at least one iteration, so if the
652*38fd1498Szrj count is zero then it must have wrapped. Cope with this by
653*38fd1498Szrj subtracting 1 before the conversion and adding 1 to the result. */
654*38fd1498Szrj gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655*38fd1498Szrj niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656*38fd1498Szrj niters, build_minus_one_cst (ni_actual_type));
657*38fd1498Szrj niters = gimple_convert (&preheader_seq, compare_type, niters);
658*38fd1498Szrj niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659*38fd1498Szrj niters, build_one_cst (compare_type));
660*38fd1498Szrj }
661*38fd1498Szrj else
662*38fd1498Szrj niters = gimple_convert (&preheader_seq, compare_type, niters);
663*38fd1498Szrj
664*38fd1498Szrj /* Convert skip_niters to the right type. */
665*38fd1498Szrj tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666*38fd1498Szrj
667*38fd1498Szrj /* Now calculate the value that the induction variable must be able
668*38fd1498Szrj to hit in order to ensure that we end the loop with an all-false mask.
669*38fd1498Szrj This involves adding the maximum number of inactive trailing scalar
670*38fd1498Szrj iterations. */
671*38fd1498Szrj widest_int iv_limit;
672*38fd1498Szrj bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673*38fd1498Szrj if (known_max_iters)
674*38fd1498Szrj {
675*38fd1498Szrj if (niters_skip)
676*38fd1498Szrj {
677*38fd1498Szrj /* Add the maximum number of skipped iterations to the
678*38fd1498Szrj maximum iteration count. */
679*38fd1498Szrj if (TREE_CODE (niters_skip) == INTEGER_CST)
680*38fd1498Szrj iv_limit += wi::to_widest (niters_skip);
681*38fd1498Szrj else
682*38fd1498Szrj iv_limit += max_vf - 1;
683*38fd1498Szrj }
684*38fd1498Szrj /* IV_LIMIT is the maximum number of latch iterations, which is also
685*38fd1498Szrj the maximum in-range IV value. Round this value down to the previous
686*38fd1498Szrj vector alignment boundary and then add an extra full iteration. */
687*38fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688*38fd1498Szrj iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689*38fd1498Szrj }
690*38fd1498Szrj
691*38fd1498Szrj /* Get the vectorization factor in tree form. */
692*38fd1498Szrj tree vf = build_int_cst (compare_type,
693*38fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694*38fd1498Szrj
695*38fd1498Szrj /* Iterate over all the rgroups and fill in their masks. We could use
696*38fd1498Szrj the first mask from any rgroup for the loop condition; here we
697*38fd1498Szrj arbitrarily pick the last. */
698*38fd1498Szrj tree test_mask = NULL_TREE;
699*38fd1498Szrj rgroup_masks *rgm;
700*38fd1498Szrj unsigned int i;
701*38fd1498Szrj vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702*38fd1498Szrj FOR_EACH_VEC_ELT (*masks, i, rgm)
703*38fd1498Szrj if (!rgm->masks.is_empty ())
704*38fd1498Szrj {
705*38fd1498Szrj /* First try using permutes. This adds a single vector
706*38fd1498Szrj instruction to the loop for each mask, but needs no extra
707*38fd1498Szrj loop invariants or IVs. */
708*38fd1498Szrj unsigned int nmasks = i + 1;
709*38fd1498Szrj if ((nmasks & 1) == 0)
710*38fd1498Szrj {
711*38fd1498Szrj rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712*38fd1498Szrj if (!half_rgm->masks.is_empty ()
713*38fd1498Szrj && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714*38fd1498Szrj continue;
715*38fd1498Szrj }
716*38fd1498Szrj
717*38fd1498Szrj /* See whether zero-based IV would ever generate all-false masks
718*38fd1498Szrj before wrapping around. */
719*38fd1498Szrj bool might_wrap_p
720*38fd1498Szrj = (!known_max_iters
721*38fd1498Szrj || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722*38fd1498Szrj UNSIGNED)
723*38fd1498Szrj > compare_precision));
724*38fd1498Szrj
725*38fd1498Szrj /* Set up all masks for this group. */
726*38fd1498Szrj test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727*38fd1498Szrj &preheader_seq,
728*38fd1498Szrj loop_cond_gsi, rgm, vf,
729*38fd1498Szrj niters, niters_skip,
730*38fd1498Szrj might_wrap_p);
731*38fd1498Szrj }
732*38fd1498Szrj
733*38fd1498Szrj /* Emit all accumulated statements. */
734*38fd1498Szrj add_preheader_seq (loop, preheader_seq);
735*38fd1498Szrj add_header_seq (loop, header_seq);
736*38fd1498Szrj
737*38fd1498Szrj /* Get a boolean result that tells us whether to iterate. */
738*38fd1498Szrj edge exit_edge = single_exit (loop);
739*38fd1498Szrj tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740*38fd1498Szrj tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741*38fd1498Szrj gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742*38fd1498Szrj NULL_TREE, NULL_TREE);
743*38fd1498Szrj gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744*38fd1498Szrj
745*38fd1498Szrj /* The loop iterates (NITERS - 1) / VF + 1 times.
746*38fd1498Szrj Subtract one from this to get the latch count. */
747*38fd1498Szrj tree step = build_int_cst (compare_type,
748*38fd1498Szrj LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749*38fd1498Szrj tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750*38fd1498Szrj build_minus_one_cst (compare_type));
751*38fd1498Szrj loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752*38fd1498Szrj niters_minus_one, step);
753*38fd1498Szrj
754*38fd1498Szrj if (final_iv)
755*38fd1498Szrj {
756*38fd1498Szrj gassign *assign = gimple_build_assign (final_iv, orig_niters);
757*38fd1498Szrj gsi_insert_on_edge_immediate (single_exit (loop), assign);
758*38fd1498Szrj }
759*38fd1498Szrj
760*38fd1498Szrj return cond_stmt;
761*38fd1498Szrj }
762*38fd1498Szrj
763*38fd1498Szrj /* Like vect_set_loop_condition, but handle the case in which there
764*38fd1498Szrj are no loop masks. */
765*38fd1498Szrj
766*38fd1498Szrj static gcond *
vect_set_loop_condition_unmasked(struct loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)767*38fd1498Szrj vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768*38fd1498Szrj tree step, tree final_iv,
769*38fd1498Szrj bool niters_maybe_zero,
770*38fd1498Szrj gimple_stmt_iterator loop_cond_gsi)
771*38fd1498Szrj {
772*38fd1498Szrj tree indx_before_incr, indx_after_incr;
773*38fd1498Szrj gcond *cond_stmt;
774*38fd1498Szrj gcond *orig_cond;
775*38fd1498Szrj edge pe = loop_preheader_edge (loop);
776*38fd1498Szrj edge exit_edge = single_exit (loop);
777*38fd1498Szrj gimple_stmt_iterator incr_gsi;
778*38fd1498Szrj bool insert_after;
779*38fd1498Szrj enum tree_code code;
780*38fd1498Szrj tree niters_type = TREE_TYPE (niters);
781*38fd1498Szrj
782*38fd1498Szrj orig_cond = get_loop_exit_condition (loop);
783*38fd1498Szrj gcc_assert (orig_cond);
784*38fd1498Szrj loop_cond_gsi = gsi_for_stmt (orig_cond);
785*38fd1498Szrj
786*38fd1498Szrj tree init, limit;
787*38fd1498Szrj if (!niters_maybe_zero && integer_onep (step))
788*38fd1498Szrj {
789*38fd1498Szrj /* In this case we can use a simple 0-based IV:
790*38fd1498Szrj
791*38fd1498Szrj A:
792*38fd1498Szrj x = 0;
793*38fd1498Szrj do
794*38fd1498Szrj {
795*38fd1498Szrj ...
796*38fd1498Szrj x += 1;
797*38fd1498Szrj }
798*38fd1498Szrj while (x < NITERS); */
799*38fd1498Szrj code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800*38fd1498Szrj init = build_zero_cst (niters_type);
801*38fd1498Szrj limit = niters;
802*38fd1498Szrj }
803*38fd1498Szrj else
804*38fd1498Szrj {
805*38fd1498Szrj /* The following works for all values of NITERS except 0:
806*38fd1498Szrj
807*38fd1498Szrj B:
808*38fd1498Szrj x = 0;
809*38fd1498Szrj do
810*38fd1498Szrj {
811*38fd1498Szrj ...
812*38fd1498Szrj x += STEP;
813*38fd1498Szrj }
814*38fd1498Szrj while (x <= NITERS - STEP);
815*38fd1498Szrj
816*38fd1498Szrj so that the loop continues to iterate if x + STEP - 1 < NITERS
817*38fd1498Szrj but stops if x + STEP - 1 >= NITERS.
818*38fd1498Szrj
819*38fd1498Szrj However, if NITERS is zero, x never hits a value above NITERS - STEP
820*38fd1498Szrj before wrapping around. There are two obvious ways of dealing with
821*38fd1498Szrj this:
822*38fd1498Szrj
823*38fd1498Szrj - start at STEP - 1 and compare x before incrementing it
824*38fd1498Szrj - start at -1 and compare x after incrementing it
825*38fd1498Szrj
826*38fd1498Szrj The latter is simpler and is what we use. The loop in this case
827*38fd1498Szrj looks like:
828*38fd1498Szrj
829*38fd1498Szrj C:
830*38fd1498Szrj x = -1;
831*38fd1498Szrj do
832*38fd1498Szrj {
833*38fd1498Szrj ...
834*38fd1498Szrj x += STEP;
835*38fd1498Szrj }
836*38fd1498Szrj while (x < NITERS - STEP);
837*38fd1498Szrj
838*38fd1498Szrj In both cases the loop limit is NITERS - STEP. */
839*38fd1498Szrj gimple_seq seq = NULL;
840*38fd1498Szrj limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841*38fd1498Szrj limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842*38fd1498Szrj if (seq)
843*38fd1498Szrj {
844*38fd1498Szrj basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845*38fd1498Szrj gcc_assert (!new_bb);
846*38fd1498Szrj }
847*38fd1498Szrj if (niters_maybe_zero)
848*38fd1498Szrj {
849*38fd1498Szrj /* Case C. */
850*38fd1498Szrj code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851*38fd1498Szrj init = build_all_ones_cst (niters_type);
852*38fd1498Szrj }
853*38fd1498Szrj else
854*38fd1498Szrj {
855*38fd1498Szrj /* Case B. */
856*38fd1498Szrj code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857*38fd1498Szrj init = build_zero_cst (niters_type);
858*38fd1498Szrj }
859*38fd1498Szrj }
860*38fd1498Szrj
861*38fd1498Szrj standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862*38fd1498Szrj create_iv (init, step, NULL_TREE, loop,
863*38fd1498Szrj &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864*38fd1498Szrj indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865*38fd1498Szrj true, NULL_TREE, true,
866*38fd1498Szrj GSI_SAME_STMT);
867*38fd1498Szrj limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868*38fd1498Szrj true, GSI_SAME_STMT);
869*38fd1498Szrj
870*38fd1498Szrj cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871*38fd1498Szrj NULL_TREE);
872*38fd1498Szrj
873*38fd1498Szrj gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874*38fd1498Szrj
875*38fd1498Szrj /* Record the number of latch iterations. */
876*38fd1498Szrj if (limit == niters)
877*38fd1498Szrj /* Case A: the loop iterates NITERS times. Subtract one to get the
878*38fd1498Szrj latch count. */
879*38fd1498Szrj loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880*38fd1498Szrj build_int_cst (niters_type, 1));
881*38fd1498Szrj else
882*38fd1498Szrj /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883*38fd1498Szrj Subtract one from this to get the latch count. */
884*38fd1498Szrj loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885*38fd1498Szrj limit, step);
886*38fd1498Szrj
887*38fd1498Szrj if (final_iv)
888*38fd1498Szrj {
889*38fd1498Szrj gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890*38fd1498Szrj indx_after_incr, init);
891*38fd1498Szrj gsi_insert_on_edge_immediate (single_exit (loop), assign);
892*38fd1498Szrj }
893*38fd1498Szrj
894*38fd1498Szrj return cond_stmt;
895*38fd1498Szrj }
896*38fd1498Szrj
897*38fd1498Szrj /* If we're using fully-masked loops, make LOOP iterate:
898*38fd1498Szrj
899*38fd1498Szrj N == (NITERS - 1) / STEP + 1
900*38fd1498Szrj
901*38fd1498Szrj times. When NITERS is zero, this is equivalent to making the loop
902*38fd1498Szrj execute (1 << M) / STEP times, where M is the precision of NITERS.
903*38fd1498Szrj NITERS_MAYBE_ZERO is true if this last case might occur.
904*38fd1498Szrj
905*38fd1498Szrj If we're not using fully-masked loops, make LOOP iterate:
906*38fd1498Szrj
907*38fd1498Szrj N == (NITERS - STEP) / STEP + 1
908*38fd1498Szrj
909*38fd1498Szrj times, where NITERS is known to be outside the range [1, STEP - 1].
910*38fd1498Szrj This is equivalent to making the loop execute NITERS / STEP times
911*38fd1498Szrj when NITERS is nonzero and (1 << M) / STEP times otherwise.
912*38fd1498Szrj NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913*38fd1498Szrj
914*38fd1498Szrj If FINAL_IV is nonnull, it is an SSA name that should be set to
915*38fd1498Szrj N * STEP on exit from the loop.
916*38fd1498Szrj
917*38fd1498Szrj Assumption: the exit-condition of LOOP is the last stmt in the loop. */
918*38fd1498Szrj
919*38fd1498Szrj void
vect_set_loop_condition(struct loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)920*38fd1498Szrj vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921*38fd1498Szrj tree niters, tree step, tree final_iv,
922*38fd1498Szrj bool niters_maybe_zero)
923*38fd1498Szrj {
924*38fd1498Szrj gcond *cond_stmt;
925*38fd1498Szrj gcond *orig_cond = get_loop_exit_condition (loop);
926*38fd1498Szrj gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927*38fd1498Szrj
928*38fd1498Szrj if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929*38fd1498Szrj cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930*38fd1498Szrj final_iv, niters_maybe_zero,
931*38fd1498Szrj loop_cond_gsi);
932*38fd1498Szrj else
933*38fd1498Szrj cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934*38fd1498Szrj final_iv, niters_maybe_zero,
935*38fd1498Szrj loop_cond_gsi);
936*38fd1498Szrj
937*38fd1498Szrj /* Remove old loop exit test. */
938*38fd1498Szrj gsi_remove (&loop_cond_gsi, true);
939*38fd1498Szrj free_stmt_vec_info (orig_cond);
940*38fd1498Szrj
941*38fd1498Szrj if (dump_enabled_p ())
942*38fd1498Szrj {
943*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
944*38fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
945*38fd1498Szrj }
946*38fd1498Szrj }
947*38fd1498Szrj
948*38fd1498Szrj /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
949*38fd1498Szrj For all PHI arguments in FROM->dest and TO->dest from those
950*38fd1498Szrj edges ensure that TO->dest PHI arguments have current_def
951*38fd1498Szrj to that in from. */
952*38fd1498Szrj
953*38fd1498Szrj static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)954*38fd1498Szrj slpeel_duplicate_current_defs_from_edges (edge from, edge to)
955*38fd1498Szrj {
956*38fd1498Szrj gimple_stmt_iterator gsi_from, gsi_to;
957*38fd1498Szrj
958*38fd1498Szrj for (gsi_from = gsi_start_phis (from->dest),
959*38fd1498Szrj gsi_to = gsi_start_phis (to->dest);
960*38fd1498Szrj !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
961*38fd1498Szrj {
962*38fd1498Szrj gimple *from_phi = gsi_stmt (gsi_from);
963*38fd1498Szrj gimple *to_phi = gsi_stmt (gsi_to);
964*38fd1498Szrj tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
965*38fd1498Szrj tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
966*38fd1498Szrj if (virtual_operand_p (from_arg))
967*38fd1498Szrj {
968*38fd1498Szrj gsi_next (&gsi_from);
969*38fd1498Szrj continue;
970*38fd1498Szrj }
971*38fd1498Szrj if (virtual_operand_p (to_arg))
972*38fd1498Szrj {
973*38fd1498Szrj gsi_next (&gsi_to);
974*38fd1498Szrj continue;
975*38fd1498Szrj }
976*38fd1498Szrj if (TREE_CODE (from_arg) != SSA_NAME)
977*38fd1498Szrj gcc_assert (operand_equal_p (from_arg, to_arg, 0));
978*38fd1498Szrj else
979*38fd1498Szrj {
980*38fd1498Szrj if (get_current_def (to_arg) == NULL_TREE)
981*38fd1498Szrj set_current_def (to_arg, get_current_def (from_arg));
982*38fd1498Szrj }
983*38fd1498Szrj gsi_next (&gsi_from);
984*38fd1498Szrj gsi_next (&gsi_to);
985*38fd1498Szrj }
986*38fd1498Szrj
987*38fd1498Szrj gphi *from_phi = get_virtual_phi (from->dest);
988*38fd1498Szrj gphi *to_phi = get_virtual_phi (to->dest);
989*38fd1498Szrj if (from_phi)
990*38fd1498Szrj set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
991*38fd1498Szrj get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
992*38fd1498Szrj }
993*38fd1498Szrj
994*38fd1498Szrj
995*38fd1498Szrj /* Given LOOP this function generates a new copy of it and puts it
996*38fd1498Szrj on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
997*38fd1498Szrj non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
998*38fd1498Szrj basic blocks from SCALAR_LOOP instead of LOOP, but to either the
999*38fd1498Szrj entry or exit of LOOP. */
1000*38fd1498Szrj
1001*38fd1498Szrj struct loop *
slpeel_tree_duplicate_loop_to_edge_cfg(struct loop * loop,struct loop * scalar_loop,edge e)1002*38fd1498Szrj slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1003*38fd1498Szrj struct loop *scalar_loop, edge e)
1004*38fd1498Szrj {
1005*38fd1498Szrj struct loop *new_loop;
1006*38fd1498Szrj basic_block *new_bbs, *bbs, *pbbs;
1007*38fd1498Szrj bool at_exit;
1008*38fd1498Szrj bool was_imm_dom;
1009*38fd1498Szrj basic_block exit_dest;
1010*38fd1498Szrj edge exit, new_exit;
1011*38fd1498Szrj bool duplicate_outer_loop = false;
1012*38fd1498Szrj
1013*38fd1498Szrj exit = single_exit (loop);
1014*38fd1498Szrj at_exit = (e == exit);
1015*38fd1498Szrj if (!at_exit && e != loop_preheader_edge (loop))
1016*38fd1498Szrj return NULL;
1017*38fd1498Szrj
1018*38fd1498Szrj if (scalar_loop == NULL)
1019*38fd1498Szrj scalar_loop = loop;
1020*38fd1498Szrj
1021*38fd1498Szrj bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1022*38fd1498Szrj pbbs = bbs + 1;
1023*38fd1498Szrj get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1024*38fd1498Szrj /* Allow duplication of outer loops. */
1025*38fd1498Szrj if (scalar_loop->inner)
1026*38fd1498Szrj duplicate_outer_loop = true;
1027*38fd1498Szrj /* Check whether duplication is possible. */
1028*38fd1498Szrj if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1029*38fd1498Szrj {
1030*38fd1498Szrj free (bbs);
1031*38fd1498Szrj return NULL;
1032*38fd1498Szrj }
1033*38fd1498Szrj
1034*38fd1498Szrj /* Generate new loop structure. */
1035*38fd1498Szrj new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1036*38fd1498Szrj duplicate_subloops (scalar_loop, new_loop);
1037*38fd1498Szrj
1038*38fd1498Szrj exit_dest = exit->dest;
1039*38fd1498Szrj was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1040*38fd1498Szrj exit_dest) == loop->header ?
1041*38fd1498Szrj true : false);
1042*38fd1498Szrj
1043*38fd1498Szrj /* Also copy the pre-header, this avoids jumping through hoops to
1044*38fd1498Szrj duplicate the loop entry PHI arguments. Create an empty
1045*38fd1498Szrj pre-header unconditionally for this. */
1046*38fd1498Szrj basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1047*38fd1498Szrj edge entry_e = single_pred_edge (preheader);
1048*38fd1498Szrj bbs[0] = preheader;
1049*38fd1498Szrj new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1050*38fd1498Szrj
1051*38fd1498Szrj exit = single_exit (scalar_loop);
1052*38fd1498Szrj copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1053*38fd1498Szrj &exit, 1, &new_exit, NULL,
1054*38fd1498Szrj at_exit ? loop->latch : e->src, true);
1055*38fd1498Szrj exit = single_exit (loop);
1056*38fd1498Szrj basic_block new_preheader = new_bbs[0];
1057*38fd1498Szrj
1058*38fd1498Szrj add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1059*38fd1498Szrj
1060*38fd1498Szrj if (scalar_loop != loop)
1061*38fd1498Szrj {
1062*38fd1498Szrj /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063*38fd1498Szrj SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064*38fd1498Szrj but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1065*38fd1498Szrj the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066*38fd1498Szrj header) to have current_def set, so copy them over. */
1067*38fd1498Szrj slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068*38fd1498Szrj exit);
1069*38fd1498Szrj slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070*38fd1498Szrj 0),
1071*38fd1498Szrj EDGE_SUCC (loop->latch, 0));
1072*38fd1498Szrj }
1073*38fd1498Szrj
1074*38fd1498Szrj if (at_exit) /* Add the loop copy at exit. */
1075*38fd1498Szrj {
1076*38fd1498Szrj if (scalar_loop != loop)
1077*38fd1498Szrj {
1078*38fd1498Szrj gphi_iterator gsi;
1079*38fd1498Szrj new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080*38fd1498Szrj
1081*38fd1498Szrj for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082*38fd1498Szrj gsi_next (&gsi))
1083*38fd1498Szrj {
1084*38fd1498Szrj gphi *phi = gsi.phi ();
1085*38fd1498Szrj tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086*38fd1498Szrj location_t orig_locus
1087*38fd1498Szrj = gimple_phi_arg_location_from_edge (phi, e);
1088*38fd1498Szrj
1089*38fd1498Szrj add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090*38fd1498Szrj }
1091*38fd1498Szrj }
1092*38fd1498Szrj redirect_edge_and_branch_force (e, new_preheader);
1093*38fd1498Szrj flush_pending_stmts (e);
1094*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095*38fd1498Szrj if (was_imm_dom || duplicate_outer_loop)
1096*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097*38fd1498Szrj
1098*38fd1498Szrj /* And remove the non-necessary forwarder again. Keep the other
1099*38fd1498Szrj one so we have a proper pre-header for the loop at the exit edge. */
1100*38fd1498Szrj redirect_edge_pred (single_succ_edge (preheader),
1101*38fd1498Szrj single_pred (preheader));
1102*38fd1498Szrj delete_basic_block (preheader);
1103*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104*38fd1498Szrj loop_preheader_edge (scalar_loop)->src);
1105*38fd1498Szrj }
1106*38fd1498Szrj else /* Add the copy at entry. */
1107*38fd1498Szrj {
1108*38fd1498Szrj if (scalar_loop != loop)
1109*38fd1498Szrj {
1110*38fd1498Szrj /* Remove the non-necessary forwarder of scalar_loop again. */
1111*38fd1498Szrj redirect_edge_pred (single_succ_edge (preheader),
1112*38fd1498Szrj single_pred (preheader));
1113*38fd1498Szrj delete_basic_block (preheader);
1114*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115*38fd1498Szrj loop_preheader_edge (scalar_loop)->src);
1116*38fd1498Szrj preheader = split_edge (loop_preheader_edge (loop));
1117*38fd1498Szrj entry_e = single_pred_edge (preheader);
1118*38fd1498Szrj }
1119*38fd1498Szrj
1120*38fd1498Szrj redirect_edge_and_branch_force (entry_e, new_preheader);
1121*38fd1498Szrj flush_pending_stmts (entry_e);
1122*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123*38fd1498Szrj
1124*38fd1498Szrj redirect_edge_and_branch_force (new_exit, preheader);
1125*38fd1498Szrj flush_pending_stmts (new_exit);
1126*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127*38fd1498Szrj
1128*38fd1498Szrj /* And remove the non-necessary forwarder again. Keep the other
1129*38fd1498Szrj one so we have a proper pre-header for the loop at the exit edge. */
1130*38fd1498Szrj redirect_edge_pred (single_succ_edge (new_preheader),
1131*38fd1498Szrj single_pred (new_preheader));
1132*38fd1498Szrj delete_basic_block (new_preheader);
1133*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134*38fd1498Szrj loop_preheader_edge (new_loop)->src);
1135*38fd1498Szrj }
1136*38fd1498Szrj
1137*38fd1498Szrj /* Skip new preheader since it's deleted if copy loop is added at entry. */
1138*38fd1498Szrj for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139*38fd1498Szrj rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140*38fd1498Szrj
1141*38fd1498Szrj if (scalar_loop != loop)
1142*38fd1498Szrj {
1143*38fd1498Szrj /* Update new_loop->header PHIs, so that on the preheader
1144*38fd1498Szrj edge they are the ones from loop rather than scalar_loop. */
1145*38fd1498Szrj gphi_iterator gsi_orig, gsi_new;
1146*38fd1498Szrj edge orig_e = loop_preheader_edge (loop);
1147*38fd1498Szrj edge new_e = loop_preheader_edge (new_loop);
1148*38fd1498Szrj
1149*38fd1498Szrj for (gsi_orig = gsi_start_phis (loop->header),
1150*38fd1498Szrj gsi_new = gsi_start_phis (new_loop->header);
1151*38fd1498Szrj !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1152*38fd1498Szrj gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153*38fd1498Szrj {
1154*38fd1498Szrj gphi *orig_phi = gsi_orig.phi ();
1155*38fd1498Szrj gphi *new_phi = gsi_new.phi ();
1156*38fd1498Szrj tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1157*38fd1498Szrj location_t orig_locus
1158*38fd1498Szrj = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159*38fd1498Szrj
1160*38fd1498Szrj add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1161*38fd1498Szrj }
1162*38fd1498Szrj }
1163*38fd1498Szrj
1164*38fd1498Szrj free (new_bbs);
1165*38fd1498Szrj free (bbs);
1166*38fd1498Szrj
1167*38fd1498Szrj checking_verify_dominators (CDI_DOMINATORS);
1168*38fd1498Szrj
1169*38fd1498Szrj return new_loop;
1170*38fd1498Szrj }
1171*38fd1498Szrj
1172*38fd1498Szrj
1173*38fd1498Szrj /* Given the condition expression COND, put it as the last statement of
1174*38fd1498Szrj GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1175*38fd1498Szrj DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1176*38fd1498Szrj skip the loop. PROBABILITY is the skip edge's probability. Mark the
1177*38fd1498Szrj new edge as irreducible if IRREDUCIBLE_P is true. */
1178*38fd1498Szrj
1179*38fd1498Szrj static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1180*38fd1498Szrj slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1181*38fd1498Szrj basic_block guard_to, basic_block dom_bb,
1182*38fd1498Szrj profile_probability probability, bool irreducible_p)
1183*38fd1498Szrj {
1184*38fd1498Szrj gimple_stmt_iterator gsi;
1185*38fd1498Szrj edge new_e, enter_e;
1186*38fd1498Szrj gcond *cond_stmt;
1187*38fd1498Szrj gimple_seq gimplify_stmt_list = NULL;
1188*38fd1498Szrj
1189*38fd1498Szrj enter_e = EDGE_SUCC (guard_bb, 0);
1190*38fd1498Szrj enter_e->flags &= ~EDGE_FALLTHRU;
1191*38fd1498Szrj enter_e->flags |= EDGE_FALSE_VALUE;
1192*38fd1498Szrj gsi = gsi_last_bb (guard_bb);
1193*38fd1498Szrj
1194*38fd1498Szrj cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1195*38fd1498Szrj NULL_TREE);
1196*38fd1498Szrj if (gimplify_stmt_list)
1197*38fd1498Szrj gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1198*38fd1498Szrj
1199*38fd1498Szrj cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1200*38fd1498Szrj gsi = gsi_last_bb (guard_bb);
1201*38fd1498Szrj gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202*38fd1498Szrj
1203*38fd1498Szrj /* Add new edge to connect guard block to the merge/loop-exit block. */
1204*38fd1498Szrj new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1205*38fd1498Szrj
1206*38fd1498Szrj new_e->probability = probability;
1207*38fd1498Szrj if (irreducible_p)
1208*38fd1498Szrj new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209*38fd1498Szrj
1210*38fd1498Szrj enter_e->probability = probability.invert ();
1211*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1212*38fd1498Szrj
1213*38fd1498Szrj /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1214*38fd1498Szrj if (enter_e->dest->loop_father->header == enter_e->dest)
1215*38fd1498Szrj split_edge (enter_e);
1216*38fd1498Szrj
1217*38fd1498Szrj return new_e;
1218*38fd1498Szrj }
1219*38fd1498Szrj
1220*38fd1498Szrj
1221*38fd1498Szrj /* This function verifies that the following restrictions apply to LOOP:
1222*38fd1498Szrj (1) it consists of exactly 2 basic blocks - header, and an empty latch
1223*38fd1498Szrj for innermost loop and 5 basic blocks for outer-loop.
1224*38fd1498Szrj (2) it is single entry, single exit
1225*38fd1498Szrj (3) its exit condition is the last stmt in the header
1226*38fd1498Szrj (4) E is the entry/exit edge of LOOP.
1227*38fd1498Szrj */
1228*38fd1498Szrj
1229*38fd1498Szrj bool
slpeel_can_duplicate_loop_p(const struct loop * loop,const_edge e)1230*38fd1498Szrj slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231*38fd1498Szrj {
1232*38fd1498Szrj edge exit_e = single_exit (loop);
1233*38fd1498Szrj edge entry_e = loop_preheader_edge (loop);
1234*38fd1498Szrj gcond *orig_cond = get_loop_exit_condition (loop);
1235*38fd1498Szrj gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1236*38fd1498Szrj unsigned int num_bb = loop->inner? 5 : 2;
1237*38fd1498Szrj
1238*38fd1498Szrj /* All loops have an outer scope; the only case loop->outer is NULL is for
1239*38fd1498Szrj the function itself. */
1240*38fd1498Szrj if (!loop_outer (loop)
1241*38fd1498Szrj || loop->num_nodes != num_bb
1242*38fd1498Szrj || !empty_block_p (loop->latch)
1243*38fd1498Szrj || !single_exit (loop)
1244*38fd1498Szrj /* Verify that new loop exit condition can be trivially modified. */
1245*38fd1498Szrj || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1246*38fd1498Szrj || (e != exit_e && e != entry_e))
1247*38fd1498Szrj return false;
1248*38fd1498Szrj
1249*38fd1498Szrj return true;
1250*38fd1498Szrj }
1251*38fd1498Szrj
1252*38fd1498Szrj /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1253*38fd1498Szrj in the exit bb and rename all the uses after the loop. This simplifies
1254*38fd1498Szrj the *guard[12] routines, which assume loop closed SSA form for all PHIs
1255*38fd1498Szrj (but normally loop closed SSA form doesn't require virtual PHIs to be
1256*38fd1498Szrj in the same form). Doing this early simplifies the checking what
1257*38fd1498Szrj uses should be renamed. */
1258*38fd1498Szrj
1259*38fd1498Szrj static void
create_lcssa_for_virtual_phi(struct loop * loop)1260*38fd1498Szrj create_lcssa_for_virtual_phi (struct loop *loop)
1261*38fd1498Szrj {
1262*38fd1498Szrj gphi_iterator gsi;
1263*38fd1498Szrj edge exit_e = single_exit (loop);
1264*38fd1498Szrj
1265*38fd1498Szrj for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1266*38fd1498Szrj if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267*38fd1498Szrj {
1268*38fd1498Szrj gphi *phi = gsi.phi ();
1269*38fd1498Szrj for (gsi = gsi_start_phis (exit_e->dest);
1270*38fd1498Szrj !gsi_end_p (gsi); gsi_next (&gsi))
1271*38fd1498Szrj if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1272*38fd1498Szrj break;
1273*38fd1498Szrj if (gsi_end_p (gsi))
1274*38fd1498Szrj {
1275*38fd1498Szrj tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1276*38fd1498Szrj gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1277*38fd1498Szrj tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1278*38fd1498Szrj imm_use_iterator imm_iter;
1279*38fd1498Szrj gimple *stmt;
1280*38fd1498Szrj use_operand_p use_p;
1281*38fd1498Szrj
1282*38fd1498Szrj SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1283*38fd1498Szrj = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1284*38fd1498Szrj add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1285*38fd1498Szrj gimple_phi_set_result (new_phi, new_vop);
1286*38fd1498Szrj FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1287*38fd1498Szrj if (stmt != new_phi
1288*38fd1498Szrj && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1289*38fd1498Szrj FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1290*38fd1498Szrj SET_USE (use_p, new_vop);
1291*38fd1498Szrj }
1292*38fd1498Szrj break;
1293*38fd1498Szrj }
1294*38fd1498Szrj
1295*38fd1498Szrj }
1296*38fd1498Szrj
1297*38fd1498Szrj /* Function vect_get_loop_location.
1298*38fd1498Szrj
1299*38fd1498Szrj Extract the location of the loop in the source code.
1300*38fd1498Szrj If the loop is not well formed for vectorization, an estimated
1301*38fd1498Szrj location is calculated.
1302*38fd1498Szrj Return the loop location if succeed and NULL if not. */
1303*38fd1498Szrj
1304*38fd1498Szrj source_location
find_loop_location(struct loop * loop)1305*38fd1498Szrj find_loop_location (struct loop *loop)
1306*38fd1498Szrj {
1307*38fd1498Szrj gimple *stmt = NULL;
1308*38fd1498Szrj basic_block bb;
1309*38fd1498Szrj gimple_stmt_iterator si;
1310*38fd1498Szrj
1311*38fd1498Szrj if (!loop)
1312*38fd1498Szrj return UNKNOWN_LOCATION;
1313*38fd1498Szrj
1314*38fd1498Szrj stmt = get_loop_exit_condition (loop);
1315*38fd1498Szrj
1316*38fd1498Szrj if (stmt
1317*38fd1498Szrj && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318*38fd1498Szrj return gimple_location (stmt);
1319*38fd1498Szrj
1320*38fd1498Szrj /* If we got here the loop is probably not "well formed",
1321*38fd1498Szrj try to estimate the loop location */
1322*38fd1498Szrj
1323*38fd1498Szrj if (!loop->header)
1324*38fd1498Szrj return UNKNOWN_LOCATION;
1325*38fd1498Szrj
1326*38fd1498Szrj bb = loop->header;
1327*38fd1498Szrj
1328*38fd1498Szrj for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329*38fd1498Szrj {
1330*38fd1498Szrj stmt = gsi_stmt (si);
1331*38fd1498Szrj if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332*38fd1498Szrj return gimple_location (stmt);
1333*38fd1498Szrj }
1334*38fd1498Szrj
1335*38fd1498Szrj return UNKNOWN_LOCATION;
1336*38fd1498Szrj }
1337*38fd1498Szrj
1338*38fd1498Szrj /* Return true if PHI defines an IV of the loop to be vectorized. */
1339*38fd1498Szrj
1340*38fd1498Szrj static bool
iv_phi_p(gphi * phi)1341*38fd1498Szrj iv_phi_p (gphi *phi)
1342*38fd1498Szrj {
1343*38fd1498Szrj if (virtual_operand_p (PHI_RESULT (phi)))
1344*38fd1498Szrj return false;
1345*38fd1498Szrj
1346*38fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1347*38fd1498Szrj gcc_assert (stmt_info != NULL);
1348*38fd1498Szrj if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349*38fd1498Szrj || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350*38fd1498Szrj return false;
1351*38fd1498Szrj
1352*38fd1498Szrj return true;
1353*38fd1498Szrj }
1354*38fd1498Szrj
1355*38fd1498Szrj /* Function vect_can_advance_ivs_p
1356*38fd1498Szrj
1357*38fd1498Szrj In case the number of iterations that LOOP iterates is unknown at compile
1358*38fd1498Szrj time, an epilog loop will be generated, and the loop induction variables
1359*38fd1498Szrj (IVs) will be "advanced" to the value they are supposed to take just before
1360*38fd1498Szrj the epilog loop. Here we check that the access function of the loop IVs
1361*38fd1498Szrj and the expression that represents the loop bound are simple enough.
1362*38fd1498Szrj These restrictions will be relaxed in the future. */
1363*38fd1498Szrj
1364*38fd1498Szrj bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1365*38fd1498Szrj vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366*38fd1498Szrj {
1367*38fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368*38fd1498Szrj basic_block bb = loop->header;
1369*38fd1498Szrj gphi_iterator gsi;
1370*38fd1498Szrj
1371*38fd1498Szrj /* Analyze phi functions of the loop header. */
1372*38fd1498Szrj
1373*38fd1498Szrj if (dump_enabled_p ())
1374*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375*38fd1498Szrj for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376*38fd1498Szrj {
1377*38fd1498Szrj tree evolution_part;
1378*38fd1498Szrj
1379*38fd1498Szrj gphi *phi = gsi.phi ();
1380*38fd1498Szrj if (dump_enabled_p ())
1381*38fd1498Szrj {
1382*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1383*38fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1384*38fd1498Szrj }
1385*38fd1498Szrj
1386*38fd1498Szrj /* Skip virtual phi's. The data dependences that are associated with
1387*38fd1498Szrj virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388*38fd1498Szrj
1389*38fd1498Szrj Skip reduction phis. */
1390*38fd1498Szrj if (!iv_phi_p (phi))
1391*38fd1498Szrj {
1392*38fd1498Szrj if (dump_enabled_p ())
1393*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1394*38fd1498Szrj "reduc or virtual phi. skip.\n");
1395*38fd1498Szrj continue;
1396*38fd1498Szrj }
1397*38fd1498Szrj
1398*38fd1498Szrj /* Analyze the evolution function. */
1399*38fd1498Szrj
1400*38fd1498Szrj evolution_part
1401*38fd1498Szrj = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1402*38fd1498Szrj if (evolution_part == NULL_TREE)
1403*38fd1498Szrj {
1404*38fd1498Szrj if (dump_enabled_p ())
1405*38fd1498Szrj dump_printf (MSG_MISSED_OPTIMIZATION,
1406*38fd1498Szrj "No access function or evolution.\n");
1407*38fd1498Szrj return false;
1408*38fd1498Szrj }
1409*38fd1498Szrj
1410*38fd1498Szrj /* FORNOW: We do not transform initial conditions of IVs
1411*38fd1498Szrj which evolution functions are not invariants in the loop. */
1412*38fd1498Szrj
1413*38fd1498Szrj if (!expr_invariant_in_loop_p (loop, evolution_part))
1414*38fd1498Szrj {
1415*38fd1498Szrj if (dump_enabled_p ())
1416*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417*38fd1498Szrj "evolution not invariant in loop.\n");
1418*38fd1498Szrj return false;
1419*38fd1498Szrj }
1420*38fd1498Szrj
1421*38fd1498Szrj /* FORNOW: We do not transform initial conditions of IVs
1422*38fd1498Szrj which evolution functions are a polynomial of degree >= 2. */
1423*38fd1498Szrj
1424*38fd1498Szrj if (tree_is_chrec (evolution_part))
1425*38fd1498Szrj {
1426*38fd1498Szrj if (dump_enabled_p ())
1427*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428*38fd1498Szrj "evolution is chrec.\n");
1429*38fd1498Szrj return false;
1430*38fd1498Szrj }
1431*38fd1498Szrj }
1432*38fd1498Szrj
1433*38fd1498Szrj return true;
1434*38fd1498Szrj }
1435*38fd1498Szrj
1436*38fd1498Szrj
1437*38fd1498Szrj /* Function vect_update_ivs_after_vectorizer.
1438*38fd1498Szrj
1439*38fd1498Szrj "Advance" the induction variables of LOOP to the value they should take
1440*38fd1498Szrj after the execution of LOOP. This is currently necessary because the
1441*38fd1498Szrj vectorizer does not handle induction variables that are used after the
1442*38fd1498Szrj loop. Such a situation occurs when the last iterations of LOOP are
1443*38fd1498Szrj peeled, because:
1444*38fd1498Szrj 1. We introduced new uses after LOOP for IVs that were not originally used
1445*38fd1498Szrj after LOOP: the IVs of LOOP are now used by an epilog loop.
1446*38fd1498Szrj 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447*38fd1498Szrj times, whereas the loop IVs should be bumped N times.
1448*38fd1498Szrj
1449*38fd1498Szrj Input:
1450*38fd1498Szrj - LOOP - a loop that is going to be vectorized. The last few iterations
1451*38fd1498Szrj of LOOP were peeled.
1452*38fd1498Szrj - NITERS - the number of iterations that LOOP executes (before it is
1453*38fd1498Szrj vectorized). i.e, the number of times the ivs should be bumped.
1454*38fd1498Szrj - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455*38fd1498Szrj coming out from LOOP on which there are uses of the LOOP ivs
1456*38fd1498Szrj (this is the path from LOOP->exit to epilog_loop->preheader).
1457*38fd1498Szrj
1458*38fd1498Szrj The new definitions of the ivs are placed in LOOP->exit.
1459*38fd1498Szrj The phi args associated with the edge UPDATE_E in the bb
1460*38fd1498Szrj UPDATE_E->dest are updated accordingly.
1461*38fd1498Szrj
1462*38fd1498Szrj Assumption 1: Like the rest of the vectorizer, this function assumes
1463*38fd1498Szrj a single loop exit that has a single predecessor.
1464*38fd1498Szrj
1465*38fd1498Szrj Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466*38fd1498Szrj organized in the same order.
1467*38fd1498Szrj
1468*38fd1498Szrj Assumption 3: The access function of the ivs is simple enough (see
1469*38fd1498Szrj vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1470*38fd1498Szrj
1471*38fd1498Szrj Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472*38fd1498Szrj coming out of LOOP on which the ivs of LOOP are used (this is the path
1473*38fd1498Szrj that leads to the epilog loop; other paths skip the epilog loop). This
1474*38fd1498Szrj path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475*38fd1498Szrj needs to have its phis updated.
1476*38fd1498Szrj */
1477*38fd1498Szrj
1478*38fd1498Szrj static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1479*38fd1498Szrj vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480*38fd1498Szrj tree niters, edge update_e)
1481*38fd1498Szrj {
1482*38fd1498Szrj gphi_iterator gsi, gsi1;
1483*38fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1484*38fd1498Szrj basic_block update_bb = update_e->dest;
1485*38fd1498Szrj basic_block exit_bb = single_exit (loop)->dest;
1486*38fd1498Szrj
1487*38fd1498Szrj /* Make sure there exists a single-predecessor exit bb: */
1488*38fd1498Szrj gcc_assert (single_pred_p (exit_bb));
1489*38fd1498Szrj gcc_assert (single_succ_edge (exit_bb) == update_e);
1490*38fd1498Szrj
1491*38fd1498Szrj for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492*38fd1498Szrj !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493*38fd1498Szrj gsi_next (&gsi), gsi_next (&gsi1))
1494*38fd1498Szrj {
1495*38fd1498Szrj tree init_expr;
1496*38fd1498Szrj tree step_expr, off;
1497*38fd1498Szrj tree type;
1498*38fd1498Szrj tree var, ni, ni_name;
1499*38fd1498Szrj gimple_stmt_iterator last_gsi;
1500*38fd1498Szrj
1501*38fd1498Szrj gphi *phi = gsi.phi ();
1502*38fd1498Szrj gphi *phi1 = gsi1.phi ();
1503*38fd1498Szrj if (dump_enabled_p ())
1504*38fd1498Szrj {
1505*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1506*38fd1498Szrj "vect_update_ivs_after_vectorizer: phi: ");
1507*38fd1498Szrj dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1508*38fd1498Szrj }
1509*38fd1498Szrj
1510*38fd1498Szrj /* Skip reduction and virtual phis. */
1511*38fd1498Szrj if (!iv_phi_p (phi))
1512*38fd1498Szrj {
1513*38fd1498Szrj if (dump_enabled_p ())
1514*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1515*38fd1498Szrj "reduc or virtual phi. skip.\n");
1516*38fd1498Szrj continue;
1517*38fd1498Szrj }
1518*38fd1498Szrj
1519*38fd1498Szrj type = TREE_TYPE (gimple_phi_result (phi));
1520*38fd1498Szrj step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1521*38fd1498Szrj step_expr = unshare_expr (step_expr);
1522*38fd1498Szrj
1523*38fd1498Szrj /* FORNOW: We do not support IVs whose evolution function is a polynomial
1524*38fd1498Szrj of degree >= 2 or exponential. */
1525*38fd1498Szrj gcc_assert (!tree_is_chrec (step_expr));
1526*38fd1498Szrj
1527*38fd1498Szrj init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1528*38fd1498Szrj
1529*38fd1498Szrj off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1530*38fd1498Szrj fold_convert (TREE_TYPE (step_expr), niters),
1531*38fd1498Szrj step_expr);
1532*38fd1498Szrj if (POINTER_TYPE_P (type))
1533*38fd1498Szrj ni = fold_build_pointer_plus (init_expr, off);
1534*38fd1498Szrj else
1535*38fd1498Szrj ni = fold_build2 (PLUS_EXPR, type,
1536*38fd1498Szrj init_expr, fold_convert (type, off));
1537*38fd1498Szrj
1538*38fd1498Szrj var = create_tmp_var (type, "tmp");
1539*38fd1498Szrj
1540*38fd1498Szrj last_gsi = gsi_last_bb (exit_bb);
1541*38fd1498Szrj gimple_seq new_stmts = NULL;
1542*38fd1498Szrj ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1543*38fd1498Szrj /* Exit_bb shouldn't be empty. */
1544*38fd1498Szrj if (!gsi_end_p (last_gsi))
1545*38fd1498Szrj gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1546*38fd1498Szrj else
1547*38fd1498Szrj gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1548*38fd1498Szrj
1549*38fd1498Szrj /* Fix phi expressions in the successor bb. */
1550*38fd1498Szrj adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1551*38fd1498Szrj }
1552*38fd1498Szrj }
1553*38fd1498Szrj
1554*38fd1498Szrj /* Return a gimple value containing the misalignment (measured in vector
1555*38fd1498Szrj elements) for the loop described by LOOP_VINFO, i.e. how many elements
1556*38fd1498Szrj it is away from a perfectly aligned address. Add any new statements
1557*38fd1498Szrj to SEQ. */
1558*38fd1498Szrj
1559*38fd1498Szrj static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1560*38fd1498Szrj get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1561*38fd1498Szrj {
1562*38fd1498Szrj struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1563*38fd1498Szrj gimple *dr_stmt = DR_STMT (dr);
1564*38fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1565*38fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566*38fd1498Szrj
1567*38fd1498Szrj unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1568*38fd1498Szrj gcc_assert (target_align != 0);
1569*38fd1498Szrj
1570*38fd1498Szrj bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1571*38fd1498Szrj tree offset = (negative
1572*38fd1498Szrj ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1573*38fd1498Szrj : size_zero_node);
1574*38fd1498Szrj tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1575*38fd1498Szrj offset);
1576*38fd1498Szrj tree type = unsigned_type_for (TREE_TYPE (start_addr));
1577*38fd1498Szrj tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1578*38fd1498Szrj HOST_WIDE_INT elem_size
1579*38fd1498Szrj = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580*38fd1498Szrj tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581*38fd1498Szrj
1582*38fd1498Szrj /* Create: misalign_in_bytes = addr & (target_align - 1). */
1583*38fd1498Szrj tree int_start_addr = fold_convert (type, start_addr);
1584*38fd1498Szrj tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585*38fd1498Szrj target_align_minus_1);
1586*38fd1498Szrj
1587*38fd1498Szrj /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1588*38fd1498Szrj tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589*38fd1498Szrj elem_size_log);
1590*38fd1498Szrj
1591*38fd1498Szrj return misalign_in_elems;
1592*38fd1498Szrj }
1593*38fd1498Szrj
1594*38fd1498Szrj /* Function vect_gen_prolog_loop_niters
1595*38fd1498Szrj
1596*38fd1498Szrj Generate the number of iterations which should be peeled as prolog for the
1597*38fd1498Szrj loop represented by LOOP_VINFO. It is calculated as the misalignment of
1598*38fd1498Szrj DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599*38fd1498Szrj As a result, after the execution of this loop, the data reference DR will
1600*38fd1498Szrj refer to an aligned location. The following computation is generated:
1601*38fd1498Szrj
1602*38fd1498Szrj If the misalignment of DR is known at compile time:
1603*38fd1498Szrj addr_mis = int mis = DR_MISALIGNMENT (dr);
1604*38fd1498Szrj Else, compute address misalignment in bytes:
1605*38fd1498Szrj addr_mis = addr & (target_align - 1)
1606*38fd1498Szrj
1607*38fd1498Szrj prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608*38fd1498Szrj
1609*38fd1498Szrj (elem_size = element type size; an element is the scalar element whose type
1610*38fd1498Szrj is the inner type of the vectype)
1611*38fd1498Szrj
1612*38fd1498Szrj The computations will be emitted at the end of BB. We also compute and
1613*38fd1498Szrj store upper bound (included) of the result in BOUND.
1614*38fd1498Szrj
1615*38fd1498Szrj When the step of the data-ref in the loop is not 1 (as in interleaved data
1616*38fd1498Szrj and SLP), the number of iterations of the prolog must be divided by the step
1617*38fd1498Szrj (which is equal to the size of interleaved group).
1618*38fd1498Szrj
1619*38fd1498Szrj The above formulas assume that VF == number of elements in the vector. This
1620*38fd1498Szrj may not hold when there are multiple-types in the loop.
1621*38fd1498Szrj In this case, for some data-references in the loop the VF does not represent
1622*38fd1498Szrj the number of elements that fit in the vector. Therefore, instead of VF we
1623*38fd1498Szrj use TYPE_VECTOR_SUBPARTS. */
1624*38fd1498Szrj
1625*38fd1498Szrj static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1626*38fd1498Szrj vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627*38fd1498Szrj basic_block bb, int *bound)
1628*38fd1498Szrj {
1629*38fd1498Szrj struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630*38fd1498Szrj tree var;
1631*38fd1498Szrj tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632*38fd1498Szrj gimple_seq stmts = NULL, new_stmts = NULL;
1633*38fd1498Szrj tree iters, iters_name;
1634*38fd1498Szrj gimple *dr_stmt = DR_STMT (dr);
1635*38fd1498Szrj stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1636*38fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637*38fd1498Szrj unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1638*38fd1498Szrj
1639*38fd1498Szrj if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1640*38fd1498Szrj {
1641*38fd1498Szrj int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1642*38fd1498Szrj
1643*38fd1498Szrj if (dump_enabled_p ())
1644*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1645*38fd1498Szrj "known peeling = %d.\n", npeel);
1646*38fd1498Szrj
1647*38fd1498Szrj iters = build_int_cst (niters_type, npeel);
1648*38fd1498Szrj *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649*38fd1498Szrj }
1650*38fd1498Szrj else
1651*38fd1498Szrj {
1652*38fd1498Szrj tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1653*38fd1498Szrj tree type = TREE_TYPE (misalign_in_elems);
1654*38fd1498Szrj HOST_WIDE_INT elem_size
1655*38fd1498Szrj = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1656*38fd1498Szrj HOST_WIDE_INT align_in_elems = target_align / elem_size;
1657*38fd1498Szrj tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1658*38fd1498Szrj tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659*38fd1498Szrj
1660*38fd1498Szrj /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1661*38fd1498Szrj & (align_in_elems - 1)). */
1662*38fd1498Szrj bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1663*38fd1498Szrj if (negative)
1664*38fd1498Szrj iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1665*38fd1498Szrj align_in_elems_tree);
1666*38fd1498Szrj else
1667*38fd1498Szrj iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1668*38fd1498Szrj misalign_in_elems);
1669*38fd1498Szrj iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1670*38fd1498Szrj iters = fold_convert (niters_type, iters);
1671*38fd1498Szrj *bound = align_in_elems - 1;
1672*38fd1498Szrj }
1673*38fd1498Szrj
1674*38fd1498Szrj if (dump_enabled_p ())
1675*38fd1498Szrj {
1676*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1677*38fd1498Szrj "niters for prolog loop: ");
1678*38fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1679*38fd1498Szrj dump_printf (MSG_NOTE, "\n");
1680*38fd1498Szrj }
1681*38fd1498Szrj
1682*38fd1498Szrj var = create_tmp_var (niters_type, "prolog_loop_niters");
1683*38fd1498Szrj iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1684*38fd1498Szrj
1685*38fd1498Szrj if (new_stmts)
1686*38fd1498Szrj gimple_seq_add_seq (&stmts, new_stmts);
1687*38fd1498Szrj if (stmts)
1688*38fd1498Szrj {
1689*38fd1498Szrj gcc_assert (single_succ_p (bb));
1690*38fd1498Szrj gimple_stmt_iterator gsi = gsi_last_bb (bb);
1691*38fd1498Szrj if (gsi_end_p (gsi))
1692*38fd1498Szrj gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1693*38fd1498Szrj else
1694*38fd1498Szrj gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1695*38fd1498Szrj }
1696*38fd1498Szrj return iters_name;
1697*38fd1498Szrj }
1698*38fd1498Szrj
1699*38fd1498Szrj
1700*38fd1498Szrj /* Function vect_update_init_of_dr
1701*38fd1498Szrj
1702*38fd1498Szrj If CODE is PLUS, the vector loop starts NITERS iterations after the
1703*38fd1498Szrj scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1704*38fd1498Szrj iterations before the scalar one (using masking to skip inactive
1705*38fd1498Szrj elements). This function updates the information recorded in DR to
1706*38fd1498Szrj account for the difference. Specifically, it updates the OFFSET
1707*38fd1498Szrj field of DR. */
1708*38fd1498Szrj
1709*38fd1498Szrj static void
vect_update_init_of_dr(struct data_reference * dr,tree niters,tree_code code)1710*38fd1498Szrj vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1711*38fd1498Szrj {
1712*38fd1498Szrj tree offset = DR_OFFSET (dr);
1713*38fd1498Szrj
1714*38fd1498Szrj niters = fold_build2 (MULT_EXPR, sizetype,
1715*38fd1498Szrj fold_convert (sizetype, niters),
1716*38fd1498Szrj fold_convert (sizetype, DR_STEP (dr)));
1717*38fd1498Szrj offset = fold_build2 (code, sizetype,
1718*38fd1498Szrj fold_convert (sizetype, offset), niters);
1719*38fd1498Szrj DR_OFFSET (dr) = offset;
1720*38fd1498Szrj }
1721*38fd1498Szrj
1722*38fd1498Szrj
1723*38fd1498Szrj /* Function vect_update_inits_of_drs
1724*38fd1498Szrj
1725*38fd1498Szrj Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1726*38fd1498Szrj CODE and NITERS are as for vect_update_inits_of_dr. */
1727*38fd1498Szrj
1728*38fd1498Szrj static void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1729*38fd1498Szrj vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1730*38fd1498Szrj tree_code code)
1731*38fd1498Szrj {
1732*38fd1498Szrj unsigned int i;
1733*38fd1498Szrj vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1734*38fd1498Szrj struct data_reference *dr;
1735*38fd1498Szrj
1736*38fd1498Szrj if (dump_enabled_p ())
1737*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1738*38fd1498Szrj "=== vect_update_inits_of_dr ===\n");
1739*38fd1498Szrj
1740*38fd1498Szrj /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1741*38fd1498Szrj if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1742*38fd1498Szrj {
1743*38fd1498Szrj gimple_seq seq;
1744*38fd1498Szrj edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1745*38fd1498Szrj tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1746*38fd1498Szrj
1747*38fd1498Szrj niters = fold_convert (sizetype, niters);
1748*38fd1498Szrj niters = force_gimple_operand (niters, &seq, false, var);
1749*38fd1498Szrj if (seq)
1750*38fd1498Szrj {
1751*38fd1498Szrj basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1752*38fd1498Szrj gcc_assert (!new_bb);
1753*38fd1498Szrj }
1754*38fd1498Szrj }
1755*38fd1498Szrj
1756*38fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
1757*38fd1498Szrj vect_update_init_of_dr (dr, niters, code);
1758*38fd1498Szrj }
1759*38fd1498Szrj
1760*38fd1498Szrj /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1761*38fd1498Szrj by masking. This involves calculating the number of iterations to
1762*38fd1498Szrj be peeled and then aligning all memory references appropriately. */
1763*38fd1498Szrj
1764*38fd1498Szrj void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1765*38fd1498Szrj vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1766*38fd1498Szrj {
1767*38fd1498Szrj tree misalign_in_elems;
1768*38fd1498Szrj tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1769*38fd1498Szrj
1770*38fd1498Szrj gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1771*38fd1498Szrj
1772*38fd1498Szrj /* From the information recorded in LOOP_VINFO get the number of iterations
1773*38fd1498Szrj that need to be skipped via masking. */
1774*38fd1498Szrj if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1775*38fd1498Szrj {
1776*38fd1498Szrj poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1777*38fd1498Szrj - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1778*38fd1498Szrj misalign_in_elems = build_int_cst (type, misalign);
1779*38fd1498Szrj }
1780*38fd1498Szrj else
1781*38fd1498Szrj {
1782*38fd1498Szrj gimple_seq seq1 = NULL, seq2 = NULL;
1783*38fd1498Szrj misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1784*38fd1498Szrj misalign_in_elems = fold_convert (type, misalign_in_elems);
1785*38fd1498Szrj misalign_in_elems = force_gimple_operand (misalign_in_elems,
1786*38fd1498Szrj &seq2, true, NULL_TREE);
1787*38fd1498Szrj gimple_seq_add_seq (&seq1, seq2);
1788*38fd1498Szrj if (seq1)
1789*38fd1498Szrj {
1790*38fd1498Szrj edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1791*38fd1498Szrj basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1792*38fd1498Szrj gcc_assert (!new_bb);
1793*38fd1498Szrj }
1794*38fd1498Szrj }
1795*38fd1498Szrj
1796*38fd1498Szrj if (dump_enabled_p ())
1797*38fd1498Szrj {
1798*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
1799*38fd1498Szrj "misalignment for fully-masked loop: ");
1800*38fd1498Szrj dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1801*38fd1498Szrj dump_printf (MSG_NOTE, "\n");
1802*38fd1498Szrj }
1803*38fd1498Szrj
1804*38fd1498Szrj LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1805*38fd1498Szrj
1806*38fd1498Szrj vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1807*38fd1498Szrj }
1808*38fd1498Szrj
1809*38fd1498Szrj /* This function builds ni_name = number of iterations. Statements
1810*38fd1498Szrj are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1811*38fd1498Szrj it to TRUE if new ssa_var is generated. */
1812*38fd1498Szrj
1813*38fd1498Szrj tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1814*38fd1498Szrj vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1815*38fd1498Szrj {
1816*38fd1498Szrj tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1817*38fd1498Szrj if (TREE_CODE (ni) == INTEGER_CST)
1818*38fd1498Szrj return ni;
1819*38fd1498Szrj else
1820*38fd1498Szrj {
1821*38fd1498Szrj tree ni_name, var;
1822*38fd1498Szrj gimple_seq stmts = NULL;
1823*38fd1498Szrj edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1824*38fd1498Szrj
1825*38fd1498Szrj var = create_tmp_var (TREE_TYPE (ni), "niters");
1826*38fd1498Szrj ni_name = force_gimple_operand (ni, &stmts, false, var);
1827*38fd1498Szrj if (stmts)
1828*38fd1498Szrj {
1829*38fd1498Szrj gsi_insert_seq_on_edge_immediate (pe, stmts);
1830*38fd1498Szrj if (new_var_p != NULL)
1831*38fd1498Szrj *new_var_p = true;
1832*38fd1498Szrj }
1833*38fd1498Szrj
1834*38fd1498Szrj return ni_name;
1835*38fd1498Szrj }
1836*38fd1498Szrj }
1837*38fd1498Szrj
1838*38fd1498Szrj /* Calculate the number of iterations above which vectorized loop will be
1839*38fd1498Szrj preferred than scalar loop. NITERS_PROLOG is the number of iterations
1840*38fd1498Szrj of prolog loop. If it's integer const, the integer number is also passed
1841*38fd1498Szrj in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1842*38fd1498Szrj number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1843*38fd1498Szrj value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1844*38fd1498Szrj threshold below which the scalar (rather than vectorized) loop will be
1845*38fd1498Szrj executed. This function stores the upper bound (inclusive) of the result
1846*38fd1498Szrj in BOUND_SCALAR. */
1847*38fd1498Szrj
1848*38fd1498Szrj static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1849*38fd1498Szrj vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1850*38fd1498Szrj int bound_prolog, poly_int64 bound_epilog, int th,
1851*38fd1498Szrj poly_uint64 *bound_scalar,
1852*38fd1498Szrj bool check_profitability)
1853*38fd1498Szrj {
1854*38fd1498Szrj tree type = TREE_TYPE (niters_prolog);
1855*38fd1498Szrj tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1856*38fd1498Szrj build_int_cst (type, bound_epilog));
1857*38fd1498Szrj
1858*38fd1498Szrj *bound_scalar = bound_prolog + bound_epilog;
1859*38fd1498Szrj if (check_profitability)
1860*38fd1498Szrj {
1861*38fd1498Szrj /* TH indicates the minimum niters of vectorized loop, while we
1862*38fd1498Szrj compute the maximum niters of scalar loop. */
1863*38fd1498Szrj th--;
1864*38fd1498Szrj /* Peeling for constant times. */
1865*38fd1498Szrj if (int_niters_prolog >= 0)
1866*38fd1498Szrj {
1867*38fd1498Szrj *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1868*38fd1498Szrj return build_int_cst (type, *bound_scalar);
1869*38fd1498Szrj }
1870*38fd1498Szrj /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1871*38fd1498Szrj and BOUND_EPILOG are inclusive upper bounds. */
1872*38fd1498Szrj if (known_ge (th, bound_prolog + bound_epilog))
1873*38fd1498Szrj {
1874*38fd1498Szrj *bound_scalar = th;
1875*38fd1498Szrj return build_int_cst (type, th);
1876*38fd1498Szrj }
1877*38fd1498Szrj /* Need to do runtime comparison. */
1878*38fd1498Szrj else if (maybe_gt (th, bound_epilog))
1879*38fd1498Szrj {
1880*38fd1498Szrj *bound_scalar = upper_bound (*bound_scalar, th);
1881*38fd1498Szrj return fold_build2 (MAX_EXPR, type,
1882*38fd1498Szrj build_int_cst (type, th), niters);
1883*38fd1498Szrj }
1884*38fd1498Szrj }
1885*38fd1498Szrj return niters;
1886*38fd1498Szrj }
1887*38fd1498Szrj
1888*38fd1498Szrj /* NITERS is the number of times that the original scalar loop executes
1889*38fd1498Szrj after peeling. Work out the maximum number of iterations N that can
1890*38fd1498Szrj be handled by the vectorized form of the loop and then either:
1891*38fd1498Szrj
1892*38fd1498Szrj a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1893*38fd1498Szrj
1894*38fd1498Szrj niters_vector = N
1895*38fd1498Szrj
1896*38fd1498Szrj b) set *STEP_VECTOR_PTR to one and generate:
1897*38fd1498Szrj
1898*38fd1498Szrj niters_vector = N / vf
1899*38fd1498Szrj
1900*38fd1498Szrj In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1901*38fd1498Szrj any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1902*38fd1498Szrj is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1903*38fd1498Szrj
1904*38fd1498Szrj void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1905*38fd1498Szrj vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1906*38fd1498Szrj tree *niters_vector_ptr, tree *step_vector_ptr,
1907*38fd1498Szrj bool niters_no_overflow)
1908*38fd1498Szrj {
1909*38fd1498Szrj tree ni_minus_gap, var;
1910*38fd1498Szrj tree niters_vector, step_vector, type = TREE_TYPE (niters);
1911*38fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1912*38fd1498Szrj edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1913*38fd1498Szrj tree log_vf = NULL_TREE;
1914*38fd1498Szrj
1915*38fd1498Szrj /* If epilogue loop is required because of data accesses with gaps, we
1916*38fd1498Szrj subtract one iteration from the total number of iterations here for
1917*38fd1498Szrj correct calculation of RATIO. */
1918*38fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1919*38fd1498Szrj {
1920*38fd1498Szrj ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1921*38fd1498Szrj build_one_cst (type));
1922*38fd1498Szrj if (!is_gimple_val (ni_minus_gap))
1923*38fd1498Szrj {
1924*38fd1498Szrj var = create_tmp_var (type, "ni_gap");
1925*38fd1498Szrj gimple *stmts = NULL;
1926*38fd1498Szrj ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1927*38fd1498Szrj true, var);
1928*38fd1498Szrj gsi_insert_seq_on_edge_immediate (pe, stmts);
1929*38fd1498Szrj }
1930*38fd1498Szrj }
1931*38fd1498Szrj else
1932*38fd1498Szrj ni_minus_gap = niters;
1933*38fd1498Szrj
1934*38fd1498Szrj unsigned HOST_WIDE_INT const_vf;
1935*38fd1498Szrj if (vf.is_constant (&const_vf)
1936*38fd1498Szrj && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1937*38fd1498Szrj {
1938*38fd1498Szrj /* Create: niters >> log2(vf) */
1939*38fd1498Szrj /* If it's known that niters == number of latch executions + 1 doesn't
1940*38fd1498Szrj overflow, we can generate niters >> log2(vf); otherwise we generate
1941*38fd1498Szrj (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1942*38fd1498Szrj will be at least one. */
1943*38fd1498Szrj log_vf = build_int_cst (type, exact_log2 (const_vf));
1944*38fd1498Szrj if (niters_no_overflow)
1945*38fd1498Szrj niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1946*38fd1498Szrj else
1947*38fd1498Szrj niters_vector
1948*38fd1498Szrj = fold_build2 (PLUS_EXPR, type,
1949*38fd1498Szrj fold_build2 (RSHIFT_EXPR, type,
1950*38fd1498Szrj fold_build2 (MINUS_EXPR, type,
1951*38fd1498Szrj ni_minus_gap,
1952*38fd1498Szrj build_int_cst (type, vf)),
1953*38fd1498Szrj log_vf),
1954*38fd1498Szrj build_int_cst (type, 1));
1955*38fd1498Szrj step_vector = build_one_cst (type);
1956*38fd1498Szrj }
1957*38fd1498Szrj else
1958*38fd1498Szrj {
1959*38fd1498Szrj niters_vector = ni_minus_gap;
1960*38fd1498Szrj step_vector = build_int_cst (type, vf);
1961*38fd1498Szrj }
1962*38fd1498Szrj
1963*38fd1498Szrj if (!is_gimple_val (niters_vector))
1964*38fd1498Szrj {
1965*38fd1498Szrj var = create_tmp_var (type, "bnd");
1966*38fd1498Szrj gimple_seq stmts = NULL;
1967*38fd1498Szrj niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1968*38fd1498Szrj gsi_insert_seq_on_edge_immediate (pe, stmts);
1969*38fd1498Szrj /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1970*38fd1498Szrj we set range information to make niters analyzer's life easier. */
1971*38fd1498Szrj if (stmts != NULL && log_vf)
1972*38fd1498Szrj set_range_info (niters_vector, VR_RANGE,
1973*38fd1498Szrj wi::to_wide (build_int_cst (type, 1)),
1974*38fd1498Szrj wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1975*38fd1498Szrj TYPE_MAX_VALUE (type),
1976*38fd1498Szrj log_vf)));
1977*38fd1498Szrj }
1978*38fd1498Szrj *niters_vector_ptr = niters_vector;
1979*38fd1498Szrj *step_vector_ptr = step_vector;
1980*38fd1498Szrj
1981*38fd1498Szrj return;
1982*38fd1498Szrj }
1983*38fd1498Szrj
1984*38fd1498Szrj /* Given NITERS_VECTOR which is the number of iterations for vectorized
1985*38fd1498Szrj loop specified by LOOP_VINFO after vectorization, compute the number
1986*38fd1498Szrj of iterations before vectorization (niters_vector * vf) and store it
1987*38fd1498Szrj to NITERS_VECTOR_MULT_VF_PTR. */
1988*38fd1498Szrj
1989*38fd1498Szrj static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)1990*38fd1498Szrj vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1991*38fd1498Szrj tree niters_vector,
1992*38fd1498Szrj tree *niters_vector_mult_vf_ptr)
1993*38fd1498Szrj {
1994*38fd1498Szrj /* We should be using a step_vector of VF if VF is variable. */
1995*38fd1498Szrj int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1996*38fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1997*38fd1498Szrj tree type = TREE_TYPE (niters_vector);
1998*38fd1498Szrj tree log_vf = build_int_cst (type, exact_log2 (vf));
1999*38fd1498Szrj basic_block exit_bb = single_exit (loop)->dest;
2000*38fd1498Szrj
2001*38fd1498Szrj gcc_assert (niters_vector_mult_vf_ptr != NULL);
2002*38fd1498Szrj tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2003*38fd1498Szrj niters_vector, log_vf);
2004*38fd1498Szrj if (!is_gimple_val (niters_vector_mult_vf))
2005*38fd1498Szrj {
2006*38fd1498Szrj tree var = create_tmp_var (type, "niters_vector_mult_vf");
2007*38fd1498Szrj gimple_seq stmts = NULL;
2008*38fd1498Szrj niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2009*38fd1498Szrj &stmts, true, var);
2010*38fd1498Szrj gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2011*38fd1498Szrj gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2012*38fd1498Szrj }
2013*38fd1498Szrj *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2014*38fd1498Szrj }
2015*38fd1498Szrj
2016*38fd1498Szrj /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2017*38fd1498Szrj from SECOND/FIRST and puts it at the original loop's preheader/exit
2018*38fd1498Szrj edge, the two loops are arranged as below:
2019*38fd1498Szrj
2020*38fd1498Szrj preheader_a:
2021*38fd1498Szrj first_loop:
2022*38fd1498Szrj header_a:
2023*38fd1498Szrj i_1 = PHI<i_0, i_2>;
2024*38fd1498Szrj ...
2025*38fd1498Szrj i_2 = i_1 + 1;
2026*38fd1498Szrj if (cond_a)
2027*38fd1498Szrj goto latch_a;
2028*38fd1498Szrj else
2029*38fd1498Szrj goto between_bb;
2030*38fd1498Szrj latch_a:
2031*38fd1498Szrj goto header_a;
2032*38fd1498Szrj
2033*38fd1498Szrj between_bb:
2034*38fd1498Szrj ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2035*38fd1498Szrj
2036*38fd1498Szrj second_loop:
2037*38fd1498Szrj header_b:
2038*38fd1498Szrj i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2039*38fd1498Szrj or with i_2 if no LCSSA phi is created
2040*38fd1498Szrj under condition of CREATE_LCSSA_FOR_IV_PHIS.
2041*38fd1498Szrj ...
2042*38fd1498Szrj i_4 = i_3 + 1;
2043*38fd1498Szrj if (cond_b)
2044*38fd1498Szrj goto latch_b;
2045*38fd1498Szrj else
2046*38fd1498Szrj goto exit_bb;
2047*38fd1498Szrj latch_b:
2048*38fd1498Szrj goto header_b;
2049*38fd1498Szrj
2050*38fd1498Szrj exit_bb:
2051*38fd1498Szrj
2052*38fd1498Szrj This function creates loop closed SSA for the first loop; update the
2053*38fd1498Szrj second loop's PHI nodes by replacing argument on incoming edge with the
2054*38fd1498Szrj result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2055*38fd1498Szrj is false, Loop closed ssa phis will only be created for non-iv phis for
2056*38fd1498Szrj the first loop.
2057*38fd1498Szrj
2058*38fd1498Szrj This function assumes exit bb of the first loop is preheader bb of the
2059*38fd1498Szrj second loop, i.e, between_bb in the example code. With PHIs updated,
2060*38fd1498Szrj the second loop will execute rest iterations of the first. */
2061*38fd1498Szrj
2062*38fd1498Szrj static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,struct loop * first,struct loop * second,bool create_lcssa_for_iv_phis)2063*38fd1498Szrj slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2064*38fd1498Szrj struct loop *first, struct loop *second,
2065*38fd1498Szrj bool create_lcssa_for_iv_phis)
2066*38fd1498Szrj {
2067*38fd1498Szrj gphi_iterator gsi_update, gsi_orig;
2068*38fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2069*38fd1498Szrj
2070*38fd1498Szrj edge first_latch_e = EDGE_SUCC (first->latch, 0);
2071*38fd1498Szrj edge second_preheader_e = loop_preheader_edge (second);
2072*38fd1498Szrj basic_block between_bb = single_exit (first)->dest;
2073*38fd1498Szrj
2074*38fd1498Szrj gcc_assert (between_bb == second_preheader_e->src);
2075*38fd1498Szrj gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2076*38fd1498Szrj /* Either the first loop or the second is the loop to be vectorized. */
2077*38fd1498Szrj gcc_assert (loop == first || loop == second);
2078*38fd1498Szrj
2079*38fd1498Szrj for (gsi_orig = gsi_start_phis (first->header),
2080*38fd1498Szrj gsi_update = gsi_start_phis (second->header);
2081*38fd1498Szrj !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2082*38fd1498Szrj gsi_next (&gsi_orig), gsi_next (&gsi_update))
2083*38fd1498Szrj {
2084*38fd1498Szrj gphi *orig_phi = gsi_orig.phi ();
2085*38fd1498Szrj gphi *update_phi = gsi_update.phi ();
2086*38fd1498Szrj
2087*38fd1498Szrj tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2088*38fd1498Szrj /* Generate lcssa PHI node for the first loop. */
2089*38fd1498Szrj gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2090*38fd1498Szrj if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2091*38fd1498Szrj {
2092*38fd1498Szrj tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2093*38fd1498Szrj gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2094*38fd1498Szrj add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2095*38fd1498Szrj arg = new_res;
2096*38fd1498Szrj }
2097*38fd1498Szrj
2098*38fd1498Szrj /* Update PHI node in the second loop by replacing arg on the loop's
2099*38fd1498Szrj incoming edge. */
2100*38fd1498Szrj adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2101*38fd1498Szrj }
2102*38fd1498Szrj }
2103*38fd1498Szrj
2104*38fd1498Szrj /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2105*38fd1498Szrj of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2106*38fd1498Szrj are two pred edges of the merge point before UPDATE_LOOP. The two loops
2107*38fd1498Szrj appear like below:
2108*38fd1498Szrj
2109*38fd1498Szrj guard_bb:
2110*38fd1498Szrj if (cond)
2111*38fd1498Szrj goto merge_bb;
2112*38fd1498Szrj else
2113*38fd1498Szrj goto skip_loop;
2114*38fd1498Szrj
2115*38fd1498Szrj skip_loop:
2116*38fd1498Szrj header_a:
2117*38fd1498Szrj i_1 = PHI<i_0, i_2>;
2118*38fd1498Szrj ...
2119*38fd1498Szrj i_2 = i_1 + 1;
2120*38fd1498Szrj if (cond_a)
2121*38fd1498Szrj goto latch_a;
2122*38fd1498Szrj else
2123*38fd1498Szrj goto exit_a;
2124*38fd1498Szrj latch_a:
2125*38fd1498Szrj goto header_a;
2126*38fd1498Szrj
2127*38fd1498Szrj exit_a:
2128*38fd1498Szrj i_5 = PHI<i_2>;
2129*38fd1498Szrj
2130*38fd1498Szrj merge_bb:
2131*38fd1498Szrj ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2132*38fd1498Szrj
2133*38fd1498Szrj update_loop:
2134*38fd1498Szrj header_b:
2135*38fd1498Szrj i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2136*38fd1498Szrj ...
2137*38fd1498Szrj i_4 = i_3 + 1;
2138*38fd1498Szrj if (cond_b)
2139*38fd1498Szrj goto latch_b;
2140*38fd1498Szrj else
2141*38fd1498Szrj goto exit_bb;
2142*38fd1498Szrj latch_b:
2143*38fd1498Szrj goto header_b;
2144*38fd1498Szrj
2145*38fd1498Szrj exit_bb:
2146*38fd1498Szrj
2147*38fd1498Szrj This function creates PHI nodes at merge_bb and replaces the use of i_5
2148*38fd1498Szrj in the update_loop's PHI node with the result of new PHI result. */
2149*38fd1498Szrj
2150*38fd1498Szrj static void
slpeel_update_phi_nodes_for_guard1(struct loop * skip_loop,struct loop * update_loop,edge guard_edge,edge merge_edge)2151*38fd1498Szrj slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2152*38fd1498Szrj struct loop *update_loop,
2153*38fd1498Szrj edge guard_edge, edge merge_edge)
2154*38fd1498Szrj {
2155*38fd1498Szrj source_location merge_loc, guard_loc;
2156*38fd1498Szrj edge orig_e = loop_preheader_edge (skip_loop);
2157*38fd1498Szrj edge update_e = loop_preheader_edge (update_loop);
2158*38fd1498Szrj gphi_iterator gsi_orig, gsi_update;
2159*38fd1498Szrj
2160*38fd1498Szrj for ((gsi_orig = gsi_start_phis (skip_loop->header),
2161*38fd1498Szrj gsi_update = gsi_start_phis (update_loop->header));
2162*38fd1498Szrj !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2163*38fd1498Szrj gsi_next (&gsi_orig), gsi_next (&gsi_update))
2164*38fd1498Szrj {
2165*38fd1498Szrj gphi *orig_phi = gsi_orig.phi ();
2166*38fd1498Szrj gphi *update_phi = gsi_update.phi ();
2167*38fd1498Szrj
2168*38fd1498Szrj /* Generate new phi node at merge bb of the guard. */
2169*38fd1498Szrj tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2170*38fd1498Szrj gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2171*38fd1498Szrj
2172*38fd1498Szrj /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2173*38fd1498Szrj args in NEW_PHI for these edges. */
2174*38fd1498Szrj tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2175*38fd1498Szrj tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2176*38fd1498Szrj merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2177*38fd1498Szrj guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2178*38fd1498Szrj add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2179*38fd1498Szrj add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2180*38fd1498Szrj
2181*38fd1498Szrj /* Update phi in UPDATE_PHI. */
2182*38fd1498Szrj adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2183*38fd1498Szrj }
2184*38fd1498Szrj }
2185*38fd1498Szrj
2186*38fd1498Szrj /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2187*38fd1498Szrj this function searches for the corresponding lcssa phi node in exit
2188*38fd1498Szrj bb of LOOP. If it is found, return the phi result; otherwise return
2189*38fd1498Szrj NULL. */
2190*38fd1498Szrj
2191*38fd1498Szrj static tree
find_guard_arg(struct loop * loop,struct loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2192*38fd1498Szrj find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2193*38fd1498Szrj gphi *lcssa_phi)
2194*38fd1498Szrj {
2195*38fd1498Szrj gphi_iterator gsi;
2196*38fd1498Szrj edge e = single_exit (loop);
2197*38fd1498Szrj
2198*38fd1498Szrj gcc_assert (single_pred_p (e->dest));
2199*38fd1498Szrj for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2200*38fd1498Szrj {
2201*38fd1498Szrj gphi *phi = gsi.phi ();
2202*38fd1498Szrj if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2203*38fd1498Szrj PHI_ARG_DEF (lcssa_phi, 0), 0))
2204*38fd1498Szrj return PHI_RESULT (phi);
2205*38fd1498Szrj }
2206*38fd1498Szrj return NULL_TREE;
2207*38fd1498Szrj }
2208*38fd1498Szrj
2209*38fd1498Szrj /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2210*38fd1498Szrj from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2211*38fd1498Szrj point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2212*38fd1498Szrj and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2213*38fd1498Szrj The CFG looks like:
2214*38fd1498Szrj
2215*38fd1498Szrj loop:
2216*38fd1498Szrj header_a:
2217*38fd1498Szrj i_1 = PHI<i_0, i_2>;
2218*38fd1498Szrj ...
2219*38fd1498Szrj i_2 = i_1 + 1;
2220*38fd1498Szrj if (cond_a)
2221*38fd1498Szrj goto latch_a;
2222*38fd1498Szrj else
2223*38fd1498Szrj goto exit_a;
2224*38fd1498Szrj latch_a:
2225*38fd1498Szrj goto header_a;
2226*38fd1498Szrj
2227*38fd1498Szrj exit_a:
2228*38fd1498Szrj
2229*38fd1498Szrj guard_bb:
2230*38fd1498Szrj if (cond)
2231*38fd1498Szrj goto merge_bb;
2232*38fd1498Szrj else
2233*38fd1498Szrj goto epilog_loop;
2234*38fd1498Szrj
2235*38fd1498Szrj ;; fall_through_bb
2236*38fd1498Szrj
2237*38fd1498Szrj epilog_loop:
2238*38fd1498Szrj header_b:
2239*38fd1498Szrj i_3 = PHI<i_2, i_4>;
2240*38fd1498Szrj ...
2241*38fd1498Szrj i_4 = i_3 + 1;
2242*38fd1498Szrj if (cond_b)
2243*38fd1498Szrj goto latch_b;
2244*38fd1498Szrj else
2245*38fd1498Szrj goto merge_bb;
2246*38fd1498Szrj latch_b:
2247*38fd1498Szrj goto header_b;
2248*38fd1498Szrj
2249*38fd1498Szrj merge_bb:
2250*38fd1498Szrj ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2251*38fd1498Szrj
2252*38fd1498Szrj exit_bb:
2253*38fd1498Szrj i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2254*38fd1498Szrj
2255*38fd1498Szrj For each name used out side EPILOG (i.e - for each name that has a lcssa
2256*38fd1498Szrj phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2257*38fd1498Szrj args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2258*38fd1498Szrj the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2259*38fd1498Szrj by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2260*38fd1498Szrj in exit_bb will also be updated. */
2261*38fd1498Szrj
2262*38fd1498Szrj static void
slpeel_update_phi_nodes_for_guard2(struct loop * loop,struct loop * epilog,edge guard_edge,edge merge_edge)2263*38fd1498Szrj slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2264*38fd1498Szrj edge guard_edge, edge merge_edge)
2265*38fd1498Szrj {
2266*38fd1498Szrj gphi_iterator gsi;
2267*38fd1498Szrj basic_block merge_bb = guard_edge->dest;
2268*38fd1498Szrj
2269*38fd1498Szrj gcc_assert (single_succ_p (merge_bb));
2270*38fd1498Szrj edge e = single_succ_edge (merge_bb);
2271*38fd1498Szrj basic_block exit_bb = e->dest;
2272*38fd1498Szrj gcc_assert (single_pred_p (exit_bb));
2273*38fd1498Szrj gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2274*38fd1498Szrj
2275*38fd1498Szrj for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2276*38fd1498Szrj {
2277*38fd1498Szrj gphi *update_phi = gsi.phi ();
2278*38fd1498Szrj tree old_arg = PHI_ARG_DEF (update_phi, 0);
2279*38fd1498Szrj /* This loop-closed-phi actually doesn't represent a use out of the
2280*38fd1498Szrj loop - the phi arg is a constant. */
2281*38fd1498Szrj if (TREE_CODE (old_arg) != SSA_NAME)
2282*38fd1498Szrj continue;
2283*38fd1498Szrj
2284*38fd1498Szrj tree merge_arg = get_current_def (old_arg);
2285*38fd1498Szrj if (!merge_arg)
2286*38fd1498Szrj merge_arg = old_arg;
2287*38fd1498Szrj
2288*38fd1498Szrj tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2289*38fd1498Szrj /* If the var is live after loop but not a reduction, we simply
2290*38fd1498Szrj use the old arg. */
2291*38fd1498Szrj if (!guard_arg)
2292*38fd1498Szrj guard_arg = old_arg;
2293*38fd1498Szrj
2294*38fd1498Szrj /* Create new phi node in MERGE_BB: */
2295*38fd1498Szrj tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2296*38fd1498Szrj gphi *merge_phi = create_phi_node (new_res, merge_bb);
2297*38fd1498Szrj
2298*38fd1498Szrj /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2299*38fd1498Szrj the two PHI args in merge_phi for these edges. */
2300*38fd1498Szrj add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2301*38fd1498Szrj add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2302*38fd1498Szrj
2303*38fd1498Szrj /* Update the original phi in exit_bb. */
2304*38fd1498Szrj adjust_phi_and_debug_stmts (update_phi, e, new_res);
2305*38fd1498Szrj }
2306*38fd1498Szrj }
2307*38fd1498Szrj
2308*38fd1498Szrj /* EPILOG loop is duplicated from the original loop for vectorizing,
2309*38fd1498Szrj the arg of its loop closed ssa PHI needs to be updated. */
2310*38fd1498Szrj
2311*38fd1498Szrj static void
slpeel_update_phi_nodes_for_lcssa(struct loop * epilog)2312*38fd1498Szrj slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2313*38fd1498Szrj {
2314*38fd1498Szrj gphi_iterator gsi;
2315*38fd1498Szrj basic_block exit_bb = single_exit (epilog)->dest;
2316*38fd1498Szrj
2317*38fd1498Szrj gcc_assert (single_pred_p (exit_bb));
2318*38fd1498Szrj edge e = EDGE_PRED (exit_bb, 0);
2319*38fd1498Szrj for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2320*38fd1498Szrj rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2321*38fd1498Szrj }
2322*38fd1498Szrj
2323*38fd1498Szrj /* Function vect_do_peeling.
2324*38fd1498Szrj
2325*38fd1498Szrj Input:
2326*38fd1498Szrj - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2327*38fd1498Szrj
2328*38fd1498Szrj preheader:
2329*38fd1498Szrj LOOP:
2330*38fd1498Szrj header_bb:
2331*38fd1498Szrj loop_body
2332*38fd1498Szrj if (exit_loop_cond) goto exit_bb
2333*38fd1498Szrj else goto header_bb
2334*38fd1498Szrj exit_bb:
2335*38fd1498Szrj
2336*38fd1498Szrj - NITERS: The number of iterations of the loop.
2337*38fd1498Szrj - NITERSM1: The number of iterations of the loop's latch.
2338*38fd1498Szrj - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2339*38fd1498Szrj - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2340*38fd1498Szrj CHECK_PROFITABILITY is true.
2341*38fd1498Szrj Output:
2342*38fd1498Szrj - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2343*38fd1498Szrj iterate after vectorization; see vect_set_loop_condition for details.
2344*38fd1498Szrj - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2345*38fd1498Szrj should be set to the number of scalar iterations handled by the
2346*38fd1498Szrj vector loop. The SSA name is only used on exit from the loop.
2347*38fd1498Szrj
2348*38fd1498Szrj This function peels prolog and epilog from the loop, adds guards skipping
2349*38fd1498Szrj PROLOG and EPILOG for various conditions. As a result, the changed CFG
2350*38fd1498Szrj would look like:
2351*38fd1498Szrj
2352*38fd1498Szrj guard_bb_1:
2353*38fd1498Szrj if (prefer_scalar_loop) goto merge_bb_1
2354*38fd1498Szrj else goto guard_bb_2
2355*38fd1498Szrj
2356*38fd1498Szrj guard_bb_2:
2357*38fd1498Szrj if (skip_prolog) goto merge_bb_2
2358*38fd1498Szrj else goto prolog_preheader
2359*38fd1498Szrj
2360*38fd1498Szrj prolog_preheader:
2361*38fd1498Szrj PROLOG:
2362*38fd1498Szrj prolog_header_bb:
2363*38fd1498Szrj prolog_body
2364*38fd1498Szrj if (exit_prolog_cond) goto prolog_exit_bb
2365*38fd1498Szrj else goto prolog_header_bb
2366*38fd1498Szrj prolog_exit_bb:
2367*38fd1498Szrj
2368*38fd1498Szrj merge_bb_2:
2369*38fd1498Szrj
2370*38fd1498Szrj vector_preheader:
2371*38fd1498Szrj VECTOR LOOP:
2372*38fd1498Szrj vector_header_bb:
2373*38fd1498Szrj vector_body
2374*38fd1498Szrj if (exit_vector_cond) goto vector_exit_bb
2375*38fd1498Szrj else goto vector_header_bb
2376*38fd1498Szrj vector_exit_bb:
2377*38fd1498Szrj
2378*38fd1498Szrj guard_bb_3:
2379*38fd1498Szrj if (skip_epilog) goto merge_bb_3
2380*38fd1498Szrj else goto epilog_preheader
2381*38fd1498Szrj
2382*38fd1498Szrj merge_bb_1:
2383*38fd1498Szrj
2384*38fd1498Szrj epilog_preheader:
2385*38fd1498Szrj EPILOG:
2386*38fd1498Szrj epilog_header_bb:
2387*38fd1498Szrj epilog_body
2388*38fd1498Szrj if (exit_epilog_cond) goto merge_bb_3
2389*38fd1498Szrj else goto epilog_header_bb
2390*38fd1498Szrj
2391*38fd1498Szrj merge_bb_3:
2392*38fd1498Szrj
2393*38fd1498Szrj Note this function peels prolog and epilog only if it's necessary,
2394*38fd1498Szrj as well as guards.
2395*38fd1498Szrj Returns created epilogue or NULL.
2396*38fd1498Szrj
2397*38fd1498Szrj TODO: Guard for prefer_scalar_loop should be emitted along with
2398*38fd1498Szrj versioning conditions if loop versioning is needed. */
2399*38fd1498Szrj
2400*38fd1498Szrj
2401*38fd1498Szrj struct loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow)2402*38fd1498Szrj vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2403*38fd1498Szrj tree *niters_vector, tree *step_vector,
2404*38fd1498Szrj tree *niters_vector_mult_vf_var, int th,
2405*38fd1498Szrj bool check_profitability, bool niters_no_overflow)
2406*38fd1498Szrj {
2407*38fd1498Szrj edge e, guard_e;
2408*38fd1498Szrj tree type = TREE_TYPE (niters), guard_cond;
2409*38fd1498Szrj basic_block guard_bb, guard_to;
2410*38fd1498Szrj profile_probability prob_prolog, prob_vector, prob_epilog;
2411*38fd1498Szrj int estimated_vf;
2412*38fd1498Szrj int prolog_peeling = 0;
2413*38fd1498Szrj if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2414*38fd1498Szrj prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2415*38fd1498Szrj
2416*38fd1498Szrj poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2417*38fd1498Szrj poly_uint64 bound_epilog = 0;
2418*38fd1498Szrj if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2419*38fd1498Szrj && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2420*38fd1498Szrj bound_epilog += vf - 1;
2421*38fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2422*38fd1498Szrj bound_epilog += 1;
2423*38fd1498Szrj bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2424*38fd1498Szrj poly_uint64 bound_scalar = bound_epilog;
2425*38fd1498Szrj
2426*38fd1498Szrj if (!prolog_peeling && !epilog_peeling)
2427*38fd1498Szrj return NULL;
2428*38fd1498Szrj
2429*38fd1498Szrj prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2430*38fd1498Szrj estimated_vf = vect_vf_for_cost (loop_vinfo);
2431*38fd1498Szrj if (estimated_vf == 2)
2432*38fd1498Szrj estimated_vf = 3;
2433*38fd1498Szrj prob_prolog = prob_epilog = profile_probability::guessed_always ()
2434*38fd1498Szrj .apply_scale (estimated_vf - 1, estimated_vf);
2435*38fd1498Szrj
2436*38fd1498Szrj struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2437*38fd1498Szrj struct loop *first_loop = loop;
2438*38fd1498Szrj bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2439*38fd1498Szrj create_lcssa_for_virtual_phi (loop);
2440*38fd1498Szrj update_ssa (TODO_update_ssa_only_virtuals);
2441*38fd1498Szrj
2442*38fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS)
2443*38fd1498Szrj {
2444*38fd1498Szrj gcc_assert (!adjust_vec.exists ());
2445*38fd1498Szrj adjust_vec.create (32);
2446*38fd1498Szrj }
2447*38fd1498Szrj initialize_original_copy_tables ();
2448*38fd1498Szrj
2449*38fd1498Szrj /* Record the anchor bb at which the guard should be placed if the scalar
2450*38fd1498Szrj loop might be preferred. */
2451*38fd1498Szrj basic_block anchor = loop_preheader_edge (loop)->src;
2452*38fd1498Szrj
2453*38fd1498Szrj /* Generate the number of iterations for the prolog loop. We do this here
2454*38fd1498Szrj so that we can also get the upper bound on the number of iterations. */
2455*38fd1498Szrj tree niters_prolog;
2456*38fd1498Szrj int bound_prolog = 0;
2457*38fd1498Szrj if (prolog_peeling)
2458*38fd1498Szrj niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2459*38fd1498Szrj &bound_prolog);
2460*38fd1498Szrj else
2461*38fd1498Szrj niters_prolog = build_int_cst (type, 0);
2462*38fd1498Szrj
2463*38fd1498Szrj /* Prolog loop may be skipped. */
2464*38fd1498Szrj bool skip_prolog = (prolog_peeling != 0);
2465*38fd1498Szrj /* Skip to epilog if scalar loop may be preferred. It's only needed
2466*38fd1498Szrj when we peel for epilog loop and when it hasn't been checked with
2467*38fd1498Szrj loop versioning. */
2468*38fd1498Szrj bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2469*38fd1498Szrj ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2470*38fd1498Szrj bound_prolog + bound_epilog)
2471*38fd1498Szrj : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2472*38fd1498Szrj /* Epilog loop must be executed if the number of iterations for epilog
2473*38fd1498Szrj loop is known at compile time, otherwise we need to add a check at
2474*38fd1498Szrj the end of vector loop and skip to the end of epilog loop. */
2475*38fd1498Szrj bool skip_epilog = (prolog_peeling < 0
2476*38fd1498Szrj || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2477*38fd1498Szrj || !vf.is_constant ());
2478*38fd1498Szrj /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2479*38fd1498Szrj if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2480*38fd1498Szrj skip_epilog = false;
2481*38fd1498Szrj
2482*38fd1498Szrj if (skip_vector)
2483*38fd1498Szrj {
2484*38fd1498Szrj split_edge (loop_preheader_edge (loop));
2485*38fd1498Szrj
2486*38fd1498Szrj /* Due to the order in which we peel prolog and epilog, we first
2487*38fd1498Szrj propagate probability to the whole loop. The purpose is to
2488*38fd1498Szrj avoid adjusting probabilities of both prolog and vector loops
2489*38fd1498Szrj separately. Note in this case, the probability of epilog loop
2490*38fd1498Szrj needs to be scaled back later. */
2491*38fd1498Szrj basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2492*38fd1498Szrj if (prob_vector.initialized_p ())
2493*38fd1498Szrj {
2494*38fd1498Szrj scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2495*38fd1498Szrj scale_loop_profile (loop, prob_vector, 0);
2496*38fd1498Szrj }
2497*38fd1498Szrj }
2498*38fd1498Szrj
2499*38fd1498Szrj source_location loop_loc = find_loop_location (loop);
2500*38fd1498Szrj struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2501*38fd1498Szrj if (prolog_peeling)
2502*38fd1498Szrj {
2503*38fd1498Szrj e = loop_preheader_edge (loop);
2504*38fd1498Szrj if (!slpeel_can_duplicate_loop_p (loop, e))
2505*38fd1498Szrj {
2506*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2507*38fd1498Szrj "loop can't be duplicated to preheader edge.\n");
2508*38fd1498Szrj gcc_unreachable ();
2509*38fd1498Szrj }
2510*38fd1498Szrj /* Peel prolog and put it on preheader edge of loop. */
2511*38fd1498Szrj prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2512*38fd1498Szrj if (!prolog)
2513*38fd1498Szrj {
2514*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2515*38fd1498Szrj "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2516*38fd1498Szrj gcc_unreachable ();
2517*38fd1498Szrj }
2518*38fd1498Szrj slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2519*38fd1498Szrj first_loop = prolog;
2520*38fd1498Szrj reset_original_copy_tables ();
2521*38fd1498Szrj
2522*38fd1498Szrj /* Update the number of iterations for prolog loop. */
2523*38fd1498Szrj tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2524*38fd1498Szrj vect_set_loop_condition (prolog, NULL, niters_prolog,
2525*38fd1498Szrj step_prolog, NULL_TREE, false);
2526*38fd1498Szrj
2527*38fd1498Szrj /* Skip the prolog loop. */
2528*38fd1498Szrj if (skip_prolog)
2529*38fd1498Szrj {
2530*38fd1498Szrj guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2531*38fd1498Szrj niters_prolog, build_int_cst (type, 0));
2532*38fd1498Szrj guard_bb = loop_preheader_edge (prolog)->src;
2533*38fd1498Szrj basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2534*38fd1498Szrj guard_to = split_edge (loop_preheader_edge (loop));
2535*38fd1498Szrj guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2536*38fd1498Szrj guard_to, guard_bb,
2537*38fd1498Szrj prob_prolog.invert (),
2538*38fd1498Szrj irred_flag);
2539*38fd1498Szrj e = EDGE_PRED (guard_to, 0);
2540*38fd1498Szrj e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2541*38fd1498Szrj slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2542*38fd1498Szrj
2543*38fd1498Szrj scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2544*38fd1498Szrj scale_loop_profile (prolog, prob_prolog, bound_prolog);
2545*38fd1498Szrj }
2546*38fd1498Szrj /* Update init address of DRs. */
2547*38fd1498Szrj vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2548*38fd1498Szrj /* Update niters for vector loop. */
2549*38fd1498Szrj LOOP_VINFO_NITERS (loop_vinfo)
2550*38fd1498Szrj = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2551*38fd1498Szrj LOOP_VINFO_NITERSM1 (loop_vinfo)
2552*38fd1498Szrj = fold_build2 (MINUS_EXPR, type,
2553*38fd1498Szrj LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2554*38fd1498Szrj bool new_var_p = false;
2555*38fd1498Szrj niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2556*38fd1498Szrj /* It's guaranteed that vector loop bound before vectorization is at
2557*38fd1498Szrj least VF, so set range information for newly generated var. */
2558*38fd1498Szrj if (new_var_p)
2559*38fd1498Szrj set_range_info (niters, VR_RANGE,
2560*38fd1498Szrj wi::to_wide (build_int_cst (type, vf)),
2561*38fd1498Szrj wi::to_wide (TYPE_MAX_VALUE (type)));
2562*38fd1498Szrj
2563*38fd1498Szrj /* Prolog iterates at most bound_prolog times, latch iterates at
2564*38fd1498Szrj most bound_prolog - 1 times. */
2565*38fd1498Szrj record_niter_bound (prolog, bound_prolog - 1, false, true);
2566*38fd1498Szrj delete_update_ssa ();
2567*38fd1498Szrj adjust_vec_debug_stmts ();
2568*38fd1498Szrj scev_reset ();
2569*38fd1498Szrj }
2570*38fd1498Szrj
2571*38fd1498Szrj if (epilog_peeling)
2572*38fd1498Szrj {
2573*38fd1498Szrj e = single_exit (loop);
2574*38fd1498Szrj if (!slpeel_can_duplicate_loop_p (loop, e))
2575*38fd1498Szrj {
2576*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2577*38fd1498Szrj "loop can't be duplicated to exit edge.\n");
2578*38fd1498Szrj gcc_unreachable ();
2579*38fd1498Szrj }
2580*38fd1498Szrj /* Peel epilog and put it on exit edge of loop. */
2581*38fd1498Szrj epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2582*38fd1498Szrj if (!epilog)
2583*38fd1498Szrj {
2584*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2585*38fd1498Szrj "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2586*38fd1498Szrj gcc_unreachable ();
2587*38fd1498Szrj }
2588*38fd1498Szrj slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2589*38fd1498Szrj
2590*38fd1498Szrj /* Scalar version loop may be preferred. In this case, add guard
2591*38fd1498Szrj and skip to epilog. Note this only happens when the number of
2592*38fd1498Szrj iterations of loop is unknown at compile time, otherwise this
2593*38fd1498Szrj won't be vectorized. */
2594*38fd1498Szrj if (skip_vector)
2595*38fd1498Szrj {
2596*38fd1498Szrj /* Additional epilogue iteration is peeled if gap exists. */
2597*38fd1498Szrj tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2598*38fd1498Szrj bound_prolog, bound_epilog,
2599*38fd1498Szrj th, &bound_scalar,
2600*38fd1498Szrj check_profitability);
2601*38fd1498Szrj /* Build guard against NITERSM1 since NITERS may overflow. */
2602*38fd1498Szrj guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2603*38fd1498Szrj guard_bb = anchor;
2604*38fd1498Szrj guard_to = split_edge (loop_preheader_edge (epilog));
2605*38fd1498Szrj guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2606*38fd1498Szrj guard_to, guard_bb,
2607*38fd1498Szrj prob_vector.invert (),
2608*38fd1498Szrj irred_flag);
2609*38fd1498Szrj e = EDGE_PRED (guard_to, 0);
2610*38fd1498Szrj e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2611*38fd1498Szrj slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2612*38fd1498Szrj
2613*38fd1498Szrj /* Simply propagate profile info from guard_bb to guard_to which is
2614*38fd1498Szrj a merge point of control flow. */
2615*38fd1498Szrj guard_to->count = guard_bb->count;
2616*38fd1498Szrj
2617*38fd1498Szrj /* Scale probability of epilog loop back.
2618*38fd1498Szrj FIXME: We should avoid scaling down and back up. Profile may
2619*38fd1498Szrj get lost if we scale down to 0. */
2620*38fd1498Szrj basic_block *bbs = get_loop_body (epilog);
2621*38fd1498Szrj for (unsigned int i = 0; i < epilog->num_nodes; i++)
2622*38fd1498Szrj bbs[i]->count = bbs[i]->count.apply_scale
2623*38fd1498Szrj (bbs[i]->count,
2624*38fd1498Szrj bbs[i]->count.apply_probability
2625*38fd1498Szrj (prob_vector));
2626*38fd1498Szrj free (bbs);
2627*38fd1498Szrj }
2628*38fd1498Szrj
2629*38fd1498Szrj basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2630*38fd1498Szrj tree niters_vector_mult_vf;
2631*38fd1498Szrj /* If loop is peeled for non-zero constant times, now niters refers to
2632*38fd1498Szrj orig_niters - prolog_peeling, it won't overflow even the orig_niters
2633*38fd1498Szrj overflows. */
2634*38fd1498Szrj niters_no_overflow |= (prolog_peeling > 0);
2635*38fd1498Szrj vect_gen_vector_loop_niters (loop_vinfo, niters,
2636*38fd1498Szrj niters_vector, step_vector,
2637*38fd1498Szrj niters_no_overflow);
2638*38fd1498Szrj if (!integer_onep (*step_vector))
2639*38fd1498Szrj {
2640*38fd1498Szrj /* On exit from the loop we will have an easy way of calcalating
2641*38fd1498Szrj NITERS_VECTOR / STEP * STEP. Install a dummy definition
2642*38fd1498Szrj until then. */
2643*38fd1498Szrj niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2644*38fd1498Szrj SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2645*38fd1498Szrj *niters_vector_mult_vf_var = niters_vector_mult_vf;
2646*38fd1498Szrj }
2647*38fd1498Szrj else
2648*38fd1498Szrj vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2649*38fd1498Szrj &niters_vector_mult_vf);
2650*38fd1498Szrj /* Update IVs of original loop as if they were advanced by
2651*38fd1498Szrj niters_vector_mult_vf steps. */
2652*38fd1498Szrj gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2653*38fd1498Szrj edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2654*38fd1498Szrj vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2655*38fd1498Szrj update_e);
2656*38fd1498Szrj
2657*38fd1498Szrj if (skip_epilog)
2658*38fd1498Szrj {
2659*38fd1498Szrj guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2660*38fd1498Szrj niters, niters_vector_mult_vf);
2661*38fd1498Szrj guard_bb = single_exit (loop)->dest;
2662*38fd1498Szrj guard_to = split_edge (single_exit (epilog));
2663*38fd1498Szrj guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2664*38fd1498Szrj skip_vector ? anchor : guard_bb,
2665*38fd1498Szrj prob_epilog.invert (),
2666*38fd1498Szrj irred_flag);
2667*38fd1498Szrj slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2668*38fd1498Szrj single_exit (epilog));
2669*38fd1498Szrj /* Only need to handle basic block before epilog loop if it's not
2670*38fd1498Szrj the guard_bb, which is the case when skip_vector is true. */
2671*38fd1498Szrj if (guard_bb != bb_before_epilog)
2672*38fd1498Szrj {
2673*38fd1498Szrj prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2674*38fd1498Szrj
2675*38fd1498Szrj scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2676*38fd1498Szrj }
2677*38fd1498Szrj scale_loop_profile (epilog, prob_epilog, 0);
2678*38fd1498Szrj }
2679*38fd1498Szrj else
2680*38fd1498Szrj slpeel_update_phi_nodes_for_lcssa (epilog);
2681*38fd1498Szrj
2682*38fd1498Szrj unsigned HOST_WIDE_INT bound;
2683*38fd1498Szrj if (bound_scalar.is_constant (&bound))
2684*38fd1498Szrj {
2685*38fd1498Szrj gcc_assert (bound != 0);
2686*38fd1498Szrj /* -1 to convert loop iterations to latch iterations. */
2687*38fd1498Szrj record_niter_bound (epilog, bound - 1, false, true);
2688*38fd1498Szrj }
2689*38fd1498Szrj
2690*38fd1498Szrj delete_update_ssa ();
2691*38fd1498Szrj adjust_vec_debug_stmts ();
2692*38fd1498Szrj scev_reset ();
2693*38fd1498Szrj }
2694*38fd1498Szrj adjust_vec.release ();
2695*38fd1498Szrj free_original_copy_tables ();
2696*38fd1498Szrj
2697*38fd1498Szrj return epilog;
2698*38fd1498Szrj }
2699*38fd1498Szrj
2700*38fd1498Szrj /* Function vect_create_cond_for_niters_checks.
2701*38fd1498Szrj
2702*38fd1498Szrj Create a conditional expression that represents the run-time checks for
2703*38fd1498Szrj loop's niter. The loop is guaranteed to terminate if the run-time
2704*38fd1498Szrj checks hold.
2705*38fd1498Szrj
2706*38fd1498Szrj Input:
2707*38fd1498Szrj COND_EXPR - input conditional expression. New conditions will be chained
2708*38fd1498Szrj with logical AND operation. If it is NULL, then the function
2709*38fd1498Szrj is used to return the number of alias checks.
2710*38fd1498Szrj LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2711*38fd1498Szrj to be checked.
2712*38fd1498Szrj
2713*38fd1498Szrj Output:
2714*38fd1498Szrj COND_EXPR - conditional expression.
2715*38fd1498Szrj
2716*38fd1498Szrj The returned COND_EXPR is the conditional expression to be used in the
2717*38fd1498Szrj if statement that controls which version of the loop gets executed at
2718*38fd1498Szrj runtime. */
2719*38fd1498Szrj
2720*38fd1498Szrj static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)2721*38fd1498Szrj vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2722*38fd1498Szrj {
2723*38fd1498Szrj tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2724*38fd1498Szrj
2725*38fd1498Szrj if (*cond_expr)
2726*38fd1498Szrj *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2727*38fd1498Szrj *cond_expr, part_cond_expr);
2728*38fd1498Szrj else
2729*38fd1498Szrj *cond_expr = part_cond_expr;
2730*38fd1498Szrj }
2731*38fd1498Szrj
2732*38fd1498Szrj /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2733*38fd1498Szrj and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2734*38fd1498Szrj
2735*38fd1498Szrj static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)2736*38fd1498Szrj chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2737*38fd1498Szrj {
2738*38fd1498Szrj if (*cond_expr)
2739*38fd1498Szrj *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2740*38fd1498Szrj *cond_expr, part_cond_expr);
2741*38fd1498Szrj else
2742*38fd1498Szrj *cond_expr = part_cond_expr;
2743*38fd1498Szrj }
2744*38fd1498Szrj
2745*38fd1498Szrj /* Function vect_create_cond_for_align_checks.
2746*38fd1498Szrj
2747*38fd1498Szrj Create a conditional expression that represents the alignment checks for
2748*38fd1498Szrj all of data references (array element references) whose alignment must be
2749*38fd1498Szrj checked at runtime.
2750*38fd1498Szrj
2751*38fd1498Szrj Input:
2752*38fd1498Szrj COND_EXPR - input conditional expression. New conditions will be chained
2753*38fd1498Szrj with logical AND operation.
2754*38fd1498Szrj LOOP_VINFO - two fields of the loop information are used.
2755*38fd1498Szrj LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2756*38fd1498Szrj LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2757*38fd1498Szrj
2758*38fd1498Szrj Output:
2759*38fd1498Szrj COND_EXPR_STMT_LIST - statements needed to construct the conditional
2760*38fd1498Szrj expression.
2761*38fd1498Szrj The returned value is the conditional expression to be used in the if
2762*38fd1498Szrj statement that controls which version of the loop gets executed at runtime.
2763*38fd1498Szrj
2764*38fd1498Szrj The algorithm makes two assumptions:
2765*38fd1498Szrj 1) The number of bytes "n" in a vector is a power of 2.
2766*38fd1498Szrj 2) An address "a" is aligned if a%n is zero and that this
2767*38fd1498Szrj test can be done as a&(n-1) == 0. For example, for 16
2768*38fd1498Szrj byte vectors the test is a&0xf == 0. */
2769*38fd1498Szrj
2770*38fd1498Szrj static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)2771*38fd1498Szrj vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2772*38fd1498Szrj tree *cond_expr,
2773*38fd1498Szrj gimple_seq *cond_expr_stmt_list)
2774*38fd1498Szrj {
2775*38fd1498Szrj vec<gimple *> may_misalign_stmts
2776*38fd1498Szrj = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2777*38fd1498Szrj gimple *ref_stmt;
2778*38fd1498Szrj int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2779*38fd1498Szrj tree mask_cst;
2780*38fd1498Szrj unsigned int i;
2781*38fd1498Szrj tree int_ptrsize_type;
2782*38fd1498Szrj char tmp_name[20];
2783*38fd1498Szrj tree or_tmp_name = NULL_TREE;
2784*38fd1498Szrj tree and_tmp_name;
2785*38fd1498Szrj gimple *and_stmt;
2786*38fd1498Szrj tree ptrsize_zero;
2787*38fd1498Szrj tree part_cond_expr;
2788*38fd1498Szrj
2789*38fd1498Szrj /* Check that mask is one less than a power of 2, i.e., mask is
2790*38fd1498Szrj all zeros followed by all ones. */
2791*38fd1498Szrj gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2792*38fd1498Szrj
2793*38fd1498Szrj int_ptrsize_type = signed_type_for (ptr_type_node);
2794*38fd1498Szrj
2795*38fd1498Szrj /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2796*38fd1498Szrj of the first vector of the i'th data reference. */
2797*38fd1498Szrj
2798*38fd1498Szrj FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2799*38fd1498Szrj {
2800*38fd1498Szrj gimple_seq new_stmt_list = NULL;
2801*38fd1498Szrj tree addr_base;
2802*38fd1498Szrj tree addr_tmp_name;
2803*38fd1498Szrj tree new_or_tmp_name;
2804*38fd1498Szrj gimple *addr_stmt, *or_stmt;
2805*38fd1498Szrj stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2806*38fd1498Szrj tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2807*38fd1498Szrj bool negative = tree_int_cst_compare
2808*38fd1498Szrj (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2809*38fd1498Szrj tree offset = negative
2810*38fd1498Szrj ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2811*38fd1498Szrj
2812*38fd1498Szrj /* create: addr_tmp = (int)(address_of_first_vector) */
2813*38fd1498Szrj addr_base =
2814*38fd1498Szrj vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2815*38fd1498Szrj offset);
2816*38fd1498Szrj if (new_stmt_list != NULL)
2817*38fd1498Szrj gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2818*38fd1498Szrj
2819*38fd1498Szrj sprintf (tmp_name, "addr2int%d", i);
2820*38fd1498Szrj addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2821*38fd1498Szrj addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2822*38fd1498Szrj gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2823*38fd1498Szrj
2824*38fd1498Szrj /* The addresses are OR together. */
2825*38fd1498Szrj
2826*38fd1498Szrj if (or_tmp_name != NULL_TREE)
2827*38fd1498Szrj {
2828*38fd1498Szrj /* create: or_tmp = or_tmp | addr_tmp */
2829*38fd1498Szrj sprintf (tmp_name, "orptrs%d", i);
2830*38fd1498Szrj new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2831*38fd1498Szrj or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2832*38fd1498Szrj or_tmp_name, addr_tmp_name);
2833*38fd1498Szrj gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2834*38fd1498Szrj or_tmp_name = new_or_tmp_name;
2835*38fd1498Szrj }
2836*38fd1498Szrj else
2837*38fd1498Szrj or_tmp_name = addr_tmp_name;
2838*38fd1498Szrj
2839*38fd1498Szrj } /* end for i */
2840*38fd1498Szrj
2841*38fd1498Szrj mask_cst = build_int_cst (int_ptrsize_type, mask);
2842*38fd1498Szrj
2843*38fd1498Szrj /* create: and_tmp = or_tmp & mask */
2844*38fd1498Szrj and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2845*38fd1498Szrj
2846*38fd1498Szrj and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2847*38fd1498Szrj or_tmp_name, mask_cst);
2848*38fd1498Szrj gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2849*38fd1498Szrj
2850*38fd1498Szrj /* Make and_tmp the left operand of the conditional test against zero.
2851*38fd1498Szrj if and_tmp has a nonzero bit then some address is unaligned. */
2852*38fd1498Szrj ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2853*38fd1498Szrj part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2854*38fd1498Szrj and_tmp_name, ptrsize_zero);
2855*38fd1498Szrj chain_cond_expr (cond_expr, part_cond_expr);
2856*38fd1498Szrj }
2857*38fd1498Szrj
2858*38fd1498Szrj /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2859*38fd1498Szrj create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2860*38fd1498Szrj Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2861*38fd1498Szrj and this new condition are true. Treat a null *COND_EXPR as "true". */
2862*38fd1498Szrj
2863*38fd1498Szrj static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)2864*38fd1498Szrj vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2865*38fd1498Szrj {
2866*38fd1498Szrj vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2867*38fd1498Szrj unsigned int i;
2868*38fd1498Szrj vec_object_pair *pair;
2869*38fd1498Szrj FOR_EACH_VEC_ELT (pairs, i, pair)
2870*38fd1498Szrj {
2871*38fd1498Szrj tree addr1 = build_fold_addr_expr (pair->first);
2872*38fd1498Szrj tree addr2 = build_fold_addr_expr (pair->second);
2873*38fd1498Szrj tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2874*38fd1498Szrj addr1, addr2);
2875*38fd1498Szrj chain_cond_expr (cond_expr, part_cond_expr);
2876*38fd1498Szrj }
2877*38fd1498Szrj }
2878*38fd1498Szrj
2879*38fd1498Szrj /* Create an expression that is true when all lower-bound conditions for
2880*38fd1498Szrj the vectorized loop are met. Chain this condition with *COND_EXPR. */
2881*38fd1498Szrj
2882*38fd1498Szrj static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)2883*38fd1498Szrj vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2884*38fd1498Szrj {
2885*38fd1498Szrj vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2886*38fd1498Szrj for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2887*38fd1498Szrj {
2888*38fd1498Szrj tree expr = lower_bounds[i].expr;
2889*38fd1498Szrj tree type = unsigned_type_for (TREE_TYPE (expr));
2890*38fd1498Szrj expr = fold_convert (type, expr);
2891*38fd1498Szrj poly_uint64 bound = lower_bounds[i].min_value;
2892*38fd1498Szrj if (!lower_bounds[i].unsigned_p)
2893*38fd1498Szrj {
2894*38fd1498Szrj expr = fold_build2 (PLUS_EXPR, type, expr,
2895*38fd1498Szrj build_int_cstu (type, bound - 1));
2896*38fd1498Szrj bound += bound - 1;
2897*38fd1498Szrj }
2898*38fd1498Szrj tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2899*38fd1498Szrj build_int_cstu (type, bound));
2900*38fd1498Szrj chain_cond_expr (cond_expr, part_cond_expr);
2901*38fd1498Szrj }
2902*38fd1498Szrj }
2903*38fd1498Szrj
2904*38fd1498Szrj /* Function vect_create_cond_for_alias_checks.
2905*38fd1498Szrj
2906*38fd1498Szrj Create a conditional expression that represents the run-time checks for
2907*38fd1498Szrj overlapping of address ranges represented by a list of data references
2908*38fd1498Szrj relations passed as input.
2909*38fd1498Szrj
2910*38fd1498Szrj Input:
2911*38fd1498Szrj COND_EXPR - input conditional expression. New conditions will be chained
2912*38fd1498Szrj with logical AND operation. If it is NULL, then the function
2913*38fd1498Szrj is used to return the number of alias checks.
2914*38fd1498Szrj LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2915*38fd1498Szrj to be checked.
2916*38fd1498Szrj
2917*38fd1498Szrj Output:
2918*38fd1498Szrj COND_EXPR - conditional expression.
2919*38fd1498Szrj
2920*38fd1498Szrj The returned COND_EXPR is the conditional expression to be used in the if
2921*38fd1498Szrj statement that controls which version of the loop gets executed at runtime.
2922*38fd1498Szrj */
2923*38fd1498Szrj
2924*38fd1498Szrj void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)2925*38fd1498Szrj vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2926*38fd1498Szrj {
2927*38fd1498Szrj vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2928*38fd1498Szrj LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2929*38fd1498Szrj
2930*38fd1498Szrj if (comp_alias_ddrs.is_empty ())
2931*38fd1498Szrj return;
2932*38fd1498Szrj
2933*38fd1498Szrj create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2934*38fd1498Szrj &comp_alias_ddrs, cond_expr);
2935*38fd1498Szrj if (dump_enabled_p ())
2936*38fd1498Szrj dump_printf_loc (MSG_NOTE, vect_location,
2937*38fd1498Szrj "created %u versioning for alias checks.\n",
2938*38fd1498Szrj comp_alias_ddrs.length ());
2939*38fd1498Szrj }
2940*38fd1498Szrj
2941*38fd1498Szrj
2942*38fd1498Szrj /* Function vect_loop_versioning.
2943*38fd1498Szrj
2944*38fd1498Szrj If the loop has data references that may or may not be aligned or/and
2945*38fd1498Szrj has data reference relations whose independence was not proven then
2946*38fd1498Szrj two versions of the loop need to be generated, one which is vectorized
2947*38fd1498Szrj and one which isn't. A test is then generated to control which of the
2948*38fd1498Szrj loops is executed. The test checks for the alignment of all of the
2949*38fd1498Szrj data references that may or may not be aligned. An additional
2950*38fd1498Szrj sequence of runtime tests is generated for each pairs of DDRs whose
2951*38fd1498Szrj independence was not proven. The vectorized version of loop is
2952*38fd1498Szrj executed only if both alias and alignment tests are passed.
2953*38fd1498Szrj
2954*38fd1498Szrj The test generated to check which version of loop is executed
2955*38fd1498Szrj is modified to also check for profitability as indicated by the
2956*38fd1498Szrj cost model threshold TH.
2957*38fd1498Szrj
2958*38fd1498Szrj The versioning precondition(s) are placed in *COND_EXPR and
2959*38fd1498Szrj *COND_EXPR_STMT_LIST. */
2960*38fd1498Szrj
2961*38fd1498Szrj void
vect_loop_versioning(loop_vec_info loop_vinfo,unsigned int th,bool check_profitability,poly_uint64 versioning_threshold)2962*38fd1498Szrj vect_loop_versioning (loop_vec_info loop_vinfo,
2963*38fd1498Szrj unsigned int th, bool check_profitability,
2964*38fd1498Szrj poly_uint64 versioning_threshold)
2965*38fd1498Szrj {
2966*38fd1498Szrj struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2967*38fd1498Szrj struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2968*38fd1498Szrj basic_block condition_bb;
2969*38fd1498Szrj gphi_iterator gsi;
2970*38fd1498Szrj gimple_stmt_iterator cond_exp_gsi;
2971*38fd1498Szrj basic_block merge_bb;
2972*38fd1498Szrj basic_block new_exit_bb;
2973*38fd1498Szrj edge new_exit_e, e;
2974*38fd1498Szrj gphi *orig_phi, *new_phi;
2975*38fd1498Szrj tree cond_expr = NULL_TREE;
2976*38fd1498Szrj gimple_seq cond_expr_stmt_list = NULL;
2977*38fd1498Szrj tree arg;
2978*38fd1498Szrj profile_probability prob = profile_probability::likely ();
2979*38fd1498Szrj gimple_seq gimplify_stmt_list = NULL;
2980*38fd1498Szrj tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2981*38fd1498Szrj bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2982*38fd1498Szrj bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2983*38fd1498Szrj bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2984*38fd1498Szrj
2985*38fd1498Szrj if (check_profitability)
2986*38fd1498Szrj cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2987*38fd1498Szrj build_int_cst (TREE_TYPE (scalar_loop_iters),
2988*38fd1498Szrj th - 1));
2989*38fd1498Szrj if (maybe_ne (versioning_threshold, 0U))
2990*38fd1498Szrj {
2991*38fd1498Szrj tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2992*38fd1498Szrj build_int_cst (TREE_TYPE (scalar_loop_iters),
2993*38fd1498Szrj versioning_threshold - 1));
2994*38fd1498Szrj if (cond_expr)
2995*38fd1498Szrj cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2996*38fd1498Szrj expr, cond_expr);
2997*38fd1498Szrj else
2998*38fd1498Szrj cond_expr = expr;
2999*38fd1498Szrj }
3000*38fd1498Szrj
3001*38fd1498Szrj if (version_niter)
3002*38fd1498Szrj vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3003*38fd1498Szrj
3004*38fd1498Szrj if (cond_expr)
3005*38fd1498Szrj cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3006*38fd1498Szrj is_gimple_condexpr, NULL_TREE);
3007*38fd1498Szrj
3008*38fd1498Szrj if (version_align)
3009*38fd1498Szrj vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3010*38fd1498Szrj &cond_expr_stmt_list);
3011*38fd1498Szrj
3012*38fd1498Szrj if (version_alias)
3013*38fd1498Szrj {
3014*38fd1498Szrj vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3015*38fd1498Szrj vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3016*38fd1498Szrj vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3017*38fd1498Szrj }
3018*38fd1498Szrj
3019*38fd1498Szrj cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3020*38fd1498Szrj &gimplify_stmt_list,
3021*38fd1498Szrj is_gimple_condexpr, NULL_TREE);
3022*38fd1498Szrj gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3023*38fd1498Szrj
3024*38fd1498Szrj initialize_original_copy_tables ();
3025*38fd1498Szrj if (scalar_loop)
3026*38fd1498Szrj {
3027*38fd1498Szrj edge scalar_e;
3028*38fd1498Szrj basic_block preheader, scalar_preheader;
3029*38fd1498Szrj
3030*38fd1498Szrj /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3031*38fd1498Szrj scale LOOP's frequencies instead. */
3032*38fd1498Szrj nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3033*38fd1498Szrj prob, prob.invert (), prob, prob.invert (), true);
3034*38fd1498Szrj scale_loop_frequencies (loop, prob);
3035*38fd1498Szrj /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3036*38fd1498Szrj while we need to move it above LOOP's preheader. */
3037*38fd1498Szrj e = loop_preheader_edge (loop);
3038*38fd1498Szrj scalar_e = loop_preheader_edge (scalar_loop);
3039*38fd1498Szrj gcc_assert (empty_block_p (e->src)
3040*38fd1498Szrj && single_pred_p (e->src));
3041*38fd1498Szrj gcc_assert (empty_block_p (scalar_e->src)
3042*38fd1498Szrj && single_pred_p (scalar_e->src));
3043*38fd1498Szrj gcc_assert (single_pred_p (condition_bb));
3044*38fd1498Szrj preheader = e->src;
3045*38fd1498Szrj scalar_preheader = scalar_e->src;
3046*38fd1498Szrj scalar_e = find_edge (condition_bb, scalar_preheader);
3047*38fd1498Szrj e = single_pred_edge (preheader);
3048*38fd1498Szrj redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3049*38fd1498Szrj scalar_preheader);
3050*38fd1498Szrj redirect_edge_and_branch_force (scalar_e, preheader);
3051*38fd1498Szrj redirect_edge_and_branch_force (e, condition_bb);
3052*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3053*38fd1498Szrj single_pred (condition_bb));
3054*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3055*38fd1498Szrj single_pred (scalar_preheader));
3056*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, preheader,
3057*38fd1498Szrj condition_bb);
3058*38fd1498Szrj }
3059*38fd1498Szrj else
3060*38fd1498Szrj nloop = loop_version (loop, cond_expr, &condition_bb,
3061*38fd1498Szrj prob, prob.invert (), prob, prob.invert (), true);
3062*38fd1498Szrj
3063*38fd1498Szrj if (version_niter)
3064*38fd1498Szrj {
3065*38fd1498Szrj /* The versioned loop could be infinite, we need to clear existing
3066*38fd1498Szrj niter information which is copied from the original loop. */
3067*38fd1498Szrj gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3068*38fd1498Szrj vect_free_loop_info_assumptions (nloop);
3069*38fd1498Szrj /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3070*38fd1498Szrj loop_constraint_set (loop, LOOP_C_INFINITE);
3071*38fd1498Szrj }
3072*38fd1498Szrj
3073*38fd1498Szrj if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
3074*38fd1498Szrj && dump_enabled_p ())
3075*38fd1498Szrj {
3076*38fd1498Szrj if (version_alias)
3077*38fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3078*38fd1498Szrj "loop versioned for vectorization because of "
3079*38fd1498Szrj "possible aliasing\n");
3080*38fd1498Szrj if (version_align)
3081*38fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3082*38fd1498Szrj "loop versioned for vectorization to enhance "
3083*38fd1498Szrj "alignment\n");
3084*38fd1498Szrj
3085*38fd1498Szrj }
3086*38fd1498Szrj free_original_copy_tables ();
3087*38fd1498Szrj
3088*38fd1498Szrj /* Loop versioning violates an assumption we try to maintain during
3089*38fd1498Szrj vectorization - that the loop exit block has a single predecessor.
3090*38fd1498Szrj After versioning, the exit block of both loop versions is the same
3091*38fd1498Szrj basic block (i.e. it has two predecessors). Just in order to simplify
3092*38fd1498Szrj following transformations in the vectorizer, we fix this situation
3093*38fd1498Szrj here by adding a new (empty) block on the exit-edge of the loop,
3094*38fd1498Szrj with the proper loop-exit phis to maintain loop-closed-form.
3095*38fd1498Szrj If loop versioning wasn't done from loop, but scalar_loop instead,
3096*38fd1498Szrj merge_bb will have already just a single successor. */
3097*38fd1498Szrj
3098*38fd1498Szrj merge_bb = single_exit (loop)->dest;
3099*38fd1498Szrj if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3100*38fd1498Szrj {
3101*38fd1498Szrj gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3102*38fd1498Szrj new_exit_bb = split_edge (single_exit (loop));
3103*38fd1498Szrj new_exit_e = single_exit (loop);
3104*38fd1498Szrj e = EDGE_SUCC (new_exit_bb, 0);
3105*38fd1498Szrj
3106*38fd1498Szrj for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3107*38fd1498Szrj {
3108*38fd1498Szrj tree new_res;
3109*38fd1498Szrj orig_phi = gsi.phi ();
3110*38fd1498Szrj new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3111*38fd1498Szrj new_phi = create_phi_node (new_res, new_exit_bb);
3112*38fd1498Szrj arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3113*38fd1498Szrj add_phi_arg (new_phi, arg, new_exit_e,
3114*38fd1498Szrj gimple_phi_arg_location_from_edge (orig_phi, e));
3115*38fd1498Szrj adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3116*38fd1498Szrj }
3117*38fd1498Szrj }
3118*38fd1498Szrj
3119*38fd1498Szrj /* End loop-exit-fixes after versioning. */
3120*38fd1498Szrj
3121*38fd1498Szrj if (cond_expr_stmt_list)
3122*38fd1498Szrj {
3123*38fd1498Szrj cond_exp_gsi = gsi_last_bb (condition_bb);
3124*38fd1498Szrj gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3125*38fd1498Szrj GSI_SAME_STMT);
3126*38fd1498Szrj }
3127*38fd1498Szrj update_ssa (TODO_update_ssa);
3128*38fd1498Szrj }
3129