1*38fd1498Szrj /* Vectorizer Specific Loop Manipulations
2*38fd1498Szrj    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3*38fd1498Szrj    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4*38fd1498Szrj    and Ira Rosen <irar@il.ibm.com>
5*38fd1498Szrj 
6*38fd1498Szrj This file is part of GCC.
7*38fd1498Szrj 
8*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
9*38fd1498Szrj the terms of the GNU General Public License as published by the Free
10*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
11*38fd1498Szrj version.
12*38fd1498Szrj 
13*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
15*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16*38fd1498Szrj for more details.
17*38fd1498Szrj 
18*38fd1498Szrj You should have received a copy of the GNU General Public License
19*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
20*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
21*38fd1498Szrj 
22*38fd1498Szrj #include "config.h"
23*38fd1498Szrj #include "system.h"
24*38fd1498Szrj #include "coretypes.h"
25*38fd1498Szrj #include "backend.h"
26*38fd1498Szrj #include "tree.h"
27*38fd1498Szrj #include "gimple.h"
28*38fd1498Szrj #include "cfghooks.h"
29*38fd1498Szrj #include "tree-pass.h"
30*38fd1498Szrj #include "ssa.h"
31*38fd1498Szrj #include "fold-const.h"
32*38fd1498Szrj #include "cfganal.h"
33*38fd1498Szrj #include "gimplify.h"
34*38fd1498Szrj #include "gimple-iterator.h"
35*38fd1498Szrj #include "gimplify-me.h"
36*38fd1498Szrj #include "tree-cfg.h"
37*38fd1498Szrj #include "tree-ssa-loop-manip.h"
38*38fd1498Szrj #include "tree-into-ssa.h"
39*38fd1498Szrj #include "tree-ssa.h"
40*38fd1498Szrj #include "cfgloop.h"
41*38fd1498Szrj #include "tree-scalar-evolution.h"
42*38fd1498Szrj #include "tree-vectorizer.h"
43*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
44*38fd1498Szrj #include "gimple-fold.h"
45*38fd1498Szrj #include "tree-ssa-loop-niter.h"
46*38fd1498Szrj #include "internal-fn.h"
47*38fd1498Szrj #include "stor-layout.h"
48*38fd1498Szrj #include "optabs-query.h"
49*38fd1498Szrj #include "vec-perm-indices.h"
50*38fd1498Szrj 
51*38fd1498Szrj /*************************************************************************
52*38fd1498Szrj   Simple Loop Peeling Utilities
53*38fd1498Szrj 
54*38fd1498Szrj   Utilities to support loop peeling for vectorization purposes.
55*38fd1498Szrj  *************************************************************************/
56*38fd1498Szrj 
57*38fd1498Szrj 
58*38fd1498Szrj /* Renames the use *OP_P.  */
59*38fd1498Szrj 
60*38fd1498Szrj static void
rename_use_op(use_operand_p op_p)61*38fd1498Szrj rename_use_op (use_operand_p op_p)
62*38fd1498Szrj {
63*38fd1498Szrj   tree new_name;
64*38fd1498Szrj 
65*38fd1498Szrj   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66*38fd1498Szrj     return;
67*38fd1498Szrj 
68*38fd1498Szrj   new_name = get_current_def (USE_FROM_PTR (op_p));
69*38fd1498Szrj 
70*38fd1498Szrj   /* Something defined outside of the loop.  */
71*38fd1498Szrj   if (!new_name)
72*38fd1498Szrj     return;
73*38fd1498Szrj 
74*38fd1498Szrj   /* An ordinary ssa name defined in the loop.  */
75*38fd1498Szrj 
76*38fd1498Szrj   SET_USE (op_p, new_name);
77*38fd1498Szrj }
78*38fd1498Szrj 
79*38fd1498Szrj 
80*38fd1498Szrj /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
81*38fd1498Szrj    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82*38fd1498Szrj    true.  */
83*38fd1498Szrj 
84*38fd1498Szrj static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)85*38fd1498Szrj rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86*38fd1498Szrj {
87*38fd1498Szrj   gimple *stmt;
88*38fd1498Szrj   use_operand_p use_p;
89*38fd1498Szrj   ssa_op_iter iter;
90*38fd1498Szrj   edge e;
91*38fd1498Szrj   edge_iterator ei;
92*38fd1498Szrj   struct loop *loop = bb->loop_father;
93*38fd1498Szrj   struct loop *outer_loop = NULL;
94*38fd1498Szrj 
95*38fd1498Szrj   if (rename_from_outer_loop)
96*38fd1498Szrj     {
97*38fd1498Szrj       gcc_assert (loop);
98*38fd1498Szrj       outer_loop = loop_outer (loop);
99*38fd1498Szrj     }
100*38fd1498Szrj 
101*38fd1498Szrj   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102*38fd1498Szrj        gsi_next (&gsi))
103*38fd1498Szrj     {
104*38fd1498Szrj       stmt = gsi_stmt (gsi);
105*38fd1498Szrj       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106*38fd1498Szrj 	rename_use_op (use_p);
107*38fd1498Szrj     }
108*38fd1498Szrj 
109*38fd1498Szrj   FOR_EACH_EDGE (e, ei, bb->preds)
110*38fd1498Szrj     {
111*38fd1498Szrj       if (!flow_bb_inside_loop_p (loop, e->src))
112*38fd1498Szrj 	{
113*38fd1498Szrj 	  if (!rename_from_outer_loop)
114*38fd1498Szrj 	    continue;
115*38fd1498Szrj 	  if (e->src != outer_loop->header)
116*38fd1498Szrj 	    {
117*38fd1498Szrj 	      if (outer_loop->inner->next)
118*38fd1498Szrj 		{
119*38fd1498Szrj 		  /* If outer_loop has 2 inner loops, allow there to
120*38fd1498Szrj 		     be an extra basic block which decides which of the
121*38fd1498Szrj 		     two loops to use using LOOP_VECTORIZED.  */
122*38fd1498Szrj 		  if (!single_pred_p (e->src)
123*38fd1498Szrj 		      || single_pred (e->src) != outer_loop->header)
124*38fd1498Szrj 		    continue;
125*38fd1498Szrj 		}
126*38fd1498Szrj 	    }
127*38fd1498Szrj 	}
128*38fd1498Szrj       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129*38fd1498Szrj 	   gsi_next (&gsi))
130*38fd1498Szrj         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131*38fd1498Szrj     }
132*38fd1498Szrj }
133*38fd1498Szrj 
134*38fd1498Szrj 
135*38fd1498Szrj struct adjust_info
136*38fd1498Szrj {
137*38fd1498Szrj   tree from, to;
138*38fd1498Szrj   basic_block bb;
139*38fd1498Szrj };
140*38fd1498Szrj 
141*38fd1498Szrj /* A stack of values to be adjusted in debug stmts.  We have to
142*38fd1498Szrj    process them LIFO, so that the closest substitution applies.  If we
143*38fd1498Szrj    processed them FIFO, without the stack, we might substitute uses
144*38fd1498Szrj    with a PHI DEF that would soon become non-dominant, and when we got
145*38fd1498Szrj    to the suitable one, it wouldn't have anything to substitute any
146*38fd1498Szrj    more.  */
147*38fd1498Szrj static vec<adjust_info, va_heap> adjust_vec;
148*38fd1498Szrj 
149*38fd1498Szrj /* Adjust any debug stmts that referenced AI->from values to use the
150*38fd1498Szrj    loop-closed AI->to, if the references are dominated by AI->bb and
151*38fd1498Szrj    not by the definition of AI->from.  */
152*38fd1498Szrj 
153*38fd1498Szrj static void
adjust_debug_stmts_now(adjust_info * ai)154*38fd1498Szrj adjust_debug_stmts_now (adjust_info *ai)
155*38fd1498Szrj {
156*38fd1498Szrj   basic_block bbphi = ai->bb;
157*38fd1498Szrj   tree orig_def = ai->from;
158*38fd1498Szrj   tree new_def = ai->to;
159*38fd1498Szrj   imm_use_iterator imm_iter;
160*38fd1498Szrj   gimple *stmt;
161*38fd1498Szrj   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162*38fd1498Szrj 
163*38fd1498Szrj   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164*38fd1498Szrj 
165*38fd1498Szrj   /* Adjust any debug stmts that held onto non-loop-closed
166*38fd1498Szrj      references.  */
167*38fd1498Szrj   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168*38fd1498Szrj     {
169*38fd1498Szrj       use_operand_p use_p;
170*38fd1498Szrj       basic_block bbuse;
171*38fd1498Szrj 
172*38fd1498Szrj       if (!is_gimple_debug (stmt))
173*38fd1498Szrj 	continue;
174*38fd1498Szrj 
175*38fd1498Szrj       gcc_assert (gimple_debug_bind_p (stmt));
176*38fd1498Szrj 
177*38fd1498Szrj       bbuse = gimple_bb (stmt);
178*38fd1498Szrj 
179*38fd1498Szrj       if ((bbuse == bbphi
180*38fd1498Szrj 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181*38fd1498Szrj 	  && !(bbuse == bbdef
182*38fd1498Szrj 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183*38fd1498Szrj 	{
184*38fd1498Szrj 	  if (new_def)
185*38fd1498Szrj 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186*38fd1498Szrj 	      SET_USE (use_p, new_def);
187*38fd1498Szrj 	  else
188*38fd1498Szrj 	    {
189*38fd1498Szrj 	      gimple_debug_bind_reset_value (stmt);
190*38fd1498Szrj 	      update_stmt (stmt);
191*38fd1498Szrj 	    }
192*38fd1498Szrj 	}
193*38fd1498Szrj     }
194*38fd1498Szrj }
195*38fd1498Szrj 
196*38fd1498Szrj /* Adjust debug stmts as scheduled before.  */
197*38fd1498Szrj 
198*38fd1498Szrj static void
adjust_vec_debug_stmts(void)199*38fd1498Szrj adjust_vec_debug_stmts (void)
200*38fd1498Szrj {
201*38fd1498Szrj   if (!MAY_HAVE_DEBUG_BIND_STMTS)
202*38fd1498Szrj     return;
203*38fd1498Szrj 
204*38fd1498Szrj   gcc_assert (adjust_vec.exists ());
205*38fd1498Szrj 
206*38fd1498Szrj   while (!adjust_vec.is_empty ())
207*38fd1498Szrj     {
208*38fd1498Szrj       adjust_debug_stmts_now (&adjust_vec.last ());
209*38fd1498Szrj       adjust_vec.pop ();
210*38fd1498Szrj     }
211*38fd1498Szrj }
212*38fd1498Szrj 
213*38fd1498Szrj /* Adjust any debug stmts that referenced FROM values to use the
214*38fd1498Szrj    loop-closed TO, if the references are dominated by BB and not by
215*38fd1498Szrj    the definition of FROM.  If adjust_vec is non-NULL, adjustments
216*38fd1498Szrj    will be postponed until adjust_vec_debug_stmts is called.  */
217*38fd1498Szrj 
218*38fd1498Szrj static void
adjust_debug_stmts(tree from,tree to,basic_block bb)219*38fd1498Szrj adjust_debug_stmts (tree from, tree to, basic_block bb)
220*38fd1498Szrj {
221*38fd1498Szrj   adjust_info ai;
222*38fd1498Szrj 
223*38fd1498Szrj   if (MAY_HAVE_DEBUG_BIND_STMTS
224*38fd1498Szrj       && TREE_CODE (from) == SSA_NAME
225*38fd1498Szrj       && ! SSA_NAME_IS_DEFAULT_DEF (from)
226*38fd1498Szrj       && ! virtual_operand_p (from))
227*38fd1498Szrj     {
228*38fd1498Szrj       ai.from = from;
229*38fd1498Szrj       ai.to = to;
230*38fd1498Szrj       ai.bb = bb;
231*38fd1498Szrj 
232*38fd1498Szrj       if (adjust_vec.exists ())
233*38fd1498Szrj 	adjust_vec.safe_push (ai);
234*38fd1498Szrj       else
235*38fd1498Szrj 	adjust_debug_stmts_now (&ai);
236*38fd1498Szrj     }
237*38fd1498Szrj }
238*38fd1498Szrj 
239*38fd1498Szrj /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240*38fd1498Szrj    to adjust any debug stmts that referenced the old phi arg,
241*38fd1498Szrj    presumably non-loop-closed references left over from other
242*38fd1498Szrj    transformations.  */
243*38fd1498Szrj 
244*38fd1498Szrj static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)245*38fd1498Szrj adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246*38fd1498Szrj {
247*38fd1498Szrj   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248*38fd1498Szrj 
249*38fd1498Szrj   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250*38fd1498Szrj 
251*38fd1498Szrj   if (MAY_HAVE_DEBUG_BIND_STMTS)
252*38fd1498Szrj     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253*38fd1498Szrj 			gimple_bb (update_phi));
254*38fd1498Szrj }
255*38fd1498Szrj 
256*38fd1498Szrj /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
257*38fd1498Szrj    the mask should have during the first iteration and NEXT_MASK is the
258*38fd1498Szrj    value that it should have on subsequent iterations.  */
259*38fd1498Szrj 
260*38fd1498Szrj static void
vect_set_loop_mask(struct loop * loop,tree mask,tree init_mask,tree next_mask)261*38fd1498Szrj vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262*38fd1498Szrj 		    tree next_mask)
263*38fd1498Szrj {
264*38fd1498Szrj   gphi *phi = create_phi_node (mask, loop->header);
265*38fd1498Szrj   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266*38fd1498Szrj   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267*38fd1498Szrj }
268*38fd1498Szrj 
269*38fd1498Szrj /* Add SEQ to the end of LOOP's preheader block.  */
270*38fd1498Szrj 
271*38fd1498Szrj static void
add_preheader_seq(struct loop * loop,gimple_seq seq)272*38fd1498Szrj add_preheader_seq (struct loop *loop, gimple_seq seq)
273*38fd1498Szrj {
274*38fd1498Szrj   if (seq)
275*38fd1498Szrj     {
276*38fd1498Szrj       edge pe = loop_preheader_edge (loop);
277*38fd1498Szrj       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278*38fd1498Szrj       gcc_assert (!new_bb);
279*38fd1498Szrj     }
280*38fd1498Szrj }
281*38fd1498Szrj 
282*38fd1498Szrj /* Add SEQ to the beginning of LOOP's header block.  */
283*38fd1498Szrj 
284*38fd1498Szrj static void
add_header_seq(struct loop * loop,gimple_seq seq)285*38fd1498Szrj add_header_seq (struct loop *loop, gimple_seq seq)
286*38fd1498Szrj {
287*38fd1498Szrj   if (seq)
288*38fd1498Szrj     {
289*38fd1498Szrj       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290*38fd1498Szrj       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291*38fd1498Szrj     }
292*38fd1498Szrj }
293*38fd1498Szrj 
294*38fd1498Szrj /* Return true if the target can interleave elements of two vectors.
295*38fd1498Szrj    OFFSET is 0 if the first half of the vectors should be interleaved
296*38fd1498Szrj    or 1 if the second half should.  When returning true, store the
297*38fd1498Szrj    associated permutation in INDICES.  */
298*38fd1498Szrj 
299*38fd1498Szrj static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)300*38fd1498Szrj interleave_supported_p (vec_perm_indices *indices, tree vectype,
301*38fd1498Szrj 			unsigned int offset)
302*38fd1498Szrj {
303*38fd1498Szrj   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304*38fd1498Szrj   poly_uint64 base = exact_div (nelts, 2) * offset;
305*38fd1498Szrj   vec_perm_builder sel (nelts, 2, 3);
306*38fd1498Szrj   for (unsigned int i = 0; i < 3; ++i)
307*38fd1498Szrj     {
308*38fd1498Szrj       sel.quick_push (base + i);
309*38fd1498Szrj       sel.quick_push (base + i + nelts);
310*38fd1498Szrj     }
311*38fd1498Szrj   indices->new_vector (sel, 2, nelts);
312*38fd1498Szrj   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313*38fd1498Szrj }
314*38fd1498Szrj 
315*38fd1498Szrj /* Try to use permutes to define the masks in DEST_RGM using the masks
316*38fd1498Szrj    in SRC_RGM, given that the former has twice as many masks as the
317*38fd1498Szrj    latter.  Return true on success, adding any new statements to SEQ.  */
318*38fd1498Szrj 
319*38fd1498Szrj static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)320*38fd1498Szrj vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321*38fd1498Szrj 			       rgroup_masks *src_rgm)
322*38fd1498Szrj {
323*38fd1498Szrj   tree src_masktype = src_rgm->mask_type;
324*38fd1498Szrj   tree dest_masktype = dest_rgm->mask_type;
325*38fd1498Szrj   machine_mode src_mode = TYPE_MODE (src_masktype);
326*38fd1498Szrj   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327*38fd1498Szrj       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328*38fd1498Szrj       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329*38fd1498Szrj     {
330*38fd1498Szrj       /* Unpacking the source masks gives at least as many mask bits as
331*38fd1498Szrj 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
332*38fd1498Szrj       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333*38fd1498Szrj       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334*38fd1498Szrj 	{
335*38fd1498Szrj 	  tree src = src_rgm->masks[i / 2];
336*38fd1498Szrj 	  tree dest = dest_rgm->masks[i];
337*38fd1498Szrj 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338*38fd1498Szrj 			    ? VEC_UNPACK_HI_EXPR
339*38fd1498Szrj 			    : VEC_UNPACK_LO_EXPR);
340*38fd1498Szrj 	  gassign *stmt;
341*38fd1498Szrj 	  if (dest_masktype == unpack_masktype)
342*38fd1498Szrj 	    stmt = gimple_build_assign (dest, code, src);
343*38fd1498Szrj 	  else
344*38fd1498Szrj 	    {
345*38fd1498Szrj 	      tree temp = make_ssa_name (unpack_masktype);
346*38fd1498Szrj 	      stmt = gimple_build_assign (temp, code, src);
347*38fd1498Szrj 	      gimple_seq_add_stmt (seq, stmt);
348*38fd1498Szrj 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349*38fd1498Szrj 					  build1 (VIEW_CONVERT_EXPR,
350*38fd1498Szrj 						  dest_masktype, temp));
351*38fd1498Szrj 	    }
352*38fd1498Szrj 	  gimple_seq_add_stmt (seq, stmt);
353*38fd1498Szrj 	}
354*38fd1498Szrj       return true;
355*38fd1498Szrj     }
356*38fd1498Szrj   vec_perm_indices indices[2];
357*38fd1498Szrj   if (dest_masktype == src_masktype
358*38fd1498Szrj       && interleave_supported_p (&indices[0], src_masktype, 0)
359*38fd1498Szrj       && interleave_supported_p (&indices[1], src_masktype, 1))
360*38fd1498Szrj     {
361*38fd1498Szrj       /* The destination requires twice as many mask bits as the source, so
362*38fd1498Szrj 	 we can use interleaving permutes to double up the number of bits.  */
363*38fd1498Szrj       tree masks[2];
364*38fd1498Szrj       for (unsigned int i = 0; i < 2; ++i)
365*38fd1498Szrj 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366*38fd1498Szrj       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367*38fd1498Szrj 	{
368*38fd1498Szrj 	  tree src = src_rgm->masks[i / 2];
369*38fd1498Szrj 	  tree dest = dest_rgm->masks[i];
370*38fd1498Szrj 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371*38fd1498Szrj 					      src, src, masks[i & 1]);
372*38fd1498Szrj 	  gimple_seq_add_stmt (seq, stmt);
373*38fd1498Szrj 	}
374*38fd1498Szrj       return true;
375*38fd1498Szrj     }
376*38fd1498Szrj   return false;
377*38fd1498Szrj }
378*38fd1498Szrj 
379*38fd1498Szrj /* Helper for vect_set_loop_condition_masked.  Generate definitions for
380*38fd1498Szrj    all the masks in RGM and return a mask that is nonzero when the loop
381*38fd1498Szrj    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
382*38fd1498Szrj    Use LOOP_COND_GSI to insert code before the exit gcond.
383*38fd1498Szrj 
384*38fd1498Szrj    RGM belongs to loop LOOP.  The loop originally iterated NITERS
385*38fd1498Szrj    times and has been vectorized according to LOOP_VINFO.  Each iteration
386*38fd1498Szrj    of the vectorized loop handles VF iterations of the scalar loop.
387*38fd1498Szrj 
388*38fd1498Szrj    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389*38fd1498Szrj    starts with NITERS_SKIP dummy iterations of the scalar loop before
390*38fd1498Szrj    the real work starts.  The mask elements for these dummy iterations
391*38fd1498Szrj    must be 0, to ensure that the extra iterations do not have an effect.
392*38fd1498Szrj 
393*38fd1498Szrj    It is known that:
394*38fd1498Szrj 
395*38fd1498Szrj      NITERS * RGM->max_nscalars_per_iter
396*38fd1498Szrj 
397*38fd1498Szrj    does not overflow.  However, MIGHT_WRAP_P says whether an induction
398*38fd1498Szrj    variable that starts at 0 and has step:
399*38fd1498Szrj 
400*38fd1498Szrj      VF * RGM->max_nscalars_per_iter
401*38fd1498Szrj 
402*38fd1498Szrj    might overflow before hitting a value above:
403*38fd1498Szrj 
404*38fd1498Szrj      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405*38fd1498Szrj 
406*38fd1498Szrj    This means that we cannot guarantee that such an induction variable
407*38fd1498Szrj    would ever hit a value that produces a set of all-false masks for RGM.  */
408*38fd1498Szrj 
409*38fd1498Szrj static tree
vect_set_loop_masks_directly(struct loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree vf,tree niters,tree niters_skip,bool might_wrap_p)410*38fd1498Szrj vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411*38fd1498Szrj 			      gimple_seq *preheader_seq,
412*38fd1498Szrj 			      gimple_stmt_iterator loop_cond_gsi,
413*38fd1498Szrj 			      rgroup_masks *rgm, tree vf,
414*38fd1498Szrj 			      tree niters, tree niters_skip,
415*38fd1498Szrj 			      bool might_wrap_p)
416*38fd1498Szrj {
417*38fd1498Szrj   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418*38fd1498Szrj   tree mask_type = rgm->mask_type;
419*38fd1498Szrj   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420*38fd1498Szrj   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421*38fd1498Szrj 
422*38fd1498Szrj   /* Calculate the maximum number of scalar values that the rgroup
423*38fd1498Szrj      handles in total, the number that it handles for each iteration
424*38fd1498Szrj      of the vector loop, and the number that it should skip during the
425*38fd1498Szrj      first iteration of the vector loop.  */
426*38fd1498Szrj   tree nscalars_total = niters;
427*38fd1498Szrj   tree nscalars_step = vf;
428*38fd1498Szrj   tree nscalars_skip = niters_skip;
429*38fd1498Szrj   if (nscalars_per_iter != 1)
430*38fd1498Szrj     {
431*38fd1498Szrj       /* We checked before choosing to use a fully-masked loop that these
432*38fd1498Szrj 	 multiplications don't overflow.  */
433*38fd1498Szrj       tree factor = build_int_cst (compare_type, nscalars_per_iter);
434*38fd1498Szrj       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435*38fd1498Szrj 				     nscalars_total, factor);
436*38fd1498Szrj       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437*38fd1498Szrj 				    nscalars_step, factor);
438*38fd1498Szrj       if (nscalars_skip)
439*38fd1498Szrj 	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440*38fd1498Szrj 				      nscalars_skip, factor);
441*38fd1498Szrj     }
442*38fd1498Szrj 
443*38fd1498Szrj   /* Create an induction variable that counts the number of scalars
444*38fd1498Szrj      processed.  */
445*38fd1498Szrj   tree index_before_incr, index_after_incr;
446*38fd1498Szrj   gimple_stmt_iterator incr_gsi;
447*38fd1498Szrj   bool insert_after;
448*38fd1498Szrj   tree zero_index = build_int_cst (compare_type, 0);
449*38fd1498Szrj   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450*38fd1498Szrj   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451*38fd1498Szrj 	     insert_after, &index_before_incr, &index_after_incr);
452*38fd1498Szrj 
453*38fd1498Szrj   tree test_index, test_limit, first_limit;
454*38fd1498Szrj   gimple_stmt_iterator *test_gsi;
455*38fd1498Szrj   if (might_wrap_p)
456*38fd1498Szrj     {
457*38fd1498Szrj       /* In principle the loop should stop iterating once the incremented
458*38fd1498Szrj 	 IV reaches a value greater than or equal to:
459*38fd1498Szrj 
460*38fd1498Szrj 	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461*38fd1498Szrj 
462*38fd1498Szrj 	 However, there's no guarantee that this addition doesn't overflow
463*38fd1498Szrj 	 the comparison type, or that the IV hits a value above it before
464*38fd1498Szrj 	 wrapping around.  We therefore adjust the limit down by one
465*38fd1498Szrj 	 IV step:
466*38fd1498Szrj 
467*38fd1498Szrj 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468*38fd1498Szrj 	   -[infinite-prec] NSCALARS_STEP
469*38fd1498Szrj 
470*38fd1498Szrj 	 and compare the IV against this limit _before_ incrementing it.
471*38fd1498Szrj 	 Since the comparison type is unsigned, we actually want the
472*38fd1498Szrj 	 subtraction to saturate at zero:
473*38fd1498Szrj 
474*38fd1498Szrj 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475*38fd1498Szrj 	   -[sat] NSCALARS_STEP
476*38fd1498Szrj 
477*38fd1498Szrj 	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478*38fd1498Szrj 
479*38fd1498Szrj 	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480*38fd1498Szrj 
481*38fd1498Szrj 	 where the rightmost subtraction can be done directly in
482*38fd1498Szrj 	 COMPARE_TYPE.  */
483*38fd1498Szrj       test_index = index_before_incr;
484*38fd1498Szrj       tree adjust = nscalars_step;
485*38fd1498Szrj       if (nscalars_skip)
486*38fd1498Szrj 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487*38fd1498Szrj 			       adjust, nscalars_skip);
488*38fd1498Szrj       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489*38fd1498Szrj 				 nscalars_total, adjust);
490*38fd1498Szrj       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491*38fd1498Szrj 				 test_limit, adjust);
492*38fd1498Szrj       test_gsi = &incr_gsi;
493*38fd1498Szrj 
494*38fd1498Szrj       /* Get a safe limit for the first iteration.  */
495*38fd1498Szrj       if (nscalars_skip)
496*38fd1498Szrj 	{
497*38fd1498Szrj 	  /* The first vector iteration can handle at most NSCALARS_STEP
498*38fd1498Szrj 	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
499*38fd1498Szrj 	     NSCALARS_SKIP to that cannot overflow.  */
500*38fd1498Szrj 	  tree const_limit = build_int_cst (compare_type,
501*38fd1498Szrj 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502*38fd1498Szrj 					    * nscalars_per_iter);
503*38fd1498Szrj 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504*38fd1498Szrj 				      nscalars_total, const_limit);
505*38fd1498Szrj 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506*38fd1498Szrj 				      first_limit, nscalars_skip);
507*38fd1498Szrj 	}
508*38fd1498Szrj       else
509*38fd1498Szrj 	/* For the first iteration it doesn't matter whether the IV hits
510*38fd1498Szrj 	   a value above NSCALARS_TOTAL.  That only matters for the latch
511*38fd1498Szrj 	   condition.  */
512*38fd1498Szrj 	first_limit = nscalars_total;
513*38fd1498Szrj     }
514*38fd1498Szrj   else
515*38fd1498Szrj     {
516*38fd1498Szrj       /* Test the incremented IV, which will always hit a value above
517*38fd1498Szrj 	 the bound before wrapping.  */
518*38fd1498Szrj       test_index = index_after_incr;
519*38fd1498Szrj       test_limit = nscalars_total;
520*38fd1498Szrj       if (nscalars_skip)
521*38fd1498Szrj 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522*38fd1498Szrj 				   test_limit, nscalars_skip);
523*38fd1498Szrj       test_gsi = &loop_cond_gsi;
524*38fd1498Szrj 
525*38fd1498Szrj       first_limit = test_limit;
526*38fd1498Szrj     }
527*38fd1498Szrj 
528*38fd1498Szrj   /* Provide a definition of each mask in the group.  */
529*38fd1498Szrj   tree next_mask = NULL_TREE;
530*38fd1498Szrj   tree mask;
531*38fd1498Szrj   unsigned int i;
532*38fd1498Szrj   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533*38fd1498Szrj     {
534*38fd1498Szrj       /* Previous masks will cover BIAS scalars.  This mask covers the
535*38fd1498Szrj 	 next batch.  */
536*38fd1498Szrj       poly_uint64 bias = nscalars_per_mask * i;
537*38fd1498Szrj       tree bias_tree = build_int_cst (compare_type, bias);
538*38fd1498Szrj       gimple *tmp_stmt;
539*38fd1498Szrj 
540*38fd1498Szrj       /* See whether the first iteration of the vector loop is known
541*38fd1498Szrj 	 to have a full mask.  */
542*38fd1498Szrj       poly_uint64 const_limit;
543*38fd1498Szrj       bool first_iteration_full
544*38fd1498Szrj 	= (poly_int_tree_p (first_limit, &const_limit)
545*38fd1498Szrj 	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546*38fd1498Szrj 
547*38fd1498Szrj       /* Rather than have a new IV that starts at BIAS and goes up to
548*38fd1498Szrj 	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549*38fd1498Szrj 	 and adjust the bound down by BIAS.  */
550*38fd1498Szrj       tree this_test_limit = test_limit;
551*38fd1498Szrj       if (i != 0)
552*38fd1498Szrj 	{
553*38fd1498Szrj 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554*38fd1498Szrj 					  compare_type, this_test_limit,
555*38fd1498Szrj 					  bias_tree);
556*38fd1498Szrj 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557*38fd1498Szrj 					  compare_type, this_test_limit,
558*38fd1498Szrj 					  bias_tree);
559*38fd1498Szrj 	}
560*38fd1498Szrj 
561*38fd1498Szrj       /* Create the initial mask.  First include all scalars that
562*38fd1498Szrj 	 are within the loop limit.  */
563*38fd1498Szrj       tree init_mask = NULL_TREE;
564*38fd1498Szrj       if (!first_iteration_full)
565*38fd1498Szrj 	{
566*38fd1498Szrj 	  tree start, end;
567*38fd1498Szrj 	  if (first_limit == test_limit)
568*38fd1498Szrj 	    {
569*38fd1498Szrj 	      /* Use a natural test between zero (the initial IV value)
570*38fd1498Szrj 		 and the loop limit.  The "else" block would be valid too,
571*38fd1498Szrj 		 but this choice can avoid the need to load BIAS_TREE into
572*38fd1498Szrj 		 a register.  */
573*38fd1498Szrj 	      start = zero_index;
574*38fd1498Szrj 	      end = this_test_limit;
575*38fd1498Szrj 	    }
576*38fd1498Szrj 	  else
577*38fd1498Szrj 	    {
578*38fd1498Szrj 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
579*38fd1498Szrj 		 first iteration of the vector loop.  Test the portion
580*38fd1498Szrj 		 associated with this mask.  */
581*38fd1498Szrj 	      start = bias_tree;
582*38fd1498Szrj 	      end = first_limit;
583*38fd1498Szrj 	    }
584*38fd1498Szrj 
585*38fd1498Szrj 	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586*38fd1498Szrj 	  tmp_stmt = vect_gen_while (init_mask, start, end);
587*38fd1498Szrj 	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588*38fd1498Szrj 	}
589*38fd1498Szrj 
590*38fd1498Szrj       /* Now AND out the bits that are within the number of skipped
591*38fd1498Szrj 	 scalars.  */
592*38fd1498Szrj       poly_uint64 const_skip;
593*38fd1498Szrj       if (nscalars_skip
594*38fd1498Szrj 	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
595*38fd1498Szrj 	       && known_le (const_skip, bias)))
596*38fd1498Szrj 	{
597*38fd1498Szrj 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598*38fd1498Szrj 						    bias_tree, nscalars_skip);
599*38fd1498Szrj 	  if (init_mask)
600*38fd1498Szrj 	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601*38fd1498Szrj 				      init_mask, unskipped_mask);
602*38fd1498Szrj 	  else
603*38fd1498Szrj 	    init_mask = unskipped_mask;
604*38fd1498Szrj 	}
605*38fd1498Szrj 
606*38fd1498Szrj       if (!init_mask)
607*38fd1498Szrj 	/* First iteration is full.  */
608*38fd1498Szrj 	init_mask = build_minus_one_cst (mask_type);
609*38fd1498Szrj 
610*38fd1498Szrj       /* Get the mask value for the next iteration of the loop.  */
611*38fd1498Szrj       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612*38fd1498Szrj       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613*38fd1498Szrj       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614*38fd1498Szrj 
615*38fd1498Szrj       vect_set_loop_mask (loop, mask, init_mask, next_mask);
616*38fd1498Szrj     }
617*38fd1498Szrj   return next_mask;
618*38fd1498Szrj }
619*38fd1498Szrj 
620*38fd1498Szrj /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621*38fd1498Szrj    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
622*38fd1498Szrj    number of iterations of the original scalar loop that should be
623*38fd1498Szrj    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
624*38fd1498Szrj    as for vect_set_loop_condition.
625*38fd1498Szrj 
626*38fd1498Szrj    Insert the branch-back condition before LOOP_COND_GSI and return the
627*38fd1498Szrj    final gcond.  */
628*38fd1498Szrj 
629*38fd1498Szrj static gcond *
vect_set_loop_condition_masked(struct loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)630*38fd1498Szrj vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631*38fd1498Szrj 				tree niters, tree final_iv,
632*38fd1498Szrj 				bool niters_maybe_zero,
633*38fd1498Szrj 				gimple_stmt_iterator loop_cond_gsi)
634*38fd1498Szrj {
635*38fd1498Szrj   gimple_seq preheader_seq = NULL;
636*38fd1498Szrj   gimple_seq header_seq = NULL;
637*38fd1498Szrj 
638*38fd1498Szrj   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639*38fd1498Szrj   unsigned int compare_precision = TYPE_PRECISION (compare_type);
640*38fd1498Szrj   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641*38fd1498Szrj   tree orig_niters = niters;
642*38fd1498Szrj 
643*38fd1498Szrj   /* Type of the initial value of NITERS.  */
644*38fd1498Szrj   tree ni_actual_type = TREE_TYPE (niters);
645*38fd1498Szrj   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646*38fd1498Szrj 
647*38fd1498Szrj   /* Convert NITERS to the same size as the compare.  */
648*38fd1498Szrj   if (compare_precision > ni_actual_precision
649*38fd1498Szrj       && niters_maybe_zero)
650*38fd1498Szrj     {
651*38fd1498Szrj       /* We know that there is always at least one iteration, so if the
652*38fd1498Szrj 	 count is zero then it must have wrapped.  Cope with this by
653*38fd1498Szrj 	 subtracting 1 before the conversion and adding 1 to the result.  */
654*38fd1498Szrj       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655*38fd1498Szrj       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656*38fd1498Szrj 			     niters, build_minus_one_cst (ni_actual_type));
657*38fd1498Szrj       niters = gimple_convert (&preheader_seq, compare_type, niters);
658*38fd1498Szrj       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659*38fd1498Szrj 			     niters, build_one_cst (compare_type));
660*38fd1498Szrj     }
661*38fd1498Szrj   else
662*38fd1498Szrj     niters = gimple_convert (&preheader_seq, compare_type, niters);
663*38fd1498Szrj 
664*38fd1498Szrj   /* Convert skip_niters to the right type.  */
665*38fd1498Szrj   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666*38fd1498Szrj 
667*38fd1498Szrj   /* Now calculate the value that the induction variable must be able
668*38fd1498Szrj      to hit in order to ensure that we end the loop with an all-false mask.
669*38fd1498Szrj      This involves adding the maximum number of inactive trailing scalar
670*38fd1498Szrj      iterations.  */
671*38fd1498Szrj   widest_int iv_limit;
672*38fd1498Szrj   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673*38fd1498Szrj   if (known_max_iters)
674*38fd1498Szrj     {
675*38fd1498Szrj       if (niters_skip)
676*38fd1498Szrj 	{
677*38fd1498Szrj 	  /* Add the maximum number of skipped iterations to the
678*38fd1498Szrj 	     maximum iteration count.  */
679*38fd1498Szrj 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
680*38fd1498Szrj 	    iv_limit += wi::to_widest (niters_skip);
681*38fd1498Szrj 	  else
682*38fd1498Szrj 	    iv_limit += max_vf - 1;
683*38fd1498Szrj 	}
684*38fd1498Szrj       /* IV_LIMIT is the maximum number of latch iterations, which is also
685*38fd1498Szrj 	 the maximum in-range IV value.  Round this value down to the previous
686*38fd1498Szrj 	 vector alignment boundary and then add an extra full iteration.  */
687*38fd1498Szrj       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688*38fd1498Szrj       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689*38fd1498Szrj     }
690*38fd1498Szrj 
691*38fd1498Szrj   /* Get the vectorization factor in tree form.  */
692*38fd1498Szrj   tree vf = build_int_cst (compare_type,
693*38fd1498Szrj 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694*38fd1498Szrj 
695*38fd1498Szrj   /* Iterate over all the rgroups and fill in their masks.  We could use
696*38fd1498Szrj      the first mask from any rgroup for the loop condition; here we
697*38fd1498Szrj      arbitrarily pick the last.  */
698*38fd1498Szrj   tree test_mask = NULL_TREE;
699*38fd1498Szrj   rgroup_masks *rgm;
700*38fd1498Szrj   unsigned int i;
701*38fd1498Szrj   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702*38fd1498Szrj   FOR_EACH_VEC_ELT (*masks, i, rgm)
703*38fd1498Szrj     if (!rgm->masks.is_empty ())
704*38fd1498Szrj       {
705*38fd1498Szrj 	/* First try using permutes.  This adds a single vector
706*38fd1498Szrj 	   instruction to the loop for each mask, but needs no extra
707*38fd1498Szrj 	   loop invariants or IVs.  */
708*38fd1498Szrj 	unsigned int nmasks = i + 1;
709*38fd1498Szrj 	if ((nmasks & 1) == 0)
710*38fd1498Szrj 	  {
711*38fd1498Szrj 	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712*38fd1498Szrj 	    if (!half_rgm->masks.is_empty ()
713*38fd1498Szrj 		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714*38fd1498Szrj 	      continue;
715*38fd1498Szrj 	  }
716*38fd1498Szrj 
717*38fd1498Szrj 	/* See whether zero-based IV would ever generate all-false masks
718*38fd1498Szrj 	   before wrapping around.  */
719*38fd1498Szrj 	bool might_wrap_p
720*38fd1498Szrj 	  = (!known_max_iters
721*38fd1498Szrj 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722*38fd1498Szrj 				    UNSIGNED)
723*38fd1498Szrj 		 > compare_precision));
724*38fd1498Szrj 
725*38fd1498Szrj 	/* Set up all masks for this group.  */
726*38fd1498Szrj 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727*38fd1498Szrj 						  &preheader_seq,
728*38fd1498Szrj 						  loop_cond_gsi, rgm, vf,
729*38fd1498Szrj 						  niters, niters_skip,
730*38fd1498Szrj 						  might_wrap_p);
731*38fd1498Szrj       }
732*38fd1498Szrj 
733*38fd1498Szrj   /* Emit all accumulated statements.  */
734*38fd1498Szrj   add_preheader_seq (loop, preheader_seq);
735*38fd1498Szrj   add_header_seq (loop, header_seq);
736*38fd1498Szrj 
737*38fd1498Szrj   /* Get a boolean result that tells us whether to iterate.  */
738*38fd1498Szrj   edge exit_edge = single_exit (loop);
739*38fd1498Szrj   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740*38fd1498Szrj   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741*38fd1498Szrj   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742*38fd1498Szrj 					NULL_TREE, NULL_TREE);
743*38fd1498Szrj   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744*38fd1498Szrj 
745*38fd1498Szrj   /* The loop iterates (NITERS - 1) / VF + 1 times.
746*38fd1498Szrj      Subtract one from this to get the latch count.  */
747*38fd1498Szrj   tree step = build_int_cst (compare_type,
748*38fd1498Szrj 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749*38fd1498Szrj   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750*38fd1498Szrj 				       build_minus_one_cst (compare_type));
751*38fd1498Szrj   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752*38fd1498Szrj 				     niters_minus_one, step);
753*38fd1498Szrj 
754*38fd1498Szrj   if (final_iv)
755*38fd1498Szrj     {
756*38fd1498Szrj       gassign *assign = gimple_build_assign (final_iv, orig_niters);
757*38fd1498Szrj       gsi_insert_on_edge_immediate (single_exit (loop), assign);
758*38fd1498Szrj     }
759*38fd1498Szrj 
760*38fd1498Szrj   return cond_stmt;
761*38fd1498Szrj }
762*38fd1498Szrj 
763*38fd1498Szrj /* Like vect_set_loop_condition, but handle the case in which there
764*38fd1498Szrj    are no loop masks.  */
765*38fd1498Szrj 
766*38fd1498Szrj static gcond *
vect_set_loop_condition_unmasked(struct loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)767*38fd1498Szrj vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768*38fd1498Szrj 				  tree step, tree final_iv,
769*38fd1498Szrj 				  bool niters_maybe_zero,
770*38fd1498Szrj 				  gimple_stmt_iterator loop_cond_gsi)
771*38fd1498Szrj {
772*38fd1498Szrj   tree indx_before_incr, indx_after_incr;
773*38fd1498Szrj   gcond *cond_stmt;
774*38fd1498Szrj   gcond *orig_cond;
775*38fd1498Szrj   edge pe = loop_preheader_edge (loop);
776*38fd1498Szrj   edge exit_edge = single_exit (loop);
777*38fd1498Szrj   gimple_stmt_iterator incr_gsi;
778*38fd1498Szrj   bool insert_after;
779*38fd1498Szrj   enum tree_code code;
780*38fd1498Szrj   tree niters_type = TREE_TYPE (niters);
781*38fd1498Szrj 
782*38fd1498Szrj   orig_cond = get_loop_exit_condition (loop);
783*38fd1498Szrj   gcc_assert (orig_cond);
784*38fd1498Szrj   loop_cond_gsi = gsi_for_stmt (orig_cond);
785*38fd1498Szrj 
786*38fd1498Szrj   tree init, limit;
787*38fd1498Szrj   if (!niters_maybe_zero && integer_onep (step))
788*38fd1498Szrj     {
789*38fd1498Szrj       /* In this case we can use a simple 0-based IV:
790*38fd1498Szrj 
791*38fd1498Szrj 	 A:
792*38fd1498Szrj 	   x = 0;
793*38fd1498Szrj 	   do
794*38fd1498Szrj 	     {
795*38fd1498Szrj 	       ...
796*38fd1498Szrj 	       x += 1;
797*38fd1498Szrj 	     }
798*38fd1498Szrj 	   while (x < NITERS);  */
799*38fd1498Szrj       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800*38fd1498Szrj       init = build_zero_cst (niters_type);
801*38fd1498Szrj       limit = niters;
802*38fd1498Szrj     }
803*38fd1498Szrj   else
804*38fd1498Szrj     {
805*38fd1498Szrj       /* The following works for all values of NITERS except 0:
806*38fd1498Szrj 
807*38fd1498Szrj 	 B:
808*38fd1498Szrj 	   x = 0;
809*38fd1498Szrj 	   do
810*38fd1498Szrj 	     {
811*38fd1498Szrj 	       ...
812*38fd1498Szrj 	       x += STEP;
813*38fd1498Szrj 	     }
814*38fd1498Szrj 	   while (x <= NITERS - STEP);
815*38fd1498Szrj 
816*38fd1498Szrj 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
817*38fd1498Szrj 	 but stops if x + STEP - 1 >= NITERS.
818*38fd1498Szrj 
819*38fd1498Szrj 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
820*38fd1498Szrj 	 before wrapping around.  There are two obvious ways of dealing with
821*38fd1498Szrj 	 this:
822*38fd1498Szrj 
823*38fd1498Szrj 	 - start at STEP - 1 and compare x before incrementing it
824*38fd1498Szrj 	 - start at -1 and compare x after incrementing it
825*38fd1498Szrj 
826*38fd1498Szrj 	 The latter is simpler and is what we use.  The loop in this case
827*38fd1498Szrj 	 looks like:
828*38fd1498Szrj 
829*38fd1498Szrj 	 C:
830*38fd1498Szrj 	   x = -1;
831*38fd1498Szrj 	   do
832*38fd1498Szrj 	     {
833*38fd1498Szrj 	       ...
834*38fd1498Szrj 	       x += STEP;
835*38fd1498Szrj 	     }
836*38fd1498Szrj 	   while (x < NITERS - STEP);
837*38fd1498Szrj 
838*38fd1498Szrj 	 In both cases the loop limit is NITERS - STEP.  */
839*38fd1498Szrj       gimple_seq seq = NULL;
840*38fd1498Szrj       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841*38fd1498Szrj       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842*38fd1498Szrj       if (seq)
843*38fd1498Szrj 	{
844*38fd1498Szrj 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845*38fd1498Szrj 	  gcc_assert (!new_bb);
846*38fd1498Szrj 	}
847*38fd1498Szrj       if (niters_maybe_zero)
848*38fd1498Szrj 	{
849*38fd1498Szrj 	  /* Case C.  */
850*38fd1498Szrj 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851*38fd1498Szrj 	  init = build_all_ones_cst (niters_type);
852*38fd1498Szrj 	}
853*38fd1498Szrj       else
854*38fd1498Szrj 	{
855*38fd1498Szrj 	  /* Case B.  */
856*38fd1498Szrj 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857*38fd1498Szrj 	  init = build_zero_cst (niters_type);
858*38fd1498Szrj 	}
859*38fd1498Szrj     }
860*38fd1498Szrj 
861*38fd1498Szrj   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862*38fd1498Szrj   create_iv (init, step, NULL_TREE, loop,
863*38fd1498Szrj              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864*38fd1498Szrj   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865*38fd1498Szrj 					      true, NULL_TREE, true,
866*38fd1498Szrj 					      GSI_SAME_STMT);
867*38fd1498Szrj   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868*38fd1498Szrj 				     true, GSI_SAME_STMT);
869*38fd1498Szrj 
870*38fd1498Szrj   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871*38fd1498Szrj 				 NULL_TREE);
872*38fd1498Szrj 
873*38fd1498Szrj   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874*38fd1498Szrj 
875*38fd1498Szrj   /* Record the number of latch iterations.  */
876*38fd1498Szrj   if (limit == niters)
877*38fd1498Szrj     /* Case A: the loop iterates NITERS times.  Subtract one to get the
878*38fd1498Szrj        latch count.  */
879*38fd1498Szrj     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880*38fd1498Szrj 				       build_int_cst (niters_type, 1));
881*38fd1498Szrj   else
882*38fd1498Szrj     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883*38fd1498Szrj        Subtract one from this to get the latch count.  */
884*38fd1498Szrj     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885*38fd1498Szrj 				       limit, step);
886*38fd1498Szrj 
887*38fd1498Szrj   if (final_iv)
888*38fd1498Szrj     {
889*38fd1498Szrj       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890*38fd1498Szrj 					     indx_after_incr, init);
891*38fd1498Szrj       gsi_insert_on_edge_immediate (single_exit (loop), assign);
892*38fd1498Szrj     }
893*38fd1498Szrj 
894*38fd1498Szrj   return cond_stmt;
895*38fd1498Szrj }
896*38fd1498Szrj 
897*38fd1498Szrj /* If we're using fully-masked loops, make LOOP iterate:
898*38fd1498Szrj 
899*38fd1498Szrj       N == (NITERS - 1) / STEP + 1
900*38fd1498Szrj 
901*38fd1498Szrj    times.  When NITERS is zero, this is equivalent to making the loop
902*38fd1498Szrj    execute (1 << M) / STEP times, where M is the precision of NITERS.
903*38fd1498Szrj    NITERS_MAYBE_ZERO is true if this last case might occur.
904*38fd1498Szrj 
905*38fd1498Szrj    If we're not using fully-masked loops, make LOOP iterate:
906*38fd1498Szrj 
907*38fd1498Szrj       N == (NITERS - STEP) / STEP + 1
908*38fd1498Szrj 
909*38fd1498Szrj    times, where NITERS is known to be outside the range [1, STEP - 1].
910*38fd1498Szrj    This is equivalent to making the loop execute NITERS / STEP times
911*38fd1498Szrj    when NITERS is nonzero and (1 << M) / STEP times otherwise.
912*38fd1498Szrj    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913*38fd1498Szrj 
914*38fd1498Szrj    If FINAL_IV is nonnull, it is an SSA name that should be set to
915*38fd1498Szrj    N * STEP on exit from the loop.
916*38fd1498Szrj 
917*38fd1498Szrj    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
918*38fd1498Szrj 
919*38fd1498Szrj void
vect_set_loop_condition(struct loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)920*38fd1498Szrj vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921*38fd1498Szrj 			 tree niters, tree step, tree final_iv,
922*38fd1498Szrj 			 bool niters_maybe_zero)
923*38fd1498Szrj {
924*38fd1498Szrj   gcond *cond_stmt;
925*38fd1498Szrj   gcond *orig_cond = get_loop_exit_condition (loop);
926*38fd1498Szrj   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927*38fd1498Szrj 
928*38fd1498Szrj   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929*38fd1498Szrj     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930*38fd1498Szrj 						final_iv, niters_maybe_zero,
931*38fd1498Szrj 						loop_cond_gsi);
932*38fd1498Szrj   else
933*38fd1498Szrj     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934*38fd1498Szrj 						  final_iv, niters_maybe_zero,
935*38fd1498Szrj 						  loop_cond_gsi);
936*38fd1498Szrj 
937*38fd1498Szrj   /* Remove old loop exit test.  */
938*38fd1498Szrj   gsi_remove (&loop_cond_gsi, true);
939*38fd1498Szrj   free_stmt_vec_info (orig_cond);
940*38fd1498Szrj 
941*38fd1498Szrj   if (dump_enabled_p ())
942*38fd1498Szrj     {
943*38fd1498Szrj       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
944*38fd1498Szrj       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
945*38fd1498Szrj     }
946*38fd1498Szrj }
947*38fd1498Szrj 
948*38fd1498Szrj /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
949*38fd1498Szrj    For all PHI arguments in FROM->dest and TO->dest from those
950*38fd1498Szrj    edges ensure that TO->dest PHI arguments have current_def
951*38fd1498Szrj    to that in from.  */
952*38fd1498Szrj 
953*38fd1498Szrj static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)954*38fd1498Szrj slpeel_duplicate_current_defs_from_edges (edge from, edge to)
955*38fd1498Szrj {
956*38fd1498Szrj   gimple_stmt_iterator gsi_from, gsi_to;
957*38fd1498Szrj 
958*38fd1498Szrj   for (gsi_from = gsi_start_phis (from->dest),
959*38fd1498Szrj        gsi_to = gsi_start_phis (to->dest);
960*38fd1498Szrj        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
961*38fd1498Szrj     {
962*38fd1498Szrj       gimple *from_phi = gsi_stmt (gsi_from);
963*38fd1498Szrj       gimple *to_phi = gsi_stmt (gsi_to);
964*38fd1498Szrj       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
965*38fd1498Szrj       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
966*38fd1498Szrj       if (virtual_operand_p (from_arg))
967*38fd1498Szrj 	{
968*38fd1498Szrj 	  gsi_next (&gsi_from);
969*38fd1498Szrj 	  continue;
970*38fd1498Szrj 	}
971*38fd1498Szrj       if (virtual_operand_p (to_arg))
972*38fd1498Szrj 	{
973*38fd1498Szrj 	  gsi_next (&gsi_to);
974*38fd1498Szrj 	  continue;
975*38fd1498Szrj 	}
976*38fd1498Szrj       if (TREE_CODE (from_arg) != SSA_NAME)
977*38fd1498Szrj 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
978*38fd1498Szrj       else
979*38fd1498Szrj 	{
980*38fd1498Szrj 	  if (get_current_def (to_arg) == NULL_TREE)
981*38fd1498Szrj 	    set_current_def (to_arg, get_current_def (from_arg));
982*38fd1498Szrj 	}
983*38fd1498Szrj       gsi_next (&gsi_from);
984*38fd1498Szrj       gsi_next (&gsi_to);
985*38fd1498Szrj     }
986*38fd1498Szrj 
987*38fd1498Szrj   gphi *from_phi = get_virtual_phi (from->dest);
988*38fd1498Szrj   gphi *to_phi = get_virtual_phi (to->dest);
989*38fd1498Szrj   if (from_phi)
990*38fd1498Szrj     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
991*38fd1498Szrj 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
992*38fd1498Szrj }
993*38fd1498Szrj 
994*38fd1498Szrj 
995*38fd1498Szrj /* Given LOOP this function generates a new copy of it and puts it
996*38fd1498Szrj    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
997*38fd1498Szrj    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
998*38fd1498Szrj    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
999*38fd1498Szrj    entry or exit of LOOP.  */
1000*38fd1498Szrj 
1001*38fd1498Szrj struct loop *
slpeel_tree_duplicate_loop_to_edge_cfg(struct loop * loop,struct loop * scalar_loop,edge e)1002*38fd1498Szrj slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1003*38fd1498Szrj 					struct loop *scalar_loop, edge e)
1004*38fd1498Szrj {
1005*38fd1498Szrj   struct loop *new_loop;
1006*38fd1498Szrj   basic_block *new_bbs, *bbs, *pbbs;
1007*38fd1498Szrj   bool at_exit;
1008*38fd1498Szrj   bool was_imm_dom;
1009*38fd1498Szrj   basic_block exit_dest;
1010*38fd1498Szrj   edge exit, new_exit;
1011*38fd1498Szrj   bool duplicate_outer_loop = false;
1012*38fd1498Szrj 
1013*38fd1498Szrj   exit = single_exit (loop);
1014*38fd1498Szrj   at_exit = (e == exit);
1015*38fd1498Szrj   if (!at_exit && e != loop_preheader_edge (loop))
1016*38fd1498Szrj     return NULL;
1017*38fd1498Szrj 
1018*38fd1498Szrj   if (scalar_loop == NULL)
1019*38fd1498Szrj     scalar_loop = loop;
1020*38fd1498Szrj 
1021*38fd1498Szrj   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1022*38fd1498Szrj   pbbs = bbs + 1;
1023*38fd1498Szrj   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1024*38fd1498Szrj   /* Allow duplication of outer loops.  */
1025*38fd1498Szrj   if (scalar_loop->inner)
1026*38fd1498Szrj     duplicate_outer_loop = true;
1027*38fd1498Szrj   /* Check whether duplication is possible.  */
1028*38fd1498Szrj   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1029*38fd1498Szrj     {
1030*38fd1498Szrj       free (bbs);
1031*38fd1498Szrj       return NULL;
1032*38fd1498Szrj     }
1033*38fd1498Szrj 
1034*38fd1498Szrj   /* Generate new loop structure.  */
1035*38fd1498Szrj   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1036*38fd1498Szrj   duplicate_subloops (scalar_loop, new_loop);
1037*38fd1498Szrj 
1038*38fd1498Szrj   exit_dest = exit->dest;
1039*38fd1498Szrj   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1040*38fd1498Szrj 					  exit_dest) == loop->header ?
1041*38fd1498Szrj 		 true : false);
1042*38fd1498Szrj 
1043*38fd1498Szrj   /* Also copy the pre-header, this avoids jumping through hoops to
1044*38fd1498Szrj      duplicate the loop entry PHI arguments.  Create an empty
1045*38fd1498Szrj      pre-header unconditionally for this.  */
1046*38fd1498Szrj   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1047*38fd1498Szrj   edge entry_e = single_pred_edge (preheader);
1048*38fd1498Szrj   bbs[0] = preheader;
1049*38fd1498Szrj   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1050*38fd1498Szrj 
1051*38fd1498Szrj   exit = single_exit (scalar_loop);
1052*38fd1498Szrj   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1053*38fd1498Szrj 	    &exit, 1, &new_exit, NULL,
1054*38fd1498Szrj 	    at_exit ? loop->latch : e->src, true);
1055*38fd1498Szrj   exit = single_exit (loop);
1056*38fd1498Szrj   basic_block new_preheader = new_bbs[0];
1057*38fd1498Szrj 
1058*38fd1498Szrj   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1059*38fd1498Szrj 
1060*38fd1498Szrj   if (scalar_loop != loop)
1061*38fd1498Szrj     {
1062*38fd1498Szrj       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063*38fd1498Szrj 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064*38fd1498Szrj 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1065*38fd1498Szrj 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066*38fd1498Szrj 	 header) to have current_def set, so copy them over.  */
1067*38fd1498Szrj       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068*38fd1498Szrj 						exit);
1069*38fd1498Szrj       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070*38fd1498Szrj 							   0),
1071*38fd1498Szrj 						EDGE_SUCC (loop->latch, 0));
1072*38fd1498Szrj     }
1073*38fd1498Szrj 
1074*38fd1498Szrj   if (at_exit) /* Add the loop copy at exit.  */
1075*38fd1498Szrj     {
1076*38fd1498Szrj       if (scalar_loop != loop)
1077*38fd1498Szrj 	{
1078*38fd1498Szrj 	  gphi_iterator gsi;
1079*38fd1498Szrj 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080*38fd1498Szrj 
1081*38fd1498Szrj 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082*38fd1498Szrj 	       gsi_next (&gsi))
1083*38fd1498Szrj 	    {
1084*38fd1498Szrj 	      gphi *phi = gsi.phi ();
1085*38fd1498Szrj 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086*38fd1498Szrj 	      location_t orig_locus
1087*38fd1498Szrj 		= gimple_phi_arg_location_from_edge (phi, e);
1088*38fd1498Szrj 
1089*38fd1498Szrj 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090*38fd1498Szrj 	    }
1091*38fd1498Szrj 	}
1092*38fd1498Szrj       redirect_edge_and_branch_force (e, new_preheader);
1093*38fd1498Szrj       flush_pending_stmts (e);
1094*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095*38fd1498Szrj       if (was_imm_dom || duplicate_outer_loop)
1096*38fd1498Szrj 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097*38fd1498Szrj 
1098*38fd1498Szrj       /* And remove the non-necessary forwarder again.  Keep the other
1099*38fd1498Szrj          one so we have a proper pre-header for the loop at the exit edge.  */
1100*38fd1498Szrj       redirect_edge_pred (single_succ_edge (preheader),
1101*38fd1498Szrj 			  single_pred (preheader));
1102*38fd1498Szrj       delete_basic_block (preheader);
1103*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104*38fd1498Szrj 			       loop_preheader_edge (scalar_loop)->src);
1105*38fd1498Szrj     }
1106*38fd1498Szrj   else /* Add the copy at entry.  */
1107*38fd1498Szrj     {
1108*38fd1498Szrj       if (scalar_loop != loop)
1109*38fd1498Szrj 	{
1110*38fd1498Szrj 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1111*38fd1498Szrj 	  redirect_edge_pred (single_succ_edge (preheader),
1112*38fd1498Szrj 			      single_pred (preheader));
1113*38fd1498Szrj 	  delete_basic_block (preheader);
1114*38fd1498Szrj 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115*38fd1498Szrj 				   loop_preheader_edge (scalar_loop)->src);
1116*38fd1498Szrj 	  preheader = split_edge (loop_preheader_edge (loop));
1117*38fd1498Szrj 	  entry_e = single_pred_edge (preheader);
1118*38fd1498Szrj 	}
1119*38fd1498Szrj 
1120*38fd1498Szrj       redirect_edge_and_branch_force (entry_e, new_preheader);
1121*38fd1498Szrj       flush_pending_stmts (entry_e);
1122*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123*38fd1498Szrj 
1124*38fd1498Szrj       redirect_edge_and_branch_force (new_exit, preheader);
1125*38fd1498Szrj       flush_pending_stmts (new_exit);
1126*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127*38fd1498Szrj 
1128*38fd1498Szrj       /* And remove the non-necessary forwarder again.  Keep the other
1129*38fd1498Szrj          one so we have a proper pre-header for the loop at the exit edge.  */
1130*38fd1498Szrj       redirect_edge_pred (single_succ_edge (new_preheader),
1131*38fd1498Szrj 			  single_pred (new_preheader));
1132*38fd1498Szrj       delete_basic_block (new_preheader);
1133*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134*38fd1498Szrj 			       loop_preheader_edge (new_loop)->src);
1135*38fd1498Szrj     }
1136*38fd1498Szrj 
1137*38fd1498Szrj   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1138*38fd1498Szrj   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139*38fd1498Szrj     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140*38fd1498Szrj 
1141*38fd1498Szrj   if (scalar_loop != loop)
1142*38fd1498Szrj     {
1143*38fd1498Szrj       /* Update new_loop->header PHIs, so that on the preheader
1144*38fd1498Szrj 	 edge they are the ones from loop rather than scalar_loop.  */
1145*38fd1498Szrj       gphi_iterator gsi_orig, gsi_new;
1146*38fd1498Szrj       edge orig_e = loop_preheader_edge (loop);
1147*38fd1498Szrj       edge new_e = loop_preheader_edge (new_loop);
1148*38fd1498Szrj 
1149*38fd1498Szrj       for (gsi_orig = gsi_start_phis (loop->header),
1150*38fd1498Szrj 	   gsi_new = gsi_start_phis (new_loop->header);
1151*38fd1498Szrj 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1152*38fd1498Szrj 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153*38fd1498Szrj 	{
1154*38fd1498Szrj 	  gphi *orig_phi = gsi_orig.phi ();
1155*38fd1498Szrj 	  gphi *new_phi = gsi_new.phi ();
1156*38fd1498Szrj 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1157*38fd1498Szrj 	  location_t orig_locus
1158*38fd1498Szrj 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159*38fd1498Szrj 
1160*38fd1498Szrj 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1161*38fd1498Szrj 	}
1162*38fd1498Szrj     }
1163*38fd1498Szrj 
1164*38fd1498Szrj   free (new_bbs);
1165*38fd1498Szrj   free (bbs);
1166*38fd1498Szrj 
1167*38fd1498Szrj   checking_verify_dominators (CDI_DOMINATORS);
1168*38fd1498Szrj 
1169*38fd1498Szrj   return new_loop;
1170*38fd1498Szrj }
1171*38fd1498Szrj 
1172*38fd1498Szrj 
1173*38fd1498Szrj /* Given the condition expression COND, put it as the last statement of
1174*38fd1498Szrj    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1175*38fd1498Szrj    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1176*38fd1498Szrj    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1177*38fd1498Szrj    new edge as irreducible if IRREDUCIBLE_P is true.  */
1178*38fd1498Szrj 
1179*38fd1498Szrj static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1180*38fd1498Szrj slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1181*38fd1498Szrj 		       basic_block guard_to, basic_block dom_bb,
1182*38fd1498Szrj 		       profile_probability probability, bool irreducible_p)
1183*38fd1498Szrj {
1184*38fd1498Szrj   gimple_stmt_iterator gsi;
1185*38fd1498Szrj   edge new_e, enter_e;
1186*38fd1498Szrj   gcond *cond_stmt;
1187*38fd1498Szrj   gimple_seq gimplify_stmt_list = NULL;
1188*38fd1498Szrj 
1189*38fd1498Szrj   enter_e = EDGE_SUCC (guard_bb, 0);
1190*38fd1498Szrj   enter_e->flags &= ~EDGE_FALLTHRU;
1191*38fd1498Szrj   enter_e->flags |= EDGE_FALSE_VALUE;
1192*38fd1498Szrj   gsi = gsi_last_bb (guard_bb);
1193*38fd1498Szrj 
1194*38fd1498Szrj   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1195*38fd1498Szrj 				 NULL_TREE);
1196*38fd1498Szrj   if (gimplify_stmt_list)
1197*38fd1498Szrj     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1198*38fd1498Szrj 
1199*38fd1498Szrj   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1200*38fd1498Szrj   gsi = gsi_last_bb (guard_bb);
1201*38fd1498Szrj   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202*38fd1498Szrj 
1203*38fd1498Szrj   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1204*38fd1498Szrj   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1205*38fd1498Szrj 
1206*38fd1498Szrj   new_e->probability = probability;
1207*38fd1498Szrj   if (irreducible_p)
1208*38fd1498Szrj     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209*38fd1498Szrj 
1210*38fd1498Szrj   enter_e->probability = probability.invert ();
1211*38fd1498Szrj   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1212*38fd1498Szrj 
1213*38fd1498Szrj   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1214*38fd1498Szrj   if (enter_e->dest->loop_father->header == enter_e->dest)
1215*38fd1498Szrj     split_edge (enter_e);
1216*38fd1498Szrj 
1217*38fd1498Szrj   return new_e;
1218*38fd1498Szrj }
1219*38fd1498Szrj 
1220*38fd1498Szrj 
1221*38fd1498Szrj /* This function verifies that the following restrictions apply to LOOP:
1222*38fd1498Szrj    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1223*38fd1498Szrj        for innermost loop and 5 basic blocks for outer-loop.
1224*38fd1498Szrj    (2) it is single entry, single exit
1225*38fd1498Szrj    (3) its exit condition is the last stmt in the header
1226*38fd1498Szrj    (4) E is the entry/exit edge of LOOP.
1227*38fd1498Szrj  */
1228*38fd1498Szrj 
1229*38fd1498Szrj bool
slpeel_can_duplicate_loop_p(const struct loop * loop,const_edge e)1230*38fd1498Szrj slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231*38fd1498Szrj {
1232*38fd1498Szrj   edge exit_e = single_exit (loop);
1233*38fd1498Szrj   edge entry_e = loop_preheader_edge (loop);
1234*38fd1498Szrj   gcond *orig_cond = get_loop_exit_condition (loop);
1235*38fd1498Szrj   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1236*38fd1498Szrj   unsigned int num_bb = loop->inner? 5 : 2;
1237*38fd1498Szrj 
1238*38fd1498Szrj   /* All loops have an outer scope; the only case loop->outer is NULL is for
1239*38fd1498Szrj      the function itself.  */
1240*38fd1498Szrj   if (!loop_outer (loop)
1241*38fd1498Szrj       || loop->num_nodes != num_bb
1242*38fd1498Szrj       || !empty_block_p (loop->latch)
1243*38fd1498Szrj       || !single_exit (loop)
1244*38fd1498Szrj       /* Verify that new loop exit condition can be trivially modified.  */
1245*38fd1498Szrj       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1246*38fd1498Szrj       || (e != exit_e && e != entry_e))
1247*38fd1498Szrj     return false;
1248*38fd1498Szrj 
1249*38fd1498Szrj   return true;
1250*38fd1498Szrj }
1251*38fd1498Szrj 
1252*38fd1498Szrj /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1253*38fd1498Szrj    in the exit bb and rename all the uses after the loop.  This simplifies
1254*38fd1498Szrj    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1255*38fd1498Szrj    (but normally loop closed SSA form doesn't require virtual PHIs to be
1256*38fd1498Szrj    in the same form).  Doing this early simplifies the checking what
1257*38fd1498Szrj    uses should be renamed.  */
1258*38fd1498Szrj 
1259*38fd1498Szrj static void
create_lcssa_for_virtual_phi(struct loop * loop)1260*38fd1498Szrj create_lcssa_for_virtual_phi (struct loop *loop)
1261*38fd1498Szrj {
1262*38fd1498Szrj   gphi_iterator gsi;
1263*38fd1498Szrj   edge exit_e = single_exit (loop);
1264*38fd1498Szrj 
1265*38fd1498Szrj   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1266*38fd1498Szrj     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267*38fd1498Szrj       {
1268*38fd1498Szrj 	gphi *phi = gsi.phi ();
1269*38fd1498Szrj 	for (gsi = gsi_start_phis (exit_e->dest);
1270*38fd1498Szrj 	     !gsi_end_p (gsi); gsi_next (&gsi))
1271*38fd1498Szrj 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1272*38fd1498Szrj 	    break;
1273*38fd1498Szrj 	if (gsi_end_p (gsi))
1274*38fd1498Szrj 	  {
1275*38fd1498Szrj 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1276*38fd1498Szrj 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1277*38fd1498Szrj 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1278*38fd1498Szrj 	    imm_use_iterator imm_iter;
1279*38fd1498Szrj 	    gimple *stmt;
1280*38fd1498Szrj 	    use_operand_p use_p;
1281*38fd1498Szrj 
1282*38fd1498Szrj 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1283*38fd1498Szrj 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1284*38fd1498Szrj 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1285*38fd1498Szrj 	    gimple_phi_set_result (new_phi, new_vop);
1286*38fd1498Szrj 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1287*38fd1498Szrj 	      if (stmt != new_phi
1288*38fd1498Szrj 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1289*38fd1498Szrj 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1290*38fd1498Szrj 		  SET_USE (use_p, new_vop);
1291*38fd1498Szrj 	  }
1292*38fd1498Szrj 	break;
1293*38fd1498Szrj       }
1294*38fd1498Szrj 
1295*38fd1498Szrj }
1296*38fd1498Szrj 
1297*38fd1498Szrj /* Function vect_get_loop_location.
1298*38fd1498Szrj 
1299*38fd1498Szrj    Extract the location of the loop in the source code.
1300*38fd1498Szrj    If the loop is not well formed for vectorization, an estimated
1301*38fd1498Szrj    location is calculated.
1302*38fd1498Szrj    Return the loop location if succeed and NULL if not.  */
1303*38fd1498Szrj 
1304*38fd1498Szrj source_location
find_loop_location(struct loop * loop)1305*38fd1498Szrj find_loop_location (struct loop *loop)
1306*38fd1498Szrj {
1307*38fd1498Szrj   gimple *stmt = NULL;
1308*38fd1498Szrj   basic_block bb;
1309*38fd1498Szrj   gimple_stmt_iterator si;
1310*38fd1498Szrj 
1311*38fd1498Szrj   if (!loop)
1312*38fd1498Szrj     return UNKNOWN_LOCATION;
1313*38fd1498Szrj 
1314*38fd1498Szrj   stmt = get_loop_exit_condition (loop);
1315*38fd1498Szrj 
1316*38fd1498Szrj   if (stmt
1317*38fd1498Szrj       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318*38fd1498Szrj     return gimple_location (stmt);
1319*38fd1498Szrj 
1320*38fd1498Szrj   /* If we got here the loop is probably not "well formed",
1321*38fd1498Szrj      try to estimate the loop location */
1322*38fd1498Szrj 
1323*38fd1498Szrj   if (!loop->header)
1324*38fd1498Szrj     return UNKNOWN_LOCATION;
1325*38fd1498Szrj 
1326*38fd1498Szrj   bb = loop->header;
1327*38fd1498Szrj 
1328*38fd1498Szrj   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329*38fd1498Szrj     {
1330*38fd1498Szrj       stmt = gsi_stmt (si);
1331*38fd1498Szrj       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332*38fd1498Szrj         return gimple_location (stmt);
1333*38fd1498Szrj     }
1334*38fd1498Szrj 
1335*38fd1498Szrj   return UNKNOWN_LOCATION;
1336*38fd1498Szrj }
1337*38fd1498Szrj 
1338*38fd1498Szrj /* Return true if PHI defines an IV of the loop to be vectorized.  */
1339*38fd1498Szrj 
1340*38fd1498Szrj static bool
iv_phi_p(gphi * phi)1341*38fd1498Szrj iv_phi_p (gphi *phi)
1342*38fd1498Szrj {
1343*38fd1498Szrj   if (virtual_operand_p (PHI_RESULT (phi)))
1344*38fd1498Szrj     return false;
1345*38fd1498Szrj 
1346*38fd1498Szrj   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1347*38fd1498Szrj   gcc_assert (stmt_info != NULL);
1348*38fd1498Szrj   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349*38fd1498Szrj       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350*38fd1498Szrj     return false;
1351*38fd1498Szrj 
1352*38fd1498Szrj   return true;
1353*38fd1498Szrj }
1354*38fd1498Szrj 
1355*38fd1498Szrj /* Function vect_can_advance_ivs_p
1356*38fd1498Szrj 
1357*38fd1498Szrj    In case the number of iterations that LOOP iterates is unknown at compile
1358*38fd1498Szrj    time, an epilog loop will be generated, and the loop induction variables
1359*38fd1498Szrj    (IVs) will be "advanced" to the value they are supposed to take just before
1360*38fd1498Szrj    the epilog loop.  Here we check that the access function of the loop IVs
1361*38fd1498Szrj    and the expression that represents the loop bound are simple enough.
1362*38fd1498Szrj    These restrictions will be relaxed in the future.  */
1363*38fd1498Szrj 
1364*38fd1498Szrj bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1365*38fd1498Szrj vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366*38fd1498Szrj {
1367*38fd1498Szrj   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368*38fd1498Szrj   basic_block bb = loop->header;
1369*38fd1498Szrj   gphi_iterator gsi;
1370*38fd1498Szrj 
1371*38fd1498Szrj   /* Analyze phi functions of the loop header.  */
1372*38fd1498Szrj 
1373*38fd1498Szrj   if (dump_enabled_p ())
1374*38fd1498Szrj     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375*38fd1498Szrj   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376*38fd1498Szrj     {
1377*38fd1498Szrj       tree evolution_part;
1378*38fd1498Szrj 
1379*38fd1498Szrj       gphi *phi = gsi.phi ();
1380*38fd1498Szrj       if (dump_enabled_p ())
1381*38fd1498Szrj 	{
1382*38fd1498Szrj           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1383*38fd1498Szrj           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1384*38fd1498Szrj 	}
1385*38fd1498Szrj 
1386*38fd1498Szrj       /* Skip virtual phi's. The data dependences that are associated with
1387*38fd1498Szrj 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388*38fd1498Szrj 
1389*38fd1498Szrj 	 Skip reduction phis.  */
1390*38fd1498Szrj       if (!iv_phi_p (phi))
1391*38fd1498Szrj 	{
1392*38fd1498Szrj 	  if (dump_enabled_p ())
1393*38fd1498Szrj 	    dump_printf_loc (MSG_NOTE, vect_location,
1394*38fd1498Szrj 			     "reduc or virtual phi. skip.\n");
1395*38fd1498Szrj 	  continue;
1396*38fd1498Szrj 	}
1397*38fd1498Szrj 
1398*38fd1498Szrj       /* Analyze the evolution function.  */
1399*38fd1498Szrj 
1400*38fd1498Szrj       evolution_part
1401*38fd1498Szrj 	= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1402*38fd1498Szrj       if (evolution_part == NULL_TREE)
1403*38fd1498Szrj         {
1404*38fd1498Szrj 	  if (dump_enabled_p ())
1405*38fd1498Szrj 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1406*38fd1498Szrj 			 "No access function or evolution.\n");
1407*38fd1498Szrj 	  return false;
1408*38fd1498Szrj         }
1409*38fd1498Szrj 
1410*38fd1498Szrj       /* FORNOW: We do not transform initial conditions of IVs
1411*38fd1498Szrj 	 which evolution functions are not invariants in the loop.  */
1412*38fd1498Szrj 
1413*38fd1498Szrj       if (!expr_invariant_in_loop_p (loop, evolution_part))
1414*38fd1498Szrj 	{
1415*38fd1498Szrj 	  if (dump_enabled_p ())
1416*38fd1498Szrj 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417*38fd1498Szrj 			     "evolution not invariant in loop.\n");
1418*38fd1498Szrj 	  return false;
1419*38fd1498Szrj 	}
1420*38fd1498Szrj 
1421*38fd1498Szrj       /* FORNOW: We do not transform initial conditions of IVs
1422*38fd1498Szrj 	 which evolution functions are a polynomial of degree >= 2.  */
1423*38fd1498Szrj 
1424*38fd1498Szrj       if (tree_is_chrec (evolution_part))
1425*38fd1498Szrj 	{
1426*38fd1498Szrj 	  if (dump_enabled_p ())
1427*38fd1498Szrj 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428*38fd1498Szrj 			     "evolution is chrec.\n");
1429*38fd1498Szrj 	  return false;
1430*38fd1498Szrj 	}
1431*38fd1498Szrj     }
1432*38fd1498Szrj 
1433*38fd1498Szrj   return true;
1434*38fd1498Szrj }
1435*38fd1498Szrj 
1436*38fd1498Szrj 
1437*38fd1498Szrj /*   Function vect_update_ivs_after_vectorizer.
1438*38fd1498Szrj 
1439*38fd1498Szrj      "Advance" the induction variables of LOOP to the value they should take
1440*38fd1498Szrj      after the execution of LOOP.  This is currently necessary because the
1441*38fd1498Szrj      vectorizer does not handle induction variables that are used after the
1442*38fd1498Szrj      loop.  Such a situation occurs when the last iterations of LOOP are
1443*38fd1498Szrj      peeled, because:
1444*38fd1498Szrj      1. We introduced new uses after LOOP for IVs that were not originally used
1445*38fd1498Szrj         after LOOP: the IVs of LOOP are now used by an epilog loop.
1446*38fd1498Szrj      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447*38fd1498Szrj         times, whereas the loop IVs should be bumped N times.
1448*38fd1498Szrj 
1449*38fd1498Szrj      Input:
1450*38fd1498Szrj      - LOOP - a loop that is going to be vectorized. The last few iterations
1451*38fd1498Szrj               of LOOP were peeled.
1452*38fd1498Szrj      - NITERS - the number of iterations that LOOP executes (before it is
1453*38fd1498Szrj                 vectorized). i.e, the number of times the ivs should be bumped.
1454*38fd1498Szrj      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455*38fd1498Szrj                   coming out from LOOP on which there are uses of the LOOP ivs
1456*38fd1498Szrj 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1457*38fd1498Szrj 
1458*38fd1498Szrj                   The new definitions of the ivs are placed in LOOP->exit.
1459*38fd1498Szrj                   The phi args associated with the edge UPDATE_E in the bb
1460*38fd1498Szrj                   UPDATE_E->dest are updated accordingly.
1461*38fd1498Szrj 
1462*38fd1498Szrj      Assumption 1: Like the rest of the vectorizer, this function assumes
1463*38fd1498Szrj      a single loop exit that has a single predecessor.
1464*38fd1498Szrj 
1465*38fd1498Szrj      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466*38fd1498Szrj      organized in the same order.
1467*38fd1498Szrj 
1468*38fd1498Szrj      Assumption 3: The access function of the ivs is simple enough (see
1469*38fd1498Szrj      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1470*38fd1498Szrj 
1471*38fd1498Szrj      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472*38fd1498Szrj      coming out of LOOP on which the ivs of LOOP are used (this is the path
1473*38fd1498Szrj      that leads to the epilog loop; other paths skip the epilog loop).  This
1474*38fd1498Szrj      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475*38fd1498Szrj      needs to have its phis updated.
1476*38fd1498Szrj  */
1477*38fd1498Szrj 
1478*38fd1498Szrj static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1479*38fd1498Szrj vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480*38fd1498Szrj 				  tree niters, edge update_e)
1481*38fd1498Szrj {
1482*38fd1498Szrj   gphi_iterator gsi, gsi1;
1483*38fd1498Szrj   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1484*38fd1498Szrj   basic_block update_bb = update_e->dest;
1485*38fd1498Szrj   basic_block exit_bb = single_exit (loop)->dest;
1486*38fd1498Szrj 
1487*38fd1498Szrj   /* Make sure there exists a single-predecessor exit bb:  */
1488*38fd1498Szrj   gcc_assert (single_pred_p (exit_bb));
1489*38fd1498Szrj   gcc_assert (single_succ_edge (exit_bb) == update_e);
1490*38fd1498Szrj 
1491*38fd1498Szrj   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492*38fd1498Szrj        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493*38fd1498Szrj        gsi_next (&gsi), gsi_next (&gsi1))
1494*38fd1498Szrj     {
1495*38fd1498Szrj       tree init_expr;
1496*38fd1498Szrj       tree step_expr, off;
1497*38fd1498Szrj       tree type;
1498*38fd1498Szrj       tree var, ni, ni_name;
1499*38fd1498Szrj       gimple_stmt_iterator last_gsi;
1500*38fd1498Szrj 
1501*38fd1498Szrj       gphi *phi = gsi.phi ();
1502*38fd1498Szrj       gphi *phi1 = gsi1.phi ();
1503*38fd1498Szrj       if (dump_enabled_p ())
1504*38fd1498Szrj 	{
1505*38fd1498Szrj 	  dump_printf_loc (MSG_NOTE, vect_location,
1506*38fd1498Szrj 			   "vect_update_ivs_after_vectorizer: phi: ");
1507*38fd1498Szrj 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1508*38fd1498Szrj 	}
1509*38fd1498Szrj 
1510*38fd1498Szrj       /* Skip reduction and virtual phis.  */
1511*38fd1498Szrj       if (!iv_phi_p (phi))
1512*38fd1498Szrj 	{
1513*38fd1498Szrj 	  if (dump_enabled_p ())
1514*38fd1498Szrj 	    dump_printf_loc (MSG_NOTE, vect_location,
1515*38fd1498Szrj 			     "reduc or virtual phi. skip.\n");
1516*38fd1498Szrj 	  continue;
1517*38fd1498Szrj 	}
1518*38fd1498Szrj 
1519*38fd1498Szrj       type = TREE_TYPE (gimple_phi_result (phi));
1520*38fd1498Szrj       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1521*38fd1498Szrj       step_expr = unshare_expr (step_expr);
1522*38fd1498Szrj 
1523*38fd1498Szrj       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1524*38fd1498Szrj          of degree >= 2 or exponential.  */
1525*38fd1498Szrj       gcc_assert (!tree_is_chrec (step_expr));
1526*38fd1498Szrj 
1527*38fd1498Szrj       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1528*38fd1498Szrj 
1529*38fd1498Szrj       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1530*38fd1498Szrj 			 fold_convert (TREE_TYPE (step_expr), niters),
1531*38fd1498Szrj 			 step_expr);
1532*38fd1498Szrj       if (POINTER_TYPE_P (type))
1533*38fd1498Szrj 	ni = fold_build_pointer_plus (init_expr, off);
1534*38fd1498Szrj       else
1535*38fd1498Szrj 	ni = fold_build2 (PLUS_EXPR, type,
1536*38fd1498Szrj 			  init_expr, fold_convert (type, off));
1537*38fd1498Szrj 
1538*38fd1498Szrj       var = create_tmp_var (type, "tmp");
1539*38fd1498Szrj 
1540*38fd1498Szrj       last_gsi = gsi_last_bb (exit_bb);
1541*38fd1498Szrj       gimple_seq new_stmts = NULL;
1542*38fd1498Szrj       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1543*38fd1498Szrj       /* Exit_bb shouldn't be empty.  */
1544*38fd1498Szrj       if (!gsi_end_p (last_gsi))
1545*38fd1498Szrj 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1546*38fd1498Szrj       else
1547*38fd1498Szrj 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1548*38fd1498Szrj 
1549*38fd1498Szrj       /* Fix phi expressions in the successor bb.  */
1550*38fd1498Szrj       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1551*38fd1498Szrj     }
1552*38fd1498Szrj }
1553*38fd1498Szrj 
1554*38fd1498Szrj /* Return a gimple value containing the misalignment (measured in vector
1555*38fd1498Szrj    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1556*38fd1498Szrj    it is away from a perfectly aligned address.  Add any new statements
1557*38fd1498Szrj    to SEQ.  */
1558*38fd1498Szrj 
1559*38fd1498Szrj static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1560*38fd1498Szrj get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1561*38fd1498Szrj {
1562*38fd1498Szrj   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1563*38fd1498Szrj   gimple *dr_stmt = DR_STMT (dr);
1564*38fd1498Szrj   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1565*38fd1498Szrj   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566*38fd1498Szrj 
1567*38fd1498Szrj   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1568*38fd1498Szrj   gcc_assert (target_align != 0);
1569*38fd1498Szrj 
1570*38fd1498Szrj   bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1571*38fd1498Szrj   tree offset = (negative
1572*38fd1498Szrj 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1573*38fd1498Szrj 		 : size_zero_node);
1574*38fd1498Szrj   tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1575*38fd1498Szrj 							  offset);
1576*38fd1498Szrj   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1577*38fd1498Szrj   tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1578*38fd1498Szrj   HOST_WIDE_INT elem_size
1579*38fd1498Szrj     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580*38fd1498Szrj   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581*38fd1498Szrj 
1582*38fd1498Szrj   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1583*38fd1498Szrj   tree int_start_addr = fold_convert (type, start_addr);
1584*38fd1498Szrj   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585*38fd1498Szrj 					target_align_minus_1);
1586*38fd1498Szrj 
1587*38fd1498Szrj   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1588*38fd1498Szrj   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589*38fd1498Szrj 					elem_size_log);
1590*38fd1498Szrj 
1591*38fd1498Szrj   return misalign_in_elems;
1592*38fd1498Szrj }
1593*38fd1498Szrj 
1594*38fd1498Szrj /* Function vect_gen_prolog_loop_niters
1595*38fd1498Szrj 
1596*38fd1498Szrj    Generate the number of iterations which should be peeled as prolog for the
1597*38fd1498Szrj    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1598*38fd1498Szrj    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599*38fd1498Szrj    As a result, after the execution of this loop, the data reference DR will
1600*38fd1498Szrj    refer to an aligned location.  The following computation is generated:
1601*38fd1498Szrj 
1602*38fd1498Szrj    If the misalignment of DR is known at compile time:
1603*38fd1498Szrj      addr_mis = int mis = DR_MISALIGNMENT (dr);
1604*38fd1498Szrj    Else, compute address misalignment in bytes:
1605*38fd1498Szrj      addr_mis = addr & (target_align - 1)
1606*38fd1498Szrj 
1607*38fd1498Szrj    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608*38fd1498Szrj 
1609*38fd1498Szrj    (elem_size = element type size; an element is the scalar element whose type
1610*38fd1498Szrj    is the inner type of the vectype)
1611*38fd1498Szrj 
1612*38fd1498Szrj    The computations will be emitted at the end of BB.  We also compute and
1613*38fd1498Szrj    store upper bound (included) of the result in BOUND.
1614*38fd1498Szrj 
1615*38fd1498Szrj    When the step of the data-ref in the loop is not 1 (as in interleaved data
1616*38fd1498Szrj    and SLP), the number of iterations of the prolog must be divided by the step
1617*38fd1498Szrj    (which is equal to the size of interleaved group).
1618*38fd1498Szrj 
1619*38fd1498Szrj    The above formulas assume that VF == number of elements in the vector. This
1620*38fd1498Szrj    may not hold when there are multiple-types in the loop.
1621*38fd1498Szrj    In this case, for some data-references in the loop the VF does not represent
1622*38fd1498Szrj    the number of elements that fit in the vector.  Therefore, instead of VF we
1623*38fd1498Szrj    use TYPE_VECTOR_SUBPARTS.  */
1624*38fd1498Szrj 
1625*38fd1498Szrj static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1626*38fd1498Szrj vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627*38fd1498Szrj 			     basic_block bb, int *bound)
1628*38fd1498Szrj {
1629*38fd1498Szrj   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630*38fd1498Szrj   tree var;
1631*38fd1498Szrj   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632*38fd1498Szrj   gimple_seq stmts = NULL, new_stmts = NULL;
1633*38fd1498Szrj   tree iters, iters_name;
1634*38fd1498Szrj   gimple *dr_stmt = DR_STMT (dr);
1635*38fd1498Szrj   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1636*38fd1498Szrj   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637*38fd1498Szrj   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1638*38fd1498Szrj 
1639*38fd1498Szrj   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1640*38fd1498Szrj     {
1641*38fd1498Szrj       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1642*38fd1498Szrj 
1643*38fd1498Szrj       if (dump_enabled_p ())
1644*38fd1498Szrj         dump_printf_loc (MSG_NOTE, vect_location,
1645*38fd1498Szrj                          "known peeling = %d.\n", npeel);
1646*38fd1498Szrj 
1647*38fd1498Szrj       iters = build_int_cst (niters_type, npeel);
1648*38fd1498Szrj       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649*38fd1498Szrj     }
1650*38fd1498Szrj   else
1651*38fd1498Szrj     {
1652*38fd1498Szrj       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1653*38fd1498Szrj       tree type = TREE_TYPE (misalign_in_elems);
1654*38fd1498Szrj       HOST_WIDE_INT elem_size
1655*38fd1498Szrj 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1656*38fd1498Szrj       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1657*38fd1498Szrj       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1658*38fd1498Szrj       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659*38fd1498Szrj 
1660*38fd1498Szrj       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1661*38fd1498Szrj 				 & (align_in_elems - 1)).  */
1662*38fd1498Szrj       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1663*38fd1498Szrj       if (negative)
1664*38fd1498Szrj 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1665*38fd1498Szrj 			     align_in_elems_tree);
1666*38fd1498Szrj       else
1667*38fd1498Szrj 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1668*38fd1498Szrj 			     misalign_in_elems);
1669*38fd1498Szrj       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1670*38fd1498Szrj       iters = fold_convert (niters_type, iters);
1671*38fd1498Szrj       *bound = align_in_elems - 1;
1672*38fd1498Szrj     }
1673*38fd1498Szrj 
1674*38fd1498Szrj   if (dump_enabled_p ())
1675*38fd1498Szrj     {
1676*38fd1498Szrj       dump_printf_loc (MSG_NOTE, vect_location,
1677*38fd1498Szrj                        "niters for prolog loop: ");
1678*38fd1498Szrj       dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1679*38fd1498Szrj       dump_printf (MSG_NOTE, "\n");
1680*38fd1498Szrj     }
1681*38fd1498Szrj 
1682*38fd1498Szrj   var = create_tmp_var (niters_type, "prolog_loop_niters");
1683*38fd1498Szrj   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1684*38fd1498Szrj 
1685*38fd1498Szrj   if (new_stmts)
1686*38fd1498Szrj     gimple_seq_add_seq (&stmts, new_stmts);
1687*38fd1498Szrj   if (stmts)
1688*38fd1498Szrj     {
1689*38fd1498Szrj       gcc_assert (single_succ_p (bb));
1690*38fd1498Szrj       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1691*38fd1498Szrj       if (gsi_end_p (gsi))
1692*38fd1498Szrj 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1693*38fd1498Szrj       else
1694*38fd1498Szrj 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1695*38fd1498Szrj     }
1696*38fd1498Szrj   return iters_name;
1697*38fd1498Szrj }
1698*38fd1498Szrj 
1699*38fd1498Szrj 
1700*38fd1498Szrj /* Function vect_update_init_of_dr
1701*38fd1498Szrj 
1702*38fd1498Szrj    If CODE is PLUS, the vector loop starts NITERS iterations after the
1703*38fd1498Szrj    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1704*38fd1498Szrj    iterations before the scalar one (using masking to skip inactive
1705*38fd1498Szrj    elements).  This function updates the information recorded in DR to
1706*38fd1498Szrj    account for the difference.  Specifically, it updates the OFFSET
1707*38fd1498Szrj    field of DR.  */
1708*38fd1498Szrj 
1709*38fd1498Szrj static void
vect_update_init_of_dr(struct data_reference * dr,tree niters,tree_code code)1710*38fd1498Szrj vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1711*38fd1498Szrj {
1712*38fd1498Szrj   tree offset = DR_OFFSET (dr);
1713*38fd1498Szrj 
1714*38fd1498Szrj   niters = fold_build2 (MULT_EXPR, sizetype,
1715*38fd1498Szrj 			fold_convert (sizetype, niters),
1716*38fd1498Szrj 			fold_convert (sizetype, DR_STEP (dr)));
1717*38fd1498Szrj   offset = fold_build2 (code, sizetype,
1718*38fd1498Szrj 			fold_convert (sizetype, offset), niters);
1719*38fd1498Szrj   DR_OFFSET (dr) = offset;
1720*38fd1498Szrj }
1721*38fd1498Szrj 
1722*38fd1498Szrj 
1723*38fd1498Szrj /* Function vect_update_inits_of_drs
1724*38fd1498Szrj 
1725*38fd1498Szrj    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1726*38fd1498Szrj    CODE and NITERS are as for vect_update_inits_of_dr.  */
1727*38fd1498Szrj 
1728*38fd1498Szrj static void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1729*38fd1498Szrj vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1730*38fd1498Szrj 			  tree_code code)
1731*38fd1498Szrj {
1732*38fd1498Szrj   unsigned int i;
1733*38fd1498Szrj   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1734*38fd1498Szrj   struct data_reference *dr;
1735*38fd1498Szrj 
1736*38fd1498Szrj   if (dump_enabled_p ())
1737*38fd1498Szrj     dump_printf_loc (MSG_NOTE, vect_location,
1738*38fd1498Szrj 		     "=== vect_update_inits_of_dr ===\n");
1739*38fd1498Szrj 
1740*38fd1498Szrj   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1741*38fd1498Szrj   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1742*38fd1498Szrj     {
1743*38fd1498Szrj       gimple_seq seq;
1744*38fd1498Szrj       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1745*38fd1498Szrj       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1746*38fd1498Szrj 
1747*38fd1498Szrj       niters = fold_convert (sizetype, niters);
1748*38fd1498Szrj       niters = force_gimple_operand (niters, &seq, false, var);
1749*38fd1498Szrj       if (seq)
1750*38fd1498Szrj 	{
1751*38fd1498Szrj 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1752*38fd1498Szrj 	  gcc_assert (!new_bb);
1753*38fd1498Szrj 	}
1754*38fd1498Szrj     }
1755*38fd1498Szrj 
1756*38fd1498Szrj   FOR_EACH_VEC_ELT (datarefs, i, dr)
1757*38fd1498Szrj     vect_update_init_of_dr (dr, niters, code);
1758*38fd1498Szrj }
1759*38fd1498Szrj 
1760*38fd1498Szrj /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1761*38fd1498Szrj    by masking.  This involves calculating the number of iterations to
1762*38fd1498Szrj    be peeled and then aligning all memory references appropriately.  */
1763*38fd1498Szrj 
1764*38fd1498Szrj void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1765*38fd1498Szrj vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1766*38fd1498Szrj {
1767*38fd1498Szrj   tree misalign_in_elems;
1768*38fd1498Szrj   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1769*38fd1498Szrj 
1770*38fd1498Szrj   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1771*38fd1498Szrj 
1772*38fd1498Szrj   /* From the information recorded in LOOP_VINFO get the number of iterations
1773*38fd1498Szrj      that need to be skipped via masking.  */
1774*38fd1498Szrj   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1775*38fd1498Szrj     {
1776*38fd1498Szrj       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1777*38fd1498Szrj 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1778*38fd1498Szrj       misalign_in_elems = build_int_cst (type, misalign);
1779*38fd1498Szrj     }
1780*38fd1498Szrj   else
1781*38fd1498Szrj     {
1782*38fd1498Szrj       gimple_seq seq1 = NULL, seq2 = NULL;
1783*38fd1498Szrj       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1784*38fd1498Szrj       misalign_in_elems = fold_convert (type, misalign_in_elems);
1785*38fd1498Szrj       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1786*38fd1498Szrj 						&seq2, true, NULL_TREE);
1787*38fd1498Szrj       gimple_seq_add_seq (&seq1, seq2);
1788*38fd1498Szrj       if (seq1)
1789*38fd1498Szrj 	{
1790*38fd1498Szrj 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1791*38fd1498Szrj 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1792*38fd1498Szrj 	  gcc_assert (!new_bb);
1793*38fd1498Szrj 	}
1794*38fd1498Szrj     }
1795*38fd1498Szrj 
1796*38fd1498Szrj   if (dump_enabled_p ())
1797*38fd1498Szrj     {
1798*38fd1498Szrj       dump_printf_loc (MSG_NOTE, vect_location,
1799*38fd1498Szrj 		       "misalignment for fully-masked loop: ");
1800*38fd1498Szrj       dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1801*38fd1498Szrj       dump_printf (MSG_NOTE, "\n");
1802*38fd1498Szrj     }
1803*38fd1498Szrj 
1804*38fd1498Szrj   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1805*38fd1498Szrj 
1806*38fd1498Szrj   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1807*38fd1498Szrj }
1808*38fd1498Szrj 
1809*38fd1498Szrj /* This function builds ni_name = number of iterations.  Statements
1810*38fd1498Szrj    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1811*38fd1498Szrj    it to TRUE if new ssa_var is generated.  */
1812*38fd1498Szrj 
1813*38fd1498Szrj tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1814*38fd1498Szrj vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1815*38fd1498Szrj {
1816*38fd1498Szrj   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1817*38fd1498Szrj   if (TREE_CODE (ni) == INTEGER_CST)
1818*38fd1498Szrj     return ni;
1819*38fd1498Szrj   else
1820*38fd1498Szrj     {
1821*38fd1498Szrj       tree ni_name, var;
1822*38fd1498Szrj       gimple_seq stmts = NULL;
1823*38fd1498Szrj       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1824*38fd1498Szrj 
1825*38fd1498Szrj       var = create_tmp_var (TREE_TYPE (ni), "niters");
1826*38fd1498Szrj       ni_name = force_gimple_operand (ni, &stmts, false, var);
1827*38fd1498Szrj       if (stmts)
1828*38fd1498Szrj 	{
1829*38fd1498Szrj 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1830*38fd1498Szrj 	  if (new_var_p != NULL)
1831*38fd1498Szrj 	    *new_var_p = true;
1832*38fd1498Szrj 	}
1833*38fd1498Szrj 
1834*38fd1498Szrj       return ni_name;
1835*38fd1498Szrj     }
1836*38fd1498Szrj }
1837*38fd1498Szrj 
1838*38fd1498Szrj /* Calculate the number of iterations above which vectorized loop will be
1839*38fd1498Szrj    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1840*38fd1498Szrj    of prolog loop.  If it's integer const, the integer number is also passed
1841*38fd1498Szrj    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1842*38fd1498Szrj    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1843*38fd1498Szrj    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1844*38fd1498Szrj    threshold below which the scalar (rather than vectorized) loop will be
1845*38fd1498Szrj    executed.  This function stores the upper bound (inclusive) of the result
1846*38fd1498Szrj    in BOUND_SCALAR.  */
1847*38fd1498Szrj 
1848*38fd1498Szrj static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1849*38fd1498Szrj vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1850*38fd1498Szrj 			     int bound_prolog, poly_int64 bound_epilog, int th,
1851*38fd1498Szrj 			     poly_uint64 *bound_scalar,
1852*38fd1498Szrj 			     bool check_profitability)
1853*38fd1498Szrj {
1854*38fd1498Szrj   tree type = TREE_TYPE (niters_prolog);
1855*38fd1498Szrj   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1856*38fd1498Szrj 			     build_int_cst (type, bound_epilog));
1857*38fd1498Szrj 
1858*38fd1498Szrj   *bound_scalar = bound_prolog + bound_epilog;
1859*38fd1498Szrj   if (check_profitability)
1860*38fd1498Szrj     {
1861*38fd1498Szrj       /* TH indicates the minimum niters of vectorized loop, while we
1862*38fd1498Szrj 	 compute the maximum niters of scalar loop.  */
1863*38fd1498Szrj       th--;
1864*38fd1498Szrj       /* Peeling for constant times.  */
1865*38fd1498Szrj       if (int_niters_prolog >= 0)
1866*38fd1498Szrj 	{
1867*38fd1498Szrj 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1868*38fd1498Szrj 	  return build_int_cst (type, *bound_scalar);
1869*38fd1498Szrj 	}
1870*38fd1498Szrj       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1871*38fd1498Szrj 	 and BOUND_EPILOG are inclusive upper bounds.  */
1872*38fd1498Szrj       if (known_ge (th, bound_prolog + bound_epilog))
1873*38fd1498Szrj 	{
1874*38fd1498Szrj 	  *bound_scalar = th;
1875*38fd1498Szrj 	  return build_int_cst (type, th);
1876*38fd1498Szrj 	}
1877*38fd1498Szrj       /* Need to do runtime comparison.  */
1878*38fd1498Szrj       else if (maybe_gt (th, bound_epilog))
1879*38fd1498Szrj 	{
1880*38fd1498Szrj 	  *bound_scalar = upper_bound (*bound_scalar, th);
1881*38fd1498Szrj 	  return fold_build2 (MAX_EXPR, type,
1882*38fd1498Szrj 			      build_int_cst (type, th), niters);
1883*38fd1498Szrj 	}
1884*38fd1498Szrj     }
1885*38fd1498Szrj   return niters;
1886*38fd1498Szrj }
1887*38fd1498Szrj 
1888*38fd1498Szrj /* NITERS is the number of times that the original scalar loop executes
1889*38fd1498Szrj    after peeling.  Work out the maximum number of iterations N that can
1890*38fd1498Szrj    be handled by the vectorized form of the loop and then either:
1891*38fd1498Szrj 
1892*38fd1498Szrj    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1893*38fd1498Szrj 
1894*38fd1498Szrj 	niters_vector = N
1895*38fd1498Szrj 
1896*38fd1498Szrj    b) set *STEP_VECTOR_PTR to one and generate:
1897*38fd1498Szrj 
1898*38fd1498Szrj         niters_vector = N / vf
1899*38fd1498Szrj 
1900*38fd1498Szrj    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1901*38fd1498Szrj    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1902*38fd1498Szrj    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1903*38fd1498Szrj 
1904*38fd1498Szrj void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1905*38fd1498Szrj vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1906*38fd1498Szrj 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1907*38fd1498Szrj 			     bool niters_no_overflow)
1908*38fd1498Szrj {
1909*38fd1498Szrj   tree ni_minus_gap, var;
1910*38fd1498Szrj   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1911*38fd1498Szrj   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1912*38fd1498Szrj   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1913*38fd1498Szrj   tree log_vf = NULL_TREE;
1914*38fd1498Szrj 
1915*38fd1498Szrj   /* If epilogue loop is required because of data accesses with gaps, we
1916*38fd1498Szrj      subtract one iteration from the total number of iterations here for
1917*38fd1498Szrj      correct calculation of RATIO.  */
1918*38fd1498Szrj   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1919*38fd1498Szrj     {
1920*38fd1498Szrj       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1921*38fd1498Szrj 				  build_one_cst (type));
1922*38fd1498Szrj       if (!is_gimple_val (ni_minus_gap))
1923*38fd1498Szrj 	{
1924*38fd1498Szrj 	  var = create_tmp_var (type, "ni_gap");
1925*38fd1498Szrj 	  gimple *stmts = NULL;
1926*38fd1498Szrj 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1927*38fd1498Szrj 					       true, var);
1928*38fd1498Szrj 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1929*38fd1498Szrj 	}
1930*38fd1498Szrj     }
1931*38fd1498Szrj   else
1932*38fd1498Szrj     ni_minus_gap = niters;
1933*38fd1498Szrj 
1934*38fd1498Szrj   unsigned HOST_WIDE_INT const_vf;
1935*38fd1498Szrj   if (vf.is_constant (&const_vf)
1936*38fd1498Szrj       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1937*38fd1498Szrj     {
1938*38fd1498Szrj       /* Create: niters >> log2(vf) */
1939*38fd1498Szrj       /* If it's known that niters == number of latch executions + 1 doesn't
1940*38fd1498Szrj 	 overflow, we can generate niters >> log2(vf); otherwise we generate
1941*38fd1498Szrj 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1942*38fd1498Szrj 	 will be at least one.  */
1943*38fd1498Szrj       log_vf = build_int_cst (type, exact_log2 (const_vf));
1944*38fd1498Szrj       if (niters_no_overflow)
1945*38fd1498Szrj 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1946*38fd1498Szrj       else
1947*38fd1498Szrj 	niters_vector
1948*38fd1498Szrj 	  = fold_build2 (PLUS_EXPR, type,
1949*38fd1498Szrj 			 fold_build2 (RSHIFT_EXPR, type,
1950*38fd1498Szrj 				      fold_build2 (MINUS_EXPR, type,
1951*38fd1498Szrj 						   ni_minus_gap,
1952*38fd1498Szrj 						   build_int_cst (type, vf)),
1953*38fd1498Szrj 				      log_vf),
1954*38fd1498Szrj 			 build_int_cst (type, 1));
1955*38fd1498Szrj       step_vector = build_one_cst (type);
1956*38fd1498Szrj     }
1957*38fd1498Szrj   else
1958*38fd1498Szrj     {
1959*38fd1498Szrj       niters_vector = ni_minus_gap;
1960*38fd1498Szrj       step_vector = build_int_cst (type, vf);
1961*38fd1498Szrj     }
1962*38fd1498Szrj 
1963*38fd1498Szrj   if (!is_gimple_val (niters_vector))
1964*38fd1498Szrj     {
1965*38fd1498Szrj       var = create_tmp_var (type, "bnd");
1966*38fd1498Szrj       gimple_seq stmts = NULL;
1967*38fd1498Szrj       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1968*38fd1498Szrj       gsi_insert_seq_on_edge_immediate (pe, stmts);
1969*38fd1498Szrj       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1970*38fd1498Szrj 	 we set range information to make niters analyzer's life easier.  */
1971*38fd1498Szrj       if (stmts != NULL && log_vf)
1972*38fd1498Szrj 	set_range_info (niters_vector, VR_RANGE,
1973*38fd1498Szrj 			wi::to_wide (build_int_cst (type, 1)),
1974*38fd1498Szrj 			wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1975*38fd1498Szrj 						  TYPE_MAX_VALUE (type),
1976*38fd1498Szrj 						  log_vf)));
1977*38fd1498Szrj     }
1978*38fd1498Szrj   *niters_vector_ptr = niters_vector;
1979*38fd1498Szrj   *step_vector_ptr = step_vector;
1980*38fd1498Szrj 
1981*38fd1498Szrj   return;
1982*38fd1498Szrj }
1983*38fd1498Szrj 
1984*38fd1498Szrj /* Given NITERS_VECTOR which is the number of iterations for vectorized
1985*38fd1498Szrj    loop specified by LOOP_VINFO after vectorization, compute the number
1986*38fd1498Szrj    of iterations before vectorization (niters_vector * vf) and store it
1987*38fd1498Szrj    to NITERS_VECTOR_MULT_VF_PTR.  */
1988*38fd1498Szrj 
1989*38fd1498Szrj static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)1990*38fd1498Szrj vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1991*38fd1498Szrj 				     tree niters_vector,
1992*38fd1498Szrj 				     tree *niters_vector_mult_vf_ptr)
1993*38fd1498Szrj {
1994*38fd1498Szrj   /* We should be using a step_vector of VF if VF is variable.  */
1995*38fd1498Szrj   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1996*38fd1498Szrj   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1997*38fd1498Szrj   tree type = TREE_TYPE (niters_vector);
1998*38fd1498Szrj   tree log_vf = build_int_cst (type, exact_log2 (vf));
1999*38fd1498Szrj   basic_block exit_bb = single_exit (loop)->dest;
2000*38fd1498Szrj 
2001*38fd1498Szrj   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2002*38fd1498Szrj   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2003*38fd1498Szrj 					    niters_vector, log_vf);
2004*38fd1498Szrj   if (!is_gimple_val (niters_vector_mult_vf))
2005*38fd1498Szrj     {
2006*38fd1498Szrj       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2007*38fd1498Szrj       gimple_seq stmts = NULL;
2008*38fd1498Szrj       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2009*38fd1498Szrj 						    &stmts, true, var);
2010*38fd1498Szrj       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2011*38fd1498Szrj       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2012*38fd1498Szrj     }
2013*38fd1498Szrj   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2014*38fd1498Szrj }
2015*38fd1498Szrj 
2016*38fd1498Szrj /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2017*38fd1498Szrj    from SECOND/FIRST and puts it at the original loop's preheader/exit
2018*38fd1498Szrj    edge, the two loops are arranged as below:
2019*38fd1498Szrj 
2020*38fd1498Szrj        preheader_a:
2021*38fd1498Szrj      first_loop:
2022*38fd1498Szrj        header_a:
2023*38fd1498Szrj 	 i_1 = PHI<i_0, i_2>;
2024*38fd1498Szrj 	 ...
2025*38fd1498Szrj 	 i_2 = i_1 + 1;
2026*38fd1498Szrj 	 if (cond_a)
2027*38fd1498Szrj 	   goto latch_a;
2028*38fd1498Szrj 	 else
2029*38fd1498Szrj 	   goto between_bb;
2030*38fd1498Szrj        latch_a:
2031*38fd1498Szrj 	 goto header_a;
2032*38fd1498Szrj 
2033*38fd1498Szrj        between_bb:
2034*38fd1498Szrj 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2035*38fd1498Szrj 
2036*38fd1498Szrj      second_loop:
2037*38fd1498Szrj        header_b:
2038*38fd1498Szrj 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2039*38fd1498Szrj 				 or with i_2 if no LCSSA phi is created
2040*38fd1498Szrj 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2041*38fd1498Szrj 	 ...
2042*38fd1498Szrj 	 i_4 = i_3 + 1;
2043*38fd1498Szrj 	 if (cond_b)
2044*38fd1498Szrj 	   goto latch_b;
2045*38fd1498Szrj 	 else
2046*38fd1498Szrj 	   goto exit_bb;
2047*38fd1498Szrj        latch_b:
2048*38fd1498Szrj 	 goto header_b;
2049*38fd1498Szrj 
2050*38fd1498Szrj        exit_bb:
2051*38fd1498Szrj 
2052*38fd1498Szrj    This function creates loop closed SSA for the first loop; update the
2053*38fd1498Szrj    second loop's PHI nodes by replacing argument on incoming edge with the
2054*38fd1498Szrj    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2055*38fd1498Szrj    is false, Loop closed ssa phis will only be created for non-iv phis for
2056*38fd1498Szrj    the first loop.
2057*38fd1498Szrj 
2058*38fd1498Szrj    This function assumes exit bb of the first loop is preheader bb of the
2059*38fd1498Szrj    second loop, i.e, between_bb in the example code.  With PHIs updated,
2060*38fd1498Szrj    the second loop will execute rest iterations of the first.  */
2061*38fd1498Szrj 
2062*38fd1498Szrj static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,struct loop * first,struct loop * second,bool create_lcssa_for_iv_phis)2063*38fd1498Szrj slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2064*38fd1498Szrj 				   struct loop *first, struct loop *second,
2065*38fd1498Szrj 				   bool create_lcssa_for_iv_phis)
2066*38fd1498Szrj {
2067*38fd1498Szrj   gphi_iterator gsi_update, gsi_orig;
2068*38fd1498Szrj   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2069*38fd1498Szrj 
2070*38fd1498Szrj   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2071*38fd1498Szrj   edge second_preheader_e = loop_preheader_edge (second);
2072*38fd1498Szrj   basic_block between_bb = single_exit (first)->dest;
2073*38fd1498Szrj 
2074*38fd1498Szrj   gcc_assert (between_bb == second_preheader_e->src);
2075*38fd1498Szrj   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2076*38fd1498Szrj   /* Either the first loop or the second is the loop to be vectorized.  */
2077*38fd1498Szrj   gcc_assert (loop == first || loop == second);
2078*38fd1498Szrj 
2079*38fd1498Szrj   for (gsi_orig = gsi_start_phis (first->header),
2080*38fd1498Szrj        gsi_update = gsi_start_phis (second->header);
2081*38fd1498Szrj        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2082*38fd1498Szrj        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2083*38fd1498Szrj     {
2084*38fd1498Szrj       gphi *orig_phi = gsi_orig.phi ();
2085*38fd1498Szrj       gphi *update_phi = gsi_update.phi ();
2086*38fd1498Szrj 
2087*38fd1498Szrj       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2088*38fd1498Szrj       /* Generate lcssa PHI node for the first loop.  */
2089*38fd1498Szrj       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2090*38fd1498Szrj       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2091*38fd1498Szrj 	{
2092*38fd1498Szrj 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2093*38fd1498Szrj 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2094*38fd1498Szrj 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2095*38fd1498Szrj 	  arg = new_res;
2096*38fd1498Szrj 	}
2097*38fd1498Szrj 
2098*38fd1498Szrj       /* Update PHI node in the second loop by replacing arg on the loop's
2099*38fd1498Szrj 	 incoming edge.  */
2100*38fd1498Szrj       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2101*38fd1498Szrj     }
2102*38fd1498Szrj }
2103*38fd1498Szrj 
2104*38fd1498Szrj /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2105*38fd1498Szrj    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2106*38fd1498Szrj    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2107*38fd1498Szrj    appear like below:
2108*38fd1498Szrj 
2109*38fd1498Szrj        guard_bb:
2110*38fd1498Szrj 	 if (cond)
2111*38fd1498Szrj 	   goto merge_bb;
2112*38fd1498Szrj 	 else
2113*38fd1498Szrj 	   goto skip_loop;
2114*38fd1498Szrj 
2115*38fd1498Szrj      skip_loop:
2116*38fd1498Szrj        header_a:
2117*38fd1498Szrj 	 i_1 = PHI<i_0, i_2>;
2118*38fd1498Szrj 	 ...
2119*38fd1498Szrj 	 i_2 = i_1 + 1;
2120*38fd1498Szrj 	 if (cond_a)
2121*38fd1498Szrj 	   goto latch_a;
2122*38fd1498Szrj 	 else
2123*38fd1498Szrj 	   goto exit_a;
2124*38fd1498Szrj        latch_a:
2125*38fd1498Szrj 	 goto header_a;
2126*38fd1498Szrj 
2127*38fd1498Szrj        exit_a:
2128*38fd1498Szrj 	 i_5 = PHI<i_2>;
2129*38fd1498Szrj 
2130*38fd1498Szrj        merge_bb:
2131*38fd1498Szrj 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2132*38fd1498Szrj 
2133*38fd1498Szrj      update_loop:
2134*38fd1498Szrj        header_b:
2135*38fd1498Szrj 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2136*38fd1498Szrj 	 ...
2137*38fd1498Szrj 	 i_4 = i_3 + 1;
2138*38fd1498Szrj 	 if (cond_b)
2139*38fd1498Szrj 	   goto latch_b;
2140*38fd1498Szrj 	 else
2141*38fd1498Szrj 	   goto exit_bb;
2142*38fd1498Szrj        latch_b:
2143*38fd1498Szrj 	 goto header_b;
2144*38fd1498Szrj 
2145*38fd1498Szrj        exit_bb:
2146*38fd1498Szrj 
2147*38fd1498Szrj    This function creates PHI nodes at merge_bb and replaces the use of i_5
2148*38fd1498Szrj    in the update_loop's PHI node with the result of new PHI result.  */
2149*38fd1498Szrj 
2150*38fd1498Szrj static void
slpeel_update_phi_nodes_for_guard1(struct loop * skip_loop,struct loop * update_loop,edge guard_edge,edge merge_edge)2151*38fd1498Szrj slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2152*38fd1498Szrj 				    struct loop *update_loop,
2153*38fd1498Szrj 				    edge guard_edge, edge merge_edge)
2154*38fd1498Szrj {
2155*38fd1498Szrj   source_location merge_loc, guard_loc;
2156*38fd1498Szrj   edge orig_e = loop_preheader_edge (skip_loop);
2157*38fd1498Szrj   edge update_e = loop_preheader_edge (update_loop);
2158*38fd1498Szrj   gphi_iterator gsi_orig, gsi_update;
2159*38fd1498Szrj 
2160*38fd1498Szrj   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2161*38fd1498Szrj 	gsi_update = gsi_start_phis (update_loop->header));
2162*38fd1498Szrj        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2163*38fd1498Szrj        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2164*38fd1498Szrj     {
2165*38fd1498Szrj       gphi *orig_phi = gsi_orig.phi ();
2166*38fd1498Szrj       gphi *update_phi = gsi_update.phi ();
2167*38fd1498Szrj 
2168*38fd1498Szrj       /* Generate new phi node at merge bb of the guard.  */
2169*38fd1498Szrj       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2170*38fd1498Szrj       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2171*38fd1498Szrj 
2172*38fd1498Szrj       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2173*38fd1498Szrj 	 args in NEW_PHI for these edges.  */
2174*38fd1498Szrj       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2175*38fd1498Szrj       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2176*38fd1498Szrj       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2177*38fd1498Szrj       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2178*38fd1498Szrj       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2179*38fd1498Szrj       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2180*38fd1498Szrj 
2181*38fd1498Szrj       /* Update phi in UPDATE_PHI.  */
2182*38fd1498Szrj       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2183*38fd1498Szrj     }
2184*38fd1498Szrj }
2185*38fd1498Szrj 
2186*38fd1498Szrj /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2187*38fd1498Szrj    this function searches for the corresponding lcssa phi node in exit
2188*38fd1498Szrj    bb of LOOP.  If it is found, return the phi result; otherwise return
2189*38fd1498Szrj    NULL.  */
2190*38fd1498Szrj 
2191*38fd1498Szrj static tree
find_guard_arg(struct loop * loop,struct loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2192*38fd1498Szrj find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2193*38fd1498Szrj 		gphi *lcssa_phi)
2194*38fd1498Szrj {
2195*38fd1498Szrj   gphi_iterator gsi;
2196*38fd1498Szrj   edge e = single_exit (loop);
2197*38fd1498Szrj 
2198*38fd1498Szrj   gcc_assert (single_pred_p (e->dest));
2199*38fd1498Szrj   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2200*38fd1498Szrj     {
2201*38fd1498Szrj       gphi *phi = gsi.phi ();
2202*38fd1498Szrj       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2203*38fd1498Szrj 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2204*38fd1498Szrj 	return PHI_RESULT (phi);
2205*38fd1498Szrj     }
2206*38fd1498Szrj   return NULL_TREE;
2207*38fd1498Szrj }
2208*38fd1498Szrj 
2209*38fd1498Szrj /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2210*38fd1498Szrj    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2211*38fd1498Szrj    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2212*38fd1498Szrj    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2213*38fd1498Szrj    The CFG looks like:
2214*38fd1498Szrj 
2215*38fd1498Szrj      loop:
2216*38fd1498Szrj        header_a:
2217*38fd1498Szrj 	 i_1 = PHI<i_0, i_2>;
2218*38fd1498Szrj 	 ...
2219*38fd1498Szrj 	 i_2 = i_1 + 1;
2220*38fd1498Szrj 	 if (cond_a)
2221*38fd1498Szrj 	   goto latch_a;
2222*38fd1498Szrj 	 else
2223*38fd1498Szrj 	   goto exit_a;
2224*38fd1498Szrj        latch_a:
2225*38fd1498Szrj 	 goto header_a;
2226*38fd1498Szrj 
2227*38fd1498Szrj        exit_a:
2228*38fd1498Szrj 
2229*38fd1498Szrj        guard_bb:
2230*38fd1498Szrj 	 if (cond)
2231*38fd1498Szrj 	   goto merge_bb;
2232*38fd1498Szrj 	 else
2233*38fd1498Szrj 	   goto epilog_loop;
2234*38fd1498Szrj 
2235*38fd1498Szrj        ;; fall_through_bb
2236*38fd1498Szrj 
2237*38fd1498Szrj      epilog_loop:
2238*38fd1498Szrj        header_b:
2239*38fd1498Szrj 	 i_3 = PHI<i_2, i_4>;
2240*38fd1498Szrj 	 ...
2241*38fd1498Szrj 	 i_4 = i_3 + 1;
2242*38fd1498Szrj 	 if (cond_b)
2243*38fd1498Szrj 	   goto latch_b;
2244*38fd1498Szrj 	 else
2245*38fd1498Szrj 	   goto merge_bb;
2246*38fd1498Szrj        latch_b:
2247*38fd1498Szrj 	 goto header_b;
2248*38fd1498Szrj 
2249*38fd1498Szrj        merge_bb:
2250*38fd1498Szrj 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2251*38fd1498Szrj 
2252*38fd1498Szrj        exit_bb:
2253*38fd1498Szrj 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2254*38fd1498Szrj 
2255*38fd1498Szrj    For each name used out side EPILOG (i.e - for each name that has a lcssa
2256*38fd1498Szrj    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2257*38fd1498Szrj    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2258*38fd1498Szrj    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2259*38fd1498Szrj    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2260*38fd1498Szrj    in exit_bb will also be updated.  */
2261*38fd1498Szrj 
2262*38fd1498Szrj static void
slpeel_update_phi_nodes_for_guard2(struct loop * loop,struct loop * epilog,edge guard_edge,edge merge_edge)2263*38fd1498Szrj slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2264*38fd1498Szrj 				    edge guard_edge, edge merge_edge)
2265*38fd1498Szrj {
2266*38fd1498Szrj   gphi_iterator gsi;
2267*38fd1498Szrj   basic_block merge_bb = guard_edge->dest;
2268*38fd1498Szrj 
2269*38fd1498Szrj   gcc_assert (single_succ_p (merge_bb));
2270*38fd1498Szrj   edge e = single_succ_edge (merge_bb);
2271*38fd1498Szrj   basic_block exit_bb = e->dest;
2272*38fd1498Szrj   gcc_assert (single_pred_p (exit_bb));
2273*38fd1498Szrj   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2274*38fd1498Szrj 
2275*38fd1498Szrj   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2276*38fd1498Szrj     {
2277*38fd1498Szrj       gphi *update_phi = gsi.phi ();
2278*38fd1498Szrj       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2279*38fd1498Szrj       /* This loop-closed-phi actually doesn't represent a use out of the
2280*38fd1498Szrj 	 loop - the phi arg is a constant.  */
2281*38fd1498Szrj       if (TREE_CODE (old_arg) != SSA_NAME)
2282*38fd1498Szrj 	continue;
2283*38fd1498Szrj 
2284*38fd1498Szrj       tree merge_arg = get_current_def (old_arg);
2285*38fd1498Szrj       if (!merge_arg)
2286*38fd1498Szrj 	merge_arg = old_arg;
2287*38fd1498Szrj 
2288*38fd1498Szrj       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2289*38fd1498Szrj       /* If the var is live after loop but not a reduction, we simply
2290*38fd1498Szrj 	 use the old arg.  */
2291*38fd1498Szrj       if (!guard_arg)
2292*38fd1498Szrj 	guard_arg = old_arg;
2293*38fd1498Szrj 
2294*38fd1498Szrj       /* Create new phi node in MERGE_BB:  */
2295*38fd1498Szrj       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2296*38fd1498Szrj       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2297*38fd1498Szrj 
2298*38fd1498Szrj       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2299*38fd1498Szrj 	 the two PHI args in merge_phi for these edges.  */
2300*38fd1498Szrj       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2301*38fd1498Szrj       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2302*38fd1498Szrj 
2303*38fd1498Szrj       /* Update the original phi in exit_bb.  */
2304*38fd1498Szrj       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2305*38fd1498Szrj     }
2306*38fd1498Szrj }
2307*38fd1498Szrj 
2308*38fd1498Szrj /* EPILOG loop is duplicated from the original loop for vectorizing,
2309*38fd1498Szrj    the arg of its loop closed ssa PHI needs to be updated.  */
2310*38fd1498Szrj 
2311*38fd1498Szrj static void
slpeel_update_phi_nodes_for_lcssa(struct loop * epilog)2312*38fd1498Szrj slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2313*38fd1498Szrj {
2314*38fd1498Szrj   gphi_iterator gsi;
2315*38fd1498Szrj   basic_block exit_bb = single_exit (epilog)->dest;
2316*38fd1498Szrj 
2317*38fd1498Szrj   gcc_assert (single_pred_p (exit_bb));
2318*38fd1498Szrj   edge e = EDGE_PRED (exit_bb, 0);
2319*38fd1498Szrj   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2320*38fd1498Szrj     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2321*38fd1498Szrj }
2322*38fd1498Szrj 
2323*38fd1498Szrj /* Function vect_do_peeling.
2324*38fd1498Szrj 
2325*38fd1498Szrj    Input:
2326*38fd1498Szrj    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2327*38fd1498Szrj 
2328*38fd1498Szrj        preheader:
2329*38fd1498Szrj      LOOP:
2330*38fd1498Szrj        header_bb:
2331*38fd1498Szrj 	 loop_body
2332*38fd1498Szrj 	 if (exit_loop_cond) goto exit_bb
2333*38fd1498Szrj 	 else                goto header_bb
2334*38fd1498Szrj        exit_bb:
2335*38fd1498Szrj 
2336*38fd1498Szrj    - NITERS: The number of iterations of the loop.
2337*38fd1498Szrj    - NITERSM1: The number of iterations of the loop's latch.
2338*38fd1498Szrj    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2339*38fd1498Szrj    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2340*38fd1498Szrj 			      CHECK_PROFITABILITY is true.
2341*38fd1498Szrj    Output:
2342*38fd1498Szrj    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2343*38fd1498Szrj      iterate after vectorization; see vect_set_loop_condition for details.
2344*38fd1498Szrj    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2345*38fd1498Szrj      should be set to the number of scalar iterations handled by the
2346*38fd1498Szrj      vector loop.  The SSA name is only used on exit from the loop.
2347*38fd1498Szrj 
2348*38fd1498Szrj    This function peels prolog and epilog from the loop, adds guards skipping
2349*38fd1498Szrj    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2350*38fd1498Szrj    would look like:
2351*38fd1498Szrj 
2352*38fd1498Szrj        guard_bb_1:
2353*38fd1498Szrj 	 if (prefer_scalar_loop) goto merge_bb_1
2354*38fd1498Szrj 	 else                    goto guard_bb_2
2355*38fd1498Szrj 
2356*38fd1498Szrj        guard_bb_2:
2357*38fd1498Szrj          if (skip_prolog) goto merge_bb_2
2358*38fd1498Szrj          else             goto prolog_preheader
2359*38fd1498Szrj 
2360*38fd1498Szrj        prolog_preheader:
2361*38fd1498Szrj      PROLOG:
2362*38fd1498Szrj        prolog_header_bb:
2363*38fd1498Szrj 	 prolog_body
2364*38fd1498Szrj 	 if (exit_prolog_cond) goto prolog_exit_bb
2365*38fd1498Szrj 	 else                  goto prolog_header_bb
2366*38fd1498Szrj        prolog_exit_bb:
2367*38fd1498Szrj 
2368*38fd1498Szrj        merge_bb_2:
2369*38fd1498Szrj 
2370*38fd1498Szrj        vector_preheader:
2371*38fd1498Szrj      VECTOR LOOP:
2372*38fd1498Szrj        vector_header_bb:
2373*38fd1498Szrj 	 vector_body
2374*38fd1498Szrj 	 if (exit_vector_cond) goto vector_exit_bb
2375*38fd1498Szrj 	 else                  goto vector_header_bb
2376*38fd1498Szrj        vector_exit_bb:
2377*38fd1498Szrj 
2378*38fd1498Szrj        guard_bb_3:
2379*38fd1498Szrj 	 if (skip_epilog) goto merge_bb_3
2380*38fd1498Szrj 	 else             goto epilog_preheader
2381*38fd1498Szrj 
2382*38fd1498Szrj        merge_bb_1:
2383*38fd1498Szrj 
2384*38fd1498Szrj        epilog_preheader:
2385*38fd1498Szrj      EPILOG:
2386*38fd1498Szrj        epilog_header_bb:
2387*38fd1498Szrj 	 epilog_body
2388*38fd1498Szrj 	 if (exit_epilog_cond) goto merge_bb_3
2389*38fd1498Szrj 	 else                  goto epilog_header_bb
2390*38fd1498Szrj 
2391*38fd1498Szrj        merge_bb_3:
2392*38fd1498Szrj 
2393*38fd1498Szrj    Note this function peels prolog and epilog only if it's necessary,
2394*38fd1498Szrj    as well as guards.
2395*38fd1498Szrj    Returns created epilogue or NULL.
2396*38fd1498Szrj 
2397*38fd1498Szrj    TODO: Guard for prefer_scalar_loop should be emitted along with
2398*38fd1498Szrj    versioning conditions if loop versioning is needed.  */
2399*38fd1498Szrj 
2400*38fd1498Szrj 
2401*38fd1498Szrj struct loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow)2402*38fd1498Szrj vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2403*38fd1498Szrj 		 tree *niters_vector, tree *step_vector,
2404*38fd1498Szrj 		 tree *niters_vector_mult_vf_var, int th,
2405*38fd1498Szrj 		 bool check_profitability, bool niters_no_overflow)
2406*38fd1498Szrj {
2407*38fd1498Szrj   edge e, guard_e;
2408*38fd1498Szrj   tree type = TREE_TYPE (niters), guard_cond;
2409*38fd1498Szrj   basic_block guard_bb, guard_to;
2410*38fd1498Szrj   profile_probability prob_prolog, prob_vector, prob_epilog;
2411*38fd1498Szrj   int estimated_vf;
2412*38fd1498Szrj   int prolog_peeling = 0;
2413*38fd1498Szrj   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2414*38fd1498Szrj     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2415*38fd1498Szrj 
2416*38fd1498Szrj   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2417*38fd1498Szrj   poly_uint64 bound_epilog = 0;
2418*38fd1498Szrj   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2419*38fd1498Szrj       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2420*38fd1498Szrj     bound_epilog += vf - 1;
2421*38fd1498Szrj   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2422*38fd1498Szrj     bound_epilog += 1;
2423*38fd1498Szrj   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2424*38fd1498Szrj   poly_uint64 bound_scalar = bound_epilog;
2425*38fd1498Szrj 
2426*38fd1498Szrj   if (!prolog_peeling && !epilog_peeling)
2427*38fd1498Szrj     return NULL;
2428*38fd1498Szrj 
2429*38fd1498Szrj   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2430*38fd1498Szrj   estimated_vf = vect_vf_for_cost (loop_vinfo);
2431*38fd1498Szrj   if (estimated_vf == 2)
2432*38fd1498Szrj     estimated_vf = 3;
2433*38fd1498Szrj   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2434*38fd1498Szrj 			.apply_scale (estimated_vf - 1, estimated_vf);
2435*38fd1498Szrj 
2436*38fd1498Szrj   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2437*38fd1498Szrj   struct loop *first_loop = loop;
2438*38fd1498Szrj   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2439*38fd1498Szrj   create_lcssa_for_virtual_phi (loop);
2440*38fd1498Szrj   update_ssa (TODO_update_ssa_only_virtuals);
2441*38fd1498Szrj 
2442*38fd1498Szrj   if (MAY_HAVE_DEBUG_BIND_STMTS)
2443*38fd1498Szrj     {
2444*38fd1498Szrj       gcc_assert (!adjust_vec.exists ());
2445*38fd1498Szrj       adjust_vec.create (32);
2446*38fd1498Szrj     }
2447*38fd1498Szrj   initialize_original_copy_tables ();
2448*38fd1498Szrj 
2449*38fd1498Szrj   /* Record the anchor bb at which the guard should be placed if the scalar
2450*38fd1498Szrj      loop might be preferred.  */
2451*38fd1498Szrj   basic_block anchor = loop_preheader_edge (loop)->src;
2452*38fd1498Szrj 
2453*38fd1498Szrj   /* Generate the number of iterations for the prolog loop.  We do this here
2454*38fd1498Szrj      so that we can also get the upper bound on the number of iterations.  */
2455*38fd1498Szrj   tree niters_prolog;
2456*38fd1498Szrj   int bound_prolog = 0;
2457*38fd1498Szrj   if (prolog_peeling)
2458*38fd1498Szrj     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2459*38fd1498Szrj 						 &bound_prolog);
2460*38fd1498Szrj   else
2461*38fd1498Szrj     niters_prolog = build_int_cst (type, 0);
2462*38fd1498Szrj 
2463*38fd1498Szrj   /* Prolog loop may be skipped.  */
2464*38fd1498Szrj   bool skip_prolog = (prolog_peeling != 0);
2465*38fd1498Szrj   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2466*38fd1498Szrj      when we peel for epilog loop and when it hasn't been checked with
2467*38fd1498Szrj      loop versioning.  */
2468*38fd1498Szrj   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2469*38fd1498Szrj 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2470*38fd1498Szrj 				  bound_prolog + bound_epilog)
2471*38fd1498Szrj 		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2472*38fd1498Szrj   /* Epilog loop must be executed if the number of iterations for epilog
2473*38fd1498Szrj      loop is known at compile time, otherwise we need to add a check at
2474*38fd1498Szrj      the end of vector loop and skip to the end of epilog loop.  */
2475*38fd1498Szrj   bool skip_epilog = (prolog_peeling < 0
2476*38fd1498Szrj 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2477*38fd1498Szrj 		      || !vf.is_constant ());
2478*38fd1498Szrj   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2479*38fd1498Szrj   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2480*38fd1498Szrj     skip_epilog = false;
2481*38fd1498Szrj 
2482*38fd1498Szrj   if (skip_vector)
2483*38fd1498Szrj     {
2484*38fd1498Szrj       split_edge (loop_preheader_edge (loop));
2485*38fd1498Szrj 
2486*38fd1498Szrj       /* Due to the order in which we peel prolog and epilog, we first
2487*38fd1498Szrj 	 propagate probability to the whole loop.  The purpose is to
2488*38fd1498Szrj 	 avoid adjusting probabilities of both prolog and vector loops
2489*38fd1498Szrj 	 separately.  Note in this case, the probability of epilog loop
2490*38fd1498Szrj 	 needs to be scaled back later.  */
2491*38fd1498Szrj       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2492*38fd1498Szrj       if (prob_vector.initialized_p ())
2493*38fd1498Szrj 	{
2494*38fd1498Szrj 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2495*38fd1498Szrj 	  scale_loop_profile (loop, prob_vector, 0);
2496*38fd1498Szrj 	}
2497*38fd1498Szrj     }
2498*38fd1498Szrj 
2499*38fd1498Szrj   source_location loop_loc = find_loop_location (loop);
2500*38fd1498Szrj   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2501*38fd1498Szrj   if (prolog_peeling)
2502*38fd1498Szrj     {
2503*38fd1498Szrj       e = loop_preheader_edge (loop);
2504*38fd1498Szrj       if (!slpeel_can_duplicate_loop_p (loop, e))
2505*38fd1498Szrj 	{
2506*38fd1498Szrj 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2507*38fd1498Szrj 			   "loop can't be duplicated to preheader edge.\n");
2508*38fd1498Szrj 	  gcc_unreachable ();
2509*38fd1498Szrj 	}
2510*38fd1498Szrj       /* Peel prolog and put it on preheader edge of loop.  */
2511*38fd1498Szrj       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2512*38fd1498Szrj       if (!prolog)
2513*38fd1498Szrj 	{
2514*38fd1498Szrj 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2515*38fd1498Szrj 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2516*38fd1498Szrj 	  gcc_unreachable ();
2517*38fd1498Szrj 	}
2518*38fd1498Szrj       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2519*38fd1498Szrj       first_loop = prolog;
2520*38fd1498Szrj       reset_original_copy_tables ();
2521*38fd1498Szrj 
2522*38fd1498Szrj       /* Update the number of iterations for prolog loop.  */
2523*38fd1498Szrj       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2524*38fd1498Szrj       vect_set_loop_condition (prolog, NULL, niters_prolog,
2525*38fd1498Szrj 			       step_prolog, NULL_TREE, false);
2526*38fd1498Szrj 
2527*38fd1498Szrj       /* Skip the prolog loop.  */
2528*38fd1498Szrj       if (skip_prolog)
2529*38fd1498Szrj 	{
2530*38fd1498Szrj 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2531*38fd1498Szrj 				    niters_prolog, build_int_cst (type, 0));
2532*38fd1498Szrj 	  guard_bb = loop_preheader_edge (prolog)->src;
2533*38fd1498Szrj 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2534*38fd1498Szrj 	  guard_to = split_edge (loop_preheader_edge (loop));
2535*38fd1498Szrj 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2536*38fd1498Szrj 					   guard_to, guard_bb,
2537*38fd1498Szrj 					   prob_prolog.invert (),
2538*38fd1498Szrj 					   irred_flag);
2539*38fd1498Szrj 	  e = EDGE_PRED (guard_to, 0);
2540*38fd1498Szrj 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2541*38fd1498Szrj 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2542*38fd1498Szrj 
2543*38fd1498Szrj 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2544*38fd1498Szrj 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2545*38fd1498Szrj 	}
2546*38fd1498Szrj       /* Update init address of DRs.  */
2547*38fd1498Szrj       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2548*38fd1498Szrj       /* Update niters for vector loop.  */
2549*38fd1498Szrj       LOOP_VINFO_NITERS (loop_vinfo)
2550*38fd1498Szrj 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2551*38fd1498Szrj       LOOP_VINFO_NITERSM1 (loop_vinfo)
2552*38fd1498Szrj 	= fold_build2 (MINUS_EXPR, type,
2553*38fd1498Szrj 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2554*38fd1498Szrj       bool new_var_p = false;
2555*38fd1498Szrj       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2556*38fd1498Szrj       /* It's guaranteed that vector loop bound before vectorization is at
2557*38fd1498Szrj 	 least VF, so set range information for newly generated var.  */
2558*38fd1498Szrj       if (new_var_p)
2559*38fd1498Szrj 	set_range_info (niters, VR_RANGE,
2560*38fd1498Szrj 			wi::to_wide (build_int_cst (type, vf)),
2561*38fd1498Szrj 			wi::to_wide (TYPE_MAX_VALUE (type)));
2562*38fd1498Szrj 
2563*38fd1498Szrj       /* Prolog iterates at most bound_prolog times, latch iterates at
2564*38fd1498Szrj 	 most bound_prolog - 1 times.  */
2565*38fd1498Szrj       record_niter_bound (prolog, bound_prolog - 1, false, true);
2566*38fd1498Szrj       delete_update_ssa ();
2567*38fd1498Szrj       adjust_vec_debug_stmts ();
2568*38fd1498Szrj       scev_reset ();
2569*38fd1498Szrj     }
2570*38fd1498Szrj 
2571*38fd1498Szrj   if (epilog_peeling)
2572*38fd1498Szrj     {
2573*38fd1498Szrj       e = single_exit (loop);
2574*38fd1498Szrj       if (!slpeel_can_duplicate_loop_p (loop, e))
2575*38fd1498Szrj 	{
2576*38fd1498Szrj 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2577*38fd1498Szrj 			   "loop can't be duplicated to exit edge.\n");
2578*38fd1498Szrj 	  gcc_unreachable ();
2579*38fd1498Szrj 	}
2580*38fd1498Szrj       /* Peel epilog and put it on exit edge of loop.  */
2581*38fd1498Szrj       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2582*38fd1498Szrj       if (!epilog)
2583*38fd1498Szrj 	{
2584*38fd1498Szrj 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2585*38fd1498Szrj 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2586*38fd1498Szrj 	  gcc_unreachable ();
2587*38fd1498Szrj 	}
2588*38fd1498Szrj       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2589*38fd1498Szrj 
2590*38fd1498Szrj       /* Scalar version loop may be preferred.  In this case, add guard
2591*38fd1498Szrj 	 and skip to epilog.  Note this only happens when the number of
2592*38fd1498Szrj 	 iterations of loop is unknown at compile time, otherwise this
2593*38fd1498Szrj 	 won't be vectorized.  */
2594*38fd1498Szrj       if (skip_vector)
2595*38fd1498Szrj 	{
2596*38fd1498Szrj 	  /* Additional epilogue iteration is peeled if gap exists.  */
2597*38fd1498Szrj 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2598*38fd1498Szrj 						bound_prolog, bound_epilog,
2599*38fd1498Szrj 						th, &bound_scalar,
2600*38fd1498Szrj 						check_profitability);
2601*38fd1498Szrj 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2602*38fd1498Szrj 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2603*38fd1498Szrj 	  guard_bb = anchor;
2604*38fd1498Szrj 	  guard_to = split_edge (loop_preheader_edge (epilog));
2605*38fd1498Szrj 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2606*38fd1498Szrj 					   guard_to, guard_bb,
2607*38fd1498Szrj 					   prob_vector.invert (),
2608*38fd1498Szrj 					   irred_flag);
2609*38fd1498Szrj 	  e = EDGE_PRED (guard_to, 0);
2610*38fd1498Szrj 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2611*38fd1498Szrj 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2612*38fd1498Szrj 
2613*38fd1498Szrj 	  /* Simply propagate profile info from guard_bb to guard_to which is
2614*38fd1498Szrj 	     a merge point of control flow.  */
2615*38fd1498Szrj 	  guard_to->count = guard_bb->count;
2616*38fd1498Szrj 
2617*38fd1498Szrj 	  /* Scale probability of epilog loop back.
2618*38fd1498Szrj 	     FIXME: We should avoid scaling down and back up.  Profile may
2619*38fd1498Szrj 	     get lost if we scale down to 0.  */
2620*38fd1498Szrj 	  basic_block *bbs = get_loop_body (epilog);
2621*38fd1498Szrj 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2622*38fd1498Szrj 	    bbs[i]->count = bbs[i]->count.apply_scale
2623*38fd1498Szrj 				 (bbs[i]->count,
2624*38fd1498Szrj 				  bbs[i]->count.apply_probability
2625*38fd1498Szrj 				    (prob_vector));
2626*38fd1498Szrj 	  free (bbs);
2627*38fd1498Szrj 	}
2628*38fd1498Szrj 
2629*38fd1498Szrj       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2630*38fd1498Szrj       tree niters_vector_mult_vf;
2631*38fd1498Szrj       /* If loop is peeled for non-zero constant times, now niters refers to
2632*38fd1498Szrj 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2633*38fd1498Szrj 	 overflows.  */
2634*38fd1498Szrj       niters_no_overflow |= (prolog_peeling > 0);
2635*38fd1498Szrj       vect_gen_vector_loop_niters (loop_vinfo, niters,
2636*38fd1498Szrj 				   niters_vector, step_vector,
2637*38fd1498Szrj 				   niters_no_overflow);
2638*38fd1498Szrj       if (!integer_onep (*step_vector))
2639*38fd1498Szrj 	{
2640*38fd1498Szrj 	  /* On exit from the loop we will have an easy way of calcalating
2641*38fd1498Szrj 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2642*38fd1498Szrj 	     until then.  */
2643*38fd1498Szrj 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2644*38fd1498Szrj 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2645*38fd1498Szrj 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2646*38fd1498Szrj 	}
2647*38fd1498Szrj       else
2648*38fd1498Szrj 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2649*38fd1498Szrj 					     &niters_vector_mult_vf);
2650*38fd1498Szrj       /* Update IVs of original loop as if they were advanced by
2651*38fd1498Szrj 	 niters_vector_mult_vf steps.  */
2652*38fd1498Szrj       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2653*38fd1498Szrj       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2654*38fd1498Szrj       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2655*38fd1498Szrj 					update_e);
2656*38fd1498Szrj 
2657*38fd1498Szrj       if (skip_epilog)
2658*38fd1498Szrj 	{
2659*38fd1498Szrj 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2660*38fd1498Szrj 				    niters, niters_vector_mult_vf);
2661*38fd1498Szrj 	  guard_bb = single_exit (loop)->dest;
2662*38fd1498Szrj 	  guard_to = split_edge (single_exit (epilog));
2663*38fd1498Szrj 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2664*38fd1498Szrj 					   skip_vector ? anchor : guard_bb,
2665*38fd1498Szrj 					   prob_epilog.invert (),
2666*38fd1498Szrj 					   irred_flag);
2667*38fd1498Szrj 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2668*38fd1498Szrj 					      single_exit (epilog));
2669*38fd1498Szrj 	  /* Only need to handle basic block before epilog loop if it's not
2670*38fd1498Szrj 	     the guard_bb, which is the case when skip_vector is true.  */
2671*38fd1498Szrj 	  if (guard_bb != bb_before_epilog)
2672*38fd1498Szrj 	    {
2673*38fd1498Szrj 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2674*38fd1498Szrj 
2675*38fd1498Szrj 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2676*38fd1498Szrj 	    }
2677*38fd1498Szrj 	  scale_loop_profile (epilog, prob_epilog, 0);
2678*38fd1498Szrj 	}
2679*38fd1498Szrj       else
2680*38fd1498Szrj 	slpeel_update_phi_nodes_for_lcssa (epilog);
2681*38fd1498Szrj 
2682*38fd1498Szrj       unsigned HOST_WIDE_INT bound;
2683*38fd1498Szrj       if (bound_scalar.is_constant (&bound))
2684*38fd1498Szrj 	{
2685*38fd1498Szrj 	  gcc_assert (bound != 0);
2686*38fd1498Szrj 	  /* -1 to convert loop iterations to latch iterations.  */
2687*38fd1498Szrj 	  record_niter_bound (epilog, bound - 1, false, true);
2688*38fd1498Szrj 	}
2689*38fd1498Szrj 
2690*38fd1498Szrj       delete_update_ssa ();
2691*38fd1498Szrj       adjust_vec_debug_stmts ();
2692*38fd1498Szrj       scev_reset ();
2693*38fd1498Szrj     }
2694*38fd1498Szrj   adjust_vec.release ();
2695*38fd1498Szrj   free_original_copy_tables ();
2696*38fd1498Szrj 
2697*38fd1498Szrj   return epilog;
2698*38fd1498Szrj }
2699*38fd1498Szrj 
2700*38fd1498Szrj /* Function vect_create_cond_for_niters_checks.
2701*38fd1498Szrj 
2702*38fd1498Szrj    Create a conditional expression that represents the run-time checks for
2703*38fd1498Szrj    loop's niter.  The loop is guaranteed to terminate if the run-time
2704*38fd1498Szrj    checks hold.
2705*38fd1498Szrj 
2706*38fd1498Szrj    Input:
2707*38fd1498Szrj    COND_EXPR  - input conditional expression.  New conditions will be chained
2708*38fd1498Szrj 		with logical AND operation.  If it is NULL, then the function
2709*38fd1498Szrj 		is used to return the number of alias checks.
2710*38fd1498Szrj    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2711*38fd1498Szrj 		to be checked.
2712*38fd1498Szrj 
2713*38fd1498Szrj    Output:
2714*38fd1498Szrj    COND_EXPR - conditional expression.
2715*38fd1498Szrj 
2716*38fd1498Szrj    The returned COND_EXPR is the conditional expression to be used in the
2717*38fd1498Szrj    if statement that controls which version of the loop gets executed at
2718*38fd1498Szrj    runtime.  */
2719*38fd1498Szrj 
2720*38fd1498Szrj static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)2721*38fd1498Szrj vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2722*38fd1498Szrj {
2723*38fd1498Szrj   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2724*38fd1498Szrj 
2725*38fd1498Szrj   if (*cond_expr)
2726*38fd1498Szrj     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2727*38fd1498Szrj 			      *cond_expr, part_cond_expr);
2728*38fd1498Szrj   else
2729*38fd1498Szrj     *cond_expr = part_cond_expr;
2730*38fd1498Szrj }
2731*38fd1498Szrj 
2732*38fd1498Szrj /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2733*38fd1498Szrj    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2734*38fd1498Szrj 
2735*38fd1498Szrj static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)2736*38fd1498Szrj chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2737*38fd1498Szrj {
2738*38fd1498Szrj   if (*cond_expr)
2739*38fd1498Szrj     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2740*38fd1498Szrj 			      *cond_expr, part_cond_expr);
2741*38fd1498Szrj   else
2742*38fd1498Szrj     *cond_expr = part_cond_expr;
2743*38fd1498Szrj }
2744*38fd1498Szrj 
2745*38fd1498Szrj /* Function vect_create_cond_for_align_checks.
2746*38fd1498Szrj 
2747*38fd1498Szrj    Create a conditional expression that represents the alignment checks for
2748*38fd1498Szrj    all of data references (array element references) whose alignment must be
2749*38fd1498Szrj    checked at runtime.
2750*38fd1498Szrj 
2751*38fd1498Szrj    Input:
2752*38fd1498Szrj    COND_EXPR  - input conditional expression.  New conditions will be chained
2753*38fd1498Szrj                 with logical AND operation.
2754*38fd1498Szrj    LOOP_VINFO - two fields of the loop information are used.
2755*38fd1498Szrj                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2756*38fd1498Szrj                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2757*38fd1498Szrj 
2758*38fd1498Szrj    Output:
2759*38fd1498Szrj    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2760*38fd1498Szrj                          expression.
2761*38fd1498Szrj    The returned value is the conditional expression to be used in the if
2762*38fd1498Szrj    statement that controls which version of the loop gets executed at runtime.
2763*38fd1498Szrj 
2764*38fd1498Szrj    The algorithm makes two assumptions:
2765*38fd1498Szrj      1) The number of bytes "n" in a vector is a power of 2.
2766*38fd1498Szrj      2) An address "a" is aligned if a%n is zero and that this
2767*38fd1498Szrj         test can be done as a&(n-1) == 0.  For example, for 16
2768*38fd1498Szrj         byte vectors the test is a&0xf == 0.  */
2769*38fd1498Szrj 
2770*38fd1498Szrj static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)2771*38fd1498Szrj vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2772*38fd1498Szrj                                    tree *cond_expr,
2773*38fd1498Szrj 				   gimple_seq *cond_expr_stmt_list)
2774*38fd1498Szrj {
2775*38fd1498Szrj   vec<gimple *> may_misalign_stmts
2776*38fd1498Szrj     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2777*38fd1498Szrj   gimple *ref_stmt;
2778*38fd1498Szrj   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2779*38fd1498Szrj   tree mask_cst;
2780*38fd1498Szrj   unsigned int i;
2781*38fd1498Szrj   tree int_ptrsize_type;
2782*38fd1498Szrj   char tmp_name[20];
2783*38fd1498Szrj   tree or_tmp_name = NULL_TREE;
2784*38fd1498Szrj   tree and_tmp_name;
2785*38fd1498Szrj   gimple *and_stmt;
2786*38fd1498Szrj   tree ptrsize_zero;
2787*38fd1498Szrj   tree part_cond_expr;
2788*38fd1498Szrj 
2789*38fd1498Szrj   /* Check that mask is one less than a power of 2, i.e., mask is
2790*38fd1498Szrj      all zeros followed by all ones.  */
2791*38fd1498Szrj   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2792*38fd1498Szrj 
2793*38fd1498Szrj   int_ptrsize_type = signed_type_for (ptr_type_node);
2794*38fd1498Szrj 
2795*38fd1498Szrj   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2796*38fd1498Szrj      of the first vector of the i'th data reference. */
2797*38fd1498Szrj 
2798*38fd1498Szrj   FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2799*38fd1498Szrj     {
2800*38fd1498Szrj       gimple_seq new_stmt_list = NULL;
2801*38fd1498Szrj       tree addr_base;
2802*38fd1498Szrj       tree addr_tmp_name;
2803*38fd1498Szrj       tree new_or_tmp_name;
2804*38fd1498Szrj       gimple *addr_stmt, *or_stmt;
2805*38fd1498Szrj       stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2806*38fd1498Szrj       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2807*38fd1498Szrj       bool negative = tree_int_cst_compare
2808*38fd1498Szrj 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2809*38fd1498Szrj       tree offset = negative
2810*38fd1498Szrj 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2811*38fd1498Szrj 
2812*38fd1498Szrj       /* create: addr_tmp = (int)(address_of_first_vector) */
2813*38fd1498Szrj       addr_base =
2814*38fd1498Szrj 	vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2815*38fd1498Szrj 					      offset);
2816*38fd1498Szrj       if (new_stmt_list != NULL)
2817*38fd1498Szrj 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2818*38fd1498Szrj 
2819*38fd1498Szrj       sprintf (tmp_name, "addr2int%d", i);
2820*38fd1498Szrj       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2821*38fd1498Szrj       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2822*38fd1498Szrj       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2823*38fd1498Szrj 
2824*38fd1498Szrj       /* The addresses are OR together.  */
2825*38fd1498Szrj 
2826*38fd1498Szrj       if (or_tmp_name != NULL_TREE)
2827*38fd1498Szrj         {
2828*38fd1498Szrj           /* create: or_tmp = or_tmp | addr_tmp */
2829*38fd1498Szrj           sprintf (tmp_name, "orptrs%d", i);
2830*38fd1498Szrj 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2831*38fd1498Szrj 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2832*38fd1498Szrj 					 or_tmp_name, addr_tmp_name);
2833*38fd1498Szrj 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2834*38fd1498Szrj           or_tmp_name = new_or_tmp_name;
2835*38fd1498Szrj         }
2836*38fd1498Szrj       else
2837*38fd1498Szrj         or_tmp_name = addr_tmp_name;
2838*38fd1498Szrj 
2839*38fd1498Szrj     } /* end for i */
2840*38fd1498Szrj 
2841*38fd1498Szrj   mask_cst = build_int_cst (int_ptrsize_type, mask);
2842*38fd1498Szrj 
2843*38fd1498Szrj   /* create: and_tmp = or_tmp & mask  */
2844*38fd1498Szrj   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2845*38fd1498Szrj 
2846*38fd1498Szrj   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2847*38fd1498Szrj 				  or_tmp_name, mask_cst);
2848*38fd1498Szrj   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2849*38fd1498Szrj 
2850*38fd1498Szrj   /* Make and_tmp the left operand of the conditional test against zero.
2851*38fd1498Szrj      if and_tmp has a nonzero bit then some address is unaligned.  */
2852*38fd1498Szrj   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2853*38fd1498Szrj   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2854*38fd1498Szrj 				and_tmp_name, ptrsize_zero);
2855*38fd1498Szrj   chain_cond_expr (cond_expr, part_cond_expr);
2856*38fd1498Szrj }
2857*38fd1498Szrj 
2858*38fd1498Szrj /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2859*38fd1498Szrj    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2860*38fd1498Szrj    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2861*38fd1498Szrj    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2862*38fd1498Szrj 
2863*38fd1498Szrj static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)2864*38fd1498Szrj vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2865*38fd1498Szrj {
2866*38fd1498Szrj   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2867*38fd1498Szrj   unsigned int i;
2868*38fd1498Szrj   vec_object_pair *pair;
2869*38fd1498Szrj   FOR_EACH_VEC_ELT (pairs, i, pair)
2870*38fd1498Szrj     {
2871*38fd1498Szrj       tree addr1 = build_fold_addr_expr (pair->first);
2872*38fd1498Szrj       tree addr2 = build_fold_addr_expr (pair->second);
2873*38fd1498Szrj       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2874*38fd1498Szrj 					 addr1, addr2);
2875*38fd1498Szrj       chain_cond_expr (cond_expr, part_cond_expr);
2876*38fd1498Szrj     }
2877*38fd1498Szrj }
2878*38fd1498Szrj 
2879*38fd1498Szrj /* Create an expression that is true when all lower-bound conditions for
2880*38fd1498Szrj    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2881*38fd1498Szrj 
2882*38fd1498Szrj static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)2883*38fd1498Szrj vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2884*38fd1498Szrj {
2885*38fd1498Szrj   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2886*38fd1498Szrj   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2887*38fd1498Szrj     {
2888*38fd1498Szrj       tree expr = lower_bounds[i].expr;
2889*38fd1498Szrj       tree type = unsigned_type_for (TREE_TYPE (expr));
2890*38fd1498Szrj       expr = fold_convert (type, expr);
2891*38fd1498Szrj       poly_uint64 bound = lower_bounds[i].min_value;
2892*38fd1498Szrj       if (!lower_bounds[i].unsigned_p)
2893*38fd1498Szrj 	{
2894*38fd1498Szrj 	  expr = fold_build2 (PLUS_EXPR, type, expr,
2895*38fd1498Szrj 			      build_int_cstu (type, bound - 1));
2896*38fd1498Szrj 	  bound += bound - 1;
2897*38fd1498Szrj 	}
2898*38fd1498Szrj       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2899*38fd1498Szrj 					 build_int_cstu (type, bound));
2900*38fd1498Szrj       chain_cond_expr (cond_expr, part_cond_expr);
2901*38fd1498Szrj     }
2902*38fd1498Szrj }
2903*38fd1498Szrj 
2904*38fd1498Szrj /* Function vect_create_cond_for_alias_checks.
2905*38fd1498Szrj 
2906*38fd1498Szrj    Create a conditional expression that represents the run-time checks for
2907*38fd1498Szrj    overlapping of address ranges represented by a list of data references
2908*38fd1498Szrj    relations passed as input.
2909*38fd1498Szrj 
2910*38fd1498Szrj    Input:
2911*38fd1498Szrj    COND_EXPR  - input conditional expression.  New conditions will be chained
2912*38fd1498Szrj                 with logical AND operation.  If it is NULL, then the function
2913*38fd1498Szrj                 is used to return the number of alias checks.
2914*38fd1498Szrj    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2915*38fd1498Szrj 	        to be checked.
2916*38fd1498Szrj 
2917*38fd1498Szrj    Output:
2918*38fd1498Szrj    COND_EXPR - conditional expression.
2919*38fd1498Szrj 
2920*38fd1498Szrj    The returned COND_EXPR is the conditional expression to be used in the if
2921*38fd1498Szrj    statement that controls which version of the loop gets executed at runtime.
2922*38fd1498Szrj */
2923*38fd1498Szrj 
2924*38fd1498Szrj void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)2925*38fd1498Szrj vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2926*38fd1498Szrj {
2927*38fd1498Szrj   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2928*38fd1498Szrj     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2929*38fd1498Szrj 
2930*38fd1498Szrj   if (comp_alias_ddrs.is_empty ())
2931*38fd1498Szrj     return;
2932*38fd1498Szrj 
2933*38fd1498Szrj   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2934*38fd1498Szrj 			       &comp_alias_ddrs, cond_expr);
2935*38fd1498Szrj   if (dump_enabled_p ())
2936*38fd1498Szrj     dump_printf_loc (MSG_NOTE, vect_location,
2937*38fd1498Szrj 		     "created %u versioning for alias checks.\n",
2938*38fd1498Szrj 		     comp_alias_ddrs.length ());
2939*38fd1498Szrj }
2940*38fd1498Szrj 
2941*38fd1498Szrj 
2942*38fd1498Szrj /* Function vect_loop_versioning.
2943*38fd1498Szrj 
2944*38fd1498Szrj    If the loop has data references that may or may not be aligned or/and
2945*38fd1498Szrj    has data reference relations whose independence was not proven then
2946*38fd1498Szrj    two versions of the loop need to be generated, one which is vectorized
2947*38fd1498Szrj    and one which isn't.  A test is then generated to control which of the
2948*38fd1498Szrj    loops is executed.  The test checks for the alignment of all of the
2949*38fd1498Szrj    data references that may or may not be aligned.  An additional
2950*38fd1498Szrj    sequence of runtime tests is generated for each pairs of DDRs whose
2951*38fd1498Szrj    independence was not proven.  The vectorized version of loop is
2952*38fd1498Szrj    executed only if both alias and alignment tests are passed.
2953*38fd1498Szrj 
2954*38fd1498Szrj    The test generated to check which version of loop is executed
2955*38fd1498Szrj    is modified to also check for profitability as indicated by the
2956*38fd1498Szrj    cost model threshold TH.
2957*38fd1498Szrj 
2958*38fd1498Szrj    The versioning precondition(s) are placed in *COND_EXPR and
2959*38fd1498Szrj    *COND_EXPR_STMT_LIST.  */
2960*38fd1498Szrj 
2961*38fd1498Szrj void
vect_loop_versioning(loop_vec_info loop_vinfo,unsigned int th,bool check_profitability,poly_uint64 versioning_threshold)2962*38fd1498Szrj vect_loop_versioning (loop_vec_info loop_vinfo,
2963*38fd1498Szrj 		      unsigned int th, bool check_profitability,
2964*38fd1498Szrj 		      poly_uint64 versioning_threshold)
2965*38fd1498Szrj {
2966*38fd1498Szrj   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2967*38fd1498Szrj   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2968*38fd1498Szrj   basic_block condition_bb;
2969*38fd1498Szrj   gphi_iterator gsi;
2970*38fd1498Szrj   gimple_stmt_iterator cond_exp_gsi;
2971*38fd1498Szrj   basic_block merge_bb;
2972*38fd1498Szrj   basic_block new_exit_bb;
2973*38fd1498Szrj   edge new_exit_e, e;
2974*38fd1498Szrj   gphi *orig_phi, *new_phi;
2975*38fd1498Szrj   tree cond_expr = NULL_TREE;
2976*38fd1498Szrj   gimple_seq cond_expr_stmt_list = NULL;
2977*38fd1498Szrj   tree arg;
2978*38fd1498Szrj   profile_probability prob = profile_probability::likely ();
2979*38fd1498Szrj   gimple_seq gimplify_stmt_list = NULL;
2980*38fd1498Szrj   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2981*38fd1498Szrj   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2982*38fd1498Szrj   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2983*38fd1498Szrj   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2984*38fd1498Szrj 
2985*38fd1498Szrj   if (check_profitability)
2986*38fd1498Szrj     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2987*38fd1498Szrj 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
2988*38fd1498Szrj 					    th - 1));
2989*38fd1498Szrj   if (maybe_ne (versioning_threshold, 0U))
2990*38fd1498Szrj     {
2991*38fd1498Szrj       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2992*38fd1498Szrj 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
2993*38fd1498Szrj 					      versioning_threshold - 1));
2994*38fd1498Szrj       if (cond_expr)
2995*38fd1498Szrj 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2996*38fd1498Szrj 				 expr, cond_expr);
2997*38fd1498Szrj       else
2998*38fd1498Szrj 	cond_expr = expr;
2999*38fd1498Szrj     }
3000*38fd1498Szrj 
3001*38fd1498Szrj   if (version_niter)
3002*38fd1498Szrj     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3003*38fd1498Szrj 
3004*38fd1498Szrj   if (cond_expr)
3005*38fd1498Szrj     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3006*38fd1498Szrj 					is_gimple_condexpr, NULL_TREE);
3007*38fd1498Szrj 
3008*38fd1498Szrj   if (version_align)
3009*38fd1498Szrj     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3010*38fd1498Szrj 				       &cond_expr_stmt_list);
3011*38fd1498Szrj 
3012*38fd1498Szrj   if (version_alias)
3013*38fd1498Szrj     {
3014*38fd1498Szrj       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3015*38fd1498Szrj       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3016*38fd1498Szrj       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3017*38fd1498Szrj     }
3018*38fd1498Szrj 
3019*38fd1498Szrj   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3020*38fd1498Szrj 				      &gimplify_stmt_list,
3021*38fd1498Szrj 				      is_gimple_condexpr, NULL_TREE);
3022*38fd1498Szrj   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3023*38fd1498Szrj 
3024*38fd1498Szrj   initialize_original_copy_tables ();
3025*38fd1498Szrj   if (scalar_loop)
3026*38fd1498Szrj     {
3027*38fd1498Szrj       edge scalar_e;
3028*38fd1498Szrj       basic_block preheader, scalar_preheader;
3029*38fd1498Szrj 
3030*38fd1498Szrj       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3031*38fd1498Szrj 	 scale LOOP's frequencies instead.  */
3032*38fd1498Szrj       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3033*38fd1498Szrj 			    prob, prob.invert (), prob, prob.invert (), true);
3034*38fd1498Szrj       scale_loop_frequencies (loop, prob);
3035*38fd1498Szrj       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3036*38fd1498Szrj 	 while we need to move it above LOOP's preheader.  */
3037*38fd1498Szrj       e = loop_preheader_edge (loop);
3038*38fd1498Szrj       scalar_e = loop_preheader_edge (scalar_loop);
3039*38fd1498Szrj       gcc_assert (empty_block_p (e->src)
3040*38fd1498Szrj 		  && single_pred_p (e->src));
3041*38fd1498Szrj       gcc_assert (empty_block_p (scalar_e->src)
3042*38fd1498Szrj 		  && single_pred_p (scalar_e->src));
3043*38fd1498Szrj       gcc_assert (single_pred_p (condition_bb));
3044*38fd1498Szrj       preheader = e->src;
3045*38fd1498Szrj       scalar_preheader = scalar_e->src;
3046*38fd1498Szrj       scalar_e = find_edge (condition_bb, scalar_preheader);
3047*38fd1498Szrj       e = single_pred_edge (preheader);
3048*38fd1498Szrj       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3049*38fd1498Szrj 				      scalar_preheader);
3050*38fd1498Szrj       redirect_edge_and_branch_force (scalar_e, preheader);
3051*38fd1498Szrj       redirect_edge_and_branch_force (e, condition_bb);
3052*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3053*38fd1498Szrj 			       single_pred (condition_bb));
3054*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3055*38fd1498Szrj 			       single_pred (scalar_preheader));
3056*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, preheader,
3057*38fd1498Szrj 			       condition_bb);
3058*38fd1498Szrj     }
3059*38fd1498Szrj   else
3060*38fd1498Szrj     nloop = loop_version (loop, cond_expr, &condition_bb,
3061*38fd1498Szrj 			  prob, prob.invert (), prob, prob.invert (), true);
3062*38fd1498Szrj 
3063*38fd1498Szrj   if (version_niter)
3064*38fd1498Szrj     {
3065*38fd1498Szrj       /* The versioned loop could be infinite, we need to clear existing
3066*38fd1498Szrj 	 niter information which is copied from the original loop.  */
3067*38fd1498Szrj       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3068*38fd1498Szrj       vect_free_loop_info_assumptions (nloop);
3069*38fd1498Szrj       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3070*38fd1498Szrj       loop_constraint_set (loop, LOOP_C_INFINITE);
3071*38fd1498Szrj     }
3072*38fd1498Szrj 
3073*38fd1498Szrj   if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
3074*38fd1498Szrj       && dump_enabled_p ())
3075*38fd1498Szrj     {
3076*38fd1498Szrj       if (version_alias)
3077*38fd1498Szrj         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3078*38fd1498Szrj                          "loop versioned for vectorization because of "
3079*38fd1498Szrj 			 "possible aliasing\n");
3080*38fd1498Szrj       if (version_align)
3081*38fd1498Szrj         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3082*38fd1498Szrj                          "loop versioned for vectorization to enhance "
3083*38fd1498Szrj 			 "alignment\n");
3084*38fd1498Szrj 
3085*38fd1498Szrj     }
3086*38fd1498Szrj   free_original_copy_tables ();
3087*38fd1498Szrj 
3088*38fd1498Szrj   /* Loop versioning violates an assumption we try to maintain during
3089*38fd1498Szrj      vectorization - that the loop exit block has a single predecessor.
3090*38fd1498Szrj      After versioning, the exit block of both loop versions is the same
3091*38fd1498Szrj      basic block (i.e. it has two predecessors). Just in order to simplify
3092*38fd1498Szrj      following transformations in the vectorizer, we fix this situation
3093*38fd1498Szrj      here by adding a new (empty) block on the exit-edge of the loop,
3094*38fd1498Szrj      with the proper loop-exit phis to maintain loop-closed-form.
3095*38fd1498Szrj      If loop versioning wasn't done from loop, but scalar_loop instead,
3096*38fd1498Szrj      merge_bb will have already just a single successor.  */
3097*38fd1498Szrj 
3098*38fd1498Szrj   merge_bb = single_exit (loop)->dest;
3099*38fd1498Szrj   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3100*38fd1498Szrj     {
3101*38fd1498Szrj       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3102*38fd1498Szrj       new_exit_bb = split_edge (single_exit (loop));
3103*38fd1498Szrj       new_exit_e = single_exit (loop);
3104*38fd1498Szrj       e = EDGE_SUCC (new_exit_bb, 0);
3105*38fd1498Szrj 
3106*38fd1498Szrj       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3107*38fd1498Szrj 	{
3108*38fd1498Szrj 	  tree new_res;
3109*38fd1498Szrj 	  orig_phi = gsi.phi ();
3110*38fd1498Szrj 	  new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3111*38fd1498Szrj 	  new_phi = create_phi_node (new_res, new_exit_bb);
3112*38fd1498Szrj 	  arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3113*38fd1498Szrj 	  add_phi_arg (new_phi, arg, new_exit_e,
3114*38fd1498Szrj 		       gimple_phi_arg_location_from_edge (orig_phi, e));
3115*38fd1498Szrj 	  adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3116*38fd1498Szrj 	}
3117*38fd1498Szrj     }
3118*38fd1498Szrj 
3119*38fd1498Szrj   /* End loop-exit-fixes after versioning.  */
3120*38fd1498Szrj 
3121*38fd1498Szrj   if (cond_expr_stmt_list)
3122*38fd1498Szrj     {
3123*38fd1498Szrj       cond_exp_gsi = gsi_last_bb (condition_bb);
3124*38fd1498Szrj       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3125*38fd1498Szrj 			     GSI_SAME_STMT);
3126*38fd1498Szrj     }
3127*38fd1498Szrj   update_ssa (TODO_update_ssa);
3128*38fd1498Szrj }
3129