110d565efSmrg /* Vectorizer Specific Loop Manipulations
2*ec02198aSmrg    Copyright (C) 2003-2020 Free Software Foundation, Inc.
310d565efSmrg    Contributed by Dorit Naishlos <dorit@il.ibm.com>
410d565efSmrg    and Ira Rosen <irar@il.ibm.com>
510d565efSmrg 
610d565efSmrg This file is part of GCC.
710d565efSmrg 
810d565efSmrg GCC is free software; you can redistribute it and/or modify it under
910d565efSmrg the terms of the GNU General Public License as published by the Free
1010d565efSmrg Software Foundation; either version 3, or (at your option) any later
1110d565efSmrg version.
1210d565efSmrg 
1310d565efSmrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1410d565efSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
1510d565efSmrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1610d565efSmrg for more details.
1710d565efSmrg 
1810d565efSmrg You should have received a copy of the GNU General Public License
1910d565efSmrg along with GCC; see the file COPYING3.  If not see
2010d565efSmrg <http://www.gnu.org/licenses/>.  */
2110d565efSmrg 
2210d565efSmrg #include "config.h"
2310d565efSmrg #include "system.h"
2410d565efSmrg #include "coretypes.h"
2510d565efSmrg #include "backend.h"
2610d565efSmrg #include "tree.h"
2710d565efSmrg #include "gimple.h"
2810d565efSmrg #include "cfghooks.h"
2910d565efSmrg #include "tree-pass.h"
3010d565efSmrg #include "ssa.h"
3110d565efSmrg #include "fold-const.h"
3210d565efSmrg #include "cfganal.h"
3310d565efSmrg #include "gimplify.h"
3410d565efSmrg #include "gimple-iterator.h"
3510d565efSmrg #include "gimplify-me.h"
3610d565efSmrg #include "tree-cfg.h"
3710d565efSmrg #include "tree-ssa-loop-manip.h"
3810d565efSmrg #include "tree-into-ssa.h"
3910d565efSmrg #include "tree-ssa.h"
4010d565efSmrg #include "cfgloop.h"
4110d565efSmrg #include "tree-scalar-evolution.h"
4210d565efSmrg #include "tree-vectorizer.h"
4310d565efSmrg #include "tree-ssa-loop-ivopts.h"
44c7a68eb7Smrg #include "gimple-fold.h"
45c7a68eb7Smrg #include "tree-ssa-loop-niter.h"
46c7a68eb7Smrg #include "internal-fn.h"
47c7a68eb7Smrg #include "stor-layout.h"
48c7a68eb7Smrg #include "optabs-query.h"
49c7a68eb7Smrg #include "vec-perm-indices.h"
50*ec02198aSmrg #include "insn-config.h"
51*ec02198aSmrg #include "rtl.h"
52*ec02198aSmrg #include "recog.h"
5310d565efSmrg 
5410d565efSmrg /*************************************************************************
5510d565efSmrg   Simple Loop Peeling Utilities
5610d565efSmrg 
5710d565efSmrg   Utilities to support loop peeling for vectorization purposes.
5810d565efSmrg  *************************************************************************/
5910d565efSmrg 
6010d565efSmrg 
6110d565efSmrg /* Renames the use *OP_P.  */
6210d565efSmrg 
6310d565efSmrg static void
rename_use_op(use_operand_p op_p)6410d565efSmrg rename_use_op (use_operand_p op_p)
6510d565efSmrg {
6610d565efSmrg   tree new_name;
6710d565efSmrg 
6810d565efSmrg   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
6910d565efSmrg     return;
7010d565efSmrg 
7110d565efSmrg   new_name = get_current_def (USE_FROM_PTR (op_p));
7210d565efSmrg 
7310d565efSmrg   /* Something defined outside of the loop.  */
7410d565efSmrg   if (!new_name)
7510d565efSmrg     return;
7610d565efSmrg 
7710d565efSmrg   /* An ordinary ssa name defined in the loop.  */
7810d565efSmrg 
7910d565efSmrg   SET_USE (op_p, new_name);
8010d565efSmrg }
8110d565efSmrg 
8210d565efSmrg 
8310d565efSmrg /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
8410d565efSmrg    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
8510d565efSmrg    true.  */
8610d565efSmrg 
8710d565efSmrg static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)8810d565efSmrg rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
8910d565efSmrg {
9010d565efSmrg   gimple *stmt;
9110d565efSmrg   use_operand_p use_p;
9210d565efSmrg   ssa_op_iter iter;
9310d565efSmrg   edge e;
9410d565efSmrg   edge_iterator ei;
95*ec02198aSmrg   class loop *loop = bb->loop_father;
96*ec02198aSmrg   class loop *outer_loop = NULL;
9710d565efSmrg 
9810d565efSmrg   if (rename_from_outer_loop)
9910d565efSmrg     {
10010d565efSmrg       gcc_assert (loop);
10110d565efSmrg       outer_loop = loop_outer (loop);
10210d565efSmrg     }
10310d565efSmrg 
10410d565efSmrg   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10510d565efSmrg        gsi_next (&gsi))
10610d565efSmrg     {
10710d565efSmrg       stmt = gsi_stmt (gsi);
10810d565efSmrg       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
10910d565efSmrg 	rename_use_op (use_p);
11010d565efSmrg     }
11110d565efSmrg 
11210d565efSmrg   FOR_EACH_EDGE (e, ei, bb->preds)
11310d565efSmrg     {
11410d565efSmrg       if (!flow_bb_inside_loop_p (loop, e->src))
11510d565efSmrg 	{
11610d565efSmrg 	  if (!rename_from_outer_loop)
11710d565efSmrg 	    continue;
11810d565efSmrg 	  if (e->src != outer_loop->header)
11910d565efSmrg 	    {
12010d565efSmrg 	      if (outer_loop->inner->next)
12110d565efSmrg 		{
12210d565efSmrg 		  /* If outer_loop has 2 inner loops, allow there to
12310d565efSmrg 		     be an extra basic block which decides which of the
12410d565efSmrg 		     two loops to use using LOOP_VECTORIZED.  */
12510d565efSmrg 		  if (!single_pred_p (e->src)
12610d565efSmrg 		      || single_pred (e->src) != outer_loop->header)
12710d565efSmrg 		    continue;
12810d565efSmrg 		}
12910d565efSmrg 	    }
13010d565efSmrg 	}
13110d565efSmrg       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
13210d565efSmrg 	   gsi_next (&gsi))
13310d565efSmrg         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
13410d565efSmrg     }
13510d565efSmrg }
13610d565efSmrg 
13710d565efSmrg 
13810d565efSmrg struct adjust_info
13910d565efSmrg {
14010d565efSmrg   tree from, to;
14110d565efSmrg   basic_block bb;
14210d565efSmrg };
14310d565efSmrg 
14410d565efSmrg /* A stack of values to be adjusted in debug stmts.  We have to
14510d565efSmrg    process them LIFO, so that the closest substitution applies.  If we
14610d565efSmrg    processed them FIFO, without the stack, we might substitute uses
14710d565efSmrg    with a PHI DEF that would soon become non-dominant, and when we got
14810d565efSmrg    to the suitable one, it wouldn't have anything to substitute any
14910d565efSmrg    more.  */
15010d565efSmrg static vec<adjust_info, va_heap> adjust_vec;
15110d565efSmrg 
15210d565efSmrg /* Adjust any debug stmts that referenced AI->from values to use the
15310d565efSmrg    loop-closed AI->to, if the references are dominated by AI->bb and
15410d565efSmrg    not by the definition of AI->from.  */
15510d565efSmrg 
15610d565efSmrg static void
adjust_debug_stmts_now(adjust_info * ai)15710d565efSmrg adjust_debug_stmts_now (adjust_info *ai)
15810d565efSmrg {
15910d565efSmrg   basic_block bbphi = ai->bb;
16010d565efSmrg   tree orig_def = ai->from;
16110d565efSmrg   tree new_def = ai->to;
16210d565efSmrg   imm_use_iterator imm_iter;
16310d565efSmrg   gimple *stmt;
16410d565efSmrg   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
16510d565efSmrg 
16610d565efSmrg   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
16710d565efSmrg 
16810d565efSmrg   /* Adjust any debug stmts that held onto non-loop-closed
16910d565efSmrg      references.  */
17010d565efSmrg   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
17110d565efSmrg     {
17210d565efSmrg       use_operand_p use_p;
17310d565efSmrg       basic_block bbuse;
17410d565efSmrg 
17510d565efSmrg       if (!is_gimple_debug (stmt))
17610d565efSmrg 	continue;
17710d565efSmrg 
17810d565efSmrg       gcc_assert (gimple_debug_bind_p (stmt));
17910d565efSmrg 
18010d565efSmrg       bbuse = gimple_bb (stmt);
18110d565efSmrg 
18210d565efSmrg       if ((bbuse == bbphi
18310d565efSmrg 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
18410d565efSmrg 	  && !(bbuse == bbdef
18510d565efSmrg 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
18610d565efSmrg 	{
18710d565efSmrg 	  if (new_def)
18810d565efSmrg 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
18910d565efSmrg 	      SET_USE (use_p, new_def);
19010d565efSmrg 	  else
19110d565efSmrg 	    {
19210d565efSmrg 	      gimple_debug_bind_reset_value (stmt);
19310d565efSmrg 	      update_stmt (stmt);
19410d565efSmrg 	    }
19510d565efSmrg 	}
19610d565efSmrg     }
19710d565efSmrg }
19810d565efSmrg 
19910d565efSmrg /* Adjust debug stmts as scheduled before.  */
20010d565efSmrg 
20110d565efSmrg static void
adjust_vec_debug_stmts(void)20210d565efSmrg adjust_vec_debug_stmts (void)
20310d565efSmrg {
204c7a68eb7Smrg   if (!MAY_HAVE_DEBUG_BIND_STMTS)
20510d565efSmrg     return;
20610d565efSmrg 
20710d565efSmrg   gcc_assert (adjust_vec.exists ());
20810d565efSmrg 
20910d565efSmrg   while (!adjust_vec.is_empty ())
21010d565efSmrg     {
21110d565efSmrg       adjust_debug_stmts_now (&adjust_vec.last ());
21210d565efSmrg       adjust_vec.pop ();
21310d565efSmrg     }
21410d565efSmrg }
21510d565efSmrg 
21610d565efSmrg /* Adjust any debug stmts that referenced FROM values to use the
21710d565efSmrg    loop-closed TO, if the references are dominated by BB and not by
21810d565efSmrg    the definition of FROM.  If adjust_vec is non-NULL, adjustments
21910d565efSmrg    will be postponed until adjust_vec_debug_stmts is called.  */
22010d565efSmrg 
22110d565efSmrg static void
adjust_debug_stmts(tree from,tree to,basic_block bb)22210d565efSmrg adjust_debug_stmts (tree from, tree to, basic_block bb)
22310d565efSmrg {
22410d565efSmrg   adjust_info ai;
22510d565efSmrg 
226c7a68eb7Smrg   if (MAY_HAVE_DEBUG_BIND_STMTS
22710d565efSmrg       && TREE_CODE (from) == SSA_NAME
22810d565efSmrg       && ! SSA_NAME_IS_DEFAULT_DEF (from)
22910d565efSmrg       && ! virtual_operand_p (from))
23010d565efSmrg     {
23110d565efSmrg       ai.from = from;
23210d565efSmrg       ai.to = to;
23310d565efSmrg       ai.bb = bb;
23410d565efSmrg 
23510d565efSmrg       if (adjust_vec.exists ())
23610d565efSmrg 	adjust_vec.safe_push (ai);
23710d565efSmrg       else
23810d565efSmrg 	adjust_debug_stmts_now (&ai);
23910d565efSmrg     }
24010d565efSmrg }
24110d565efSmrg 
24210d565efSmrg /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
24310d565efSmrg    to adjust any debug stmts that referenced the old phi arg,
24410d565efSmrg    presumably non-loop-closed references left over from other
24510d565efSmrg    transformations.  */
24610d565efSmrg 
24710d565efSmrg static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)24810d565efSmrg adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
24910d565efSmrg {
25010d565efSmrg   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
25110d565efSmrg 
25210d565efSmrg   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
25310d565efSmrg 
254c7a68eb7Smrg   if (MAY_HAVE_DEBUG_BIND_STMTS)
25510d565efSmrg     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
25610d565efSmrg 			gimple_bb (update_phi));
25710d565efSmrg }
25810d565efSmrg 
259c7a68eb7Smrg /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
260c7a68eb7Smrg    the mask should have during the first iteration and NEXT_MASK is the
261c7a68eb7Smrg    value that it should have on subsequent iterations.  */
26210d565efSmrg 
263c7a68eb7Smrg static void
vect_set_loop_mask(class loop * loop,tree mask,tree init_mask,tree next_mask)264*ec02198aSmrg vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265c7a68eb7Smrg 		    tree next_mask)
266c7a68eb7Smrg {
267c7a68eb7Smrg   gphi *phi = create_phi_node (mask, loop->header);
268c7a68eb7Smrg   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269c7a68eb7Smrg   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270c7a68eb7Smrg }
27110d565efSmrg 
272c7a68eb7Smrg /* Add SEQ to the end of LOOP's preheader block.  */
273c7a68eb7Smrg 
274c7a68eb7Smrg static void
add_preheader_seq(class loop * loop,gimple_seq seq)275*ec02198aSmrg add_preheader_seq (class loop *loop, gimple_seq seq)
276c7a68eb7Smrg {
277c7a68eb7Smrg   if (seq)
278c7a68eb7Smrg     {
279c7a68eb7Smrg       edge pe = loop_preheader_edge (loop);
280c7a68eb7Smrg       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281c7a68eb7Smrg       gcc_assert (!new_bb);
282c7a68eb7Smrg     }
283c7a68eb7Smrg }
284c7a68eb7Smrg 
285c7a68eb7Smrg /* Add SEQ to the beginning of LOOP's header block.  */
286c7a68eb7Smrg 
287c7a68eb7Smrg static void
add_header_seq(class loop * loop,gimple_seq seq)288*ec02198aSmrg add_header_seq (class loop *loop, gimple_seq seq)
289c7a68eb7Smrg {
290c7a68eb7Smrg   if (seq)
291c7a68eb7Smrg     {
292c7a68eb7Smrg       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293c7a68eb7Smrg       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294c7a68eb7Smrg     }
295c7a68eb7Smrg }
296c7a68eb7Smrg 
297c7a68eb7Smrg /* Return true if the target can interleave elements of two vectors.
298c7a68eb7Smrg    OFFSET is 0 if the first half of the vectors should be interleaved
299c7a68eb7Smrg    or 1 if the second half should.  When returning true, store the
300c7a68eb7Smrg    associated permutation in INDICES.  */
301c7a68eb7Smrg 
302c7a68eb7Smrg static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303c7a68eb7Smrg interleave_supported_p (vec_perm_indices *indices, tree vectype,
304c7a68eb7Smrg 			unsigned int offset)
305c7a68eb7Smrg {
306c7a68eb7Smrg   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307c7a68eb7Smrg   poly_uint64 base = exact_div (nelts, 2) * offset;
308c7a68eb7Smrg   vec_perm_builder sel (nelts, 2, 3);
309c7a68eb7Smrg   for (unsigned int i = 0; i < 3; ++i)
310c7a68eb7Smrg     {
311c7a68eb7Smrg       sel.quick_push (base + i);
312c7a68eb7Smrg       sel.quick_push (base + i + nelts);
313c7a68eb7Smrg     }
314c7a68eb7Smrg   indices->new_vector (sel, 2, nelts);
315c7a68eb7Smrg   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316c7a68eb7Smrg }
317c7a68eb7Smrg 
318c7a68eb7Smrg /* Try to use permutes to define the masks in DEST_RGM using the masks
319c7a68eb7Smrg    in SRC_RGM, given that the former has twice as many masks as the
320c7a68eb7Smrg    latter.  Return true on success, adding any new statements to SEQ.  */
321c7a68eb7Smrg 
322c7a68eb7Smrg static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)323c7a68eb7Smrg vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324c7a68eb7Smrg 			       rgroup_masks *src_rgm)
325c7a68eb7Smrg {
326c7a68eb7Smrg   tree src_masktype = src_rgm->mask_type;
327c7a68eb7Smrg   tree dest_masktype = dest_rgm->mask_type;
328c7a68eb7Smrg   machine_mode src_mode = TYPE_MODE (src_masktype);
329*ec02198aSmrg   insn_code icode1, icode2;
330c7a68eb7Smrg   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331*ec02198aSmrg       && (icode1 = optab_handler (vec_unpacku_hi_optab,
332*ec02198aSmrg 				  src_mode)) != CODE_FOR_nothing
333*ec02198aSmrg       && (icode2 = optab_handler (vec_unpacku_lo_optab,
334*ec02198aSmrg 				  src_mode)) != CODE_FOR_nothing)
335c7a68eb7Smrg     {
336c7a68eb7Smrg       /* Unpacking the source masks gives at least as many mask bits as
337c7a68eb7Smrg 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
338*ec02198aSmrg       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339*ec02198aSmrg       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340*ec02198aSmrg       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341c7a68eb7Smrg       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342c7a68eb7Smrg 	{
343c7a68eb7Smrg 	  tree src = src_rgm->masks[i / 2];
344c7a68eb7Smrg 	  tree dest = dest_rgm->masks[i];
345c7a68eb7Smrg 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346c7a68eb7Smrg 			    ? VEC_UNPACK_HI_EXPR
347c7a68eb7Smrg 			    : VEC_UNPACK_LO_EXPR);
348c7a68eb7Smrg 	  gassign *stmt;
349c7a68eb7Smrg 	  if (dest_masktype == unpack_masktype)
350c7a68eb7Smrg 	    stmt = gimple_build_assign (dest, code, src);
351c7a68eb7Smrg 	  else
352c7a68eb7Smrg 	    {
353c7a68eb7Smrg 	      tree temp = make_ssa_name (unpack_masktype);
354c7a68eb7Smrg 	      stmt = gimple_build_assign (temp, code, src);
355c7a68eb7Smrg 	      gimple_seq_add_stmt (seq, stmt);
356c7a68eb7Smrg 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357c7a68eb7Smrg 					  build1 (VIEW_CONVERT_EXPR,
358c7a68eb7Smrg 						  dest_masktype, temp));
359c7a68eb7Smrg 	    }
360c7a68eb7Smrg 	  gimple_seq_add_stmt (seq, stmt);
361c7a68eb7Smrg 	}
362c7a68eb7Smrg       return true;
363c7a68eb7Smrg     }
364c7a68eb7Smrg   vec_perm_indices indices[2];
365c7a68eb7Smrg   if (dest_masktype == src_masktype
366c7a68eb7Smrg       && interleave_supported_p (&indices[0], src_masktype, 0)
367c7a68eb7Smrg       && interleave_supported_p (&indices[1], src_masktype, 1))
368c7a68eb7Smrg     {
369c7a68eb7Smrg       /* The destination requires twice as many mask bits as the source, so
370c7a68eb7Smrg 	 we can use interleaving permutes to double up the number of bits.  */
371c7a68eb7Smrg       tree masks[2];
372c7a68eb7Smrg       for (unsigned int i = 0; i < 2; ++i)
373c7a68eb7Smrg 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374c7a68eb7Smrg       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375c7a68eb7Smrg 	{
376c7a68eb7Smrg 	  tree src = src_rgm->masks[i / 2];
377c7a68eb7Smrg 	  tree dest = dest_rgm->masks[i];
378c7a68eb7Smrg 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379c7a68eb7Smrg 					      src, src, masks[i & 1]);
380c7a68eb7Smrg 	  gimple_seq_add_stmt (seq, stmt);
381c7a68eb7Smrg 	}
382c7a68eb7Smrg       return true;
383c7a68eb7Smrg     }
384c7a68eb7Smrg   return false;
385c7a68eb7Smrg }
386c7a68eb7Smrg 
387c7a68eb7Smrg /* Helper for vect_set_loop_condition_masked.  Generate definitions for
388c7a68eb7Smrg    all the masks in RGM and return a mask that is nonzero when the loop
389c7a68eb7Smrg    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
390c7a68eb7Smrg    Use LOOP_COND_GSI to insert code before the exit gcond.
391c7a68eb7Smrg 
392c7a68eb7Smrg    RGM belongs to loop LOOP.  The loop originally iterated NITERS
393*ec02198aSmrg    times and has been vectorized according to LOOP_VINFO.
394c7a68eb7Smrg 
395c7a68eb7Smrg    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396c7a68eb7Smrg    starts with NITERS_SKIP dummy iterations of the scalar loop before
397c7a68eb7Smrg    the real work starts.  The mask elements for these dummy iterations
398c7a68eb7Smrg    must be 0, to ensure that the extra iterations do not have an effect.
399c7a68eb7Smrg 
400c7a68eb7Smrg    It is known that:
401c7a68eb7Smrg 
402c7a68eb7Smrg      NITERS * RGM->max_nscalars_per_iter
403c7a68eb7Smrg 
404c7a68eb7Smrg    does not overflow.  However, MIGHT_WRAP_P says whether an induction
405c7a68eb7Smrg    variable that starts at 0 and has step:
406c7a68eb7Smrg 
407c7a68eb7Smrg      VF * RGM->max_nscalars_per_iter
408c7a68eb7Smrg 
409c7a68eb7Smrg    might overflow before hitting a value above:
410c7a68eb7Smrg 
411c7a68eb7Smrg      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412c7a68eb7Smrg 
413c7a68eb7Smrg    This means that we cannot guarantee that such an induction variable
414c7a68eb7Smrg    would ever hit a value that produces a set of all-false masks for RGM.  */
415c7a68eb7Smrg 
416c7a68eb7Smrg static tree
vect_set_loop_masks_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree niters,tree niters_skip,bool might_wrap_p)417*ec02198aSmrg vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418c7a68eb7Smrg 			      gimple_seq *preheader_seq,
419c7a68eb7Smrg 			      gimple_stmt_iterator loop_cond_gsi,
420*ec02198aSmrg 			      rgroup_masks *rgm, tree niters, tree niters_skip,
421c7a68eb7Smrg 			      bool might_wrap_p)
422c7a68eb7Smrg {
423c7a68eb7Smrg   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424*ec02198aSmrg   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425c7a68eb7Smrg   tree mask_type = rgm->mask_type;
426c7a68eb7Smrg   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427c7a68eb7Smrg   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428*ec02198aSmrg   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429c7a68eb7Smrg 
430c7a68eb7Smrg   /* Calculate the maximum number of scalar values that the rgroup
431c7a68eb7Smrg      handles in total, the number that it handles for each iteration
432c7a68eb7Smrg      of the vector loop, and the number that it should skip during the
433c7a68eb7Smrg      first iteration of the vector loop.  */
434c7a68eb7Smrg   tree nscalars_total = niters;
435*ec02198aSmrg   tree nscalars_step = build_int_cst (iv_type, vf);
436c7a68eb7Smrg   tree nscalars_skip = niters_skip;
437c7a68eb7Smrg   if (nscalars_per_iter != 1)
438c7a68eb7Smrg     {
439c7a68eb7Smrg       /* We checked before choosing to use a fully-masked loop that these
440c7a68eb7Smrg 	 multiplications don't overflow.  */
441*ec02198aSmrg       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442*ec02198aSmrg       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443c7a68eb7Smrg       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444*ec02198aSmrg 				     nscalars_total, compare_factor);
445*ec02198aSmrg       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446*ec02198aSmrg 				    nscalars_step, iv_factor);
447c7a68eb7Smrg       if (nscalars_skip)
448c7a68eb7Smrg 	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449*ec02198aSmrg 				      nscalars_skip, compare_factor);
450c7a68eb7Smrg     }
451c7a68eb7Smrg 
452c7a68eb7Smrg   /* Create an induction variable that counts the number of scalars
453c7a68eb7Smrg      processed.  */
454c7a68eb7Smrg   tree index_before_incr, index_after_incr;
455c7a68eb7Smrg   gimple_stmt_iterator incr_gsi;
456c7a68eb7Smrg   bool insert_after;
457c7a68eb7Smrg   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458*ec02198aSmrg   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459*ec02198aSmrg 	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460c7a68eb7Smrg 
461*ec02198aSmrg   tree zero_index = build_int_cst (compare_type, 0);
462c7a68eb7Smrg   tree test_index, test_limit, first_limit;
463c7a68eb7Smrg   gimple_stmt_iterator *test_gsi;
464c7a68eb7Smrg   if (might_wrap_p)
465c7a68eb7Smrg     {
466c7a68eb7Smrg       /* In principle the loop should stop iterating once the incremented
467c7a68eb7Smrg 	 IV reaches a value greater than or equal to:
468c7a68eb7Smrg 
469c7a68eb7Smrg 	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470c7a68eb7Smrg 
471c7a68eb7Smrg 	 However, there's no guarantee that this addition doesn't overflow
472c7a68eb7Smrg 	 the comparison type, or that the IV hits a value above it before
473c7a68eb7Smrg 	 wrapping around.  We therefore adjust the limit down by one
474c7a68eb7Smrg 	 IV step:
475c7a68eb7Smrg 
476c7a68eb7Smrg 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477c7a68eb7Smrg 	   -[infinite-prec] NSCALARS_STEP
478c7a68eb7Smrg 
479c7a68eb7Smrg 	 and compare the IV against this limit _before_ incrementing it.
480c7a68eb7Smrg 	 Since the comparison type is unsigned, we actually want the
481c7a68eb7Smrg 	 subtraction to saturate at zero:
482c7a68eb7Smrg 
483c7a68eb7Smrg 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484c7a68eb7Smrg 	   -[sat] NSCALARS_STEP
485c7a68eb7Smrg 
486c7a68eb7Smrg 	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487c7a68eb7Smrg 
488c7a68eb7Smrg 	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489c7a68eb7Smrg 
490c7a68eb7Smrg 	 where the rightmost subtraction can be done directly in
491c7a68eb7Smrg 	 COMPARE_TYPE.  */
492c7a68eb7Smrg       test_index = index_before_incr;
493*ec02198aSmrg       tree adjust = gimple_convert (preheader_seq, compare_type,
494*ec02198aSmrg 				    nscalars_step);
495c7a68eb7Smrg       if (nscalars_skip)
496c7a68eb7Smrg 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497c7a68eb7Smrg 			       adjust, nscalars_skip);
498c7a68eb7Smrg       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499c7a68eb7Smrg 				 nscalars_total, adjust);
500c7a68eb7Smrg       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501c7a68eb7Smrg 				 test_limit, adjust);
502c7a68eb7Smrg       test_gsi = &incr_gsi;
503c7a68eb7Smrg 
504c7a68eb7Smrg       /* Get a safe limit for the first iteration.  */
505c7a68eb7Smrg       if (nscalars_skip)
506c7a68eb7Smrg 	{
507c7a68eb7Smrg 	  /* The first vector iteration can handle at most NSCALARS_STEP
508c7a68eb7Smrg 	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
509c7a68eb7Smrg 	     NSCALARS_SKIP to that cannot overflow.  */
510c7a68eb7Smrg 	  tree const_limit = build_int_cst (compare_type,
511c7a68eb7Smrg 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512c7a68eb7Smrg 					    * nscalars_per_iter);
513c7a68eb7Smrg 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514c7a68eb7Smrg 				      nscalars_total, const_limit);
515c7a68eb7Smrg 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516c7a68eb7Smrg 				      first_limit, nscalars_skip);
517c7a68eb7Smrg 	}
518c7a68eb7Smrg       else
519c7a68eb7Smrg 	/* For the first iteration it doesn't matter whether the IV hits
520c7a68eb7Smrg 	   a value above NSCALARS_TOTAL.  That only matters for the latch
521c7a68eb7Smrg 	   condition.  */
522c7a68eb7Smrg 	first_limit = nscalars_total;
523c7a68eb7Smrg     }
524c7a68eb7Smrg   else
525c7a68eb7Smrg     {
526c7a68eb7Smrg       /* Test the incremented IV, which will always hit a value above
527c7a68eb7Smrg 	 the bound before wrapping.  */
528c7a68eb7Smrg       test_index = index_after_incr;
529c7a68eb7Smrg       test_limit = nscalars_total;
530c7a68eb7Smrg       if (nscalars_skip)
531c7a68eb7Smrg 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532c7a68eb7Smrg 				   test_limit, nscalars_skip);
533c7a68eb7Smrg       test_gsi = &loop_cond_gsi;
534c7a68eb7Smrg 
535c7a68eb7Smrg       first_limit = test_limit;
536c7a68eb7Smrg     }
537c7a68eb7Smrg 
538*ec02198aSmrg   /* Convert the IV value to the comparison type (either a no-op or
539*ec02198aSmrg      a demotion).  */
540*ec02198aSmrg   gimple_seq test_seq = NULL;
541*ec02198aSmrg   test_index = gimple_convert (&test_seq, compare_type, test_index);
542*ec02198aSmrg   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543*ec02198aSmrg 
544c7a68eb7Smrg   /* Provide a definition of each mask in the group.  */
545c7a68eb7Smrg   tree next_mask = NULL_TREE;
546c7a68eb7Smrg   tree mask;
547c7a68eb7Smrg   unsigned int i;
548c7a68eb7Smrg   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549c7a68eb7Smrg     {
550c7a68eb7Smrg       /* Previous masks will cover BIAS scalars.  This mask covers the
551c7a68eb7Smrg 	 next batch.  */
552c7a68eb7Smrg       poly_uint64 bias = nscalars_per_mask * i;
553c7a68eb7Smrg       tree bias_tree = build_int_cst (compare_type, bias);
554c7a68eb7Smrg       gimple *tmp_stmt;
555c7a68eb7Smrg 
556c7a68eb7Smrg       /* See whether the first iteration of the vector loop is known
557c7a68eb7Smrg 	 to have a full mask.  */
558c7a68eb7Smrg       poly_uint64 const_limit;
559c7a68eb7Smrg       bool first_iteration_full
560c7a68eb7Smrg 	= (poly_int_tree_p (first_limit, &const_limit)
561c7a68eb7Smrg 	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562c7a68eb7Smrg 
563c7a68eb7Smrg       /* Rather than have a new IV that starts at BIAS and goes up to
564c7a68eb7Smrg 	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565c7a68eb7Smrg 	 and adjust the bound down by BIAS.  */
566c7a68eb7Smrg       tree this_test_limit = test_limit;
567c7a68eb7Smrg       if (i != 0)
568c7a68eb7Smrg 	{
569c7a68eb7Smrg 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570c7a68eb7Smrg 					  compare_type, this_test_limit,
571c7a68eb7Smrg 					  bias_tree);
572c7a68eb7Smrg 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573c7a68eb7Smrg 					  compare_type, this_test_limit,
574c7a68eb7Smrg 					  bias_tree);
575c7a68eb7Smrg 	}
576c7a68eb7Smrg 
577c7a68eb7Smrg       /* Create the initial mask.  First include all scalars that
578c7a68eb7Smrg 	 are within the loop limit.  */
579c7a68eb7Smrg       tree init_mask = NULL_TREE;
580c7a68eb7Smrg       if (!first_iteration_full)
581c7a68eb7Smrg 	{
582c7a68eb7Smrg 	  tree start, end;
583c7a68eb7Smrg 	  if (first_limit == test_limit)
584c7a68eb7Smrg 	    {
585c7a68eb7Smrg 	      /* Use a natural test between zero (the initial IV value)
586c7a68eb7Smrg 		 and the loop limit.  The "else" block would be valid too,
587c7a68eb7Smrg 		 but this choice can avoid the need to load BIAS_TREE into
588c7a68eb7Smrg 		 a register.  */
589c7a68eb7Smrg 	      start = zero_index;
590c7a68eb7Smrg 	      end = this_test_limit;
591c7a68eb7Smrg 	    }
592c7a68eb7Smrg 	  else
593c7a68eb7Smrg 	    {
594c7a68eb7Smrg 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
595c7a68eb7Smrg 		 first iteration of the vector loop.  Test the portion
596c7a68eb7Smrg 		 associated with this mask.  */
597c7a68eb7Smrg 	      start = bias_tree;
598c7a68eb7Smrg 	      end = first_limit;
599c7a68eb7Smrg 	    }
600c7a68eb7Smrg 
601c7a68eb7Smrg 	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602c7a68eb7Smrg 	  tmp_stmt = vect_gen_while (init_mask, start, end);
603c7a68eb7Smrg 	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604c7a68eb7Smrg 	}
605c7a68eb7Smrg 
606c7a68eb7Smrg       /* Now AND out the bits that are within the number of skipped
607c7a68eb7Smrg 	 scalars.  */
608c7a68eb7Smrg       poly_uint64 const_skip;
609c7a68eb7Smrg       if (nscalars_skip
610c7a68eb7Smrg 	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
611c7a68eb7Smrg 	       && known_le (const_skip, bias)))
612c7a68eb7Smrg 	{
613c7a68eb7Smrg 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614c7a68eb7Smrg 						    bias_tree, nscalars_skip);
615c7a68eb7Smrg 	  if (init_mask)
616c7a68eb7Smrg 	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617c7a68eb7Smrg 				      init_mask, unskipped_mask);
618c7a68eb7Smrg 	  else
619c7a68eb7Smrg 	    init_mask = unskipped_mask;
620c7a68eb7Smrg 	}
621c7a68eb7Smrg 
622c7a68eb7Smrg       if (!init_mask)
623c7a68eb7Smrg 	/* First iteration is full.  */
624c7a68eb7Smrg 	init_mask = build_minus_one_cst (mask_type);
625c7a68eb7Smrg 
626c7a68eb7Smrg       /* Get the mask value for the next iteration of the loop.  */
627c7a68eb7Smrg       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628c7a68eb7Smrg       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629c7a68eb7Smrg       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630c7a68eb7Smrg 
631c7a68eb7Smrg       vect_set_loop_mask (loop, mask, init_mask, next_mask);
632c7a68eb7Smrg     }
633c7a68eb7Smrg   return next_mask;
634c7a68eb7Smrg }
635c7a68eb7Smrg 
636c7a68eb7Smrg /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637c7a68eb7Smrg    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
638c7a68eb7Smrg    number of iterations of the original scalar loop that should be
639c7a68eb7Smrg    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
640c7a68eb7Smrg    as for vect_set_loop_condition.
641c7a68eb7Smrg 
642c7a68eb7Smrg    Insert the branch-back condition before LOOP_COND_GSI and return the
643c7a68eb7Smrg    final gcond.  */
644c7a68eb7Smrg 
645c7a68eb7Smrg static gcond *
vect_set_loop_condition_masked(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)646*ec02198aSmrg vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647c7a68eb7Smrg 				tree niters, tree final_iv,
648c7a68eb7Smrg 				bool niters_maybe_zero,
649c7a68eb7Smrg 				gimple_stmt_iterator loop_cond_gsi)
650c7a68eb7Smrg {
651c7a68eb7Smrg   gimple_seq preheader_seq = NULL;
652c7a68eb7Smrg   gimple_seq header_seq = NULL;
653c7a68eb7Smrg 
654c7a68eb7Smrg   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655c7a68eb7Smrg   unsigned int compare_precision = TYPE_PRECISION (compare_type);
656c7a68eb7Smrg   tree orig_niters = niters;
657c7a68eb7Smrg 
658c7a68eb7Smrg   /* Type of the initial value of NITERS.  */
659c7a68eb7Smrg   tree ni_actual_type = TREE_TYPE (niters);
660c7a68eb7Smrg   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661*ec02198aSmrg   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662c7a68eb7Smrg 
663c7a68eb7Smrg   /* Convert NITERS to the same size as the compare.  */
664c7a68eb7Smrg   if (compare_precision > ni_actual_precision
665c7a68eb7Smrg       && niters_maybe_zero)
666c7a68eb7Smrg     {
667c7a68eb7Smrg       /* We know that there is always at least one iteration, so if the
668c7a68eb7Smrg 	 count is zero then it must have wrapped.  Cope with this by
669c7a68eb7Smrg 	 subtracting 1 before the conversion and adding 1 to the result.  */
670c7a68eb7Smrg       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671c7a68eb7Smrg       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672c7a68eb7Smrg 			     niters, build_minus_one_cst (ni_actual_type));
673c7a68eb7Smrg       niters = gimple_convert (&preheader_seq, compare_type, niters);
674c7a68eb7Smrg       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675c7a68eb7Smrg 			     niters, build_one_cst (compare_type));
676c7a68eb7Smrg     }
677c7a68eb7Smrg   else
678c7a68eb7Smrg     niters = gimple_convert (&preheader_seq, compare_type, niters);
679c7a68eb7Smrg 
680*ec02198aSmrg   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681c7a68eb7Smrg 
682c7a68eb7Smrg   /* Iterate over all the rgroups and fill in their masks.  We could use
683c7a68eb7Smrg      the first mask from any rgroup for the loop condition; here we
684c7a68eb7Smrg      arbitrarily pick the last.  */
685c7a68eb7Smrg   tree test_mask = NULL_TREE;
686c7a68eb7Smrg   rgroup_masks *rgm;
687c7a68eb7Smrg   unsigned int i;
688c7a68eb7Smrg   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689c7a68eb7Smrg   FOR_EACH_VEC_ELT (*masks, i, rgm)
690c7a68eb7Smrg     if (!rgm->masks.is_empty ())
691c7a68eb7Smrg       {
692c7a68eb7Smrg 	/* First try using permutes.  This adds a single vector
693c7a68eb7Smrg 	   instruction to the loop for each mask, but needs no extra
694c7a68eb7Smrg 	   loop invariants or IVs.  */
695c7a68eb7Smrg 	unsigned int nmasks = i + 1;
696c7a68eb7Smrg 	if ((nmasks & 1) == 0)
697c7a68eb7Smrg 	  {
698c7a68eb7Smrg 	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699c7a68eb7Smrg 	    if (!half_rgm->masks.is_empty ()
700c7a68eb7Smrg 		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701c7a68eb7Smrg 	      continue;
702c7a68eb7Smrg 	  }
703c7a68eb7Smrg 
704c7a68eb7Smrg 	/* See whether zero-based IV would ever generate all-false masks
705c7a68eb7Smrg 	   before wrapping around.  */
706c7a68eb7Smrg 	bool might_wrap_p
707*ec02198aSmrg 	  = (iv_limit == -1
708c7a68eb7Smrg 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709c7a68eb7Smrg 				    UNSIGNED)
710c7a68eb7Smrg 		 > compare_precision));
711c7a68eb7Smrg 
712c7a68eb7Smrg 	/* Set up all masks for this group.  */
713c7a68eb7Smrg 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714c7a68eb7Smrg 						  &preheader_seq,
715*ec02198aSmrg 						  loop_cond_gsi, rgm,
716c7a68eb7Smrg 						  niters, niters_skip,
717c7a68eb7Smrg 						  might_wrap_p);
718c7a68eb7Smrg       }
719c7a68eb7Smrg 
720c7a68eb7Smrg   /* Emit all accumulated statements.  */
721c7a68eb7Smrg   add_preheader_seq (loop, preheader_seq);
722c7a68eb7Smrg   add_header_seq (loop, header_seq);
723c7a68eb7Smrg 
724c7a68eb7Smrg   /* Get a boolean result that tells us whether to iterate.  */
725c7a68eb7Smrg   edge exit_edge = single_exit (loop);
726c7a68eb7Smrg   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727c7a68eb7Smrg   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728c7a68eb7Smrg   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729c7a68eb7Smrg 					NULL_TREE, NULL_TREE);
730c7a68eb7Smrg   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731c7a68eb7Smrg 
732c7a68eb7Smrg   /* The loop iterates (NITERS - 1) / VF + 1 times.
733c7a68eb7Smrg      Subtract one from this to get the latch count.  */
734c7a68eb7Smrg   tree step = build_int_cst (compare_type,
735c7a68eb7Smrg 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736c7a68eb7Smrg   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737c7a68eb7Smrg 				       build_minus_one_cst (compare_type));
738c7a68eb7Smrg   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739c7a68eb7Smrg 				     niters_minus_one, step);
740c7a68eb7Smrg 
741c7a68eb7Smrg   if (final_iv)
742c7a68eb7Smrg     {
743c7a68eb7Smrg       gassign *assign = gimple_build_assign (final_iv, orig_niters);
744c7a68eb7Smrg       gsi_insert_on_edge_immediate (single_exit (loop), assign);
745c7a68eb7Smrg     }
746c7a68eb7Smrg 
747c7a68eb7Smrg   return cond_stmt;
748c7a68eb7Smrg }
749c7a68eb7Smrg 
750c7a68eb7Smrg /* Like vect_set_loop_condition, but handle the case in which there
751c7a68eb7Smrg    are no loop masks.  */
752c7a68eb7Smrg 
753c7a68eb7Smrg static gcond *
vect_set_loop_condition_unmasked(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)754*ec02198aSmrg vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755c7a68eb7Smrg 				  tree step, tree final_iv,
756c7a68eb7Smrg 				  bool niters_maybe_zero,
757c7a68eb7Smrg 				  gimple_stmt_iterator loop_cond_gsi)
75810d565efSmrg {
75910d565efSmrg   tree indx_before_incr, indx_after_incr;
76010d565efSmrg   gcond *cond_stmt;
76110d565efSmrg   gcond *orig_cond;
762c7a68eb7Smrg   edge pe = loop_preheader_edge (loop);
76310d565efSmrg   edge exit_edge = single_exit (loop);
76410d565efSmrg   gimple_stmt_iterator incr_gsi;
76510d565efSmrg   bool insert_after;
76610d565efSmrg   enum tree_code code;
767c7a68eb7Smrg   tree niters_type = TREE_TYPE (niters);
76810d565efSmrg 
76910d565efSmrg   orig_cond = get_loop_exit_condition (loop);
77010d565efSmrg   gcc_assert (orig_cond);
77110d565efSmrg   loop_cond_gsi = gsi_for_stmt (orig_cond);
77210d565efSmrg 
773c7a68eb7Smrg   tree init, limit;
774c7a68eb7Smrg   if (!niters_maybe_zero && integer_onep (step))
775c7a68eb7Smrg     {
776c7a68eb7Smrg       /* In this case we can use a simple 0-based IV:
777c7a68eb7Smrg 
778c7a68eb7Smrg 	 A:
779c7a68eb7Smrg 	   x = 0;
780c7a68eb7Smrg 	   do
781c7a68eb7Smrg 	     {
782c7a68eb7Smrg 	       ...
783c7a68eb7Smrg 	       x += 1;
784c7a68eb7Smrg 	     }
785c7a68eb7Smrg 	   while (x < NITERS);  */
786c7a68eb7Smrg       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787c7a68eb7Smrg       init = build_zero_cst (niters_type);
788c7a68eb7Smrg       limit = niters;
789c7a68eb7Smrg     }
790c7a68eb7Smrg   else
791c7a68eb7Smrg     {
792c7a68eb7Smrg       /* The following works for all values of NITERS except 0:
793c7a68eb7Smrg 
794c7a68eb7Smrg 	 B:
795c7a68eb7Smrg 	   x = 0;
796c7a68eb7Smrg 	   do
797c7a68eb7Smrg 	     {
798c7a68eb7Smrg 	       ...
799c7a68eb7Smrg 	       x += STEP;
800c7a68eb7Smrg 	     }
801c7a68eb7Smrg 	   while (x <= NITERS - STEP);
802c7a68eb7Smrg 
803c7a68eb7Smrg 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
804c7a68eb7Smrg 	 but stops if x + STEP - 1 >= NITERS.
805c7a68eb7Smrg 
806c7a68eb7Smrg 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
807c7a68eb7Smrg 	 before wrapping around.  There are two obvious ways of dealing with
808c7a68eb7Smrg 	 this:
809c7a68eb7Smrg 
810c7a68eb7Smrg 	 - start at STEP - 1 and compare x before incrementing it
811c7a68eb7Smrg 	 - start at -1 and compare x after incrementing it
812c7a68eb7Smrg 
813c7a68eb7Smrg 	 The latter is simpler and is what we use.  The loop in this case
814c7a68eb7Smrg 	 looks like:
815c7a68eb7Smrg 
816c7a68eb7Smrg 	 C:
817c7a68eb7Smrg 	   x = -1;
818c7a68eb7Smrg 	   do
819c7a68eb7Smrg 	     {
820c7a68eb7Smrg 	       ...
821c7a68eb7Smrg 	       x += STEP;
822c7a68eb7Smrg 	     }
823c7a68eb7Smrg 	   while (x < NITERS - STEP);
824c7a68eb7Smrg 
825c7a68eb7Smrg 	 In both cases the loop limit is NITERS - STEP.  */
826c7a68eb7Smrg       gimple_seq seq = NULL;
827c7a68eb7Smrg       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828c7a68eb7Smrg       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829c7a68eb7Smrg       if (seq)
830c7a68eb7Smrg 	{
831c7a68eb7Smrg 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832c7a68eb7Smrg 	  gcc_assert (!new_bb);
833c7a68eb7Smrg 	}
834c7a68eb7Smrg       if (niters_maybe_zero)
835c7a68eb7Smrg 	{
836c7a68eb7Smrg 	  /* Case C.  */
837c7a68eb7Smrg 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838c7a68eb7Smrg 	  init = build_all_ones_cst (niters_type);
839c7a68eb7Smrg 	}
840c7a68eb7Smrg       else
841c7a68eb7Smrg 	{
842c7a68eb7Smrg 	  /* Case B.  */
843c7a68eb7Smrg 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844c7a68eb7Smrg 	  init = build_zero_cst (niters_type);
845c7a68eb7Smrg 	}
846c7a68eb7Smrg     }
847c7a68eb7Smrg 
84810d565efSmrg   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
84910d565efSmrg   create_iv (init, step, NULL_TREE, loop,
85010d565efSmrg              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
85110d565efSmrg   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
85210d565efSmrg 					      true, NULL_TREE, true,
85310d565efSmrg 					      GSI_SAME_STMT);
854c7a68eb7Smrg   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
85510d565efSmrg 				     true, GSI_SAME_STMT);
85610d565efSmrg 
857c7a68eb7Smrg   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
85810d565efSmrg 				 NULL_TREE);
85910d565efSmrg 
86010d565efSmrg   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
86110d565efSmrg 
862c7a68eb7Smrg   /* Record the number of latch iterations.  */
863c7a68eb7Smrg   if (limit == niters)
864c7a68eb7Smrg     /* Case A: the loop iterates NITERS times.  Subtract one to get the
865c7a68eb7Smrg        latch count.  */
866c7a68eb7Smrg     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867c7a68eb7Smrg 				       build_int_cst (niters_type, 1));
868c7a68eb7Smrg   else
869c7a68eb7Smrg     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870c7a68eb7Smrg        Subtract one from this to get the latch count.  */
871c7a68eb7Smrg     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872c7a68eb7Smrg 				       limit, step);
873c7a68eb7Smrg 
874c7a68eb7Smrg   if (final_iv)
875c7a68eb7Smrg     {
876c7a68eb7Smrg       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877c7a68eb7Smrg 					     indx_after_incr, init);
878c7a68eb7Smrg       gsi_insert_on_edge_immediate (single_exit (loop), assign);
879c7a68eb7Smrg     }
880c7a68eb7Smrg 
881c7a68eb7Smrg   return cond_stmt;
882c7a68eb7Smrg }
883c7a68eb7Smrg 
884c7a68eb7Smrg /* If we're using fully-masked loops, make LOOP iterate:
885c7a68eb7Smrg 
886c7a68eb7Smrg       N == (NITERS - 1) / STEP + 1
887c7a68eb7Smrg 
888c7a68eb7Smrg    times.  When NITERS is zero, this is equivalent to making the loop
889c7a68eb7Smrg    execute (1 << M) / STEP times, where M is the precision of NITERS.
890c7a68eb7Smrg    NITERS_MAYBE_ZERO is true if this last case might occur.
891c7a68eb7Smrg 
892c7a68eb7Smrg    If we're not using fully-masked loops, make LOOP iterate:
893c7a68eb7Smrg 
894c7a68eb7Smrg       N == (NITERS - STEP) / STEP + 1
895c7a68eb7Smrg 
896c7a68eb7Smrg    times, where NITERS is known to be outside the range [1, STEP - 1].
897c7a68eb7Smrg    This is equivalent to making the loop execute NITERS / STEP times
898c7a68eb7Smrg    when NITERS is nonzero and (1 << M) / STEP times otherwise.
899c7a68eb7Smrg    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900c7a68eb7Smrg 
901c7a68eb7Smrg    If FINAL_IV is nonnull, it is an SSA name that should be set to
902c7a68eb7Smrg    N * STEP on exit from the loop.
903c7a68eb7Smrg 
904c7a68eb7Smrg    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
905c7a68eb7Smrg 
906c7a68eb7Smrg void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)907*ec02198aSmrg vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908c7a68eb7Smrg 			 tree niters, tree step, tree final_iv,
909c7a68eb7Smrg 			 bool niters_maybe_zero)
910c7a68eb7Smrg {
911c7a68eb7Smrg   gcond *cond_stmt;
912c7a68eb7Smrg   gcond *orig_cond = get_loop_exit_condition (loop);
913c7a68eb7Smrg   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914c7a68eb7Smrg 
915c7a68eb7Smrg   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916c7a68eb7Smrg     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917c7a68eb7Smrg 						final_iv, niters_maybe_zero,
918c7a68eb7Smrg 						loop_cond_gsi);
919c7a68eb7Smrg   else
920c7a68eb7Smrg     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921c7a68eb7Smrg 						  final_iv, niters_maybe_zero,
922c7a68eb7Smrg 						  loop_cond_gsi);
923c7a68eb7Smrg 
924c7a68eb7Smrg   /* Remove old loop exit test.  */
9250fc04c29Smrg   stmt_vec_info orig_cond_info;
9260fc04c29Smrg   if (loop_vinfo
9270fc04c29Smrg       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
9280fc04c29Smrg     loop_vinfo->remove_stmt (orig_cond_info);
9290fc04c29Smrg   else
93010d565efSmrg     gsi_remove (&loop_cond_gsi, true);
93110d565efSmrg 
93210d565efSmrg   if (dump_enabled_p ())
9330fc04c29Smrg     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
9340fc04c29Smrg 		     cond_stmt);
93510d565efSmrg }
93610d565efSmrg 
93710d565efSmrg /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
93810d565efSmrg    For all PHI arguments in FROM->dest and TO->dest from those
93910d565efSmrg    edges ensure that TO->dest PHI arguments have current_def
94010d565efSmrg    to that in from.  */
94110d565efSmrg 
94210d565efSmrg static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)94310d565efSmrg slpeel_duplicate_current_defs_from_edges (edge from, edge to)
94410d565efSmrg {
94510d565efSmrg   gimple_stmt_iterator gsi_from, gsi_to;
94610d565efSmrg 
94710d565efSmrg   for (gsi_from = gsi_start_phis (from->dest),
94810d565efSmrg        gsi_to = gsi_start_phis (to->dest);
94910d565efSmrg        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
95010d565efSmrg     {
95110d565efSmrg       gimple *from_phi = gsi_stmt (gsi_from);
95210d565efSmrg       gimple *to_phi = gsi_stmt (gsi_to);
95310d565efSmrg       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
95410d565efSmrg       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
95510d565efSmrg       if (virtual_operand_p (from_arg))
95610d565efSmrg 	{
95710d565efSmrg 	  gsi_next (&gsi_from);
95810d565efSmrg 	  continue;
95910d565efSmrg 	}
96010d565efSmrg       if (virtual_operand_p (to_arg))
96110d565efSmrg 	{
96210d565efSmrg 	  gsi_next (&gsi_to);
96310d565efSmrg 	  continue;
96410d565efSmrg 	}
96510d565efSmrg       if (TREE_CODE (from_arg) != SSA_NAME)
96610d565efSmrg 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
9670fc04c29Smrg       else if (TREE_CODE (to_arg) == SSA_NAME
9680fc04c29Smrg 	       && from_arg != to_arg)
96910d565efSmrg 	{
97010d565efSmrg 	  if (get_current_def (to_arg) == NULL_TREE)
9710fc04c29Smrg 	    {
9720fc04c29Smrg 	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
9730fc04c29Smrg 					      TREE_TYPE (get_current_def
9740fc04c29Smrg 							   (from_arg))));
97510d565efSmrg 	      set_current_def (to_arg, get_current_def (from_arg));
97610d565efSmrg 	    }
9770fc04c29Smrg 	}
97810d565efSmrg       gsi_next (&gsi_from);
97910d565efSmrg       gsi_next (&gsi_to);
98010d565efSmrg     }
98110d565efSmrg 
98210d565efSmrg   gphi *from_phi = get_virtual_phi (from->dest);
98310d565efSmrg   gphi *to_phi = get_virtual_phi (to->dest);
98410d565efSmrg   if (from_phi)
98510d565efSmrg     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
98610d565efSmrg 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
98710d565efSmrg }
98810d565efSmrg 
98910d565efSmrg 
99010d565efSmrg /* Given LOOP this function generates a new copy of it and puts it
99110d565efSmrg    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
99210d565efSmrg    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
99310d565efSmrg    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
99410d565efSmrg    entry or exit of LOOP.  */
99510d565efSmrg 
996*ec02198aSmrg class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)997*ec02198aSmrg slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998*ec02198aSmrg 					class loop *scalar_loop, edge e)
99910d565efSmrg {
1000*ec02198aSmrg   class loop *new_loop;
100110d565efSmrg   basic_block *new_bbs, *bbs, *pbbs;
100210d565efSmrg   bool at_exit;
100310d565efSmrg   bool was_imm_dom;
100410d565efSmrg   basic_block exit_dest;
100510d565efSmrg   edge exit, new_exit;
100610d565efSmrg   bool duplicate_outer_loop = false;
100710d565efSmrg 
100810d565efSmrg   exit = single_exit (loop);
100910d565efSmrg   at_exit = (e == exit);
101010d565efSmrg   if (!at_exit && e != loop_preheader_edge (loop))
101110d565efSmrg     return NULL;
101210d565efSmrg 
101310d565efSmrg   if (scalar_loop == NULL)
101410d565efSmrg     scalar_loop = loop;
101510d565efSmrg 
101610d565efSmrg   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
101710d565efSmrg   pbbs = bbs + 1;
101810d565efSmrg   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
101910d565efSmrg   /* Allow duplication of outer loops.  */
102010d565efSmrg   if (scalar_loop->inner)
102110d565efSmrg     duplicate_outer_loop = true;
102210d565efSmrg   /* Check whether duplication is possible.  */
102310d565efSmrg   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
102410d565efSmrg     {
102510d565efSmrg       free (bbs);
102610d565efSmrg       return NULL;
102710d565efSmrg     }
102810d565efSmrg 
102910d565efSmrg   /* Generate new loop structure.  */
103010d565efSmrg   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
103110d565efSmrg   duplicate_subloops (scalar_loop, new_loop);
103210d565efSmrg 
103310d565efSmrg   exit_dest = exit->dest;
103410d565efSmrg   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
103510d565efSmrg 					  exit_dest) == loop->header ?
103610d565efSmrg 		 true : false);
103710d565efSmrg 
103810d565efSmrg   /* Also copy the pre-header, this avoids jumping through hoops to
103910d565efSmrg      duplicate the loop entry PHI arguments.  Create an empty
104010d565efSmrg      pre-header unconditionally for this.  */
104110d565efSmrg   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
104210d565efSmrg   edge entry_e = single_pred_edge (preheader);
104310d565efSmrg   bbs[0] = preheader;
104410d565efSmrg   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
104510d565efSmrg 
104610d565efSmrg   exit = single_exit (scalar_loop);
104710d565efSmrg   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
104810d565efSmrg 	    &exit, 1, &new_exit, NULL,
104910d565efSmrg 	    at_exit ? loop->latch : e->src, true);
105010d565efSmrg   exit = single_exit (loop);
105110d565efSmrg   basic_block new_preheader = new_bbs[0];
105210d565efSmrg 
105310d565efSmrg   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
105410d565efSmrg 
1055*ec02198aSmrg   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1056*ec02198aSmrg   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1057*ec02198aSmrg     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1058*ec02198aSmrg 
105910d565efSmrg   if (scalar_loop != loop)
106010d565efSmrg     {
106110d565efSmrg       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
106210d565efSmrg 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
106310d565efSmrg 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
106410d565efSmrg 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
106510d565efSmrg 	 header) to have current_def set, so copy them over.  */
106610d565efSmrg       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
106710d565efSmrg 						exit);
106810d565efSmrg       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
106910d565efSmrg 							   0),
107010d565efSmrg 						EDGE_SUCC (loop->latch, 0));
107110d565efSmrg     }
107210d565efSmrg 
107310d565efSmrg   if (at_exit) /* Add the loop copy at exit.  */
107410d565efSmrg     {
107510d565efSmrg       if (scalar_loop != loop)
107610d565efSmrg 	{
107710d565efSmrg 	  gphi_iterator gsi;
107810d565efSmrg 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
107910d565efSmrg 
108010d565efSmrg 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
108110d565efSmrg 	       gsi_next (&gsi))
108210d565efSmrg 	    {
108310d565efSmrg 	      gphi *phi = gsi.phi ();
108410d565efSmrg 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
108510d565efSmrg 	      location_t orig_locus
108610d565efSmrg 		= gimple_phi_arg_location_from_edge (phi, e);
108710d565efSmrg 
108810d565efSmrg 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
108910d565efSmrg 	    }
109010d565efSmrg 	}
109110d565efSmrg       redirect_edge_and_branch_force (e, new_preheader);
109210d565efSmrg       flush_pending_stmts (e);
109310d565efSmrg       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
109410d565efSmrg       if (was_imm_dom || duplicate_outer_loop)
109510d565efSmrg 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
109610d565efSmrg 
109710d565efSmrg       /* And remove the non-necessary forwarder again.  Keep the other
109810d565efSmrg          one so we have a proper pre-header for the loop at the exit edge.  */
109910d565efSmrg       redirect_edge_pred (single_succ_edge (preheader),
110010d565efSmrg 			  single_pred (preheader));
110110d565efSmrg       delete_basic_block (preheader);
110210d565efSmrg       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
110310d565efSmrg 			       loop_preheader_edge (scalar_loop)->src);
110410d565efSmrg     }
110510d565efSmrg   else /* Add the copy at entry.  */
110610d565efSmrg     {
110710d565efSmrg       if (scalar_loop != loop)
110810d565efSmrg 	{
110910d565efSmrg 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
111010d565efSmrg 	  redirect_edge_pred (single_succ_edge (preheader),
111110d565efSmrg 			      single_pred (preheader));
111210d565efSmrg 	  delete_basic_block (preheader);
111310d565efSmrg 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
111410d565efSmrg 				   loop_preheader_edge (scalar_loop)->src);
111510d565efSmrg 	  preheader = split_edge (loop_preheader_edge (loop));
111610d565efSmrg 	  entry_e = single_pred_edge (preheader);
111710d565efSmrg 	}
111810d565efSmrg 
111910d565efSmrg       redirect_edge_and_branch_force (entry_e, new_preheader);
112010d565efSmrg       flush_pending_stmts (entry_e);
112110d565efSmrg       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
112210d565efSmrg 
112310d565efSmrg       redirect_edge_and_branch_force (new_exit, preheader);
112410d565efSmrg       flush_pending_stmts (new_exit);
112510d565efSmrg       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
112610d565efSmrg 
112710d565efSmrg       /* And remove the non-necessary forwarder again.  Keep the other
112810d565efSmrg          one so we have a proper pre-header for the loop at the exit edge.  */
112910d565efSmrg       redirect_edge_pred (single_succ_edge (new_preheader),
113010d565efSmrg 			  single_pred (new_preheader));
113110d565efSmrg       delete_basic_block (new_preheader);
113210d565efSmrg       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
113310d565efSmrg 			       loop_preheader_edge (new_loop)->src);
113410d565efSmrg     }
113510d565efSmrg 
113610d565efSmrg   if (scalar_loop != loop)
113710d565efSmrg     {
113810d565efSmrg       /* Update new_loop->header PHIs, so that on the preheader
113910d565efSmrg 	 edge they are the ones from loop rather than scalar_loop.  */
114010d565efSmrg       gphi_iterator gsi_orig, gsi_new;
114110d565efSmrg       edge orig_e = loop_preheader_edge (loop);
114210d565efSmrg       edge new_e = loop_preheader_edge (new_loop);
114310d565efSmrg 
114410d565efSmrg       for (gsi_orig = gsi_start_phis (loop->header),
114510d565efSmrg 	   gsi_new = gsi_start_phis (new_loop->header);
114610d565efSmrg 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
114710d565efSmrg 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
114810d565efSmrg 	{
114910d565efSmrg 	  gphi *orig_phi = gsi_orig.phi ();
115010d565efSmrg 	  gphi *new_phi = gsi_new.phi ();
115110d565efSmrg 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
115210d565efSmrg 	  location_t orig_locus
115310d565efSmrg 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
115410d565efSmrg 
115510d565efSmrg 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
115610d565efSmrg 	}
115710d565efSmrg     }
115810d565efSmrg 
115910d565efSmrg   free (new_bbs);
116010d565efSmrg   free (bbs);
116110d565efSmrg 
116210d565efSmrg   checking_verify_dominators (CDI_DOMINATORS);
116310d565efSmrg 
116410d565efSmrg   return new_loop;
116510d565efSmrg }
116610d565efSmrg 
116710d565efSmrg 
116810d565efSmrg /* Given the condition expression COND, put it as the last statement of
116910d565efSmrg    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
117010d565efSmrg    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
117110d565efSmrg    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
117210d565efSmrg    new edge as irreducible if IRREDUCIBLE_P is true.  */
117310d565efSmrg 
117410d565efSmrg static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)117510d565efSmrg slpeel_add_loop_guard (basic_block guard_bb, tree cond,
117610d565efSmrg 		       basic_block guard_to, basic_block dom_bb,
1177c7a68eb7Smrg 		       profile_probability probability, bool irreducible_p)
117810d565efSmrg {
117910d565efSmrg   gimple_stmt_iterator gsi;
118010d565efSmrg   edge new_e, enter_e;
118110d565efSmrg   gcond *cond_stmt;
118210d565efSmrg   gimple_seq gimplify_stmt_list = NULL;
118310d565efSmrg 
118410d565efSmrg   enter_e = EDGE_SUCC (guard_bb, 0);
118510d565efSmrg   enter_e->flags &= ~EDGE_FALLTHRU;
118610d565efSmrg   enter_e->flags |= EDGE_FALSE_VALUE;
118710d565efSmrg   gsi = gsi_last_bb (guard_bb);
118810d565efSmrg 
118910d565efSmrg   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
119010d565efSmrg 				 NULL_TREE);
119110d565efSmrg   if (gimplify_stmt_list)
119210d565efSmrg     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
119310d565efSmrg 
119410d565efSmrg   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
119510d565efSmrg   gsi = gsi_last_bb (guard_bb);
119610d565efSmrg   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
119710d565efSmrg 
119810d565efSmrg   /* Add new edge to connect guard block to the merge/loop-exit block.  */
119910d565efSmrg   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
120010d565efSmrg 
120110d565efSmrg   new_e->probability = probability;
120210d565efSmrg   if (irreducible_p)
120310d565efSmrg     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
120410d565efSmrg 
1205c7a68eb7Smrg   enter_e->probability = probability.invert ();
120610d565efSmrg   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
120710d565efSmrg 
120810d565efSmrg   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
120910d565efSmrg   if (enter_e->dest->loop_father->header == enter_e->dest)
121010d565efSmrg     split_edge (enter_e);
121110d565efSmrg 
121210d565efSmrg   return new_e;
121310d565efSmrg }
121410d565efSmrg 
121510d565efSmrg 
121610d565efSmrg /* This function verifies that the following restrictions apply to LOOP:
121710d565efSmrg    (1) it consists of exactly 2 basic blocks - header, and an empty latch
121810d565efSmrg        for innermost loop and 5 basic blocks for outer-loop.
121910d565efSmrg    (2) it is single entry, single exit
122010d565efSmrg    (3) its exit condition is the last stmt in the header
122110d565efSmrg    (4) E is the entry/exit edge of LOOP.
122210d565efSmrg  */
122310d565efSmrg 
122410d565efSmrg bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1225*ec02198aSmrg slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
122610d565efSmrg {
122710d565efSmrg   edge exit_e = single_exit (loop);
122810d565efSmrg   edge entry_e = loop_preheader_edge (loop);
122910d565efSmrg   gcond *orig_cond = get_loop_exit_condition (loop);
123010d565efSmrg   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
123110d565efSmrg   unsigned int num_bb = loop->inner? 5 : 2;
123210d565efSmrg 
123310d565efSmrg   /* All loops have an outer scope; the only case loop->outer is NULL is for
123410d565efSmrg      the function itself.  */
123510d565efSmrg   if (!loop_outer (loop)
123610d565efSmrg       || loop->num_nodes != num_bb
123710d565efSmrg       || !empty_block_p (loop->latch)
123810d565efSmrg       || !single_exit (loop)
123910d565efSmrg       /* Verify that new loop exit condition can be trivially modified.  */
124010d565efSmrg       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
124110d565efSmrg       || (e != exit_e && e != entry_e))
124210d565efSmrg     return false;
124310d565efSmrg 
124410d565efSmrg   return true;
124510d565efSmrg }
124610d565efSmrg 
124710d565efSmrg /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
124810d565efSmrg    in the exit bb and rename all the uses after the loop.  This simplifies
124910d565efSmrg    the *guard[12] routines, which assume loop closed SSA form for all PHIs
125010d565efSmrg    (but normally loop closed SSA form doesn't require virtual PHIs to be
125110d565efSmrg    in the same form).  Doing this early simplifies the checking what
1252*ec02198aSmrg    uses should be renamed.
125310d565efSmrg 
1254*ec02198aSmrg    If we create a new phi after the loop, return the definition that
1255*ec02198aSmrg    applies on entry to the loop, otherwise return null.  */
1256*ec02198aSmrg 
1257*ec02198aSmrg static tree
create_lcssa_for_virtual_phi(class loop * loop)1258*ec02198aSmrg create_lcssa_for_virtual_phi (class loop *loop)
125910d565efSmrg {
126010d565efSmrg   gphi_iterator gsi;
126110d565efSmrg   edge exit_e = single_exit (loop);
126210d565efSmrg 
126310d565efSmrg   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
126410d565efSmrg     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
126510d565efSmrg       {
126610d565efSmrg 	gphi *phi = gsi.phi ();
126710d565efSmrg 	for (gsi = gsi_start_phis (exit_e->dest);
126810d565efSmrg 	     !gsi_end_p (gsi); gsi_next (&gsi))
126910d565efSmrg 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
127010d565efSmrg 	    break;
127110d565efSmrg 	if (gsi_end_p (gsi))
127210d565efSmrg 	  {
127310d565efSmrg 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
127410d565efSmrg 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
127510d565efSmrg 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
127610d565efSmrg 	    imm_use_iterator imm_iter;
127710d565efSmrg 	    gimple *stmt;
127810d565efSmrg 	    use_operand_p use_p;
127910d565efSmrg 
128010d565efSmrg 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
128110d565efSmrg 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
128210d565efSmrg 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
128310d565efSmrg 	    gimple_phi_set_result (new_phi, new_vop);
128410d565efSmrg 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
128510d565efSmrg 	      if (stmt != new_phi
128610d565efSmrg 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
128710d565efSmrg 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
128810d565efSmrg 		  SET_USE (use_p, new_vop);
1289*ec02198aSmrg 
1290*ec02198aSmrg 	    return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
129110d565efSmrg 	  }
129210d565efSmrg 	break;
129310d565efSmrg       }
1294*ec02198aSmrg   return NULL_TREE;
129510d565efSmrg }
129610d565efSmrg 
129710d565efSmrg /* Function vect_get_loop_location.
129810d565efSmrg 
129910d565efSmrg    Extract the location of the loop in the source code.
130010d565efSmrg    If the loop is not well formed for vectorization, an estimated
130110d565efSmrg    location is calculated.
130210d565efSmrg    Return the loop location if succeed and NULL if not.  */
130310d565efSmrg 
13040fc04c29Smrg dump_user_location_t
find_loop_location(class loop * loop)1305*ec02198aSmrg find_loop_location (class loop *loop)
130610d565efSmrg {
130710d565efSmrg   gimple *stmt = NULL;
130810d565efSmrg   basic_block bb;
130910d565efSmrg   gimple_stmt_iterator si;
131010d565efSmrg 
131110d565efSmrg   if (!loop)
13120fc04c29Smrg     return dump_user_location_t ();
131310d565efSmrg 
131410d565efSmrg   stmt = get_loop_exit_condition (loop);
131510d565efSmrg 
131610d565efSmrg   if (stmt
131710d565efSmrg       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
13180fc04c29Smrg     return stmt;
131910d565efSmrg 
132010d565efSmrg   /* If we got here the loop is probably not "well formed",
132110d565efSmrg      try to estimate the loop location */
132210d565efSmrg 
132310d565efSmrg   if (!loop->header)
13240fc04c29Smrg     return dump_user_location_t ();
132510d565efSmrg 
132610d565efSmrg   bb = loop->header;
132710d565efSmrg 
132810d565efSmrg   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
132910d565efSmrg     {
133010d565efSmrg       stmt = gsi_stmt (si);
133110d565efSmrg       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
13320fc04c29Smrg         return stmt;
133310d565efSmrg     }
133410d565efSmrg 
13350fc04c29Smrg   return dump_user_location_t ();
133610d565efSmrg }
133710d565efSmrg 
13380fc04c29Smrg /* Return true if the phi described by STMT_INFO defines an IV of the
13390fc04c29Smrg    loop to be vectorized.  */
134010d565efSmrg 
134110d565efSmrg static bool
iv_phi_p(stmt_vec_info stmt_info)13420fc04c29Smrg iv_phi_p (stmt_vec_info stmt_info)
134310d565efSmrg {
13440fc04c29Smrg   gphi *phi = as_a <gphi *> (stmt_info->stmt);
134510d565efSmrg   if (virtual_operand_p (PHI_RESULT (phi)))
134610d565efSmrg     return false;
134710d565efSmrg 
134810d565efSmrg   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
134910d565efSmrg       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
135010d565efSmrg     return false;
135110d565efSmrg 
135210d565efSmrg   return true;
135310d565efSmrg }
135410d565efSmrg 
135510d565efSmrg /* Function vect_can_advance_ivs_p
135610d565efSmrg 
135710d565efSmrg    In case the number of iterations that LOOP iterates is unknown at compile
135810d565efSmrg    time, an epilog loop will be generated, and the loop induction variables
135910d565efSmrg    (IVs) will be "advanced" to the value they are supposed to take just before
136010d565efSmrg    the epilog loop.  Here we check that the access function of the loop IVs
136110d565efSmrg    and the expression that represents the loop bound are simple enough.
136210d565efSmrg    These restrictions will be relaxed in the future.  */
136310d565efSmrg 
136410d565efSmrg bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)136510d565efSmrg vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
136610d565efSmrg {
1367*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
136810d565efSmrg   basic_block bb = loop->header;
136910d565efSmrg   gphi_iterator gsi;
137010d565efSmrg 
137110d565efSmrg   /* Analyze phi functions of the loop header.  */
137210d565efSmrg 
137310d565efSmrg   if (dump_enabled_p ())
137410d565efSmrg     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
137510d565efSmrg   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
137610d565efSmrg     {
137710d565efSmrg       tree evolution_part;
137810d565efSmrg 
137910d565efSmrg       gphi *phi = gsi.phi ();
13800fc04c29Smrg       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
138110d565efSmrg       if (dump_enabled_p ())
13820fc04c29Smrg 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
13830fc04c29Smrg 			 phi_info->stmt);
138410d565efSmrg 
138510d565efSmrg       /* Skip virtual phi's. The data dependences that are associated with
138610d565efSmrg 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
138710d565efSmrg 
138810d565efSmrg 	 Skip reduction phis.  */
13890fc04c29Smrg       if (!iv_phi_p (phi_info))
139010d565efSmrg 	{
139110d565efSmrg 	  if (dump_enabled_p ())
139210d565efSmrg 	    dump_printf_loc (MSG_NOTE, vect_location,
139310d565efSmrg 			     "reduc or virtual phi. skip.\n");
139410d565efSmrg 	  continue;
139510d565efSmrg 	}
139610d565efSmrg 
139710d565efSmrg       /* Analyze the evolution function.  */
139810d565efSmrg 
13990fc04c29Smrg       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
140010d565efSmrg       if (evolution_part == NULL_TREE)
140110d565efSmrg         {
140210d565efSmrg 	  if (dump_enabled_p ())
140310d565efSmrg 	    dump_printf (MSG_MISSED_OPTIMIZATION,
140410d565efSmrg 			 "No access function or evolution.\n");
140510d565efSmrg 	  return false;
140610d565efSmrg         }
140710d565efSmrg 
140810d565efSmrg       /* FORNOW: We do not transform initial conditions of IVs
140910d565efSmrg 	 which evolution functions are not invariants in the loop.  */
141010d565efSmrg 
141110d565efSmrg       if (!expr_invariant_in_loop_p (loop, evolution_part))
141210d565efSmrg 	{
141310d565efSmrg 	  if (dump_enabled_p ())
141410d565efSmrg 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
141510d565efSmrg 			     "evolution not invariant in loop.\n");
141610d565efSmrg 	  return false;
141710d565efSmrg 	}
141810d565efSmrg 
141910d565efSmrg       /* FORNOW: We do not transform initial conditions of IVs
142010d565efSmrg 	 which evolution functions are a polynomial of degree >= 2.  */
142110d565efSmrg 
142210d565efSmrg       if (tree_is_chrec (evolution_part))
142310d565efSmrg 	{
142410d565efSmrg 	  if (dump_enabled_p ())
142510d565efSmrg 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
142610d565efSmrg 			     "evolution is chrec.\n");
142710d565efSmrg 	  return false;
142810d565efSmrg 	}
142910d565efSmrg     }
143010d565efSmrg 
143110d565efSmrg   return true;
143210d565efSmrg }
143310d565efSmrg 
143410d565efSmrg 
143510d565efSmrg /*   Function vect_update_ivs_after_vectorizer.
143610d565efSmrg 
143710d565efSmrg      "Advance" the induction variables of LOOP to the value they should take
143810d565efSmrg      after the execution of LOOP.  This is currently necessary because the
143910d565efSmrg      vectorizer does not handle induction variables that are used after the
144010d565efSmrg      loop.  Such a situation occurs when the last iterations of LOOP are
144110d565efSmrg      peeled, because:
144210d565efSmrg      1. We introduced new uses after LOOP for IVs that were not originally used
144310d565efSmrg         after LOOP: the IVs of LOOP are now used by an epilog loop.
144410d565efSmrg      2. LOOP is going to be vectorized; this means that it will iterate N/VF
144510d565efSmrg         times, whereas the loop IVs should be bumped N times.
144610d565efSmrg 
144710d565efSmrg      Input:
144810d565efSmrg      - LOOP - a loop that is going to be vectorized. The last few iterations
144910d565efSmrg               of LOOP were peeled.
145010d565efSmrg      - NITERS - the number of iterations that LOOP executes (before it is
145110d565efSmrg                 vectorized). i.e, the number of times the ivs should be bumped.
145210d565efSmrg      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
145310d565efSmrg                   coming out from LOOP on which there are uses of the LOOP ivs
145410d565efSmrg 		  (this is the path from LOOP->exit to epilog_loop->preheader).
145510d565efSmrg 
145610d565efSmrg                   The new definitions of the ivs are placed in LOOP->exit.
145710d565efSmrg                   The phi args associated with the edge UPDATE_E in the bb
145810d565efSmrg                   UPDATE_E->dest are updated accordingly.
145910d565efSmrg 
146010d565efSmrg      Assumption 1: Like the rest of the vectorizer, this function assumes
146110d565efSmrg      a single loop exit that has a single predecessor.
146210d565efSmrg 
146310d565efSmrg      Assumption 2: The phi nodes in the LOOP header and in update_bb are
146410d565efSmrg      organized in the same order.
146510d565efSmrg 
146610d565efSmrg      Assumption 3: The access function of the ivs is simple enough (see
146710d565efSmrg      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
146810d565efSmrg 
146910d565efSmrg      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
147010d565efSmrg      coming out of LOOP on which the ivs of LOOP are used (this is the path
147110d565efSmrg      that leads to the epilog loop; other paths skip the epilog loop).  This
147210d565efSmrg      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
147310d565efSmrg      needs to have its phis updated.
147410d565efSmrg  */
147510d565efSmrg 
147610d565efSmrg static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)147710d565efSmrg vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
147810d565efSmrg 				  tree niters, edge update_e)
147910d565efSmrg {
148010d565efSmrg   gphi_iterator gsi, gsi1;
1481*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
148210d565efSmrg   basic_block update_bb = update_e->dest;
148310d565efSmrg   basic_block exit_bb = single_exit (loop)->dest;
148410d565efSmrg 
148510d565efSmrg   /* Make sure there exists a single-predecessor exit bb:  */
148610d565efSmrg   gcc_assert (single_pred_p (exit_bb));
148710d565efSmrg   gcc_assert (single_succ_edge (exit_bb) == update_e);
148810d565efSmrg 
148910d565efSmrg   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
149010d565efSmrg        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
149110d565efSmrg        gsi_next (&gsi), gsi_next (&gsi1))
149210d565efSmrg     {
149310d565efSmrg       tree init_expr;
149410d565efSmrg       tree step_expr, off;
149510d565efSmrg       tree type;
149610d565efSmrg       tree var, ni, ni_name;
149710d565efSmrg       gimple_stmt_iterator last_gsi;
149810d565efSmrg 
149910d565efSmrg       gphi *phi = gsi.phi ();
150010d565efSmrg       gphi *phi1 = gsi1.phi ();
15010fc04c29Smrg       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
150210d565efSmrg       if (dump_enabled_p ())
150310d565efSmrg 	dump_printf_loc (MSG_NOTE, vect_location,
15040fc04c29Smrg 			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
150510d565efSmrg 
150610d565efSmrg       /* Skip reduction and virtual phis.  */
15070fc04c29Smrg       if (!iv_phi_p (phi_info))
150810d565efSmrg 	{
150910d565efSmrg 	  if (dump_enabled_p ())
151010d565efSmrg 	    dump_printf_loc (MSG_NOTE, vect_location,
151110d565efSmrg 			     "reduc or virtual phi. skip.\n");
151210d565efSmrg 	  continue;
151310d565efSmrg 	}
151410d565efSmrg 
151510d565efSmrg       type = TREE_TYPE (gimple_phi_result (phi));
15160fc04c29Smrg       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
151710d565efSmrg       step_expr = unshare_expr (step_expr);
151810d565efSmrg 
151910d565efSmrg       /* FORNOW: We do not support IVs whose evolution function is a polynomial
152010d565efSmrg          of degree >= 2 or exponential.  */
152110d565efSmrg       gcc_assert (!tree_is_chrec (step_expr));
152210d565efSmrg 
152310d565efSmrg       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
152410d565efSmrg 
152510d565efSmrg       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
152610d565efSmrg 			 fold_convert (TREE_TYPE (step_expr), niters),
152710d565efSmrg 			 step_expr);
152810d565efSmrg       if (POINTER_TYPE_P (type))
152910d565efSmrg 	ni = fold_build_pointer_plus (init_expr, off);
153010d565efSmrg       else
153110d565efSmrg 	ni = fold_build2 (PLUS_EXPR, type,
153210d565efSmrg 			  init_expr, fold_convert (type, off));
153310d565efSmrg 
153410d565efSmrg       var = create_tmp_var (type, "tmp");
153510d565efSmrg 
153610d565efSmrg       last_gsi = gsi_last_bb (exit_bb);
153710d565efSmrg       gimple_seq new_stmts = NULL;
153810d565efSmrg       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
153910d565efSmrg       /* Exit_bb shouldn't be empty.  */
154010d565efSmrg       if (!gsi_end_p (last_gsi))
154110d565efSmrg 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
154210d565efSmrg       else
154310d565efSmrg 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
154410d565efSmrg 
154510d565efSmrg       /* Fix phi expressions in the successor bb.  */
154610d565efSmrg       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
154710d565efSmrg     }
154810d565efSmrg }
154910d565efSmrg 
1550c7a68eb7Smrg /* Return a gimple value containing the misalignment (measured in vector
1551c7a68eb7Smrg    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552c7a68eb7Smrg    it is away from a perfectly aligned address.  Add any new statements
1553c7a68eb7Smrg    to SEQ.  */
1554c7a68eb7Smrg 
1555c7a68eb7Smrg static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1556c7a68eb7Smrg get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557c7a68eb7Smrg {
15580fc04c29Smrg   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
15590fc04c29Smrg   stmt_vec_info stmt_info = dr_info->stmt;
1560c7a68eb7Smrg   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561c7a68eb7Smrg 
15620fc04c29Smrg   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
15630fc04c29Smrg   unsigned HOST_WIDE_INT target_align_c;
15640fc04c29Smrg   tree target_align_minus_1;
1565c7a68eb7Smrg 
15660fc04c29Smrg   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
15670fc04c29Smrg 					size_zero_node) < 0;
1568c7a68eb7Smrg   tree offset = (negative
1569c7a68eb7Smrg 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570c7a68eb7Smrg 		 : size_zero_node);
15710fc04c29Smrg   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572c7a68eb7Smrg 							  offset);
1573c7a68eb7Smrg   tree type = unsigned_type_for (TREE_TYPE (start_addr));
15740fc04c29Smrg   if (target_align.is_constant (&target_align_c))
15750fc04c29Smrg     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
15760fc04c29Smrg   else
15770fc04c29Smrg     {
15780fc04c29Smrg       tree vla = build_int_cst (type, target_align);
15790fc04c29Smrg       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
15800fc04c29Smrg 				    fold_build2 (MINUS_EXPR, type,
15810fc04c29Smrg 						 build_int_cst (type, 0), vla));
15820fc04c29Smrg       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
15830fc04c29Smrg 					  build_int_cst (type, 1));
15840fc04c29Smrg     }
15850fc04c29Smrg 
1586c7a68eb7Smrg   HOST_WIDE_INT elem_size
1587c7a68eb7Smrg     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588c7a68eb7Smrg   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589c7a68eb7Smrg 
1590c7a68eb7Smrg   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1591c7a68eb7Smrg   tree int_start_addr = fold_convert (type, start_addr);
1592c7a68eb7Smrg   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593c7a68eb7Smrg 					target_align_minus_1);
1594c7a68eb7Smrg 
1595c7a68eb7Smrg   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1596c7a68eb7Smrg   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597c7a68eb7Smrg 					elem_size_log);
1598c7a68eb7Smrg 
1599c7a68eb7Smrg   return misalign_in_elems;
1600c7a68eb7Smrg }
1601c7a68eb7Smrg 
160210d565efSmrg /* Function vect_gen_prolog_loop_niters
160310d565efSmrg 
160410d565efSmrg    Generate the number of iterations which should be peeled as prolog for the
160510d565efSmrg    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
160610d565efSmrg    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
160710d565efSmrg    As a result, after the execution of this loop, the data reference DR will
160810d565efSmrg    refer to an aligned location.  The following computation is generated:
160910d565efSmrg 
161010d565efSmrg    If the misalignment of DR is known at compile time:
161110d565efSmrg      addr_mis = int mis = DR_MISALIGNMENT (dr);
161210d565efSmrg    Else, compute address misalignment in bytes:
1613c7a68eb7Smrg      addr_mis = addr & (target_align - 1)
161410d565efSmrg 
161510d565efSmrg    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
161610d565efSmrg 
161710d565efSmrg    (elem_size = element type size; an element is the scalar element whose type
161810d565efSmrg    is the inner type of the vectype)
161910d565efSmrg 
162010d565efSmrg    The computations will be emitted at the end of BB.  We also compute and
162110d565efSmrg    store upper bound (included) of the result in BOUND.
162210d565efSmrg 
162310d565efSmrg    When the step of the data-ref in the loop is not 1 (as in interleaved data
162410d565efSmrg    and SLP), the number of iterations of the prolog must be divided by the step
162510d565efSmrg    (which is equal to the size of interleaved group).
162610d565efSmrg 
162710d565efSmrg    The above formulas assume that VF == number of elements in the vector. This
162810d565efSmrg    may not hold when there are multiple-types in the loop.
162910d565efSmrg    In this case, for some data-references in the loop the VF does not represent
163010d565efSmrg    the number of elements that fit in the vector.  Therefore, instead of VF we
163110d565efSmrg    use TYPE_VECTOR_SUBPARTS.  */
163210d565efSmrg 
163310d565efSmrg static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)163410d565efSmrg vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
163510d565efSmrg 			     basic_block bb, int *bound)
163610d565efSmrg {
16370fc04c29Smrg   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
163810d565efSmrg   tree var;
163910d565efSmrg   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
164010d565efSmrg   gimple_seq stmts = NULL, new_stmts = NULL;
164110d565efSmrg   tree iters, iters_name;
16420fc04c29Smrg   stmt_vec_info stmt_info = dr_info->stmt;
164310d565efSmrg   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
16440fc04c29Smrg   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
164510d565efSmrg 
164610d565efSmrg   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
164710d565efSmrg     {
164810d565efSmrg       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
164910d565efSmrg 
165010d565efSmrg       if (dump_enabled_p ())
165110d565efSmrg         dump_printf_loc (MSG_NOTE, vect_location,
165210d565efSmrg                          "known peeling = %d.\n", npeel);
165310d565efSmrg 
165410d565efSmrg       iters = build_int_cst (niters_type, npeel);
165510d565efSmrg       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
165610d565efSmrg     }
165710d565efSmrg   else
165810d565efSmrg     {
1659c7a68eb7Smrg       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660c7a68eb7Smrg       tree type = TREE_TYPE (misalign_in_elems);
1661c7a68eb7Smrg       HOST_WIDE_INT elem_size
1662c7a68eb7Smrg 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
16630fc04c29Smrg       /* We only do prolog peeling if the target alignment is known at compile
16640fc04c29Smrg          time.  */
16650fc04c29Smrg       poly_uint64 align_in_elems =
16660fc04c29Smrg 	exact_div (target_align, elem_size);
16670fc04c29Smrg       tree align_in_elems_minus_1 =
16680fc04c29Smrg 	build_int_cst (type, align_in_elems - 1);
1669c7a68eb7Smrg       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670c7a68eb7Smrg 
1671c7a68eb7Smrg       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1672c7a68eb7Smrg 				 & (align_in_elems - 1)).  */
16730fc04c29Smrg       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
16740fc04c29Smrg 					    size_zero_node) < 0;
167510d565efSmrg       if (negative)
1676c7a68eb7Smrg 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677c7a68eb7Smrg 			     align_in_elems_tree);
167810d565efSmrg       else
1679c7a68eb7Smrg 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680c7a68eb7Smrg 			     misalign_in_elems);
1681c7a68eb7Smrg       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
168210d565efSmrg       iters = fold_convert (niters_type, iters);
16830fc04c29Smrg       unsigned HOST_WIDE_INT align_in_elems_c;
16840fc04c29Smrg       if (align_in_elems.is_constant (&align_in_elems_c))
16850fc04c29Smrg 	*bound = align_in_elems_c - 1;
16860fc04c29Smrg       else
16870fc04c29Smrg 	*bound = -1;
168810d565efSmrg     }
168910d565efSmrg 
169010d565efSmrg   if (dump_enabled_p ())
169110d565efSmrg     dump_printf_loc (MSG_NOTE, vect_location,
16920fc04c29Smrg 		     "niters for prolog loop: %T\n", iters);
169310d565efSmrg 
169410d565efSmrg   var = create_tmp_var (niters_type, "prolog_loop_niters");
169510d565efSmrg   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
169610d565efSmrg 
169710d565efSmrg   if (new_stmts)
169810d565efSmrg     gimple_seq_add_seq (&stmts, new_stmts);
169910d565efSmrg   if (stmts)
170010d565efSmrg     {
170110d565efSmrg       gcc_assert (single_succ_p (bb));
170210d565efSmrg       gimple_stmt_iterator gsi = gsi_last_bb (bb);
170310d565efSmrg       if (gsi_end_p (gsi))
170410d565efSmrg 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
170510d565efSmrg       else
170610d565efSmrg 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
170710d565efSmrg     }
170810d565efSmrg   return iters_name;
170910d565efSmrg }
171010d565efSmrg 
171110d565efSmrg 
171210d565efSmrg /* Function vect_update_init_of_dr
171310d565efSmrg 
1714c7a68eb7Smrg    If CODE is PLUS, the vector loop starts NITERS iterations after the
1715c7a68eb7Smrg    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716c7a68eb7Smrg    iterations before the scalar one (using masking to skip inactive
1717c7a68eb7Smrg    elements).  This function updates the information recorded in DR to
1718c7a68eb7Smrg    account for the difference.  Specifically, it updates the OFFSET
1719*ec02198aSmrg    field of DR_INFO.  */
172010d565efSmrg 
172110d565efSmrg static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1722*ec02198aSmrg vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
172310d565efSmrg {
1724*ec02198aSmrg   struct data_reference *dr = dr_info->dr;
1725*ec02198aSmrg   tree offset = dr_info->offset;
1726*ec02198aSmrg   if (!offset)
1727*ec02198aSmrg     offset = build_zero_cst (sizetype);
172810d565efSmrg 
172910d565efSmrg   niters = fold_build2 (MULT_EXPR, sizetype,
173010d565efSmrg 			fold_convert (sizetype, niters),
173110d565efSmrg 			fold_convert (sizetype, DR_STEP (dr)));
1732c7a68eb7Smrg   offset = fold_build2 (code, sizetype,
173310d565efSmrg 			fold_convert (sizetype, offset), niters);
1734*ec02198aSmrg   dr_info->offset = offset;
173510d565efSmrg }
173610d565efSmrg 
173710d565efSmrg 
173810d565efSmrg /* Function vect_update_inits_of_drs
173910d565efSmrg 
1740c7a68eb7Smrg    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741c7a68eb7Smrg    CODE and NITERS are as for vect_update_inits_of_dr.  */
174210d565efSmrg 
1743*ec02198aSmrg void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1744c7a68eb7Smrg vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745c7a68eb7Smrg 			  tree_code code)
174610d565efSmrg {
174710d565efSmrg   unsigned int i;
174810d565efSmrg   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
174910d565efSmrg   struct data_reference *dr;
175010d565efSmrg 
17510fc04c29Smrg   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
175210d565efSmrg 
1753*ec02198aSmrg   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1754*ec02198aSmrg      here, but since we might use these niters to update the epilogues niters
1755*ec02198aSmrg      and data references we can't insert them here as this definition might not
1756*ec02198aSmrg      always dominate its uses.  */
175710d565efSmrg   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
175810d565efSmrg     niters = fold_convert (sizetype, niters);
175910d565efSmrg 
176010d565efSmrg   FOR_EACH_VEC_ELT (datarefs, i, dr)
17610fc04c29Smrg     {
17620fc04c29Smrg       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763*ec02198aSmrg       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1764*ec02198aSmrg 	  && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1765*ec02198aSmrg 	vect_update_init_of_dr (dr_info, niters, code);
176610d565efSmrg     }
17670fc04c29Smrg }
176810d565efSmrg 
1769c7a68eb7Smrg /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770c7a68eb7Smrg    by masking.  This involves calculating the number of iterations to
1771c7a68eb7Smrg    be peeled and then aligning all memory references appropriately.  */
1772c7a68eb7Smrg 
1773c7a68eb7Smrg void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1774c7a68eb7Smrg vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775c7a68eb7Smrg {
1776c7a68eb7Smrg   tree misalign_in_elems;
1777c7a68eb7Smrg   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778c7a68eb7Smrg 
1779c7a68eb7Smrg   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780c7a68eb7Smrg 
1781c7a68eb7Smrg   /* From the information recorded in LOOP_VINFO get the number of iterations
1782c7a68eb7Smrg      that need to be skipped via masking.  */
1783c7a68eb7Smrg   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784c7a68eb7Smrg     {
1785c7a68eb7Smrg       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786c7a68eb7Smrg 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787c7a68eb7Smrg       misalign_in_elems = build_int_cst (type, misalign);
1788c7a68eb7Smrg     }
1789c7a68eb7Smrg   else
1790c7a68eb7Smrg     {
1791c7a68eb7Smrg       gimple_seq seq1 = NULL, seq2 = NULL;
1792c7a68eb7Smrg       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793c7a68eb7Smrg       misalign_in_elems = fold_convert (type, misalign_in_elems);
1794c7a68eb7Smrg       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795c7a68eb7Smrg 						&seq2, true, NULL_TREE);
1796c7a68eb7Smrg       gimple_seq_add_seq (&seq1, seq2);
1797c7a68eb7Smrg       if (seq1)
1798c7a68eb7Smrg 	{
1799c7a68eb7Smrg 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800c7a68eb7Smrg 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801c7a68eb7Smrg 	  gcc_assert (!new_bb);
1802c7a68eb7Smrg 	}
1803c7a68eb7Smrg     }
1804c7a68eb7Smrg 
1805c7a68eb7Smrg   if (dump_enabled_p ())
1806c7a68eb7Smrg     dump_printf_loc (MSG_NOTE, vect_location,
18070fc04c29Smrg 		     "misalignment for fully-masked loop: %T\n",
18080fc04c29Smrg 		     misalign_in_elems);
1809c7a68eb7Smrg 
1810c7a68eb7Smrg   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811c7a68eb7Smrg 
1812c7a68eb7Smrg   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813c7a68eb7Smrg }
181410d565efSmrg 
181510d565efSmrg /* This function builds ni_name = number of iterations.  Statements
1816c7a68eb7Smrg    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1817c7a68eb7Smrg    it to TRUE if new ssa_var is generated.  */
181810d565efSmrg 
181910d565efSmrg tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1820c7a68eb7Smrg vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
182110d565efSmrg {
182210d565efSmrg   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
182310d565efSmrg   if (TREE_CODE (ni) == INTEGER_CST)
182410d565efSmrg     return ni;
182510d565efSmrg   else
182610d565efSmrg     {
182710d565efSmrg       tree ni_name, var;
182810d565efSmrg       gimple_seq stmts = NULL;
182910d565efSmrg       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
183010d565efSmrg 
183110d565efSmrg       var = create_tmp_var (TREE_TYPE (ni), "niters");
183210d565efSmrg       ni_name = force_gimple_operand (ni, &stmts, false, var);
183310d565efSmrg       if (stmts)
1834c7a68eb7Smrg 	{
183510d565efSmrg 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1836c7a68eb7Smrg 	  if (new_var_p != NULL)
1837c7a68eb7Smrg 	    *new_var_p = true;
1838c7a68eb7Smrg 	}
183910d565efSmrg 
184010d565efSmrg       return ni_name;
184110d565efSmrg     }
184210d565efSmrg }
184310d565efSmrg 
184410d565efSmrg /* Calculate the number of iterations above which vectorized loop will be
184510d565efSmrg    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
184610d565efSmrg    of prolog loop.  If it's integer const, the integer number is also passed
1847c7a68eb7Smrg    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1848c7a68eb7Smrg    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1849c7a68eb7Smrg    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1850c7a68eb7Smrg    threshold below which the scalar (rather than vectorized) loop will be
1851c7a68eb7Smrg    executed.  This function stores the upper bound (inclusive) of the result
1852c7a68eb7Smrg    in BOUND_SCALAR.  */
185310d565efSmrg 
185410d565efSmrg static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)185510d565efSmrg vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856c7a68eb7Smrg 			     int bound_prolog, poly_int64 bound_epilog, int th,
1857c7a68eb7Smrg 			     poly_uint64 *bound_scalar,
1858c7a68eb7Smrg 			     bool check_profitability)
185910d565efSmrg {
186010d565efSmrg   tree type = TREE_TYPE (niters_prolog);
186110d565efSmrg   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862c7a68eb7Smrg 			     build_int_cst (type, bound_epilog));
186310d565efSmrg 
1864c7a68eb7Smrg   *bound_scalar = bound_prolog + bound_epilog;
186510d565efSmrg   if (check_profitability)
186610d565efSmrg     {
186710d565efSmrg       /* TH indicates the minimum niters of vectorized loop, while we
186810d565efSmrg 	 compute the maximum niters of scalar loop.  */
186910d565efSmrg       th--;
187010d565efSmrg       /* Peeling for constant times.  */
187110d565efSmrg       if (int_niters_prolog >= 0)
187210d565efSmrg 	{
1873c7a68eb7Smrg 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
187410d565efSmrg 	  return build_int_cst (type, *bound_scalar);
187510d565efSmrg 	}
1876c7a68eb7Smrg       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1877c7a68eb7Smrg 	 and BOUND_EPILOG are inclusive upper bounds.  */
1878c7a68eb7Smrg       if (known_ge (th, bound_prolog + bound_epilog))
187910d565efSmrg 	{
188010d565efSmrg 	  *bound_scalar = th;
188110d565efSmrg 	  return build_int_cst (type, th);
188210d565efSmrg 	}
1883c7a68eb7Smrg       /* Need to do runtime comparison.  */
1884c7a68eb7Smrg       else if (maybe_gt (th, bound_epilog))
1885c7a68eb7Smrg 	{
1886c7a68eb7Smrg 	  *bound_scalar = upper_bound (*bound_scalar, th);
1887c7a68eb7Smrg 	  return fold_build2 (MAX_EXPR, type,
1888c7a68eb7Smrg 			      build_int_cst (type, th), niters);
1889c7a68eb7Smrg 	}
189010d565efSmrg     }
189110d565efSmrg   return niters;
189210d565efSmrg }
189310d565efSmrg 
1894c7a68eb7Smrg /* NITERS is the number of times that the original scalar loop executes
1895c7a68eb7Smrg    after peeling.  Work out the maximum number of iterations N that can
1896c7a68eb7Smrg    be handled by the vectorized form of the loop and then either:
189710d565efSmrg 
1898c7a68eb7Smrg    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
189910d565efSmrg 
1900c7a68eb7Smrg 	niters_vector = N
1901c7a68eb7Smrg 
1902c7a68eb7Smrg    b) set *STEP_VECTOR_PTR to one and generate:
1903c7a68eb7Smrg 
1904c7a68eb7Smrg         niters_vector = N / vf
1905c7a68eb7Smrg 
1906c7a68eb7Smrg    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907c7a68eb7Smrg    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1908c7a68eb7Smrg    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
190910d565efSmrg 
191010d565efSmrg void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)191110d565efSmrg vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912c7a68eb7Smrg 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1913c7a68eb7Smrg 			     bool niters_no_overflow)
191410d565efSmrg {
191510d565efSmrg   tree ni_minus_gap, var;
1916c7a68eb7Smrg   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917c7a68eb7Smrg   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
191810d565efSmrg   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919c7a68eb7Smrg   tree log_vf = NULL_TREE;
192010d565efSmrg 
192110d565efSmrg   /* If epilogue loop is required because of data accesses with gaps, we
192210d565efSmrg      subtract one iteration from the total number of iterations here for
192310d565efSmrg      correct calculation of RATIO.  */
192410d565efSmrg   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
192510d565efSmrg     {
1926c7a68eb7Smrg       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927c7a68eb7Smrg 				  build_one_cst (type));
192810d565efSmrg       if (!is_gimple_val (ni_minus_gap))
192910d565efSmrg 	{
1930c7a68eb7Smrg 	  var = create_tmp_var (type, "ni_gap");
193110d565efSmrg 	  gimple *stmts = NULL;
193210d565efSmrg 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
193310d565efSmrg 					       true, var);
193410d565efSmrg 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
193510d565efSmrg 	}
193610d565efSmrg     }
193710d565efSmrg   else
193810d565efSmrg     ni_minus_gap = niters;
193910d565efSmrg 
1940c7a68eb7Smrg   unsigned HOST_WIDE_INT const_vf;
1941c7a68eb7Smrg   if (vf.is_constant (&const_vf)
1942c7a68eb7Smrg       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943c7a68eb7Smrg     {
194410d565efSmrg       /* Create: niters >> log2(vf) */
194510d565efSmrg       /* If it's known that niters == number of latch executions + 1 doesn't
194610d565efSmrg 	 overflow, we can generate niters >> log2(vf); otherwise we generate
194710d565efSmrg 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
194810d565efSmrg 	 will be at least one.  */
1949c7a68eb7Smrg       log_vf = build_int_cst (type, exact_log2 (const_vf));
195010d565efSmrg       if (niters_no_overflow)
1951c7a68eb7Smrg 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
195210d565efSmrg       else
195310d565efSmrg 	niters_vector
1954c7a68eb7Smrg 	  = fold_build2 (PLUS_EXPR, type,
1955c7a68eb7Smrg 			 fold_build2 (RSHIFT_EXPR, type,
1956c7a68eb7Smrg 				      fold_build2 (MINUS_EXPR, type,
195710d565efSmrg 						   ni_minus_gap,
1958c7a68eb7Smrg 						   build_int_cst (type, vf)),
195910d565efSmrg 				      log_vf),
1960c7a68eb7Smrg 			 build_int_cst (type, 1));
1961c7a68eb7Smrg       step_vector = build_one_cst (type);
1962c7a68eb7Smrg     }
1963c7a68eb7Smrg   else
1964c7a68eb7Smrg     {
1965c7a68eb7Smrg       niters_vector = ni_minus_gap;
1966c7a68eb7Smrg       step_vector = build_int_cst (type, vf);
1967c7a68eb7Smrg     }
196810d565efSmrg 
196910d565efSmrg   if (!is_gimple_val (niters_vector))
197010d565efSmrg     {
1971c7a68eb7Smrg       var = create_tmp_var (type, "bnd");
1972c7a68eb7Smrg       gimple_seq stmts = NULL;
197310d565efSmrg       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
197410d565efSmrg       gsi_insert_seq_on_edge_immediate (pe, stmts);
1975c7a68eb7Smrg       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976*ec02198aSmrg 	 we set range information to make niters analyzer's life easier.
1977*ec02198aSmrg 	 Note the number of latch iteration value can be TYPE_MAX_VALUE so
1978*ec02198aSmrg 	 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
1979c7a68eb7Smrg       if (stmts != NULL && log_vf)
1980*ec02198aSmrg 	{
1981*ec02198aSmrg 	  if (niters_no_overflow)
1982c7a68eb7Smrg 	    set_range_info (niters_vector, VR_RANGE,
1983*ec02198aSmrg 			    wi::one (TYPE_PRECISION (type)),
1984*ec02198aSmrg 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1985*ec02198aSmrg 						       TYPE_SIGN (type)),
1986*ec02198aSmrg 					exact_log2 (const_vf),
1987*ec02198aSmrg 					TYPE_SIGN (type)));
1988*ec02198aSmrg 	  /* For VF == 1 the vector IV might also overflow so we cannot
1989*ec02198aSmrg 	     assert a minimum value of 1.  */
1990*ec02198aSmrg 	  else if (const_vf > 1)
1991*ec02198aSmrg 	    set_range_info (niters_vector, VR_RANGE,
1992*ec02198aSmrg 			    wi::one (TYPE_PRECISION (type)),
1993*ec02198aSmrg 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1994*ec02198aSmrg 						       TYPE_SIGN (type))
1995*ec02198aSmrg 					- (const_vf - 1),
1996*ec02198aSmrg 					exact_log2 (const_vf), TYPE_SIGN (type))
1997*ec02198aSmrg 			    + 1);
1998*ec02198aSmrg 	}
199910d565efSmrg     }
200010d565efSmrg   *niters_vector_ptr = niters_vector;
2001c7a68eb7Smrg   *step_vector_ptr = step_vector;
200210d565efSmrg 
200310d565efSmrg   return;
200410d565efSmrg }
200510d565efSmrg 
200610d565efSmrg /* Given NITERS_VECTOR which is the number of iterations for vectorized
200710d565efSmrg    loop specified by LOOP_VINFO after vectorization, compute the number
200810d565efSmrg    of iterations before vectorization (niters_vector * vf) and store it
200910d565efSmrg    to NITERS_VECTOR_MULT_VF_PTR.  */
201010d565efSmrg 
201110d565efSmrg static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)201210d565efSmrg vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
201310d565efSmrg 				     tree niters_vector,
201410d565efSmrg 				     tree *niters_vector_mult_vf_ptr)
201510d565efSmrg {
2016c7a68eb7Smrg   /* We should be using a step_vector of VF if VF is variable.  */
2017c7a68eb7Smrg   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2018*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
201910d565efSmrg   tree type = TREE_TYPE (niters_vector);
202010d565efSmrg   tree log_vf = build_int_cst (type, exact_log2 (vf));
202110d565efSmrg   basic_block exit_bb = single_exit (loop)->dest;
202210d565efSmrg 
202310d565efSmrg   gcc_assert (niters_vector_mult_vf_ptr != NULL);
202410d565efSmrg   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
202510d565efSmrg 					    niters_vector, log_vf);
202610d565efSmrg   if (!is_gimple_val (niters_vector_mult_vf))
202710d565efSmrg     {
202810d565efSmrg       tree var = create_tmp_var (type, "niters_vector_mult_vf");
202910d565efSmrg       gimple_seq stmts = NULL;
203010d565efSmrg       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
203110d565efSmrg 						    &stmts, true, var);
203210d565efSmrg       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
203310d565efSmrg       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
203410d565efSmrg     }
203510d565efSmrg   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
203610d565efSmrg }
203710d565efSmrg 
2038*ec02198aSmrg /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2039*ec02198aSmrg    this function searches for the corresponding lcssa phi node in exit
2040*ec02198aSmrg    bb of LOOP.  If it is found, return the phi result; otherwise return
2041*ec02198aSmrg    NULL.  */
2042*ec02198aSmrg 
2043*ec02198aSmrg static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2044*ec02198aSmrg find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2045*ec02198aSmrg 		gphi *lcssa_phi)
2046*ec02198aSmrg {
2047*ec02198aSmrg   gphi_iterator gsi;
2048*ec02198aSmrg   edge e = single_exit (loop);
2049*ec02198aSmrg 
2050*ec02198aSmrg   gcc_assert (single_pred_p (e->dest));
2051*ec02198aSmrg   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2052*ec02198aSmrg     {
2053*ec02198aSmrg       gphi *phi = gsi.phi ();
2054*ec02198aSmrg       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2055*ec02198aSmrg 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2056*ec02198aSmrg 	return PHI_RESULT (phi);
2057*ec02198aSmrg     }
2058*ec02198aSmrg   return NULL_TREE;
2059*ec02198aSmrg }
2060*ec02198aSmrg 
206110d565efSmrg /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
206210d565efSmrg    from SECOND/FIRST and puts it at the original loop's preheader/exit
206310d565efSmrg    edge, the two loops are arranged as below:
206410d565efSmrg 
206510d565efSmrg        preheader_a:
206610d565efSmrg      first_loop:
206710d565efSmrg        header_a:
206810d565efSmrg 	 i_1 = PHI<i_0, i_2>;
206910d565efSmrg 	 ...
207010d565efSmrg 	 i_2 = i_1 + 1;
207110d565efSmrg 	 if (cond_a)
207210d565efSmrg 	   goto latch_a;
207310d565efSmrg 	 else
207410d565efSmrg 	   goto between_bb;
207510d565efSmrg        latch_a:
207610d565efSmrg 	 goto header_a;
207710d565efSmrg 
207810d565efSmrg        between_bb:
207910d565efSmrg 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
208010d565efSmrg 
208110d565efSmrg      second_loop:
208210d565efSmrg        header_b:
208310d565efSmrg 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
208410d565efSmrg 				 or with i_2 if no LCSSA phi is created
208510d565efSmrg 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
208610d565efSmrg 	 ...
208710d565efSmrg 	 i_4 = i_3 + 1;
208810d565efSmrg 	 if (cond_b)
208910d565efSmrg 	   goto latch_b;
209010d565efSmrg 	 else
209110d565efSmrg 	   goto exit_bb;
209210d565efSmrg        latch_b:
209310d565efSmrg 	 goto header_b;
209410d565efSmrg 
209510d565efSmrg        exit_bb:
209610d565efSmrg 
209710d565efSmrg    This function creates loop closed SSA for the first loop; update the
209810d565efSmrg    second loop's PHI nodes by replacing argument on incoming edge with the
209910d565efSmrg    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
210010d565efSmrg    is false, Loop closed ssa phis will only be created for non-iv phis for
210110d565efSmrg    the first loop.
210210d565efSmrg 
210310d565efSmrg    This function assumes exit bb of the first loop is preheader bb of the
210410d565efSmrg    second loop, i.e, between_bb in the example code.  With PHIs updated,
210510d565efSmrg    the second loop will execute rest iterations of the first.  */
210610d565efSmrg 
210710d565efSmrg static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)210810d565efSmrg slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2109*ec02198aSmrg 				   class loop *first, class loop *second,
211010d565efSmrg 				   bool create_lcssa_for_iv_phis)
211110d565efSmrg {
211210d565efSmrg   gphi_iterator gsi_update, gsi_orig;
2113*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211410d565efSmrg 
211510d565efSmrg   edge first_latch_e = EDGE_SUCC (first->latch, 0);
211610d565efSmrg   edge second_preheader_e = loop_preheader_edge (second);
211710d565efSmrg   basic_block between_bb = single_exit (first)->dest;
211810d565efSmrg 
211910d565efSmrg   gcc_assert (between_bb == second_preheader_e->src);
212010d565efSmrg   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
212110d565efSmrg   /* Either the first loop or the second is the loop to be vectorized.  */
212210d565efSmrg   gcc_assert (loop == first || loop == second);
212310d565efSmrg 
212410d565efSmrg   for (gsi_orig = gsi_start_phis (first->header),
212510d565efSmrg        gsi_update = gsi_start_phis (second->header);
212610d565efSmrg        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
212710d565efSmrg        gsi_next (&gsi_orig), gsi_next (&gsi_update))
212810d565efSmrg     {
212910d565efSmrg       gphi *orig_phi = gsi_orig.phi ();
213010d565efSmrg       gphi *update_phi = gsi_update.phi ();
213110d565efSmrg 
213210d565efSmrg       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
213310d565efSmrg       /* Generate lcssa PHI node for the first loop.  */
213410d565efSmrg       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
21350fc04c29Smrg       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
21360fc04c29Smrg       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
213710d565efSmrg 	{
213810d565efSmrg 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
213910d565efSmrg 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
214010d565efSmrg 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
214110d565efSmrg 	  arg = new_res;
214210d565efSmrg 	}
214310d565efSmrg 
214410d565efSmrg       /* Update PHI node in the second loop by replacing arg on the loop's
214510d565efSmrg 	 incoming edge.  */
214610d565efSmrg       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
214710d565efSmrg     }
2148*ec02198aSmrg 
2149*ec02198aSmrg   /* For epilogue peeling we have to make sure to copy all LC PHIs
2150*ec02198aSmrg      for correct vectorization of live stmts.  */
2151*ec02198aSmrg   if (loop == first)
2152*ec02198aSmrg     {
2153*ec02198aSmrg       basic_block orig_exit = single_exit (second)->dest;
2154*ec02198aSmrg       for (gsi_orig = gsi_start_phis (orig_exit);
2155*ec02198aSmrg 	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2156*ec02198aSmrg 	{
2157*ec02198aSmrg 	  gphi *orig_phi = gsi_orig.phi ();
2158*ec02198aSmrg 	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2159*ec02198aSmrg 	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2160*ec02198aSmrg 	    continue;
2161*ec02198aSmrg 
2162*ec02198aSmrg 	  /* Already created in the above loop.   */
2163*ec02198aSmrg 	  if (find_guard_arg (first, second, orig_phi))
2164*ec02198aSmrg 	    continue;
2165*ec02198aSmrg 
2166*ec02198aSmrg 	  tree new_res = copy_ssa_name (orig_arg);
2167*ec02198aSmrg 	  gphi *lcphi = create_phi_node (new_res, between_bb);
2168*ec02198aSmrg 	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2169*ec02198aSmrg 	}
2170*ec02198aSmrg     }
217110d565efSmrg }
217210d565efSmrg 
217310d565efSmrg /* Function slpeel_add_loop_guard adds guard skipping from the beginning
217410d565efSmrg    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
217510d565efSmrg    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
217610d565efSmrg    appear like below:
217710d565efSmrg 
217810d565efSmrg        guard_bb:
217910d565efSmrg 	 if (cond)
218010d565efSmrg 	   goto merge_bb;
218110d565efSmrg 	 else
218210d565efSmrg 	   goto skip_loop;
218310d565efSmrg 
218410d565efSmrg      skip_loop:
218510d565efSmrg        header_a:
218610d565efSmrg 	 i_1 = PHI<i_0, i_2>;
218710d565efSmrg 	 ...
218810d565efSmrg 	 i_2 = i_1 + 1;
218910d565efSmrg 	 if (cond_a)
219010d565efSmrg 	   goto latch_a;
219110d565efSmrg 	 else
219210d565efSmrg 	   goto exit_a;
219310d565efSmrg        latch_a:
219410d565efSmrg 	 goto header_a;
219510d565efSmrg 
219610d565efSmrg        exit_a:
219710d565efSmrg 	 i_5 = PHI<i_2>;
219810d565efSmrg 
219910d565efSmrg        merge_bb:
220010d565efSmrg 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
220110d565efSmrg 
220210d565efSmrg      update_loop:
220310d565efSmrg        header_b:
220410d565efSmrg 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
220510d565efSmrg 	 ...
220610d565efSmrg 	 i_4 = i_3 + 1;
220710d565efSmrg 	 if (cond_b)
220810d565efSmrg 	   goto latch_b;
220910d565efSmrg 	 else
221010d565efSmrg 	   goto exit_bb;
221110d565efSmrg        latch_b:
221210d565efSmrg 	 goto header_b;
221310d565efSmrg 
221410d565efSmrg        exit_bb:
221510d565efSmrg 
221610d565efSmrg    This function creates PHI nodes at merge_bb and replaces the use of i_5
221710d565efSmrg    in the update_loop's PHI node with the result of new PHI result.  */
221810d565efSmrg 
221910d565efSmrg static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2220*ec02198aSmrg slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2221*ec02198aSmrg 				    class loop *update_loop,
222210d565efSmrg 				    edge guard_edge, edge merge_edge)
222310d565efSmrg {
22240fc04c29Smrg   location_t merge_loc, guard_loc;
222510d565efSmrg   edge orig_e = loop_preheader_edge (skip_loop);
222610d565efSmrg   edge update_e = loop_preheader_edge (update_loop);
222710d565efSmrg   gphi_iterator gsi_orig, gsi_update;
222810d565efSmrg 
222910d565efSmrg   for ((gsi_orig = gsi_start_phis (skip_loop->header),
223010d565efSmrg 	gsi_update = gsi_start_phis (update_loop->header));
223110d565efSmrg        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
223210d565efSmrg        gsi_next (&gsi_orig), gsi_next (&gsi_update))
223310d565efSmrg     {
223410d565efSmrg       gphi *orig_phi = gsi_orig.phi ();
223510d565efSmrg       gphi *update_phi = gsi_update.phi ();
223610d565efSmrg 
223710d565efSmrg       /* Generate new phi node at merge bb of the guard.  */
223810d565efSmrg       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
223910d565efSmrg       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
224010d565efSmrg 
224110d565efSmrg       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
224210d565efSmrg 	 args in NEW_PHI for these edges.  */
224310d565efSmrg       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
224410d565efSmrg       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
224510d565efSmrg       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
224610d565efSmrg       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
224710d565efSmrg       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
224810d565efSmrg       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
224910d565efSmrg 
225010d565efSmrg       /* Update phi in UPDATE_PHI.  */
225110d565efSmrg       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
225210d565efSmrg     }
225310d565efSmrg }
225410d565efSmrg 
225510d565efSmrg /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
225610d565efSmrg    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
225710d565efSmrg    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
225810d565efSmrg    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
225910d565efSmrg    The CFG looks like:
226010d565efSmrg 
226110d565efSmrg      loop:
226210d565efSmrg        header_a:
226310d565efSmrg 	 i_1 = PHI<i_0, i_2>;
226410d565efSmrg 	 ...
226510d565efSmrg 	 i_2 = i_1 + 1;
226610d565efSmrg 	 if (cond_a)
226710d565efSmrg 	   goto latch_a;
226810d565efSmrg 	 else
226910d565efSmrg 	   goto exit_a;
227010d565efSmrg        latch_a:
227110d565efSmrg 	 goto header_a;
227210d565efSmrg 
227310d565efSmrg        exit_a:
227410d565efSmrg 
227510d565efSmrg        guard_bb:
227610d565efSmrg 	 if (cond)
227710d565efSmrg 	   goto merge_bb;
227810d565efSmrg 	 else
227910d565efSmrg 	   goto epilog_loop;
228010d565efSmrg 
228110d565efSmrg        ;; fall_through_bb
228210d565efSmrg 
228310d565efSmrg      epilog_loop:
228410d565efSmrg        header_b:
228510d565efSmrg 	 i_3 = PHI<i_2, i_4>;
228610d565efSmrg 	 ...
228710d565efSmrg 	 i_4 = i_3 + 1;
228810d565efSmrg 	 if (cond_b)
228910d565efSmrg 	   goto latch_b;
229010d565efSmrg 	 else
229110d565efSmrg 	   goto merge_bb;
229210d565efSmrg        latch_b:
229310d565efSmrg 	 goto header_b;
229410d565efSmrg 
229510d565efSmrg        merge_bb:
229610d565efSmrg 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
229710d565efSmrg 
229810d565efSmrg        exit_bb:
229910d565efSmrg 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
230010d565efSmrg 
230110d565efSmrg    For each name used out side EPILOG (i.e - for each name that has a lcssa
230210d565efSmrg    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
230310d565efSmrg    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
230410d565efSmrg    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
230510d565efSmrg    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
230610d565efSmrg    in exit_bb will also be updated.  */
230710d565efSmrg 
230810d565efSmrg static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2309*ec02198aSmrg slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
231010d565efSmrg 				    edge guard_edge, edge merge_edge)
231110d565efSmrg {
231210d565efSmrg   gphi_iterator gsi;
231310d565efSmrg   basic_block merge_bb = guard_edge->dest;
231410d565efSmrg 
231510d565efSmrg   gcc_assert (single_succ_p (merge_bb));
231610d565efSmrg   edge e = single_succ_edge (merge_bb);
231710d565efSmrg   basic_block exit_bb = e->dest;
231810d565efSmrg   gcc_assert (single_pred_p (exit_bb));
231910d565efSmrg   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
232010d565efSmrg 
232110d565efSmrg   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
232210d565efSmrg     {
232310d565efSmrg       gphi *update_phi = gsi.phi ();
232410d565efSmrg       tree old_arg = PHI_ARG_DEF (update_phi, 0);
232510d565efSmrg 
2326*ec02198aSmrg       tree merge_arg = NULL_TREE;
2327*ec02198aSmrg 
2328*ec02198aSmrg       /* If the old argument is a SSA_NAME use its current_def.  */
2329*ec02198aSmrg       if (TREE_CODE (old_arg) == SSA_NAME)
2330*ec02198aSmrg 	merge_arg = get_current_def (old_arg);
2331*ec02198aSmrg       /* If it's a constant or doesn't have a current_def, just use the old
2332*ec02198aSmrg 	 argument.  */
233310d565efSmrg       if (!merge_arg)
233410d565efSmrg 	merge_arg = old_arg;
233510d565efSmrg 
233610d565efSmrg       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
233710d565efSmrg       /* If the var is live after loop but not a reduction, we simply
233810d565efSmrg 	 use the old arg.  */
233910d565efSmrg       if (!guard_arg)
234010d565efSmrg 	guard_arg = old_arg;
234110d565efSmrg 
234210d565efSmrg       /* Create new phi node in MERGE_BB:  */
234310d565efSmrg       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
234410d565efSmrg       gphi *merge_phi = create_phi_node (new_res, merge_bb);
234510d565efSmrg 
234610d565efSmrg       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
234710d565efSmrg 	 the two PHI args in merge_phi for these edges.  */
234810d565efSmrg       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
234910d565efSmrg       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
235010d565efSmrg 
235110d565efSmrg       /* Update the original phi in exit_bb.  */
235210d565efSmrg       adjust_phi_and_debug_stmts (update_phi, e, new_res);
235310d565efSmrg     }
235410d565efSmrg }
235510d565efSmrg 
235610d565efSmrg /* EPILOG loop is duplicated from the original loop for vectorizing,
235710d565efSmrg    the arg of its loop closed ssa PHI needs to be updated.  */
235810d565efSmrg 
235910d565efSmrg static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2360*ec02198aSmrg slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
236110d565efSmrg {
236210d565efSmrg   gphi_iterator gsi;
236310d565efSmrg   basic_block exit_bb = single_exit (epilog)->dest;
236410d565efSmrg 
236510d565efSmrg   gcc_assert (single_pred_p (exit_bb));
236610d565efSmrg   edge e = EDGE_PRED (exit_bb, 0);
236710d565efSmrg   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
236810d565efSmrg     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
236910d565efSmrg }
237010d565efSmrg 
237110d565efSmrg /* Function vect_do_peeling.
237210d565efSmrg 
237310d565efSmrg    Input:
237410d565efSmrg    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
237510d565efSmrg 
237610d565efSmrg        preheader:
237710d565efSmrg      LOOP:
237810d565efSmrg        header_bb:
237910d565efSmrg 	 loop_body
238010d565efSmrg 	 if (exit_loop_cond) goto exit_bb
238110d565efSmrg 	 else                goto header_bb
238210d565efSmrg        exit_bb:
238310d565efSmrg 
238410d565efSmrg    - NITERS: The number of iterations of the loop.
238510d565efSmrg    - NITERSM1: The number of iterations of the loop's latch.
238610d565efSmrg    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
238710d565efSmrg    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
238810d565efSmrg 			      CHECK_PROFITABILITY is true.
238910d565efSmrg    Output:
2390c7a68eb7Smrg    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2391c7a68eb7Smrg      iterate after vectorization; see vect_set_loop_condition for details.
2392c7a68eb7Smrg    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2393c7a68eb7Smrg      should be set to the number of scalar iterations handled by the
2394c7a68eb7Smrg      vector loop.  The SSA name is only used on exit from the loop.
239510d565efSmrg 
239610d565efSmrg    This function peels prolog and epilog from the loop, adds guards skipping
239710d565efSmrg    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
239810d565efSmrg    would look like:
239910d565efSmrg 
240010d565efSmrg        guard_bb_1:
240110d565efSmrg 	 if (prefer_scalar_loop) goto merge_bb_1
240210d565efSmrg 	 else                    goto guard_bb_2
240310d565efSmrg 
240410d565efSmrg        guard_bb_2:
240510d565efSmrg          if (skip_prolog) goto merge_bb_2
240610d565efSmrg          else             goto prolog_preheader
240710d565efSmrg 
240810d565efSmrg        prolog_preheader:
240910d565efSmrg      PROLOG:
241010d565efSmrg        prolog_header_bb:
241110d565efSmrg 	 prolog_body
241210d565efSmrg 	 if (exit_prolog_cond) goto prolog_exit_bb
241310d565efSmrg 	 else                  goto prolog_header_bb
241410d565efSmrg        prolog_exit_bb:
241510d565efSmrg 
241610d565efSmrg        merge_bb_2:
241710d565efSmrg 
241810d565efSmrg        vector_preheader:
241910d565efSmrg      VECTOR LOOP:
242010d565efSmrg        vector_header_bb:
242110d565efSmrg 	 vector_body
242210d565efSmrg 	 if (exit_vector_cond) goto vector_exit_bb
242310d565efSmrg 	 else                  goto vector_header_bb
242410d565efSmrg        vector_exit_bb:
242510d565efSmrg 
242610d565efSmrg        guard_bb_3:
242710d565efSmrg 	 if (skip_epilog) goto merge_bb_3
242810d565efSmrg 	 else             goto epilog_preheader
242910d565efSmrg 
243010d565efSmrg        merge_bb_1:
243110d565efSmrg 
243210d565efSmrg        epilog_preheader:
243310d565efSmrg      EPILOG:
243410d565efSmrg        epilog_header_bb:
243510d565efSmrg 	 epilog_body
243610d565efSmrg 	 if (exit_epilog_cond) goto merge_bb_3
243710d565efSmrg 	 else                  goto epilog_header_bb
243810d565efSmrg 
243910d565efSmrg        merge_bb_3:
244010d565efSmrg 
244110d565efSmrg    Note this function peels prolog and epilog only if it's necessary,
244210d565efSmrg    as well as guards.
2443*ec02198aSmrg    This function returns the epilogue loop if a decision was made to vectorize
2444*ec02198aSmrg    it, otherwise NULL.
2445*ec02198aSmrg 
2446*ec02198aSmrg    The analysis resulting in this epilogue loop's loop_vec_info was performed
2447*ec02198aSmrg    in the same vect_analyze_loop call as the main loop's.  At that time
2448*ec02198aSmrg    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2449*ec02198aSmrg    vectorization factors than the main loop.  This list is stored in the main
2450*ec02198aSmrg    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2451*ec02198aSmrg    vectorize the epilogue loop for a lower vectorization factor,  the
2452*ec02198aSmrg    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2453*ec02198aSmrg    updated and linked to the epilogue loop.  This is later used to vectorize
2454*ec02198aSmrg    the epilogue.  The reason the loop_vec_info needs updating is that it was
2455*ec02198aSmrg    constructed based on the original main loop, and the epilogue loop is a
2456*ec02198aSmrg    copy of this loop, so all links pointing to statements in the original loop
2457*ec02198aSmrg    need updating.  Furthermore, these loop_vec_infos share the
2458*ec02198aSmrg    data_reference's records, which will also need to be updated.
245910d565efSmrg 
246010d565efSmrg    TODO: Guard for prefer_scalar_loop should be emitted along with
246110d565efSmrg    versioning conditions if loop versioning is needed.  */
246210d565efSmrg 
246310d565efSmrg 
2464*ec02198aSmrg class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)246510d565efSmrg vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2466c7a68eb7Smrg 		 tree *niters_vector, tree *step_vector,
2467c7a68eb7Smrg 		 tree *niters_vector_mult_vf_var, int th,
2468*ec02198aSmrg 		 bool check_profitability, bool niters_no_overflow,
2469*ec02198aSmrg 		 tree *advance)
247010d565efSmrg {
247110d565efSmrg   edge e, guard_e;
247210d565efSmrg   tree type = TREE_TYPE (niters), guard_cond;
247310d565efSmrg   basic_block guard_bb, guard_to;
2474c7a68eb7Smrg   profile_probability prob_prolog, prob_vector, prob_epilog;
2475c7a68eb7Smrg   int estimated_vf;
2476c7a68eb7Smrg   int prolog_peeling = 0;
2477*ec02198aSmrg   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
24780fc04c29Smrg   /* We currently do not support prolog peeling if the target alignment is not
24790fc04c29Smrg      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
24800fc04c29Smrg      target alignment being constant.  */
24810fc04c29Smrg   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
24820fc04c29Smrg   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
24830fc04c29Smrg     return NULL;
24840fc04c29Smrg 
2485c7a68eb7Smrg   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2486c7a68eb7Smrg     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487c7a68eb7Smrg 
2488c7a68eb7Smrg   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2489c7a68eb7Smrg   poly_uint64 bound_epilog = 0;
2490c7a68eb7Smrg   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2491c7a68eb7Smrg       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2492c7a68eb7Smrg     bound_epilog += vf - 1;
2493c7a68eb7Smrg   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2494c7a68eb7Smrg     bound_epilog += 1;
2495c7a68eb7Smrg   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2496c7a68eb7Smrg   poly_uint64 bound_scalar = bound_epilog;
249710d565efSmrg 
249810d565efSmrg   if (!prolog_peeling && !epilog_peeling)
249910d565efSmrg     return NULL;
250010d565efSmrg 
2501*ec02198aSmrg   /* Before doing any peeling make sure to reset debug binds outside of
2502*ec02198aSmrg      the loop refering to defs not in LC SSA.  */
2503*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2504*ec02198aSmrg   for (unsigned i = 0; i < loop->num_nodes; ++i)
2505*ec02198aSmrg     {
2506*ec02198aSmrg       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2507*ec02198aSmrg       imm_use_iterator ui;
2508*ec02198aSmrg       gimple *use_stmt;
2509*ec02198aSmrg       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2510*ec02198aSmrg 	   gsi_next (&gsi))
2511*ec02198aSmrg 	{
2512*ec02198aSmrg 	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2513*ec02198aSmrg 	    if (gimple_debug_bind_p (use_stmt)
2514*ec02198aSmrg 		&& loop != gimple_bb (use_stmt)->loop_father
2515*ec02198aSmrg 		&& !flow_loop_nested_p (loop,
2516*ec02198aSmrg 					gimple_bb (use_stmt)->loop_father))
2517*ec02198aSmrg 	      {
2518*ec02198aSmrg 		gimple_debug_bind_reset_value (use_stmt);
2519*ec02198aSmrg 		update_stmt (use_stmt);
2520*ec02198aSmrg 	      }
2521*ec02198aSmrg 	}
2522*ec02198aSmrg       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523*ec02198aSmrg 	   gsi_next (&gsi))
2524*ec02198aSmrg 	{
2525*ec02198aSmrg 	  ssa_op_iter op_iter;
2526*ec02198aSmrg 	  def_operand_p def_p;
2527*ec02198aSmrg 	  FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2528*ec02198aSmrg 	    FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2529*ec02198aSmrg 	      if (gimple_debug_bind_p (use_stmt)
2530*ec02198aSmrg 		  && loop != gimple_bb (use_stmt)->loop_father
2531*ec02198aSmrg 		  && !flow_loop_nested_p (loop,
2532*ec02198aSmrg 					  gimple_bb (use_stmt)->loop_father))
2533*ec02198aSmrg 		{
2534*ec02198aSmrg 		  gimple_debug_bind_reset_value (use_stmt);
2535*ec02198aSmrg 		  update_stmt (use_stmt);
2536*ec02198aSmrg 		}
2537*ec02198aSmrg 	}
2538*ec02198aSmrg     }
2539*ec02198aSmrg 
2540c7a68eb7Smrg   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2541c7a68eb7Smrg   estimated_vf = vect_vf_for_cost (loop_vinfo);
2542c7a68eb7Smrg   if (estimated_vf == 2)
2543c7a68eb7Smrg     estimated_vf = 3;
2544c7a68eb7Smrg   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2545c7a68eb7Smrg 			.apply_scale (estimated_vf - 1, estimated_vf);
254610d565efSmrg 
2547*ec02198aSmrg   class loop *prolog, *epilog = NULL;
2548*ec02198aSmrg   class loop *first_loop = loop;
254910d565efSmrg   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2550*ec02198aSmrg 
2551*ec02198aSmrg   /* We might have a queued need to update virtual SSA form.  As we
2552*ec02198aSmrg      delete the update SSA machinery below after doing a regular
2553*ec02198aSmrg      incremental SSA update during loop copying make sure we don't
2554*ec02198aSmrg      lose that fact.
2555*ec02198aSmrg      ???  Needing to update virtual SSA form by renaming is unfortunate
2556*ec02198aSmrg      but not all of the vectorizer code inserting new loads / stores
2557*ec02198aSmrg      properly assigns virtual operands to those statements.  */
255810d565efSmrg   update_ssa (TODO_update_ssa_only_virtuals);
255910d565efSmrg 
2560*ec02198aSmrg   create_lcssa_for_virtual_phi (loop);
2561*ec02198aSmrg 
2562*ec02198aSmrg   /* If we're vectorizing an epilogue loop, the update_ssa above will
2563*ec02198aSmrg      have ensured that the virtual operand is in SSA form throughout the
2564*ec02198aSmrg      vectorized main loop.  Normally it is possible to trace the updated
2565*ec02198aSmrg      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2566*ec02198aSmrg      back to scalar-stmt vuses, meaning that the effect of the SSA update
2567*ec02198aSmrg      remains local to the main loop.  However, there are rare cases in
2568*ec02198aSmrg      which the vectorized loop has vdefs even when the original scalar
2569*ec02198aSmrg      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2570*ec02198aSmrg      introduces clobbers of the temporary vector array, which in turn
2571*ec02198aSmrg      needs new vdefs.  If the scalar loop doesn't write to memory, these
2572*ec02198aSmrg      new vdefs will be the only ones in the vector loop.
2573*ec02198aSmrg 
2574*ec02198aSmrg      In that case, update_ssa will have added a new virtual phi to the
2575*ec02198aSmrg      main loop, which previously didn't need one.  Ensure that we (locally)
2576*ec02198aSmrg      maintain LCSSA form for the virtual operand, just as we would have
2577*ec02198aSmrg      done if the virtual phi had existed from the outset.  This makes it
2578*ec02198aSmrg      easier to duplicate the scalar epilogue loop below.  */
2579*ec02198aSmrg   tree vop_to_rename = NULL_TREE;
2580*ec02198aSmrg   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2581*ec02198aSmrg     {
2582*ec02198aSmrg       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2583*ec02198aSmrg       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2584*ec02198aSmrg     }
2585*ec02198aSmrg 
2586c7a68eb7Smrg   if (MAY_HAVE_DEBUG_BIND_STMTS)
258710d565efSmrg     {
258810d565efSmrg       gcc_assert (!adjust_vec.exists ());
258910d565efSmrg       adjust_vec.create (32);
259010d565efSmrg     }
259110d565efSmrg   initialize_original_copy_tables ();
259210d565efSmrg 
2593c7a68eb7Smrg   /* Record the anchor bb at which the guard should be placed if the scalar
2594c7a68eb7Smrg      loop might be preferred.  */
2595c7a68eb7Smrg   basic_block anchor = loop_preheader_edge (loop)->src;
2596c7a68eb7Smrg 
2597c7a68eb7Smrg   /* Generate the number of iterations for the prolog loop.  We do this here
2598c7a68eb7Smrg      so that we can also get the upper bound on the number of iterations.  */
2599c7a68eb7Smrg   tree niters_prolog;
2600c7a68eb7Smrg   int bound_prolog = 0;
2601c7a68eb7Smrg   if (prolog_peeling)
2602c7a68eb7Smrg     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2603c7a68eb7Smrg 						  &bound_prolog);
2604c7a68eb7Smrg   else
2605c7a68eb7Smrg     niters_prolog = build_int_cst (type, 0);
2606c7a68eb7Smrg 
2607*ec02198aSmrg   loop_vec_info epilogue_vinfo = NULL;
2608*ec02198aSmrg   if (vect_epilogues)
2609*ec02198aSmrg     {
2610*ec02198aSmrg       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2611*ec02198aSmrg       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2612*ec02198aSmrg     }
2613*ec02198aSmrg 
2614*ec02198aSmrg   tree niters_vector_mult_vf = NULL_TREE;
2615*ec02198aSmrg   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2616*ec02198aSmrg   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2617*ec02198aSmrg   edge update_e = NULL, skip_e = NULL;
2618*ec02198aSmrg   unsigned int lowest_vf = constant_lower_bound (vf);
2619*ec02198aSmrg   /* If we know the number of scalar iterations for the main loop we should
2620*ec02198aSmrg      check whether after the main loop there are enough iterations left over
2621*ec02198aSmrg      for the epilogue.  */
2622*ec02198aSmrg   if (vect_epilogues
2623*ec02198aSmrg       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624*ec02198aSmrg       && prolog_peeling >= 0
2625*ec02198aSmrg       && known_eq (vf, lowest_vf))
2626*ec02198aSmrg     {
2627*ec02198aSmrg       unsigned HOST_WIDE_INT eiters
2628*ec02198aSmrg 	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
2629*ec02198aSmrg 	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2630*ec02198aSmrg 
2631*ec02198aSmrg       eiters -= prolog_peeling;
2632*ec02198aSmrg       eiters
2633*ec02198aSmrg 	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2634*ec02198aSmrg 
2635*ec02198aSmrg       unsigned int ratio;
2636*ec02198aSmrg       unsigned int epilogue_gaps
2637*ec02198aSmrg 	= LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2638*ec02198aSmrg       while (!(constant_multiple_p
2639*ec02198aSmrg 	       (GET_MODE_SIZE (loop_vinfo->vector_mode),
2640*ec02198aSmrg 		GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2641*ec02198aSmrg 	       && eiters >= lowest_vf / ratio + epilogue_gaps))
2642*ec02198aSmrg 	{
2643*ec02198aSmrg 	  delete epilogue_vinfo;
2644*ec02198aSmrg 	  epilogue_vinfo = NULL;
2645*ec02198aSmrg 	  if (loop_vinfo->epilogue_vinfos.length () == 0)
2646*ec02198aSmrg 	    {
2647*ec02198aSmrg 	      vect_epilogues = false;
2648*ec02198aSmrg 	      break;
2649*ec02198aSmrg 	    }
2650*ec02198aSmrg 	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2651*ec02198aSmrg 	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
2652*ec02198aSmrg 	  epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2653*ec02198aSmrg 	}
2654*ec02198aSmrg     }
265510d565efSmrg   /* Prolog loop may be skipped.  */
265610d565efSmrg   bool skip_prolog = (prolog_peeling != 0);
2657*ec02198aSmrg   /* Skip this loop to epilog when there are not enough iterations to enter this
2658*ec02198aSmrg      vectorized loop.  If true we should perform runtime checks on the NITERS
2659*ec02198aSmrg      to check whether we should skip the current vectorized loop.  If we know
2660*ec02198aSmrg      the number of scalar iterations we may choose to add a runtime check if
2661*ec02198aSmrg      this number "maybe" smaller than the number of iterations required
2662*ec02198aSmrg      when we know the number of scalar iterations may potentially
2663*ec02198aSmrg      be smaller than the number of iterations required to enter this loop, for
2664*ec02198aSmrg      this we use the upper bounds on the prolog and epilog peeling.  When we
2665*ec02198aSmrg      don't know the number of iterations and don't require versioning it is
2666*ec02198aSmrg      because we have asserted that there are enough scalar iterations to enter
2667*ec02198aSmrg      the main loop, so this skip is not necessary.  When we are versioning then
2668*ec02198aSmrg      we only add such a skip if we have chosen to vectorize the epilogue.  */
2669c7a68eb7Smrg   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2670c7a68eb7Smrg 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2671c7a68eb7Smrg 				  bound_prolog + bound_epilog)
2672*ec02198aSmrg 		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2673*ec02198aSmrg 			 || vect_epilogues));
267410d565efSmrg   /* Epilog loop must be executed if the number of iterations for epilog
267510d565efSmrg      loop is known at compile time, otherwise we need to add a check at
267610d565efSmrg      the end of vector loop and skip to the end of epilog loop.  */
267710d565efSmrg   bool skip_epilog = (prolog_peeling < 0
2678c7a68eb7Smrg 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2679c7a68eb7Smrg 		      || !vf.is_constant ());
268010d565efSmrg   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
268110d565efSmrg   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
268210d565efSmrg     skip_epilog = false;
268310d565efSmrg 
268410d565efSmrg   if (skip_vector)
268510d565efSmrg     {
268610d565efSmrg       split_edge (loop_preheader_edge (loop));
268710d565efSmrg 
268810d565efSmrg       /* Due to the order in which we peel prolog and epilog, we first
268910d565efSmrg 	 propagate probability to the whole loop.  The purpose is to
269010d565efSmrg 	 avoid adjusting probabilities of both prolog and vector loops
269110d565efSmrg 	 separately.  Note in this case, the probability of epilog loop
269210d565efSmrg 	 needs to be scaled back later.  */
269310d565efSmrg       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2694c7a68eb7Smrg       if (prob_vector.initialized_p ())
2695c7a68eb7Smrg 	{
2696c7a68eb7Smrg 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2697c7a68eb7Smrg 	  scale_loop_profile (loop, prob_vector, 0);
2698c7a68eb7Smrg 	}
269910d565efSmrg     }
270010d565efSmrg 
27010fc04c29Smrg   dump_user_location_t loop_loc = find_loop_location (loop);
2702*ec02198aSmrg   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2703*ec02198aSmrg   if (vect_epilogues)
2704*ec02198aSmrg     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2705*ec02198aSmrg        use the original scalar loop as remaining epilogue if necessary.  */
2706*ec02198aSmrg     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2707*ec02198aSmrg       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2708*ec02198aSmrg 
270910d565efSmrg   if (prolog_peeling)
271010d565efSmrg     {
271110d565efSmrg       e = loop_preheader_edge (loop);
271210d565efSmrg       if (!slpeel_can_duplicate_loop_p (loop, e))
271310d565efSmrg 	{
271410d565efSmrg 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
271510d565efSmrg 			   "loop can't be duplicated to preheader edge.\n");
271610d565efSmrg 	  gcc_unreachable ();
271710d565efSmrg 	}
271810d565efSmrg       /* Peel prolog and put it on preheader edge of loop.  */
271910d565efSmrg       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
272010d565efSmrg       if (!prolog)
272110d565efSmrg 	{
272210d565efSmrg 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
272310d565efSmrg 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
272410d565efSmrg 	  gcc_unreachable ();
272510d565efSmrg 	}
27260fc04c29Smrg       prolog->force_vectorize = false;
272710d565efSmrg       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
272810d565efSmrg       first_loop = prolog;
272910d565efSmrg       reset_original_copy_tables ();
273010d565efSmrg 
2731c7a68eb7Smrg       /* Update the number of iterations for prolog loop.  */
2732c7a68eb7Smrg       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2733c7a68eb7Smrg       vect_set_loop_condition (prolog, NULL, niters_prolog,
2734c7a68eb7Smrg 			       step_prolog, NULL_TREE, false);
273510d565efSmrg 
273610d565efSmrg       /* Skip the prolog loop.  */
273710d565efSmrg       if (skip_prolog)
273810d565efSmrg 	{
273910d565efSmrg 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
274010d565efSmrg 				    niters_prolog, build_int_cst (type, 0));
274110d565efSmrg 	  guard_bb = loop_preheader_edge (prolog)->src;
274210d565efSmrg 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
274310d565efSmrg 	  guard_to = split_edge (loop_preheader_edge (loop));
274410d565efSmrg 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
274510d565efSmrg 					   guard_to, guard_bb,
2746c7a68eb7Smrg 					   prob_prolog.invert (),
274710d565efSmrg 					   irred_flag);
274810d565efSmrg 	  e = EDGE_PRED (guard_to, 0);
274910d565efSmrg 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
275010d565efSmrg 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
275110d565efSmrg 
2752c7a68eb7Smrg 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
275310d565efSmrg 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
275410d565efSmrg 	}
2755*ec02198aSmrg 
275610d565efSmrg       /* Update init address of DRs.  */
2757c7a68eb7Smrg       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
275810d565efSmrg       /* Update niters for vector loop.  */
275910d565efSmrg       LOOP_VINFO_NITERS (loop_vinfo)
276010d565efSmrg 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
276110d565efSmrg       LOOP_VINFO_NITERSM1 (loop_vinfo)
276210d565efSmrg 	= fold_build2 (MINUS_EXPR, type,
276310d565efSmrg 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2764c7a68eb7Smrg       bool new_var_p = false;
2765c7a68eb7Smrg       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2766c7a68eb7Smrg       /* It's guaranteed that vector loop bound before vectorization is at
2767c7a68eb7Smrg 	 least VF, so set range information for newly generated var.  */
2768c7a68eb7Smrg       if (new_var_p)
2769c7a68eb7Smrg 	set_range_info (niters, VR_RANGE,
2770c7a68eb7Smrg 			wi::to_wide (build_int_cst (type, vf)),
2771c7a68eb7Smrg 			wi::to_wide (TYPE_MAX_VALUE (type)));
277210d565efSmrg 
277310d565efSmrg       /* Prolog iterates at most bound_prolog times, latch iterates at
277410d565efSmrg 	 most bound_prolog - 1 times.  */
277510d565efSmrg       record_niter_bound (prolog, bound_prolog - 1, false, true);
277610d565efSmrg       delete_update_ssa ();
277710d565efSmrg       adjust_vec_debug_stmts ();
277810d565efSmrg       scev_reset ();
277910d565efSmrg     }
278010d565efSmrg 
278110d565efSmrg   if (epilog_peeling)
278210d565efSmrg     {
278310d565efSmrg       e = single_exit (loop);
278410d565efSmrg       if (!slpeel_can_duplicate_loop_p (loop, e))
278510d565efSmrg 	{
278610d565efSmrg 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
278710d565efSmrg 			   "loop can't be duplicated to exit edge.\n");
278810d565efSmrg 	  gcc_unreachable ();
278910d565efSmrg 	}
2790*ec02198aSmrg       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2791*ec02198aSmrg 	 said epilog then we should use a copy of the main loop as a starting
2792*ec02198aSmrg 	 point.  This loop may have already had some preliminary transformations
2793*ec02198aSmrg 	 to allow for more optimal vectorization, for example if-conversion.
2794*ec02198aSmrg 	 If we are not vectorizing the epilog then we should use the scalar loop
2795*ec02198aSmrg 	 as the transformations mentioned above make less or no sense when not
2796*ec02198aSmrg 	 vectorizing.  */
2797*ec02198aSmrg       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2798*ec02198aSmrg       if (vop_to_rename)
2799*ec02198aSmrg 	{
2800*ec02198aSmrg 	  /* Vectorizing the main loop can sometimes introduce a vdef to
2801*ec02198aSmrg 	     a loop that previously didn't have one; see the comment above
2802*ec02198aSmrg 	     the definition of VOP_TO_RENAME for details.  The definition
2803*ec02198aSmrg 	     D that holds on E will then be different from the definition
2804*ec02198aSmrg 	     VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2805*ec02198aSmrg 	     rename VOP_TO_RENAME to D when copying the loop.
2806*ec02198aSmrg 
2807*ec02198aSmrg 	     The virtual operand is in LCSSA form for the main loop,
2808*ec02198aSmrg 	     and no stmt between the main loop and E needs a vdef,
2809*ec02198aSmrg 	     so we know that D is provided by a phi rather than by a
2810*ec02198aSmrg 	     vdef on a normal gimple stmt.  */
2811*ec02198aSmrg 	  basic_block vdef_bb = e->src;
2812*ec02198aSmrg 	  gphi *vphi;
2813*ec02198aSmrg 	  while (!(vphi = get_virtual_phi (vdef_bb)))
2814*ec02198aSmrg 	    vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2815*ec02198aSmrg 	  gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2816*ec02198aSmrg 	  set_current_def (vop_to_rename, gimple_phi_result (vphi));
2817*ec02198aSmrg 	}
2818*ec02198aSmrg       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
281910d565efSmrg       if (!epilog)
282010d565efSmrg 	{
282110d565efSmrg 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
282210d565efSmrg 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
282310d565efSmrg 	  gcc_unreachable ();
282410d565efSmrg 	}
28250fc04c29Smrg       epilog->force_vectorize = false;
282610d565efSmrg       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
282710d565efSmrg 
282810d565efSmrg       /* Scalar version loop may be preferred.  In this case, add guard
282910d565efSmrg 	 and skip to epilog.  Note this only happens when the number of
283010d565efSmrg 	 iterations of loop is unknown at compile time, otherwise this
283110d565efSmrg 	 won't be vectorized.  */
283210d565efSmrg       if (skip_vector)
283310d565efSmrg 	{
283410d565efSmrg 	  /* Additional epilogue iteration is peeled if gap exists.  */
283510d565efSmrg 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2836c7a68eb7Smrg 						bound_prolog, bound_epilog,
283710d565efSmrg 						th, &bound_scalar,
283810d565efSmrg 						check_profitability);
283910d565efSmrg 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
284010d565efSmrg 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
284110d565efSmrg 	  guard_bb = anchor;
284210d565efSmrg 	  guard_to = split_edge (loop_preheader_edge (epilog));
284310d565efSmrg 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
284410d565efSmrg 					   guard_to, guard_bb,
2845c7a68eb7Smrg 					   prob_vector.invert (),
284610d565efSmrg 					   irred_flag);
2847*ec02198aSmrg 	  skip_e = guard_e;
284810d565efSmrg 	  e = EDGE_PRED (guard_to, 0);
284910d565efSmrg 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
285010d565efSmrg 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
285110d565efSmrg 
285210d565efSmrg 	  /* Simply propagate profile info from guard_bb to guard_to which is
285310d565efSmrg 	     a merge point of control flow.  */
285410d565efSmrg 	  guard_to->count = guard_bb->count;
2855c7a68eb7Smrg 
2856c7a68eb7Smrg 	  /* Scale probability of epilog loop back.
2857c7a68eb7Smrg 	     FIXME: We should avoid scaling down and back up.  Profile may
2858c7a68eb7Smrg 	     get lost if we scale down to 0.  */
2859c7a68eb7Smrg 	  basic_block *bbs = get_loop_body (epilog);
2860c7a68eb7Smrg 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2861c7a68eb7Smrg 	    bbs[i]->count = bbs[i]->count.apply_scale
2862c7a68eb7Smrg 				 (bbs[i]->count,
2863c7a68eb7Smrg 				  bbs[i]->count.apply_probability
2864c7a68eb7Smrg 				    (prob_vector));
2865c7a68eb7Smrg 	  free (bbs);
286610d565efSmrg 	}
286710d565efSmrg 
286810d565efSmrg       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
286910d565efSmrg       /* If loop is peeled for non-zero constant times, now niters refers to
287010d565efSmrg 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
287110d565efSmrg 	 overflows.  */
287210d565efSmrg       niters_no_overflow |= (prolog_peeling > 0);
287310d565efSmrg       vect_gen_vector_loop_niters (loop_vinfo, niters,
2874c7a68eb7Smrg 				   niters_vector, step_vector,
2875c7a68eb7Smrg 				   niters_no_overflow);
2876c7a68eb7Smrg       if (!integer_onep (*step_vector))
2877c7a68eb7Smrg 	{
2878c7a68eb7Smrg 	  /* On exit from the loop we will have an easy way of calcalating
2879c7a68eb7Smrg 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2880c7a68eb7Smrg 	     until then.  */
2881c7a68eb7Smrg 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2882c7a68eb7Smrg 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2883c7a68eb7Smrg 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2884c7a68eb7Smrg 	}
2885c7a68eb7Smrg       else
288610d565efSmrg 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
288710d565efSmrg 					     &niters_vector_mult_vf);
288810d565efSmrg       /* Update IVs of original loop as if they were advanced by
288910d565efSmrg 	 niters_vector_mult_vf steps.  */
289010d565efSmrg       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2891*ec02198aSmrg       update_e = skip_vector ? e : loop_preheader_edge (epilog);
289210d565efSmrg       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
289310d565efSmrg 					update_e);
289410d565efSmrg 
289510d565efSmrg       if (skip_epilog)
289610d565efSmrg 	{
289710d565efSmrg 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
289810d565efSmrg 				    niters, niters_vector_mult_vf);
289910d565efSmrg 	  guard_bb = single_exit (loop)->dest;
290010d565efSmrg 	  guard_to = split_edge (single_exit (epilog));
290110d565efSmrg 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
290210d565efSmrg 					   skip_vector ? anchor : guard_bb,
2903c7a68eb7Smrg 					   prob_epilog.invert (),
290410d565efSmrg 					   irred_flag);
290510d565efSmrg 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
290610d565efSmrg 					      single_exit (epilog));
290710d565efSmrg 	  /* Only need to handle basic block before epilog loop if it's not
290810d565efSmrg 	     the guard_bb, which is the case when skip_vector is true.  */
290910d565efSmrg 	  if (guard_bb != bb_before_epilog)
291010d565efSmrg 	    {
2911c7a68eb7Smrg 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
291210d565efSmrg 
2913c7a68eb7Smrg 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
291410d565efSmrg 	    }
2915c7a68eb7Smrg 	  scale_loop_profile (epilog, prob_epilog, 0);
291610d565efSmrg 	}
291710d565efSmrg       else
291810d565efSmrg 	slpeel_update_phi_nodes_for_lcssa (epilog);
291910d565efSmrg 
2920c7a68eb7Smrg       unsigned HOST_WIDE_INT bound;
2921c7a68eb7Smrg       if (bound_scalar.is_constant (&bound))
2922c7a68eb7Smrg 	{
2923c7a68eb7Smrg 	  gcc_assert (bound != 0);
2924c7a68eb7Smrg 	  /* -1 to convert loop iterations to latch iterations.  */
2925c7a68eb7Smrg 	  record_niter_bound (epilog, bound - 1, false, true);
2926c7a68eb7Smrg 	}
292710d565efSmrg 
292810d565efSmrg       delete_update_ssa ();
292910d565efSmrg       adjust_vec_debug_stmts ();
293010d565efSmrg       scev_reset ();
293110d565efSmrg     }
2932*ec02198aSmrg 
2933*ec02198aSmrg   if (vect_epilogues)
2934*ec02198aSmrg     {
2935*ec02198aSmrg       epilog->aux = epilogue_vinfo;
2936*ec02198aSmrg       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2937*ec02198aSmrg 
2938*ec02198aSmrg       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2939*ec02198aSmrg 
2940*ec02198aSmrg       /* We now must calculate the number of NITERS performed by the previous
2941*ec02198aSmrg 	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2942*ec02198aSmrg       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2943*ec02198aSmrg 				 niters_prolog, niters_vector_mult_vf);
2944*ec02198aSmrg 
2945*ec02198aSmrg       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2946*ec02198aSmrg 	 determine whether we are coming from the previous vectorized loop
2947*ec02198aSmrg 	 using the update_e edge or the skip_vector basic block using the
2948*ec02198aSmrg 	 skip_e edge.  */
2949*ec02198aSmrg       if (skip_vector)
2950*ec02198aSmrg 	{
2951*ec02198aSmrg 	  gcc_assert (update_e != NULL && skip_e != NULL);
2952*ec02198aSmrg 	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2953*ec02198aSmrg 					   update_e->dest);
2954*ec02198aSmrg 	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2955*ec02198aSmrg 	  gimple *stmt = gimple_build_assign (new_ssa, niters);
2956*ec02198aSmrg 	  gimple_stmt_iterator gsi;
2957*ec02198aSmrg 	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2958*ec02198aSmrg 	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2959*ec02198aSmrg 	    {
2960*ec02198aSmrg 	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2961*ec02198aSmrg 	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2962*ec02198aSmrg 	    }
2963*ec02198aSmrg 	  else
2964*ec02198aSmrg 	    {
2965*ec02198aSmrg 	      gsi = gsi_last_bb (update_e->src);
2966*ec02198aSmrg 	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2967*ec02198aSmrg 	    }
2968*ec02198aSmrg 
2969*ec02198aSmrg 	  niters = new_ssa;
2970*ec02198aSmrg 	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2971*ec02198aSmrg 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2972*ec02198aSmrg 		       UNKNOWN_LOCATION);
2973*ec02198aSmrg 	  niters = PHI_RESULT (new_phi);
2974*ec02198aSmrg 	}
2975*ec02198aSmrg 
2976*ec02198aSmrg       /* Subtract the number of iterations performed by the vectorized loop
2977*ec02198aSmrg 	 from the number of total iterations.  */
2978*ec02198aSmrg       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2979*ec02198aSmrg 					  before_loop_niters,
2980*ec02198aSmrg 					  niters);
2981*ec02198aSmrg 
2982*ec02198aSmrg       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2983*ec02198aSmrg       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2984*ec02198aSmrg 	= fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2985*ec02198aSmrg 		       epilogue_niters,
2986*ec02198aSmrg 		       build_one_cst (TREE_TYPE (epilogue_niters)));
2987*ec02198aSmrg 
2988*ec02198aSmrg       /* Set ADVANCE to the number of iterations performed by the previous
2989*ec02198aSmrg 	 loop and its prologue.  */
2990*ec02198aSmrg       *advance = niters;
2991*ec02198aSmrg 
2992*ec02198aSmrg       /* Redo the peeling for niter analysis as the NITERs and alignment
2993*ec02198aSmrg 	 may have been updated to take the main loop into account.  */
2994*ec02198aSmrg       determine_peel_for_niter (epilogue_vinfo);
2995*ec02198aSmrg     }
2996*ec02198aSmrg 
299710d565efSmrg   adjust_vec.release ();
299810d565efSmrg   free_original_copy_tables ();
299910d565efSmrg 
3000*ec02198aSmrg   return vect_epilogues ? epilog : NULL;
300110d565efSmrg }
300210d565efSmrg 
300310d565efSmrg /* Function vect_create_cond_for_niters_checks.
300410d565efSmrg 
300510d565efSmrg    Create a conditional expression that represents the run-time checks for
3006c7a68eb7Smrg    loop's niter.  The loop is guaranteed to terminate if the run-time
300710d565efSmrg    checks hold.
300810d565efSmrg 
300910d565efSmrg    Input:
301010d565efSmrg    COND_EXPR  - input conditional expression.  New conditions will be chained
301110d565efSmrg 		with logical AND operation.  If it is NULL, then the function
301210d565efSmrg 		is used to return the number of alias checks.
301310d565efSmrg    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
301410d565efSmrg 		to be checked.
301510d565efSmrg 
301610d565efSmrg    Output:
301710d565efSmrg    COND_EXPR - conditional expression.
301810d565efSmrg 
301910d565efSmrg    The returned COND_EXPR is the conditional expression to be used in the
302010d565efSmrg    if statement that controls which version of the loop gets executed at
302110d565efSmrg    runtime.  */
302210d565efSmrg 
302310d565efSmrg static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)302410d565efSmrg vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
302510d565efSmrg {
302610d565efSmrg   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
302710d565efSmrg 
302810d565efSmrg   if (*cond_expr)
302910d565efSmrg     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
303010d565efSmrg 			      *cond_expr, part_cond_expr);
303110d565efSmrg   else
303210d565efSmrg     *cond_expr = part_cond_expr;
303310d565efSmrg }
303410d565efSmrg 
3035c7a68eb7Smrg /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3036c7a68eb7Smrg    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3037c7a68eb7Smrg 
3038c7a68eb7Smrg static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3039c7a68eb7Smrg chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3040c7a68eb7Smrg {
3041c7a68eb7Smrg   if (*cond_expr)
3042c7a68eb7Smrg     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3043c7a68eb7Smrg 			      *cond_expr, part_cond_expr);
3044c7a68eb7Smrg   else
3045c7a68eb7Smrg     *cond_expr = part_cond_expr;
3046c7a68eb7Smrg }
3047c7a68eb7Smrg 
304810d565efSmrg /* Function vect_create_cond_for_align_checks.
304910d565efSmrg 
305010d565efSmrg    Create a conditional expression that represents the alignment checks for
305110d565efSmrg    all of data references (array element references) whose alignment must be
305210d565efSmrg    checked at runtime.
305310d565efSmrg 
305410d565efSmrg    Input:
305510d565efSmrg    COND_EXPR  - input conditional expression.  New conditions will be chained
305610d565efSmrg                 with logical AND operation.
305710d565efSmrg    LOOP_VINFO - two fields of the loop information are used.
305810d565efSmrg                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
305910d565efSmrg                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
306010d565efSmrg 
306110d565efSmrg    Output:
306210d565efSmrg    COND_EXPR_STMT_LIST - statements needed to construct the conditional
306310d565efSmrg                          expression.
306410d565efSmrg    The returned value is the conditional expression to be used in the if
306510d565efSmrg    statement that controls which version of the loop gets executed at runtime.
306610d565efSmrg 
306710d565efSmrg    The algorithm makes two assumptions:
306810d565efSmrg      1) The number of bytes "n" in a vector is a power of 2.
306910d565efSmrg      2) An address "a" is aligned if a%n is zero and that this
307010d565efSmrg         test can be done as a&(n-1) == 0.  For example, for 16
307110d565efSmrg         byte vectors the test is a&0xf == 0.  */
307210d565efSmrg 
307310d565efSmrg static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)307410d565efSmrg vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
307510d565efSmrg                                    tree *cond_expr,
307610d565efSmrg 				   gimple_seq *cond_expr_stmt_list)
307710d565efSmrg {
30780fc04c29Smrg   vec<stmt_vec_info> may_misalign_stmts
307910d565efSmrg     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
30800fc04c29Smrg   stmt_vec_info stmt_info;
308110d565efSmrg   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
308210d565efSmrg   tree mask_cst;
308310d565efSmrg   unsigned int i;
308410d565efSmrg   tree int_ptrsize_type;
308510d565efSmrg   char tmp_name[20];
308610d565efSmrg   tree or_tmp_name = NULL_TREE;
308710d565efSmrg   tree and_tmp_name;
308810d565efSmrg   gimple *and_stmt;
308910d565efSmrg   tree ptrsize_zero;
309010d565efSmrg   tree part_cond_expr;
309110d565efSmrg 
309210d565efSmrg   /* Check that mask is one less than a power of 2, i.e., mask is
309310d565efSmrg      all zeros followed by all ones.  */
309410d565efSmrg   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
309510d565efSmrg 
309610d565efSmrg   int_ptrsize_type = signed_type_for (ptr_type_node);
309710d565efSmrg 
309810d565efSmrg   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
309910d565efSmrg      of the first vector of the i'th data reference. */
310010d565efSmrg 
31010fc04c29Smrg   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
310210d565efSmrg     {
310310d565efSmrg       gimple_seq new_stmt_list = NULL;
310410d565efSmrg       tree addr_base;
310510d565efSmrg       tree addr_tmp_name;
310610d565efSmrg       tree new_or_tmp_name;
310710d565efSmrg       gimple *addr_stmt, *or_stmt;
31080fc04c29Smrg       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
310910d565efSmrg       bool negative = tree_int_cst_compare
31100fc04c29Smrg 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
311110d565efSmrg       tree offset = negative
311210d565efSmrg 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
311310d565efSmrg 
311410d565efSmrg       /* create: addr_tmp = (int)(address_of_first_vector) */
311510d565efSmrg       addr_base =
31160fc04c29Smrg 	vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3117c7a68eb7Smrg 					      offset);
311810d565efSmrg       if (new_stmt_list != NULL)
311910d565efSmrg 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
312010d565efSmrg 
312110d565efSmrg       sprintf (tmp_name, "addr2int%d", i);
312210d565efSmrg       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
312310d565efSmrg       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
312410d565efSmrg       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
312510d565efSmrg 
312610d565efSmrg       /* The addresses are OR together.  */
312710d565efSmrg 
312810d565efSmrg       if (or_tmp_name != NULL_TREE)
312910d565efSmrg         {
313010d565efSmrg           /* create: or_tmp = or_tmp | addr_tmp */
313110d565efSmrg           sprintf (tmp_name, "orptrs%d", i);
313210d565efSmrg 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
313310d565efSmrg 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
313410d565efSmrg 					 or_tmp_name, addr_tmp_name);
313510d565efSmrg 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
313610d565efSmrg           or_tmp_name = new_or_tmp_name;
313710d565efSmrg         }
313810d565efSmrg       else
313910d565efSmrg         or_tmp_name = addr_tmp_name;
314010d565efSmrg 
314110d565efSmrg     } /* end for i */
314210d565efSmrg 
314310d565efSmrg   mask_cst = build_int_cst (int_ptrsize_type, mask);
314410d565efSmrg 
314510d565efSmrg   /* create: and_tmp = or_tmp & mask  */
314610d565efSmrg   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
314710d565efSmrg 
314810d565efSmrg   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
314910d565efSmrg 				  or_tmp_name, mask_cst);
315010d565efSmrg   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
315110d565efSmrg 
315210d565efSmrg   /* Make and_tmp the left operand of the conditional test against zero.
315310d565efSmrg      if and_tmp has a nonzero bit then some address is unaligned.  */
315410d565efSmrg   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
315510d565efSmrg   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
315610d565efSmrg 				and_tmp_name, ptrsize_zero);
3157c7a68eb7Smrg   chain_cond_expr (cond_expr, part_cond_expr);
315810d565efSmrg }
315910d565efSmrg 
3160c7a68eb7Smrg /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3161c7a68eb7Smrg    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3162c7a68eb7Smrg    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3163c7a68eb7Smrg    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
316410d565efSmrg 
316510d565efSmrg static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3166c7a68eb7Smrg vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
316710d565efSmrg {
3168c7a68eb7Smrg   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3169c7a68eb7Smrg   unsigned int i;
3170c7a68eb7Smrg   vec_object_pair *pair;
3171c7a68eb7Smrg   FOR_EACH_VEC_ELT (pairs, i, pair)
317210d565efSmrg     {
3173c7a68eb7Smrg       tree addr1 = build_fold_addr_expr (pair->first);
3174c7a68eb7Smrg       tree addr2 = build_fold_addr_expr (pair->second);
3175c7a68eb7Smrg       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3176c7a68eb7Smrg 					 addr1, addr2);
3177c7a68eb7Smrg       chain_cond_expr (cond_expr, part_cond_expr);
3178c7a68eb7Smrg     }
317910d565efSmrg }
318010d565efSmrg 
3181c7a68eb7Smrg /* Create an expression that is true when all lower-bound conditions for
3182c7a68eb7Smrg    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3183c7a68eb7Smrg 
3184c7a68eb7Smrg static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3185c7a68eb7Smrg vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
318610d565efSmrg {
3187c7a68eb7Smrg   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3188c7a68eb7Smrg   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3189c7a68eb7Smrg     {
3190c7a68eb7Smrg       tree expr = lower_bounds[i].expr;
3191c7a68eb7Smrg       tree type = unsigned_type_for (TREE_TYPE (expr));
3192c7a68eb7Smrg       expr = fold_convert (type, expr);
3193c7a68eb7Smrg       poly_uint64 bound = lower_bounds[i].min_value;
3194c7a68eb7Smrg       if (!lower_bounds[i].unsigned_p)
3195c7a68eb7Smrg 	{
3196c7a68eb7Smrg 	  expr = fold_build2 (PLUS_EXPR, type, expr,
3197c7a68eb7Smrg 			      build_int_cstu (type, bound - 1));
3198c7a68eb7Smrg 	  bound += bound - 1;
319910d565efSmrg 	}
3200c7a68eb7Smrg       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3201c7a68eb7Smrg 					 build_int_cstu (type, bound));
3202c7a68eb7Smrg       chain_cond_expr (cond_expr, part_cond_expr);
3203c7a68eb7Smrg     }
320410d565efSmrg }
320510d565efSmrg 
320610d565efSmrg /* Function vect_create_cond_for_alias_checks.
320710d565efSmrg 
320810d565efSmrg    Create a conditional expression that represents the run-time checks for
320910d565efSmrg    overlapping of address ranges represented by a list of data references
321010d565efSmrg    relations passed as input.
321110d565efSmrg 
321210d565efSmrg    Input:
321310d565efSmrg    COND_EXPR  - input conditional expression.  New conditions will be chained
321410d565efSmrg                 with logical AND operation.  If it is NULL, then the function
321510d565efSmrg                 is used to return the number of alias checks.
321610d565efSmrg    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
321710d565efSmrg 	        to be checked.
321810d565efSmrg 
321910d565efSmrg    Output:
322010d565efSmrg    COND_EXPR - conditional expression.
322110d565efSmrg 
322210d565efSmrg    The returned COND_EXPR is the conditional expression to be used in the if
322310d565efSmrg    statement that controls which version of the loop gets executed at runtime.
322410d565efSmrg */
322510d565efSmrg 
322610d565efSmrg void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)322710d565efSmrg vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
322810d565efSmrg {
322910d565efSmrg   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
323010d565efSmrg     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
323110d565efSmrg 
323210d565efSmrg   if (comp_alias_ddrs.is_empty ())
323310d565efSmrg     return;
323410d565efSmrg 
3235c7a68eb7Smrg   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3236c7a68eb7Smrg 			       &comp_alias_ddrs, cond_expr);
323710d565efSmrg   if (dump_enabled_p ())
323810d565efSmrg     dump_printf_loc (MSG_NOTE, vect_location,
323910d565efSmrg 		     "created %u versioning for alias checks.\n",
324010d565efSmrg 		     comp_alias_ddrs.length ());
324110d565efSmrg }
324210d565efSmrg 
324310d565efSmrg 
324410d565efSmrg /* Function vect_loop_versioning.
324510d565efSmrg 
324610d565efSmrg    If the loop has data references that may or may not be aligned or/and
324710d565efSmrg    has data reference relations whose independence was not proven then
324810d565efSmrg    two versions of the loop need to be generated, one which is vectorized
324910d565efSmrg    and one which isn't.  A test is then generated to control which of the
325010d565efSmrg    loops is executed.  The test checks for the alignment of all of the
325110d565efSmrg    data references that may or may not be aligned.  An additional
325210d565efSmrg    sequence of runtime tests is generated for each pairs of DDRs whose
325310d565efSmrg    independence was not proven.  The vectorized version of loop is
325410d565efSmrg    executed only if both alias and alignment tests are passed.
325510d565efSmrg 
325610d565efSmrg    The test generated to check which version of loop is executed
325710d565efSmrg    is modified to also check for profitability as indicated by the
325810d565efSmrg    cost model threshold TH.
325910d565efSmrg 
326010d565efSmrg    The versioning precondition(s) are placed in *COND_EXPR and
326110d565efSmrg    *COND_EXPR_STMT_LIST.  */
326210d565efSmrg 
3263*ec02198aSmrg class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)326410d565efSmrg vect_loop_versioning (loop_vec_info loop_vinfo,
3265*ec02198aSmrg 		      gimple *loop_vectorized_call)
326610d565efSmrg {
3267*ec02198aSmrg   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3268*ec02198aSmrg   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
326910d565efSmrg   basic_block condition_bb;
327010d565efSmrg   gphi_iterator gsi;
327110d565efSmrg   gimple_stmt_iterator cond_exp_gsi;
327210d565efSmrg   basic_block merge_bb;
327310d565efSmrg   basic_block new_exit_bb;
327410d565efSmrg   edge new_exit_e, e;
327510d565efSmrg   gphi *orig_phi, *new_phi;
327610d565efSmrg   tree cond_expr = NULL_TREE;
327710d565efSmrg   gimple_seq cond_expr_stmt_list = NULL;
327810d565efSmrg   tree arg;
3279c7a68eb7Smrg   profile_probability prob = profile_probability::likely ();
328010d565efSmrg   gimple_seq gimplify_stmt_list = NULL;
3281c7a68eb7Smrg   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
328210d565efSmrg   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
328310d565efSmrg   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
328410d565efSmrg   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3285*ec02198aSmrg   poly_uint64 versioning_threshold
3286*ec02198aSmrg     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
32870fc04c29Smrg   tree version_simd_if_cond
32880fc04c29Smrg     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3289*ec02198aSmrg   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
329010d565efSmrg 
3291*ec02198aSmrg   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3292*ec02198aSmrg       && !ordered_p (th, versioning_threshold))
3293c7a68eb7Smrg     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
329410d565efSmrg 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3295c7a68eb7Smrg 					    th - 1));
3296c7a68eb7Smrg   if (maybe_ne (versioning_threshold, 0U))
3297c7a68eb7Smrg     {
3298c7a68eb7Smrg       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3299c7a68eb7Smrg 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3300c7a68eb7Smrg 					      versioning_threshold - 1));
3301c7a68eb7Smrg       if (cond_expr)
3302c7a68eb7Smrg 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3303c7a68eb7Smrg 				 expr, cond_expr);
3304c7a68eb7Smrg       else
3305c7a68eb7Smrg 	cond_expr = expr;
3306c7a68eb7Smrg     }
330710d565efSmrg 
330810d565efSmrg   if (version_niter)
330910d565efSmrg     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
331010d565efSmrg 
331110d565efSmrg   if (cond_expr)
3312*ec02198aSmrg     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3313*ec02198aSmrg 					&cond_expr_stmt_list,
331410d565efSmrg 					is_gimple_condexpr, NULL_TREE);
331510d565efSmrg 
331610d565efSmrg   if (version_align)
331710d565efSmrg     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
331810d565efSmrg 				       &cond_expr_stmt_list);
331910d565efSmrg 
332010d565efSmrg   if (version_alias)
3321c7a68eb7Smrg     {
3322c7a68eb7Smrg       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3323c7a68eb7Smrg       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
332410d565efSmrg       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3325c7a68eb7Smrg     }
332610d565efSmrg 
33270fc04c29Smrg   if (version_simd_if_cond)
33280fc04c29Smrg     {
33290fc04c29Smrg       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
33300fc04c29Smrg       if (flag_checking)
33310fc04c29Smrg 	if (basic_block bb
33320fc04c29Smrg 	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
33330fc04c29Smrg 	  gcc_assert (bb != loop->header
33340fc04c29Smrg 		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
33350fc04c29Smrg 		      && (scalar_loop == NULL
33360fc04c29Smrg 			  || (bb != scalar_loop->header
33370fc04c29Smrg 			      && dominated_by_p (CDI_DOMINATORS,
33380fc04c29Smrg 						 scalar_loop->header, bb))));
33390fc04c29Smrg       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
33400fc04c29Smrg       tree c = fold_build2 (NE_EXPR, boolean_type_node,
33410fc04c29Smrg 			    version_simd_if_cond, zero);
33420fc04c29Smrg       if (cond_expr)
33430fc04c29Smrg         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
33440fc04c29Smrg 				 c, cond_expr);
33450fc04c29Smrg       else
33460fc04c29Smrg         cond_expr = c;
33470fc04c29Smrg       if (dump_enabled_p ())
33480fc04c29Smrg 	dump_printf_loc (MSG_NOTE, vect_location,
33490fc04c29Smrg 			 "created versioning for simd if condition check.\n");
33500fc04c29Smrg     }
33510fc04c29Smrg 
3352c7a68eb7Smrg   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3353c7a68eb7Smrg 				      &gimplify_stmt_list,
335410d565efSmrg 				      is_gimple_condexpr, NULL_TREE);
335510d565efSmrg   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
335610d565efSmrg 
3357*ec02198aSmrg   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3358*ec02198aSmrg      invariant in.  */
3359*ec02198aSmrg   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3360*ec02198aSmrg   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3361*ec02198aSmrg        !gsi_end_p (gsi); gsi_next (&gsi))
336210d565efSmrg     {
3363*ec02198aSmrg       gimple *stmt = gsi_stmt (gsi);
3364*ec02198aSmrg       update_stmt (stmt);
3365*ec02198aSmrg       ssa_op_iter iter;
3366*ec02198aSmrg       use_operand_p use_p;
3367*ec02198aSmrg       basic_block def_bb;
3368*ec02198aSmrg       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3369*ec02198aSmrg 	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3370*ec02198aSmrg 	    && flow_bb_inside_loop_p (outermost, def_bb))
3371*ec02198aSmrg 	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3372*ec02198aSmrg     }
337310d565efSmrg 
3374*ec02198aSmrg   /* Search for the outermost loop we can version.  Avoid versioning of
3375*ec02198aSmrg      non-perfect nests but allow if-conversion versioned loops inside.  */
3376*ec02198aSmrg   class loop *loop_to_version = loop;
3377*ec02198aSmrg   if (flow_loop_nested_p (outermost, loop))
3378*ec02198aSmrg     {
3379*ec02198aSmrg       if (dump_enabled_p ())
3380*ec02198aSmrg 	dump_printf_loc (MSG_NOTE, vect_location,
3381*ec02198aSmrg 			 "trying to apply versioning to outer loop %d\n",
3382*ec02198aSmrg 			 outermost->num);
3383*ec02198aSmrg       if (outermost->num == 0)
3384*ec02198aSmrg 	outermost = superloop_at_depth (loop, 1);
3385*ec02198aSmrg       /* And avoid applying versioning on non-perfect nests.  */
3386*ec02198aSmrg       while (loop_to_version != outermost
3387*ec02198aSmrg 	     && (e = single_exit (loop_outer (loop_to_version)))
3388*ec02198aSmrg 	     && !(e->flags & EDGE_COMPLEX)
3389*ec02198aSmrg 	     && (!loop_outer (loop_to_version)->inner->next
3390*ec02198aSmrg 		 || vect_loop_vectorized_call (loop_to_version))
3391*ec02198aSmrg 	     && (!loop_outer (loop_to_version)->inner->next
3392*ec02198aSmrg 		 || !loop_outer (loop_to_version)->inner->next->next))
3393*ec02198aSmrg 	loop_to_version = loop_outer (loop_to_version);
3394*ec02198aSmrg     }
3395*ec02198aSmrg 
3396*ec02198aSmrg   /* Apply versioning.  If there is already a scalar version created by
3397*ec02198aSmrg      if-conversion re-use that.  Note we cannot re-use the copy of
3398*ec02198aSmrg      an if-converted outer-loop when vectorizing the inner loop only.  */
3399*ec02198aSmrg   gcond *cond;
3400*ec02198aSmrg   if ((!loop_to_version->inner || loop == loop_to_version)
3401*ec02198aSmrg       && loop_vectorized_call)
3402*ec02198aSmrg     {
3403*ec02198aSmrg       gcc_assert (scalar_loop);
3404*ec02198aSmrg       condition_bb = gimple_bb (loop_vectorized_call);
3405*ec02198aSmrg       cond = as_a <gcond *> (last_stmt (condition_bb));
3406*ec02198aSmrg       gimple_cond_set_condition_from_tree (cond, cond_expr);
3407*ec02198aSmrg       update_stmt (cond);
3408*ec02198aSmrg 
3409*ec02198aSmrg       if (cond_expr_stmt_list)
3410*ec02198aSmrg 	{
3411*ec02198aSmrg 	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3412*ec02198aSmrg 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3413*ec02198aSmrg 				 GSI_SAME_STMT);
3414*ec02198aSmrg 	}
3415*ec02198aSmrg 
3416*ec02198aSmrg       /* if-conversion uses profile_probability::always () for both paths,
3417*ec02198aSmrg 	 reset the paths probabilities appropriately.  */
3418*ec02198aSmrg       edge te, fe;
3419*ec02198aSmrg       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3420*ec02198aSmrg       te->probability = prob;
3421*ec02198aSmrg       fe->probability = prob.invert ();
3422*ec02198aSmrg       /* We can scale loops counts immediately but have to postpone
3423*ec02198aSmrg          scaling the scalar loop because we re-use it during peeling.  */
3424*ec02198aSmrg       scale_loop_frequencies (loop_to_version, te->probability);
3425*ec02198aSmrg       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3426*ec02198aSmrg 
3427*ec02198aSmrg       nloop = scalar_loop;
3428*ec02198aSmrg       if (dump_enabled_p ())
3429*ec02198aSmrg 	dump_printf_loc (MSG_NOTE, vect_location,
3430*ec02198aSmrg 			 "reusing %sloop version created by if conversion\n",
3431*ec02198aSmrg 			 loop_to_version != loop ? "outer " : "");
343210d565efSmrg     }
343310d565efSmrg   else
3434*ec02198aSmrg     {
3435*ec02198aSmrg       if (loop_to_version != loop
3436*ec02198aSmrg 	  && dump_enabled_p ())
3437*ec02198aSmrg 	dump_printf_loc (MSG_NOTE, vect_location,
3438*ec02198aSmrg 			 "applying loop versioning to outer loop %d\n",
3439*ec02198aSmrg 			 loop_to_version->num);
3440*ec02198aSmrg 
3441*ec02198aSmrg       initialize_original_copy_tables ();
3442*ec02198aSmrg       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3443c7a68eb7Smrg 			    prob, prob.invert (), prob, prob.invert (), true);
3444*ec02198aSmrg       gcc_assert (nloop);
3445*ec02198aSmrg       nloop = get_loop_copy (loop);
3446*ec02198aSmrg 
3447*ec02198aSmrg       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3448*ec02198aSmrg          reap those otherwise;  they also refer to the original
3449*ec02198aSmrg 	 loops.  */
3450*ec02198aSmrg       class loop *l = loop;
3451*ec02198aSmrg       while (gimple *call = vect_loop_vectorized_call (l))
3452*ec02198aSmrg 	{
3453*ec02198aSmrg 	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3454*ec02198aSmrg 	  fold_loop_internal_call (call, boolean_false_node);
3455*ec02198aSmrg 	  l = loop_outer (l);
3456*ec02198aSmrg 	}
3457*ec02198aSmrg       free_original_copy_tables ();
3458*ec02198aSmrg 
3459*ec02198aSmrg       if (cond_expr_stmt_list)
3460*ec02198aSmrg 	{
3461*ec02198aSmrg 	  cond_exp_gsi = gsi_last_bb (condition_bb);
3462*ec02198aSmrg 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3463*ec02198aSmrg 				 GSI_SAME_STMT);
3464*ec02198aSmrg 	}
3465*ec02198aSmrg 
3466*ec02198aSmrg       /* Loop versioning violates an assumption we try to maintain during
3467*ec02198aSmrg 	 vectorization - that the loop exit block has a single predecessor.
3468*ec02198aSmrg 	 After versioning, the exit block of both loop versions is the same
3469*ec02198aSmrg 	 basic block (i.e. it has two predecessors). Just in order to simplify
3470*ec02198aSmrg 	 following transformations in the vectorizer, we fix this situation
3471*ec02198aSmrg 	 here by adding a new (empty) block on the exit-edge of the loop,
3472*ec02198aSmrg 	 with the proper loop-exit phis to maintain loop-closed-form.
3473*ec02198aSmrg 	 If loop versioning wasn't done from loop, but scalar_loop instead,
3474*ec02198aSmrg 	 merge_bb will have already just a single successor.  */
3475*ec02198aSmrg 
3476*ec02198aSmrg       merge_bb = single_exit (loop_to_version)->dest;
3477*ec02198aSmrg       if (EDGE_COUNT (merge_bb->preds) >= 2)
3478*ec02198aSmrg 	{
3479*ec02198aSmrg 	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3480*ec02198aSmrg 	  new_exit_bb = split_edge (single_exit (loop_to_version));
3481*ec02198aSmrg 	  new_exit_e = single_exit (loop_to_version);
3482*ec02198aSmrg 	  e = EDGE_SUCC (new_exit_bb, 0);
3483*ec02198aSmrg 
3484*ec02198aSmrg 	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3485*ec02198aSmrg 	       gsi_next (&gsi))
3486*ec02198aSmrg 	    {
3487*ec02198aSmrg 	      tree new_res;
3488*ec02198aSmrg 	      orig_phi = gsi.phi ();
3489*ec02198aSmrg 	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3490*ec02198aSmrg 	      new_phi = create_phi_node (new_res, new_exit_bb);
3491*ec02198aSmrg 	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3492*ec02198aSmrg 	      add_phi_arg (new_phi, arg, new_exit_e,
3493*ec02198aSmrg 			   gimple_phi_arg_location_from_edge (orig_phi, e));
3494*ec02198aSmrg 	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3495*ec02198aSmrg 	    }
3496*ec02198aSmrg 	}
3497*ec02198aSmrg 
3498*ec02198aSmrg       update_ssa (TODO_update_ssa);
3499*ec02198aSmrg     }
350010d565efSmrg 
350110d565efSmrg   if (version_niter)
350210d565efSmrg     {
350310d565efSmrg       /* The versioned loop could be infinite, we need to clear existing
350410d565efSmrg 	 niter information which is copied from the original loop.  */
350510d565efSmrg       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
350610d565efSmrg       vect_free_loop_info_assumptions (nloop);
350710d565efSmrg       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
350810d565efSmrg       loop_constraint_set (loop, LOOP_C_INFINITE);
350910d565efSmrg     }
351010d565efSmrg 
35110fc04c29Smrg   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
351210d565efSmrg       && dump_enabled_p ())
351310d565efSmrg     {
351410d565efSmrg       if (version_alias)
35150fc04c29Smrg         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
35160fc04c29Smrg 			 vect_location,
351710d565efSmrg                          "loop versioned for vectorization because of "
351810d565efSmrg 			 "possible aliasing\n");
351910d565efSmrg       if (version_align)
35200fc04c29Smrg         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
35210fc04c29Smrg 			 vect_location,
352210d565efSmrg                          "loop versioned for vectorization to enhance "
352310d565efSmrg 			 "alignment\n");
352410d565efSmrg 
352510d565efSmrg     }
35260fc04c29Smrg 
35270fc04c29Smrg   return nloop;
352810d565efSmrg }
3529