1 /* Vectorizer Specific Loop Manipulations
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 
51 /*************************************************************************
52   Simple Loop Peeling Utilities
53 
54   Utilities to support loop peeling for vectorization purposes.
55  *************************************************************************/
56 
57 
58 /* Renames the use *OP_P.  */
59 
60 static void
61 rename_use_op (use_operand_p op_p)
62 {
63   tree new_name;
64 
65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66     return;
67 
68   new_name = get_current_def (USE_FROM_PTR (op_p));
69 
70   /* Something defined outside of the loop.  */
71   if (!new_name)
72     return;
73 
74   /* An ordinary ssa name defined in the loop.  */
75 
76   SET_USE (op_p, new_name);
77 }
78 
79 
80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82    true.  */
83 
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86 {
87   gimple *stmt;
88   use_operand_p use_p;
89   ssa_op_iter iter;
90   edge e;
91   edge_iterator ei;
92   struct loop *loop = bb->loop_father;
93   struct loop *outer_loop = NULL;
94 
95   if (rename_from_outer_loop)
96     {
97       gcc_assert (loop);
98       outer_loop = loop_outer (loop);
99     }
100 
101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102        gsi_next (&gsi))
103     {
104       stmt = gsi_stmt (gsi);
105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 	rename_use_op (use_p);
107     }
108 
109   FOR_EACH_EDGE (e, ei, bb->preds)
110     {
111       if (!flow_bb_inside_loop_p (loop, e->src))
112 	{
113 	  if (!rename_from_outer_loop)
114 	    continue;
115 	  if (e->src != outer_loop->header)
116 	    {
117 	      if (outer_loop->inner->next)
118 		{
119 		  /* If outer_loop has 2 inner loops, allow there to
120 		     be an extra basic block which decides which of the
121 		     two loops to use using LOOP_VECTORIZED.  */
122 		  if (!single_pred_p (e->src)
123 		      || single_pred (e->src) != outer_loop->header)
124 		    continue;
125 		}
126 	    }
127 	}
128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 	   gsi_next (&gsi))
130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131     }
132 }
133 
134 
135 struct adjust_info
136 {
137   tree from, to;
138   basic_block bb;
139 };
140 
141 /* A stack of values to be adjusted in debug stmts.  We have to
142    process them LIFO, so that the closest substitution applies.  If we
143    processed them FIFO, without the stack, we might substitute uses
144    with a PHI DEF that would soon become non-dominant, and when we got
145    to the suitable one, it wouldn't have anything to substitute any
146    more.  */
147 static vec<adjust_info, va_heap> adjust_vec;
148 
149 /* Adjust any debug stmts that referenced AI->from values to use the
150    loop-closed AI->to, if the references are dominated by AI->bb and
151    not by the definition of AI->from.  */
152 
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
155 {
156   basic_block bbphi = ai->bb;
157   tree orig_def = ai->from;
158   tree new_def = ai->to;
159   imm_use_iterator imm_iter;
160   gimple *stmt;
161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162 
163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164 
165   /* Adjust any debug stmts that held onto non-loop-closed
166      references.  */
167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168     {
169       use_operand_p use_p;
170       basic_block bbuse;
171 
172       if (!is_gimple_debug (stmt))
173 	continue;
174 
175       gcc_assert (gimple_debug_bind_p (stmt));
176 
177       bbuse = gimple_bb (stmt);
178 
179       if ((bbuse == bbphi
180 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 	  && !(bbuse == bbdef
182 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 	{
184 	  if (new_def)
185 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 	      SET_USE (use_p, new_def);
187 	  else
188 	    {
189 	      gimple_debug_bind_reset_value (stmt);
190 	      update_stmt (stmt);
191 	    }
192 	}
193     }
194 }
195 
196 /* Adjust debug stmts as scheduled before.  */
197 
198 static void
199 adjust_vec_debug_stmts (void)
200 {
201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
202     return;
203 
204   gcc_assert (adjust_vec.exists ());
205 
206   while (!adjust_vec.is_empty ())
207     {
208       adjust_debug_stmts_now (&adjust_vec.last ());
209       adjust_vec.pop ();
210     }
211 }
212 
213 /* Adjust any debug stmts that referenced FROM values to use the
214    loop-closed TO, if the references are dominated by BB and not by
215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
216    will be postponed until adjust_vec_debug_stmts is called.  */
217 
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
220 {
221   adjust_info ai;
222 
223   if (MAY_HAVE_DEBUG_BIND_STMTS
224       && TREE_CODE (from) == SSA_NAME
225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
226       && ! virtual_operand_p (from))
227     {
228       ai.from = from;
229       ai.to = to;
230       ai.bb = bb;
231 
232       if (adjust_vec.exists ())
233 	adjust_vec.safe_push (ai);
234       else
235 	adjust_debug_stmts_now (&ai);
236     }
237 }
238 
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240    to adjust any debug stmts that referenced the old phi arg,
241    presumably non-loop-closed references left over from other
242    transformations.  */
243 
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246 {
247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248 
249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250 
251   if (MAY_HAVE_DEBUG_BIND_STMTS)
252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 			gimple_bb (update_phi));
254 }
255 
256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
257    the mask should have during the first iteration and NEXT_MASK is the
258    value that it should have on subsequent iterations.  */
259 
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 		    tree next_mask)
263 {
264   gphi *phi = create_phi_node (mask, loop->header);
265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268 
269 /* Add SEQ to the end of LOOP's preheader block.  */
270 
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
273 {
274   if (seq)
275     {
276       edge pe = loop_preheader_edge (loop);
277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278       gcc_assert (!new_bb);
279     }
280 }
281 
282 /* Add SEQ to the beginning of LOOP's header block.  */
283 
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
286 {
287   if (seq)
288     {
289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291     }
292 }
293 
294 /* Return true if the target can interleave elements of two vectors.
295    OFFSET is 0 if the first half of the vectors should be interleaved
296    or 1 if the second half should.  When returning true, store the
297    associated permutation in INDICES.  */
298 
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 			unsigned int offset)
302 {
303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304   poly_uint64 base = exact_div (nelts, 2) * offset;
305   vec_perm_builder sel (nelts, 2, 3);
306   for (unsigned int i = 0; i < 3; ++i)
307     {
308       sel.quick_push (base + i);
309       sel.quick_push (base + i + nelts);
310     }
311   indices->new_vector (sel, 2, nelts);
312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314 
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316    in SRC_RGM, given that the former has twice as many masks as the
317    latter.  Return true on success, adding any new statements to SEQ.  */
318 
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 			       rgroup_masks *src_rgm)
322 {
323   tree src_masktype = src_rgm->mask_type;
324   tree dest_masktype = dest_rgm->mask_type;
325   machine_mode src_mode = TYPE_MODE (src_masktype);
326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329     {
330       /* Unpacking the source masks gives at least as many mask bits as
331 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 	{
335 	  tree src = src_rgm->masks[i / 2];
336 	  tree dest = dest_rgm->masks[i];
337 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 			    ? VEC_UNPACK_HI_EXPR
339 			    : VEC_UNPACK_LO_EXPR);
340 	  gassign *stmt;
341 	  if (dest_masktype == unpack_masktype)
342 	    stmt = gimple_build_assign (dest, code, src);
343 	  else
344 	    {
345 	      tree temp = make_ssa_name (unpack_masktype);
346 	      stmt = gimple_build_assign (temp, code, src);
347 	      gimple_seq_add_stmt (seq, stmt);
348 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 					  build1 (VIEW_CONVERT_EXPR,
350 						  dest_masktype, temp));
351 	    }
352 	  gimple_seq_add_stmt (seq, stmt);
353 	}
354       return true;
355     }
356   vec_perm_indices indices[2];
357   if (dest_masktype == src_masktype
358       && interleave_supported_p (&indices[0], src_masktype, 0)
359       && interleave_supported_p (&indices[1], src_masktype, 1))
360     {
361       /* The destination requires twice as many mask bits as the source, so
362 	 we can use interleaving permutes to double up the number of bits.  */
363       tree masks[2];
364       for (unsigned int i = 0; i < 2; ++i)
365 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 	{
368 	  tree src = src_rgm->masks[i / 2];
369 	  tree dest = dest_rgm->masks[i];
370 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 					      src, src, masks[i & 1]);
372 	  gimple_seq_add_stmt (seq, stmt);
373 	}
374       return true;
375     }
376   return false;
377 }
378 
379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
380    all the masks in RGM and return a mask that is nonzero when the loop
381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
382    Use LOOP_COND_GSI to insert code before the exit gcond.
383 
384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
385    times and has been vectorized according to LOOP_VINFO.  Each iteration
386    of the vectorized loop handles VF iterations of the scalar loop.
387 
388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389    starts with NITERS_SKIP dummy iterations of the scalar loop before
390    the real work starts.  The mask elements for these dummy iterations
391    must be 0, to ensure that the extra iterations do not have an effect.
392 
393    It is known that:
394 
395      NITERS * RGM->max_nscalars_per_iter
396 
397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
398    variable that starts at 0 and has step:
399 
400      VF * RGM->max_nscalars_per_iter
401 
402    might overflow before hitting a value above:
403 
404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405 
406    This means that we cannot guarantee that such an induction variable
407    would ever hit a value that produces a set of all-false masks for RGM.  */
408 
409 static tree
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 			      gimple_seq *preheader_seq,
412 			      gimple_stmt_iterator loop_cond_gsi,
413 			      rgroup_masks *rgm, tree vf,
414 			      tree niters, tree niters_skip,
415 			      bool might_wrap_p)
416 {
417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418   tree mask_type = rgm->mask_type;
419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421 
422   /* Calculate the maximum number of scalar values that the rgroup
423      handles in total, the number that it handles for each iteration
424      of the vector loop, and the number that it should skip during the
425      first iteration of the vector loop.  */
426   tree nscalars_total = niters;
427   tree nscalars_step = vf;
428   tree nscalars_skip = niters_skip;
429   if (nscalars_per_iter != 1)
430     {
431       /* We checked before choosing to use a fully-masked loop that these
432 	 multiplications don't overflow.  */
433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435 				     nscalars_total, factor);
436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 				    nscalars_step, factor);
438       if (nscalars_skip)
439 	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440 				      nscalars_skip, factor);
441     }
442 
443   /* Create an induction variable that counts the number of scalars
444      processed.  */
445   tree index_before_incr, index_after_incr;
446   gimple_stmt_iterator incr_gsi;
447   bool insert_after;
448   tree zero_index = build_int_cst (compare_type, 0);
449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451 	     insert_after, &index_before_incr, &index_after_incr);
452 
453   tree test_index, test_limit, first_limit;
454   gimple_stmt_iterator *test_gsi;
455   if (might_wrap_p)
456     {
457       /* In principle the loop should stop iterating once the incremented
458 	 IV reaches a value greater than or equal to:
459 
460 	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461 
462 	 However, there's no guarantee that this addition doesn't overflow
463 	 the comparison type, or that the IV hits a value above it before
464 	 wrapping around.  We therefore adjust the limit down by one
465 	 IV step:
466 
467 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 	   -[infinite-prec] NSCALARS_STEP
469 
470 	 and compare the IV against this limit _before_ incrementing it.
471 	 Since the comparison type is unsigned, we actually want the
472 	 subtraction to saturate at zero:
473 
474 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475 	   -[sat] NSCALARS_STEP
476 
477 	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478 
479 	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480 
481 	 where the rightmost subtraction can be done directly in
482 	 COMPARE_TYPE.  */
483       test_index = index_before_incr;
484       tree adjust = nscalars_step;
485       if (nscalars_skip)
486 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487 			       adjust, nscalars_skip);
488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489 				 nscalars_total, adjust);
490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491 				 test_limit, adjust);
492       test_gsi = &incr_gsi;
493 
494       /* Get a safe limit for the first iteration.  */
495       if (nscalars_skip)
496 	{
497 	  /* The first vector iteration can handle at most NSCALARS_STEP
498 	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
499 	     NSCALARS_SKIP to that cannot overflow.  */
500 	  tree const_limit = build_int_cst (compare_type,
501 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502 					    * nscalars_per_iter);
503 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504 				      nscalars_total, const_limit);
505 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506 				      first_limit, nscalars_skip);
507 	}
508       else
509 	/* For the first iteration it doesn't matter whether the IV hits
510 	   a value above NSCALARS_TOTAL.  That only matters for the latch
511 	   condition.  */
512 	first_limit = nscalars_total;
513     }
514   else
515     {
516       /* Test the incremented IV, which will always hit a value above
517 	 the bound before wrapping.  */
518       test_index = index_after_incr;
519       test_limit = nscalars_total;
520       if (nscalars_skip)
521 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522 				   test_limit, nscalars_skip);
523       test_gsi = &loop_cond_gsi;
524 
525       first_limit = test_limit;
526     }
527 
528   /* Provide a definition of each mask in the group.  */
529   tree next_mask = NULL_TREE;
530   tree mask;
531   unsigned int i;
532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533     {
534       /* Previous masks will cover BIAS scalars.  This mask covers the
535 	 next batch.  */
536       poly_uint64 bias = nscalars_per_mask * i;
537       tree bias_tree = build_int_cst (compare_type, bias);
538       gimple *tmp_stmt;
539 
540       /* See whether the first iteration of the vector loop is known
541 	 to have a full mask.  */
542       poly_uint64 const_limit;
543       bool first_iteration_full
544 	= (poly_int_tree_p (first_limit, &const_limit)
545 	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546 
547       /* Rather than have a new IV that starts at BIAS and goes up to
548 	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 	 and adjust the bound down by BIAS.  */
550       tree this_test_limit = test_limit;
551       if (i != 0)
552 	{
553 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554 					  compare_type, this_test_limit,
555 					  bias_tree);
556 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557 					  compare_type, this_test_limit,
558 					  bias_tree);
559 	}
560 
561       /* Create the initial mask.  First include all scalars that
562 	 are within the loop limit.  */
563       tree init_mask = NULL_TREE;
564       if (!first_iteration_full)
565 	{
566 	  tree start, end;
567 	  if (first_limit == test_limit)
568 	    {
569 	      /* Use a natural test between zero (the initial IV value)
570 		 and the loop limit.  The "else" block would be valid too,
571 		 but this choice can avoid the need to load BIAS_TREE into
572 		 a register.  */
573 	      start = zero_index;
574 	      end = this_test_limit;
575 	    }
576 	  else
577 	    {
578 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
579 		 first iteration of the vector loop.  Test the portion
580 		 associated with this mask.  */
581 	      start = bias_tree;
582 	      end = first_limit;
583 	    }
584 
585 	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586 	  tmp_stmt = vect_gen_while (init_mask, start, end);
587 	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588 	}
589 
590       /* Now AND out the bits that are within the number of skipped
591 	 scalars.  */
592       poly_uint64 const_skip;
593       if (nscalars_skip
594 	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
595 	       && known_le (const_skip, bias)))
596 	{
597 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598 						    bias_tree, nscalars_skip);
599 	  if (init_mask)
600 	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601 				      init_mask, unskipped_mask);
602 	  else
603 	    init_mask = unskipped_mask;
604 	}
605 
606       if (!init_mask)
607 	/* First iteration is full.  */
608 	init_mask = build_minus_one_cst (mask_type);
609 
610       /* Get the mask value for the next iteration of the loop.  */
611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614 
615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
616     }
617   return next_mask;
618 }
619 
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
622    number of iterations of the original scalar loop that should be
623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
624    as for vect_set_loop_condition.
625 
626    Insert the branch-back condition before LOOP_COND_GSI and return the
627    final gcond.  */
628 
629 static gcond *
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631 				tree niters, tree final_iv,
632 				bool niters_maybe_zero,
633 				gimple_stmt_iterator loop_cond_gsi)
634 {
635   gimple_seq preheader_seq = NULL;
636   gimple_seq header_seq = NULL;
637 
638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641   tree orig_niters = niters;
642 
643   /* Type of the initial value of NITERS.  */
644   tree ni_actual_type = TREE_TYPE (niters);
645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646 
647   /* Convert NITERS to the same size as the compare.  */
648   if (compare_precision > ni_actual_precision
649       && niters_maybe_zero)
650     {
651       /* We know that there is always at least one iteration, so if the
652 	 count is zero then it must have wrapped.  Cope with this by
653 	 subtracting 1 before the conversion and adding 1 to the result.  */
654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656 			     niters, build_minus_one_cst (ni_actual_type));
657       niters = gimple_convert (&preheader_seq, compare_type, niters);
658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659 			     niters, build_one_cst (compare_type));
660     }
661   else
662     niters = gimple_convert (&preheader_seq, compare_type, niters);
663 
664   /* Convert skip_niters to the right type.  */
665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666 
667   /* Now calculate the value that the induction variable must be able
668      to hit in order to ensure that we end the loop with an all-false mask.
669      This involves adding the maximum number of inactive trailing scalar
670      iterations.  */
671   widest_int iv_limit;
672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673   if (known_max_iters)
674     {
675       if (niters_skip)
676 	{
677 	  /* Add the maximum number of skipped iterations to the
678 	     maximum iteration count.  */
679 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
680 	    iv_limit += wi::to_widest (niters_skip);
681 	  else
682 	    iv_limit += max_vf - 1;
683 	}
684       /* IV_LIMIT is the maximum number of latch iterations, which is also
685 	 the maximum in-range IV value.  Round this value down to the previous
686 	 vector alignment boundary and then add an extra full iteration.  */
687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689     }
690 
691   /* Get the vectorization factor in tree form.  */
692   tree vf = build_int_cst (compare_type,
693 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694 
695   /* Iterate over all the rgroups and fill in their masks.  We could use
696      the first mask from any rgroup for the loop condition; here we
697      arbitrarily pick the last.  */
698   tree test_mask = NULL_TREE;
699   rgroup_masks *rgm;
700   unsigned int i;
701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702   FOR_EACH_VEC_ELT (*masks, i, rgm)
703     if (!rgm->masks.is_empty ())
704       {
705 	/* First try using permutes.  This adds a single vector
706 	   instruction to the loop for each mask, but needs no extra
707 	   loop invariants or IVs.  */
708 	unsigned int nmasks = i + 1;
709 	if ((nmasks & 1) == 0)
710 	  {
711 	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712 	    if (!half_rgm->masks.is_empty ()
713 		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714 	      continue;
715 	  }
716 
717 	/* See whether zero-based IV would ever generate all-false masks
718 	   before wrapping around.  */
719 	bool might_wrap_p
720 	  = (!known_max_iters
721 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722 				    UNSIGNED)
723 		 > compare_precision));
724 
725 	/* Set up all masks for this group.  */
726 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727 						  &preheader_seq,
728 						  loop_cond_gsi, rgm, vf,
729 						  niters, niters_skip,
730 						  might_wrap_p);
731       }
732 
733   /* Emit all accumulated statements.  */
734   add_preheader_seq (loop, preheader_seq);
735   add_header_seq (loop, header_seq);
736 
737   /* Get a boolean result that tells us whether to iterate.  */
738   edge exit_edge = single_exit (loop);
739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742 					NULL_TREE, NULL_TREE);
743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744 
745   /* The loop iterates (NITERS - 1) / VF + 1 times.
746      Subtract one from this to get the latch count.  */
747   tree step = build_int_cst (compare_type,
748 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750 				       build_minus_one_cst (compare_type));
751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752 				     niters_minus_one, step);
753 
754   if (final_iv)
755     {
756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
758     }
759 
760   return cond_stmt;
761 }
762 
763 /* Like vect_set_loop_condition, but handle the case in which there
764    are no loop masks.  */
765 
766 static gcond *
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768 				  tree step, tree final_iv,
769 				  bool niters_maybe_zero,
770 				  gimple_stmt_iterator loop_cond_gsi)
771 {
772   tree indx_before_incr, indx_after_incr;
773   gcond *cond_stmt;
774   gcond *orig_cond;
775   edge pe = loop_preheader_edge (loop);
776   edge exit_edge = single_exit (loop);
777   gimple_stmt_iterator incr_gsi;
778   bool insert_after;
779   enum tree_code code;
780   tree niters_type = TREE_TYPE (niters);
781 
782   orig_cond = get_loop_exit_condition (loop);
783   gcc_assert (orig_cond);
784   loop_cond_gsi = gsi_for_stmt (orig_cond);
785 
786   tree init, limit;
787   if (!niters_maybe_zero && integer_onep (step))
788     {
789       /* In this case we can use a simple 0-based IV:
790 
791 	 A:
792 	   x = 0;
793 	   do
794 	     {
795 	       ...
796 	       x += 1;
797 	     }
798 	   while (x < NITERS);  */
799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800       init = build_zero_cst (niters_type);
801       limit = niters;
802     }
803   else
804     {
805       /* The following works for all values of NITERS except 0:
806 
807 	 B:
808 	   x = 0;
809 	   do
810 	     {
811 	       ...
812 	       x += STEP;
813 	     }
814 	   while (x <= NITERS - STEP);
815 
816 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 	 but stops if x + STEP - 1 >= NITERS.
818 
819 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 	 before wrapping around.  There are two obvious ways of dealing with
821 	 this:
822 
823 	 - start at STEP - 1 and compare x before incrementing it
824 	 - start at -1 and compare x after incrementing it
825 
826 	 The latter is simpler and is what we use.  The loop in this case
827 	 looks like:
828 
829 	 C:
830 	   x = -1;
831 	   do
832 	     {
833 	       ...
834 	       x += STEP;
835 	     }
836 	   while (x < NITERS - STEP);
837 
838 	 In both cases the loop limit is NITERS - STEP.  */
839       gimple_seq seq = NULL;
840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842       if (seq)
843 	{
844 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845 	  gcc_assert (!new_bb);
846 	}
847       if (niters_maybe_zero)
848 	{
849 	  /* Case C.  */
850 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851 	  init = build_all_ones_cst (niters_type);
852 	}
853       else
854 	{
855 	  /* Case B.  */
856 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857 	  init = build_zero_cst (niters_type);
858 	}
859     }
860 
861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862   create_iv (init, step, NULL_TREE, loop,
863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865 					      true, NULL_TREE, true,
866 					      GSI_SAME_STMT);
867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868 				     true, GSI_SAME_STMT);
869 
870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871 				 NULL_TREE);
872 
873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874 
875   /* Record the number of latch iterations.  */
876   if (limit == niters)
877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
878        latch count.  */
879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880 				       build_int_cst (niters_type, 1));
881   else
882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883        Subtract one from this to get the latch count.  */
884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885 				       limit, step);
886 
887   if (final_iv)
888     {
889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890 					     indx_after_incr, init);
891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
892     }
893 
894   return cond_stmt;
895 }
896 
897 /* If we're using fully-masked loops, make LOOP iterate:
898 
899       N == (NITERS - 1) / STEP + 1
900 
901    times.  When NITERS is zero, this is equivalent to making the loop
902    execute (1 << M) / STEP times, where M is the precision of NITERS.
903    NITERS_MAYBE_ZERO is true if this last case might occur.
904 
905    If we're not using fully-masked loops, make LOOP iterate:
906 
907       N == (NITERS - STEP) / STEP + 1
908 
909    times, where NITERS is known to be outside the range [1, STEP - 1].
910    This is equivalent to making the loop execute NITERS / STEP times
911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913 
914    If FINAL_IV is nonnull, it is an SSA name that should be set to
915    N * STEP on exit from the loop.
916 
917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
918 
919 void
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921 			 tree niters, tree step, tree final_iv,
922 			 bool niters_maybe_zero)
923 {
924   gcond *cond_stmt;
925   gcond *orig_cond = get_loop_exit_condition (loop);
926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927 
928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930 						final_iv, niters_maybe_zero,
931 						loop_cond_gsi);
932   else
933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934 						  final_iv, niters_maybe_zero,
935 						  loop_cond_gsi);
936 
937   /* Remove old loop exit test.  */
938   gsi_remove (&loop_cond_gsi, true);
939   free_stmt_vec_info (orig_cond);
940 
941   if (dump_enabled_p ())
942     {
943       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
944       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
945     }
946 }
947 
948 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
949    For all PHI arguments in FROM->dest and TO->dest from those
950    edges ensure that TO->dest PHI arguments have current_def
951    to that in from.  */
952 
953 static void
954 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
955 {
956   gimple_stmt_iterator gsi_from, gsi_to;
957 
958   for (gsi_from = gsi_start_phis (from->dest),
959        gsi_to = gsi_start_phis (to->dest);
960        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
961     {
962       gimple *from_phi = gsi_stmt (gsi_from);
963       gimple *to_phi = gsi_stmt (gsi_to);
964       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
965       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
966       if (virtual_operand_p (from_arg))
967 	{
968 	  gsi_next (&gsi_from);
969 	  continue;
970 	}
971       if (virtual_operand_p (to_arg))
972 	{
973 	  gsi_next (&gsi_to);
974 	  continue;
975 	}
976       if (TREE_CODE (from_arg) != SSA_NAME)
977 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
978       else
979 	{
980 	  if (get_current_def (to_arg) == NULL_TREE)
981 	    set_current_def (to_arg, get_current_def (from_arg));
982 	}
983       gsi_next (&gsi_from);
984       gsi_next (&gsi_to);
985     }
986 
987   gphi *from_phi = get_virtual_phi (from->dest);
988   gphi *to_phi = get_virtual_phi (to->dest);
989   if (from_phi)
990     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
991 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
992 }
993 
994 
995 /* Given LOOP this function generates a new copy of it and puts it
996    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
997    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
998    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
999    entry or exit of LOOP.  */
1000 
1001 struct loop *
1002 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1003 					struct loop *scalar_loop, edge e)
1004 {
1005   struct loop *new_loop;
1006   basic_block *new_bbs, *bbs, *pbbs;
1007   bool at_exit;
1008   bool was_imm_dom;
1009   basic_block exit_dest;
1010   edge exit, new_exit;
1011   bool duplicate_outer_loop = false;
1012 
1013   exit = single_exit (loop);
1014   at_exit = (e == exit);
1015   if (!at_exit && e != loop_preheader_edge (loop))
1016     return NULL;
1017 
1018   if (scalar_loop == NULL)
1019     scalar_loop = loop;
1020 
1021   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1022   pbbs = bbs + 1;
1023   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1024   /* Allow duplication of outer loops.  */
1025   if (scalar_loop->inner)
1026     duplicate_outer_loop = true;
1027   /* Check whether duplication is possible.  */
1028   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1029     {
1030       free (bbs);
1031       return NULL;
1032     }
1033 
1034   /* Generate new loop structure.  */
1035   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1036   duplicate_subloops (scalar_loop, new_loop);
1037 
1038   exit_dest = exit->dest;
1039   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1040 					  exit_dest) == loop->header ?
1041 		 true : false);
1042 
1043   /* Also copy the pre-header, this avoids jumping through hoops to
1044      duplicate the loop entry PHI arguments.  Create an empty
1045      pre-header unconditionally for this.  */
1046   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1047   edge entry_e = single_pred_edge (preheader);
1048   bbs[0] = preheader;
1049   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1050 
1051   exit = single_exit (scalar_loop);
1052   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1053 	    &exit, 1, &new_exit, NULL,
1054 	    at_exit ? loop->latch : e->src, true);
1055   exit = single_exit (loop);
1056   basic_block new_preheader = new_bbs[0];
1057 
1058   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1059 
1060   if (scalar_loop != loop)
1061     {
1062       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1065 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066 	 header) to have current_def set, so copy them over.  */
1067       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068 						exit);
1069       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070 							   0),
1071 						EDGE_SUCC (loop->latch, 0));
1072     }
1073 
1074   if (at_exit) /* Add the loop copy at exit.  */
1075     {
1076       if (scalar_loop != loop)
1077 	{
1078 	  gphi_iterator gsi;
1079 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080 
1081 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082 	       gsi_next (&gsi))
1083 	    {
1084 	      gphi *phi = gsi.phi ();
1085 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086 	      location_t orig_locus
1087 		= gimple_phi_arg_location_from_edge (phi, e);
1088 
1089 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090 	    }
1091 	}
1092       redirect_edge_and_branch_force (e, new_preheader);
1093       flush_pending_stmts (e);
1094       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095       if (was_imm_dom || duplicate_outer_loop)
1096 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097 
1098       /* And remove the non-necessary forwarder again.  Keep the other
1099          one so we have a proper pre-header for the loop at the exit edge.  */
1100       redirect_edge_pred (single_succ_edge (preheader),
1101 			  single_pred (preheader));
1102       delete_basic_block (preheader);
1103       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104 			       loop_preheader_edge (scalar_loop)->src);
1105     }
1106   else /* Add the copy at entry.  */
1107     {
1108       if (scalar_loop != loop)
1109 	{
1110 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1111 	  redirect_edge_pred (single_succ_edge (preheader),
1112 			      single_pred (preheader));
1113 	  delete_basic_block (preheader);
1114 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115 				   loop_preheader_edge (scalar_loop)->src);
1116 	  preheader = split_edge (loop_preheader_edge (loop));
1117 	  entry_e = single_pred_edge (preheader);
1118 	}
1119 
1120       redirect_edge_and_branch_force (entry_e, new_preheader);
1121       flush_pending_stmts (entry_e);
1122       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123 
1124       redirect_edge_and_branch_force (new_exit, preheader);
1125       flush_pending_stmts (new_exit);
1126       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127 
1128       /* And remove the non-necessary forwarder again.  Keep the other
1129          one so we have a proper pre-header for the loop at the exit edge.  */
1130       redirect_edge_pred (single_succ_edge (new_preheader),
1131 			  single_pred (new_preheader));
1132       delete_basic_block (new_preheader);
1133       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134 			       loop_preheader_edge (new_loop)->src);
1135     }
1136 
1137   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1138   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140 
1141   if (scalar_loop != loop)
1142     {
1143       /* Update new_loop->header PHIs, so that on the preheader
1144 	 edge they are the ones from loop rather than scalar_loop.  */
1145       gphi_iterator gsi_orig, gsi_new;
1146       edge orig_e = loop_preheader_edge (loop);
1147       edge new_e = loop_preheader_edge (new_loop);
1148 
1149       for (gsi_orig = gsi_start_phis (loop->header),
1150 	   gsi_new = gsi_start_phis (new_loop->header);
1151 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1152 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153 	{
1154 	  gphi *orig_phi = gsi_orig.phi ();
1155 	  gphi *new_phi = gsi_new.phi ();
1156 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1157 	  location_t orig_locus
1158 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159 
1160 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1161 	}
1162     }
1163 
1164   free (new_bbs);
1165   free (bbs);
1166 
1167   checking_verify_dominators (CDI_DOMINATORS);
1168 
1169   return new_loop;
1170 }
1171 
1172 
1173 /* Given the condition expression COND, put it as the last statement of
1174    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1175    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1176    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1177    new edge as irreducible if IRREDUCIBLE_P is true.  */
1178 
1179 static edge
1180 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1181 		       basic_block guard_to, basic_block dom_bb,
1182 		       profile_probability probability, bool irreducible_p)
1183 {
1184   gimple_stmt_iterator gsi;
1185   edge new_e, enter_e;
1186   gcond *cond_stmt;
1187   gimple_seq gimplify_stmt_list = NULL;
1188 
1189   enter_e = EDGE_SUCC (guard_bb, 0);
1190   enter_e->flags &= ~EDGE_FALLTHRU;
1191   enter_e->flags |= EDGE_FALSE_VALUE;
1192   gsi = gsi_last_bb (guard_bb);
1193 
1194   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1195 				 NULL_TREE);
1196   if (gimplify_stmt_list)
1197     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1198 
1199   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1200   gsi = gsi_last_bb (guard_bb);
1201   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202 
1203   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1204   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1205 
1206   new_e->probability = probability;
1207   if (irreducible_p)
1208     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209 
1210   enter_e->probability = probability.invert ();
1211   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1212 
1213   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1214   if (enter_e->dest->loop_father->header == enter_e->dest)
1215     split_edge (enter_e);
1216 
1217   return new_e;
1218 }
1219 
1220 
1221 /* This function verifies that the following restrictions apply to LOOP:
1222    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1223        for innermost loop and 5 basic blocks for outer-loop.
1224    (2) it is single entry, single exit
1225    (3) its exit condition is the last stmt in the header
1226    (4) E is the entry/exit edge of LOOP.
1227  */
1228 
1229 bool
1230 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231 {
1232   edge exit_e = single_exit (loop);
1233   edge entry_e = loop_preheader_edge (loop);
1234   gcond *orig_cond = get_loop_exit_condition (loop);
1235   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1236   unsigned int num_bb = loop->inner? 5 : 2;
1237 
1238   /* All loops have an outer scope; the only case loop->outer is NULL is for
1239      the function itself.  */
1240   if (!loop_outer (loop)
1241       || loop->num_nodes != num_bb
1242       || !empty_block_p (loop->latch)
1243       || !single_exit (loop)
1244       /* Verify that new loop exit condition can be trivially modified.  */
1245       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1246       || (e != exit_e && e != entry_e))
1247     return false;
1248 
1249   return true;
1250 }
1251 
1252 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1253    in the exit bb and rename all the uses after the loop.  This simplifies
1254    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1255    (but normally loop closed SSA form doesn't require virtual PHIs to be
1256    in the same form).  Doing this early simplifies the checking what
1257    uses should be renamed.  */
1258 
1259 static void
1260 create_lcssa_for_virtual_phi (struct loop *loop)
1261 {
1262   gphi_iterator gsi;
1263   edge exit_e = single_exit (loop);
1264 
1265   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1266     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267       {
1268 	gphi *phi = gsi.phi ();
1269 	for (gsi = gsi_start_phis (exit_e->dest);
1270 	     !gsi_end_p (gsi); gsi_next (&gsi))
1271 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1272 	    break;
1273 	if (gsi_end_p (gsi))
1274 	  {
1275 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1276 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1277 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1278 	    imm_use_iterator imm_iter;
1279 	    gimple *stmt;
1280 	    use_operand_p use_p;
1281 
1282 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1283 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1284 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1285 	    gimple_phi_set_result (new_phi, new_vop);
1286 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1287 	      if (stmt != new_phi
1288 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1289 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1290 		  SET_USE (use_p, new_vop);
1291 	  }
1292 	break;
1293       }
1294 
1295 }
1296 
1297 /* Function vect_get_loop_location.
1298 
1299    Extract the location of the loop in the source code.
1300    If the loop is not well formed for vectorization, an estimated
1301    location is calculated.
1302    Return the loop location if succeed and NULL if not.  */
1303 
1304 source_location
1305 find_loop_location (struct loop *loop)
1306 {
1307   gimple *stmt = NULL;
1308   basic_block bb;
1309   gimple_stmt_iterator si;
1310 
1311   if (!loop)
1312     return UNKNOWN_LOCATION;
1313 
1314   stmt = get_loop_exit_condition (loop);
1315 
1316   if (stmt
1317       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318     return gimple_location (stmt);
1319 
1320   /* If we got here the loop is probably not "well formed",
1321      try to estimate the loop location */
1322 
1323   if (!loop->header)
1324     return UNKNOWN_LOCATION;
1325 
1326   bb = loop->header;
1327 
1328   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329     {
1330       stmt = gsi_stmt (si);
1331       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332         return gimple_location (stmt);
1333     }
1334 
1335   return UNKNOWN_LOCATION;
1336 }
1337 
1338 /* Return true if PHI defines an IV of the loop to be vectorized.  */
1339 
1340 static bool
1341 iv_phi_p (gphi *phi)
1342 {
1343   if (virtual_operand_p (PHI_RESULT (phi)))
1344     return false;
1345 
1346   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1347   gcc_assert (stmt_info != NULL);
1348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350     return false;
1351 
1352   return true;
1353 }
1354 
1355 /* Function vect_can_advance_ivs_p
1356 
1357    In case the number of iterations that LOOP iterates is unknown at compile
1358    time, an epilog loop will be generated, and the loop induction variables
1359    (IVs) will be "advanced" to the value they are supposed to take just before
1360    the epilog loop.  Here we check that the access function of the loop IVs
1361    and the expression that represents the loop bound are simple enough.
1362    These restrictions will be relaxed in the future.  */
1363 
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block bb = loop->header;
1369   gphi_iterator gsi;
1370 
1371   /* Analyze phi functions of the loop header.  */
1372 
1373   if (dump_enabled_p ())
1374     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376     {
1377       tree evolution_part;
1378 
1379       gphi *phi = gsi.phi ();
1380       if (dump_enabled_p ())
1381 	{
1382           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1383           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1384 	}
1385 
1386       /* Skip virtual phi's. The data dependences that are associated with
1387 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388 
1389 	 Skip reduction phis.  */
1390       if (!iv_phi_p (phi))
1391 	{
1392 	  if (dump_enabled_p ())
1393 	    dump_printf_loc (MSG_NOTE, vect_location,
1394 			     "reduc or virtual phi. skip.\n");
1395 	  continue;
1396 	}
1397 
1398       /* Analyze the evolution function.  */
1399 
1400       evolution_part
1401 	= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1402       if (evolution_part == NULL_TREE)
1403         {
1404 	  if (dump_enabled_p ())
1405 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1406 			 "No access function or evolution.\n");
1407 	  return false;
1408         }
1409 
1410       /* FORNOW: We do not transform initial conditions of IVs
1411 	 which evolution functions are not invariants in the loop.  */
1412 
1413       if (!expr_invariant_in_loop_p (loop, evolution_part))
1414 	{
1415 	  if (dump_enabled_p ())
1416 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 			     "evolution not invariant in loop.\n");
1418 	  return false;
1419 	}
1420 
1421       /* FORNOW: We do not transform initial conditions of IVs
1422 	 which evolution functions are a polynomial of degree >= 2.  */
1423 
1424       if (tree_is_chrec (evolution_part))
1425 	{
1426 	  if (dump_enabled_p ())
1427 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428 			     "evolution is chrec.\n");
1429 	  return false;
1430 	}
1431     }
1432 
1433   return true;
1434 }
1435 
1436 
1437 /*   Function vect_update_ivs_after_vectorizer.
1438 
1439      "Advance" the induction variables of LOOP to the value they should take
1440      after the execution of LOOP.  This is currently necessary because the
1441      vectorizer does not handle induction variables that are used after the
1442      loop.  Such a situation occurs when the last iterations of LOOP are
1443      peeled, because:
1444      1. We introduced new uses after LOOP for IVs that were not originally used
1445         after LOOP: the IVs of LOOP are now used by an epilog loop.
1446      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447         times, whereas the loop IVs should be bumped N times.
1448 
1449      Input:
1450      - LOOP - a loop that is going to be vectorized. The last few iterations
1451               of LOOP were peeled.
1452      - NITERS - the number of iterations that LOOP executes (before it is
1453                 vectorized). i.e, the number of times the ivs should be bumped.
1454      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455                   coming out from LOOP on which there are uses of the LOOP ivs
1456 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1457 
1458                   The new definitions of the ivs are placed in LOOP->exit.
1459                   The phi args associated with the edge UPDATE_E in the bb
1460                   UPDATE_E->dest are updated accordingly.
1461 
1462      Assumption 1: Like the rest of the vectorizer, this function assumes
1463      a single loop exit that has a single predecessor.
1464 
1465      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466      organized in the same order.
1467 
1468      Assumption 3: The access function of the ivs is simple enough (see
1469      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1470 
1471      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472      coming out of LOOP on which the ivs of LOOP are used (this is the path
1473      that leads to the epilog loop; other paths skip the epilog loop).  This
1474      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475      needs to have its phis updated.
1476  */
1477 
1478 static void
1479 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480 				  tree niters, edge update_e)
1481 {
1482   gphi_iterator gsi, gsi1;
1483   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1484   basic_block update_bb = update_e->dest;
1485   basic_block exit_bb = single_exit (loop)->dest;
1486 
1487   /* Make sure there exists a single-predecessor exit bb:  */
1488   gcc_assert (single_pred_p (exit_bb));
1489   gcc_assert (single_succ_edge (exit_bb) == update_e);
1490 
1491   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493        gsi_next (&gsi), gsi_next (&gsi1))
1494     {
1495       tree init_expr;
1496       tree step_expr, off;
1497       tree type;
1498       tree var, ni, ni_name;
1499       gimple_stmt_iterator last_gsi;
1500 
1501       gphi *phi = gsi.phi ();
1502       gphi *phi1 = gsi1.phi ();
1503       if (dump_enabled_p ())
1504 	{
1505 	  dump_printf_loc (MSG_NOTE, vect_location,
1506 			   "vect_update_ivs_after_vectorizer: phi: ");
1507 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1508 	}
1509 
1510       /* Skip reduction and virtual phis.  */
1511       if (!iv_phi_p (phi))
1512 	{
1513 	  if (dump_enabled_p ())
1514 	    dump_printf_loc (MSG_NOTE, vect_location,
1515 			     "reduc or virtual phi. skip.\n");
1516 	  continue;
1517 	}
1518 
1519       type = TREE_TYPE (gimple_phi_result (phi));
1520       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1521       step_expr = unshare_expr (step_expr);
1522 
1523       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1524          of degree >= 2 or exponential.  */
1525       gcc_assert (!tree_is_chrec (step_expr));
1526 
1527       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1528 
1529       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1530 			 fold_convert (TREE_TYPE (step_expr), niters),
1531 			 step_expr);
1532       if (POINTER_TYPE_P (type))
1533 	ni = fold_build_pointer_plus (init_expr, off);
1534       else
1535 	ni = fold_build2 (PLUS_EXPR, type,
1536 			  init_expr, fold_convert (type, off));
1537 
1538       var = create_tmp_var (type, "tmp");
1539 
1540       last_gsi = gsi_last_bb (exit_bb);
1541       gimple_seq new_stmts = NULL;
1542       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1543       /* Exit_bb shouldn't be empty.  */
1544       if (!gsi_end_p (last_gsi))
1545 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1546       else
1547 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1548 
1549       /* Fix phi expressions in the successor bb.  */
1550       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1551     }
1552 }
1553 
1554 /* Return a gimple value containing the misalignment (measured in vector
1555    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1556    it is away from a perfectly aligned address.  Add any new statements
1557    to SEQ.  */
1558 
1559 static tree
1560 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1561 {
1562   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1563   gimple *dr_stmt = DR_STMT (dr);
1564   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1565   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566 
1567   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1568   gcc_assert (target_align != 0);
1569 
1570   bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1571   tree offset = (negative
1572 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1573 		 : size_zero_node);
1574   tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1575 							  offset);
1576   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1577   tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1578   HOST_WIDE_INT elem_size
1579     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581 
1582   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1583   tree int_start_addr = fold_convert (type, start_addr);
1584   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585 					target_align_minus_1);
1586 
1587   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1588   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589 					elem_size_log);
1590 
1591   return misalign_in_elems;
1592 }
1593 
1594 /* Function vect_gen_prolog_loop_niters
1595 
1596    Generate the number of iterations which should be peeled as prolog for the
1597    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1598    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599    As a result, after the execution of this loop, the data reference DR will
1600    refer to an aligned location.  The following computation is generated:
1601 
1602    If the misalignment of DR is known at compile time:
1603      addr_mis = int mis = DR_MISALIGNMENT (dr);
1604    Else, compute address misalignment in bytes:
1605      addr_mis = addr & (target_align - 1)
1606 
1607    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608 
1609    (elem_size = element type size; an element is the scalar element whose type
1610    is the inner type of the vectype)
1611 
1612    The computations will be emitted at the end of BB.  We also compute and
1613    store upper bound (included) of the result in BOUND.
1614 
1615    When the step of the data-ref in the loop is not 1 (as in interleaved data
1616    and SLP), the number of iterations of the prolog must be divided by the step
1617    (which is equal to the size of interleaved group).
1618 
1619    The above formulas assume that VF == number of elements in the vector. This
1620    may not hold when there are multiple-types in the loop.
1621    In this case, for some data-references in the loop the VF does not represent
1622    the number of elements that fit in the vector.  Therefore, instead of VF we
1623    use TYPE_VECTOR_SUBPARTS.  */
1624 
1625 static tree
1626 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627 			     basic_block bb, int *bound)
1628 {
1629   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630   tree var;
1631   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632   gimple_seq stmts = NULL, new_stmts = NULL;
1633   tree iters, iters_name;
1634   gimple *dr_stmt = DR_STMT (dr);
1635   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1636   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1638 
1639   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1640     {
1641       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1642 
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_NOTE, vect_location,
1645                          "known peeling = %d.\n", npeel);
1646 
1647       iters = build_int_cst (niters_type, npeel);
1648       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649     }
1650   else
1651     {
1652       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1653       tree type = TREE_TYPE (misalign_in_elems);
1654       HOST_WIDE_INT elem_size
1655 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1656       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1657       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1658       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659 
1660       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1661 				 & (align_in_elems - 1)).  */
1662       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1663       if (negative)
1664 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1665 			     align_in_elems_tree);
1666       else
1667 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1668 			     misalign_in_elems);
1669       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1670       iters = fold_convert (niters_type, iters);
1671       *bound = align_in_elems - 1;
1672     }
1673 
1674   if (dump_enabled_p ())
1675     {
1676       dump_printf_loc (MSG_NOTE, vect_location,
1677                        "niters for prolog loop: ");
1678       dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1679       dump_printf (MSG_NOTE, "\n");
1680     }
1681 
1682   var = create_tmp_var (niters_type, "prolog_loop_niters");
1683   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1684 
1685   if (new_stmts)
1686     gimple_seq_add_seq (&stmts, new_stmts);
1687   if (stmts)
1688     {
1689       gcc_assert (single_succ_p (bb));
1690       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1691       if (gsi_end_p (gsi))
1692 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1693       else
1694 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1695     }
1696   return iters_name;
1697 }
1698 
1699 
1700 /* Function vect_update_init_of_dr
1701 
1702    If CODE is PLUS, the vector loop starts NITERS iterations after the
1703    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1704    iterations before the scalar one (using masking to skip inactive
1705    elements).  This function updates the information recorded in DR to
1706    account for the difference.  Specifically, it updates the OFFSET
1707    field of DR.  */
1708 
1709 static void
1710 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1711 {
1712   tree offset = DR_OFFSET (dr);
1713 
1714   niters = fold_build2 (MULT_EXPR, sizetype,
1715 			fold_convert (sizetype, niters),
1716 			fold_convert (sizetype, DR_STEP (dr)));
1717   offset = fold_build2 (code, sizetype,
1718 			fold_convert (sizetype, offset), niters);
1719   DR_OFFSET (dr) = offset;
1720 }
1721 
1722 
1723 /* Function vect_update_inits_of_drs
1724 
1725    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1726    CODE and NITERS are as for vect_update_inits_of_dr.  */
1727 
1728 static void
1729 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1730 			  tree_code code)
1731 {
1732   unsigned int i;
1733   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1734   struct data_reference *dr;
1735 
1736   if (dump_enabled_p ())
1737     dump_printf_loc (MSG_NOTE, vect_location,
1738 		     "=== vect_update_inits_of_dr ===\n");
1739 
1740   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1741   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1742     {
1743       gimple_seq seq;
1744       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1745       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1746 
1747       niters = fold_convert (sizetype, niters);
1748       niters = force_gimple_operand (niters, &seq, false, var);
1749       if (seq)
1750 	{
1751 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1752 	  gcc_assert (!new_bb);
1753 	}
1754     }
1755 
1756   FOR_EACH_VEC_ELT (datarefs, i, dr)
1757     vect_update_init_of_dr (dr, niters, code);
1758 }
1759 
1760 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1761    by masking.  This involves calculating the number of iterations to
1762    be peeled and then aligning all memory references appropriately.  */
1763 
1764 void
1765 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1766 {
1767   tree misalign_in_elems;
1768   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1769 
1770   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1771 
1772   /* From the information recorded in LOOP_VINFO get the number of iterations
1773      that need to be skipped via masking.  */
1774   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1775     {
1776       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1777 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1778       misalign_in_elems = build_int_cst (type, misalign);
1779     }
1780   else
1781     {
1782       gimple_seq seq1 = NULL, seq2 = NULL;
1783       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1784       misalign_in_elems = fold_convert (type, misalign_in_elems);
1785       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1786 						&seq2, true, NULL_TREE);
1787       gimple_seq_add_seq (&seq1, seq2);
1788       if (seq1)
1789 	{
1790 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1791 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1792 	  gcc_assert (!new_bb);
1793 	}
1794     }
1795 
1796   if (dump_enabled_p ())
1797     {
1798       dump_printf_loc (MSG_NOTE, vect_location,
1799 		       "misalignment for fully-masked loop: ");
1800       dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1801       dump_printf (MSG_NOTE, "\n");
1802     }
1803 
1804   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1805 
1806   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1807 }
1808 
1809 /* This function builds ni_name = number of iterations.  Statements
1810    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1811    it to TRUE if new ssa_var is generated.  */
1812 
1813 tree
1814 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1815 {
1816   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1817   if (TREE_CODE (ni) == INTEGER_CST)
1818     return ni;
1819   else
1820     {
1821       tree ni_name, var;
1822       gimple_seq stmts = NULL;
1823       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1824 
1825       var = create_tmp_var (TREE_TYPE (ni), "niters");
1826       ni_name = force_gimple_operand (ni, &stmts, false, var);
1827       if (stmts)
1828 	{
1829 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1830 	  if (new_var_p != NULL)
1831 	    *new_var_p = true;
1832 	}
1833 
1834       return ni_name;
1835     }
1836 }
1837 
1838 /* Calculate the number of iterations above which vectorized loop will be
1839    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1840    of prolog loop.  If it's integer const, the integer number is also passed
1841    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1842    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1843    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1844    threshold below which the scalar (rather than vectorized) loop will be
1845    executed.  This function stores the upper bound (inclusive) of the result
1846    in BOUND_SCALAR.  */
1847 
1848 static tree
1849 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1850 			     int bound_prolog, poly_int64 bound_epilog, int th,
1851 			     poly_uint64 *bound_scalar,
1852 			     bool check_profitability)
1853 {
1854   tree type = TREE_TYPE (niters_prolog);
1855   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1856 			     build_int_cst (type, bound_epilog));
1857 
1858   *bound_scalar = bound_prolog + bound_epilog;
1859   if (check_profitability)
1860     {
1861       /* TH indicates the minimum niters of vectorized loop, while we
1862 	 compute the maximum niters of scalar loop.  */
1863       th--;
1864       /* Peeling for constant times.  */
1865       if (int_niters_prolog >= 0)
1866 	{
1867 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1868 	  return build_int_cst (type, *bound_scalar);
1869 	}
1870       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1871 	 and BOUND_EPILOG are inclusive upper bounds.  */
1872       if (known_ge (th, bound_prolog + bound_epilog))
1873 	{
1874 	  *bound_scalar = th;
1875 	  return build_int_cst (type, th);
1876 	}
1877       /* Need to do runtime comparison.  */
1878       else if (maybe_gt (th, bound_epilog))
1879 	{
1880 	  *bound_scalar = upper_bound (*bound_scalar, th);
1881 	  return fold_build2 (MAX_EXPR, type,
1882 			      build_int_cst (type, th), niters);
1883 	}
1884     }
1885   return niters;
1886 }
1887 
1888 /* NITERS is the number of times that the original scalar loop executes
1889    after peeling.  Work out the maximum number of iterations N that can
1890    be handled by the vectorized form of the loop and then either:
1891 
1892    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1893 
1894 	niters_vector = N
1895 
1896    b) set *STEP_VECTOR_PTR to one and generate:
1897 
1898         niters_vector = N / vf
1899 
1900    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1901    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1902    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1903 
1904 void
1905 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1906 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1907 			     bool niters_no_overflow)
1908 {
1909   tree ni_minus_gap, var;
1910   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1911   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1912   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1913   tree log_vf = NULL_TREE;
1914 
1915   /* If epilogue loop is required because of data accesses with gaps, we
1916      subtract one iteration from the total number of iterations here for
1917      correct calculation of RATIO.  */
1918   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1919     {
1920       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1921 				  build_one_cst (type));
1922       if (!is_gimple_val (ni_minus_gap))
1923 	{
1924 	  var = create_tmp_var (type, "ni_gap");
1925 	  gimple *stmts = NULL;
1926 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1927 					       true, var);
1928 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1929 	}
1930     }
1931   else
1932     ni_minus_gap = niters;
1933 
1934   unsigned HOST_WIDE_INT const_vf;
1935   if (vf.is_constant (&const_vf)
1936       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1937     {
1938       /* Create: niters >> log2(vf) */
1939       /* If it's known that niters == number of latch executions + 1 doesn't
1940 	 overflow, we can generate niters >> log2(vf); otherwise we generate
1941 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1942 	 will be at least one.  */
1943       log_vf = build_int_cst (type, exact_log2 (const_vf));
1944       if (niters_no_overflow)
1945 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1946       else
1947 	niters_vector
1948 	  = fold_build2 (PLUS_EXPR, type,
1949 			 fold_build2 (RSHIFT_EXPR, type,
1950 				      fold_build2 (MINUS_EXPR, type,
1951 						   ni_minus_gap,
1952 						   build_int_cst (type, vf)),
1953 				      log_vf),
1954 			 build_int_cst (type, 1));
1955       step_vector = build_one_cst (type);
1956     }
1957   else
1958     {
1959       niters_vector = ni_minus_gap;
1960       step_vector = build_int_cst (type, vf);
1961     }
1962 
1963   if (!is_gimple_val (niters_vector))
1964     {
1965       var = create_tmp_var (type, "bnd");
1966       gimple_seq stmts = NULL;
1967       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1968       gsi_insert_seq_on_edge_immediate (pe, stmts);
1969       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1970 	 we set range information to make niters analyzer's life easier.  */
1971       if (stmts != NULL && log_vf)
1972 	set_range_info (niters_vector, VR_RANGE,
1973 			wi::to_wide (build_int_cst (type, 1)),
1974 			wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1975 						  TYPE_MAX_VALUE (type),
1976 						  log_vf)));
1977     }
1978   *niters_vector_ptr = niters_vector;
1979   *step_vector_ptr = step_vector;
1980 
1981   return;
1982 }
1983 
1984 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1985    loop specified by LOOP_VINFO after vectorization, compute the number
1986    of iterations before vectorization (niters_vector * vf) and store it
1987    to NITERS_VECTOR_MULT_VF_PTR.  */
1988 
1989 static void
1990 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1991 				     tree niters_vector,
1992 				     tree *niters_vector_mult_vf_ptr)
1993 {
1994   /* We should be using a step_vector of VF if VF is variable.  */
1995   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1996   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1997   tree type = TREE_TYPE (niters_vector);
1998   tree log_vf = build_int_cst (type, exact_log2 (vf));
1999   basic_block exit_bb = single_exit (loop)->dest;
2000 
2001   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2002   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2003 					    niters_vector, log_vf);
2004   if (!is_gimple_val (niters_vector_mult_vf))
2005     {
2006       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2007       gimple_seq stmts = NULL;
2008       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2009 						    &stmts, true, var);
2010       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2011       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2012     }
2013   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2014 }
2015 
2016 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2017    from SECOND/FIRST and puts it at the original loop's preheader/exit
2018    edge, the two loops are arranged as below:
2019 
2020        preheader_a:
2021      first_loop:
2022        header_a:
2023 	 i_1 = PHI<i_0, i_2>;
2024 	 ...
2025 	 i_2 = i_1 + 1;
2026 	 if (cond_a)
2027 	   goto latch_a;
2028 	 else
2029 	   goto between_bb;
2030        latch_a:
2031 	 goto header_a;
2032 
2033        between_bb:
2034 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2035 
2036      second_loop:
2037        header_b:
2038 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2039 				 or with i_2 if no LCSSA phi is created
2040 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2041 	 ...
2042 	 i_4 = i_3 + 1;
2043 	 if (cond_b)
2044 	   goto latch_b;
2045 	 else
2046 	   goto exit_bb;
2047        latch_b:
2048 	 goto header_b;
2049 
2050        exit_bb:
2051 
2052    This function creates loop closed SSA for the first loop; update the
2053    second loop's PHI nodes by replacing argument on incoming edge with the
2054    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2055    is false, Loop closed ssa phis will only be created for non-iv phis for
2056    the first loop.
2057 
2058    This function assumes exit bb of the first loop is preheader bb of the
2059    second loop, i.e, between_bb in the example code.  With PHIs updated,
2060    the second loop will execute rest iterations of the first.  */
2061 
2062 static void
2063 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2064 				   struct loop *first, struct loop *second,
2065 				   bool create_lcssa_for_iv_phis)
2066 {
2067   gphi_iterator gsi_update, gsi_orig;
2068   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2069 
2070   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2071   edge second_preheader_e = loop_preheader_edge (second);
2072   basic_block between_bb = single_exit (first)->dest;
2073 
2074   gcc_assert (between_bb == second_preheader_e->src);
2075   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2076   /* Either the first loop or the second is the loop to be vectorized.  */
2077   gcc_assert (loop == first || loop == second);
2078 
2079   for (gsi_orig = gsi_start_phis (first->header),
2080        gsi_update = gsi_start_phis (second->header);
2081        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2082        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2083     {
2084       gphi *orig_phi = gsi_orig.phi ();
2085       gphi *update_phi = gsi_update.phi ();
2086 
2087       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2088       /* Generate lcssa PHI node for the first loop.  */
2089       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2090       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2091 	{
2092 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2093 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2094 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2095 	  arg = new_res;
2096 	}
2097 
2098       /* Update PHI node in the second loop by replacing arg on the loop's
2099 	 incoming edge.  */
2100       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2101     }
2102 }
2103 
2104 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2105    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2106    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2107    appear like below:
2108 
2109        guard_bb:
2110 	 if (cond)
2111 	   goto merge_bb;
2112 	 else
2113 	   goto skip_loop;
2114 
2115      skip_loop:
2116        header_a:
2117 	 i_1 = PHI<i_0, i_2>;
2118 	 ...
2119 	 i_2 = i_1 + 1;
2120 	 if (cond_a)
2121 	   goto latch_a;
2122 	 else
2123 	   goto exit_a;
2124        latch_a:
2125 	 goto header_a;
2126 
2127        exit_a:
2128 	 i_5 = PHI<i_2>;
2129 
2130        merge_bb:
2131 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2132 
2133      update_loop:
2134        header_b:
2135 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2136 	 ...
2137 	 i_4 = i_3 + 1;
2138 	 if (cond_b)
2139 	   goto latch_b;
2140 	 else
2141 	   goto exit_bb;
2142        latch_b:
2143 	 goto header_b;
2144 
2145        exit_bb:
2146 
2147    This function creates PHI nodes at merge_bb and replaces the use of i_5
2148    in the update_loop's PHI node with the result of new PHI result.  */
2149 
2150 static void
2151 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2152 				    struct loop *update_loop,
2153 				    edge guard_edge, edge merge_edge)
2154 {
2155   source_location merge_loc, guard_loc;
2156   edge orig_e = loop_preheader_edge (skip_loop);
2157   edge update_e = loop_preheader_edge (update_loop);
2158   gphi_iterator gsi_orig, gsi_update;
2159 
2160   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2161 	gsi_update = gsi_start_phis (update_loop->header));
2162        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2163        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2164     {
2165       gphi *orig_phi = gsi_orig.phi ();
2166       gphi *update_phi = gsi_update.phi ();
2167 
2168       /* Generate new phi node at merge bb of the guard.  */
2169       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2170       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2171 
2172       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2173 	 args in NEW_PHI for these edges.  */
2174       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2175       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2176       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2177       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2178       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2179       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2180 
2181       /* Update phi in UPDATE_PHI.  */
2182       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2183     }
2184 }
2185 
2186 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2187    this function searches for the corresponding lcssa phi node in exit
2188    bb of LOOP.  If it is found, return the phi result; otherwise return
2189    NULL.  */
2190 
2191 static tree
2192 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2193 		gphi *lcssa_phi)
2194 {
2195   gphi_iterator gsi;
2196   edge e = single_exit (loop);
2197 
2198   gcc_assert (single_pred_p (e->dest));
2199   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2200     {
2201       gphi *phi = gsi.phi ();
2202       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2203 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2204 	return PHI_RESULT (phi);
2205     }
2206   return NULL_TREE;
2207 }
2208 
2209 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2210    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2211    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2212    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2213    The CFG looks like:
2214 
2215      loop:
2216        header_a:
2217 	 i_1 = PHI<i_0, i_2>;
2218 	 ...
2219 	 i_2 = i_1 + 1;
2220 	 if (cond_a)
2221 	   goto latch_a;
2222 	 else
2223 	   goto exit_a;
2224        latch_a:
2225 	 goto header_a;
2226 
2227        exit_a:
2228 
2229        guard_bb:
2230 	 if (cond)
2231 	   goto merge_bb;
2232 	 else
2233 	   goto epilog_loop;
2234 
2235        ;; fall_through_bb
2236 
2237      epilog_loop:
2238        header_b:
2239 	 i_3 = PHI<i_2, i_4>;
2240 	 ...
2241 	 i_4 = i_3 + 1;
2242 	 if (cond_b)
2243 	   goto latch_b;
2244 	 else
2245 	   goto merge_bb;
2246        latch_b:
2247 	 goto header_b;
2248 
2249        merge_bb:
2250 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2251 
2252        exit_bb:
2253 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2254 
2255    For each name used out side EPILOG (i.e - for each name that has a lcssa
2256    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2257    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2258    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2259    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2260    in exit_bb will also be updated.  */
2261 
2262 static void
2263 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2264 				    edge guard_edge, edge merge_edge)
2265 {
2266   gphi_iterator gsi;
2267   basic_block merge_bb = guard_edge->dest;
2268 
2269   gcc_assert (single_succ_p (merge_bb));
2270   edge e = single_succ_edge (merge_bb);
2271   basic_block exit_bb = e->dest;
2272   gcc_assert (single_pred_p (exit_bb));
2273   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2274 
2275   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2276     {
2277       gphi *update_phi = gsi.phi ();
2278       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2279       /* This loop-closed-phi actually doesn't represent a use out of the
2280 	 loop - the phi arg is a constant.  */
2281       if (TREE_CODE (old_arg) != SSA_NAME)
2282 	continue;
2283 
2284       tree merge_arg = get_current_def (old_arg);
2285       if (!merge_arg)
2286 	merge_arg = old_arg;
2287 
2288       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2289       /* If the var is live after loop but not a reduction, we simply
2290 	 use the old arg.  */
2291       if (!guard_arg)
2292 	guard_arg = old_arg;
2293 
2294       /* Create new phi node in MERGE_BB:  */
2295       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2296       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2297 
2298       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2299 	 the two PHI args in merge_phi for these edges.  */
2300       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2301       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2302 
2303       /* Update the original phi in exit_bb.  */
2304       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2305     }
2306 }
2307 
2308 /* EPILOG loop is duplicated from the original loop for vectorizing,
2309    the arg of its loop closed ssa PHI needs to be updated.  */
2310 
2311 static void
2312 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2313 {
2314   gphi_iterator gsi;
2315   basic_block exit_bb = single_exit (epilog)->dest;
2316 
2317   gcc_assert (single_pred_p (exit_bb));
2318   edge e = EDGE_PRED (exit_bb, 0);
2319   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2320     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2321 }
2322 
2323 /* Function vect_do_peeling.
2324 
2325    Input:
2326    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2327 
2328        preheader:
2329      LOOP:
2330        header_bb:
2331 	 loop_body
2332 	 if (exit_loop_cond) goto exit_bb
2333 	 else                goto header_bb
2334        exit_bb:
2335 
2336    - NITERS: The number of iterations of the loop.
2337    - NITERSM1: The number of iterations of the loop's latch.
2338    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2339    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2340 			      CHECK_PROFITABILITY is true.
2341    Output:
2342    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2343      iterate after vectorization; see vect_set_loop_condition for details.
2344    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2345      should be set to the number of scalar iterations handled by the
2346      vector loop.  The SSA name is only used on exit from the loop.
2347 
2348    This function peels prolog and epilog from the loop, adds guards skipping
2349    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2350    would look like:
2351 
2352        guard_bb_1:
2353 	 if (prefer_scalar_loop) goto merge_bb_1
2354 	 else                    goto guard_bb_2
2355 
2356        guard_bb_2:
2357          if (skip_prolog) goto merge_bb_2
2358          else             goto prolog_preheader
2359 
2360        prolog_preheader:
2361      PROLOG:
2362        prolog_header_bb:
2363 	 prolog_body
2364 	 if (exit_prolog_cond) goto prolog_exit_bb
2365 	 else                  goto prolog_header_bb
2366        prolog_exit_bb:
2367 
2368        merge_bb_2:
2369 
2370        vector_preheader:
2371      VECTOR LOOP:
2372        vector_header_bb:
2373 	 vector_body
2374 	 if (exit_vector_cond) goto vector_exit_bb
2375 	 else                  goto vector_header_bb
2376        vector_exit_bb:
2377 
2378        guard_bb_3:
2379 	 if (skip_epilog) goto merge_bb_3
2380 	 else             goto epilog_preheader
2381 
2382        merge_bb_1:
2383 
2384        epilog_preheader:
2385      EPILOG:
2386        epilog_header_bb:
2387 	 epilog_body
2388 	 if (exit_epilog_cond) goto merge_bb_3
2389 	 else                  goto epilog_header_bb
2390 
2391        merge_bb_3:
2392 
2393    Note this function peels prolog and epilog only if it's necessary,
2394    as well as guards.
2395    Returns created epilogue or NULL.
2396 
2397    TODO: Guard for prefer_scalar_loop should be emitted along with
2398    versioning conditions if loop versioning is needed.  */
2399 
2400 
2401 struct loop *
2402 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2403 		 tree *niters_vector, tree *step_vector,
2404 		 tree *niters_vector_mult_vf_var, int th,
2405 		 bool check_profitability, bool niters_no_overflow)
2406 {
2407   edge e, guard_e;
2408   tree type = TREE_TYPE (niters), guard_cond;
2409   basic_block guard_bb, guard_to;
2410   profile_probability prob_prolog, prob_vector, prob_epilog;
2411   int estimated_vf;
2412   int prolog_peeling = 0;
2413   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2414     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2415 
2416   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2417   poly_uint64 bound_epilog = 0;
2418   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2419       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2420     bound_epilog += vf - 1;
2421   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2422     bound_epilog += 1;
2423   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2424   poly_uint64 bound_scalar = bound_epilog;
2425 
2426   if (!prolog_peeling && !epilog_peeling)
2427     return NULL;
2428 
2429   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2430   estimated_vf = vect_vf_for_cost (loop_vinfo);
2431   if (estimated_vf == 2)
2432     estimated_vf = 3;
2433   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2434 			.apply_scale (estimated_vf - 1, estimated_vf);
2435 
2436   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2437   struct loop *first_loop = loop;
2438   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2439   create_lcssa_for_virtual_phi (loop);
2440   update_ssa (TODO_update_ssa_only_virtuals);
2441 
2442   if (MAY_HAVE_DEBUG_BIND_STMTS)
2443     {
2444       gcc_assert (!adjust_vec.exists ());
2445       adjust_vec.create (32);
2446     }
2447   initialize_original_copy_tables ();
2448 
2449   /* Record the anchor bb at which the guard should be placed if the scalar
2450      loop might be preferred.  */
2451   basic_block anchor = loop_preheader_edge (loop)->src;
2452 
2453   /* Generate the number of iterations for the prolog loop.  We do this here
2454      so that we can also get the upper bound on the number of iterations.  */
2455   tree niters_prolog;
2456   int bound_prolog = 0;
2457   if (prolog_peeling)
2458     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2459 						 &bound_prolog);
2460   else
2461     niters_prolog = build_int_cst (type, 0);
2462 
2463   /* Prolog loop may be skipped.  */
2464   bool skip_prolog = (prolog_peeling != 0);
2465   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2466      when we peel for epilog loop and when it hasn't been checked with
2467      loop versioning.  */
2468   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2469 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2470 				  bound_prolog + bound_epilog)
2471 		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2472   /* Epilog loop must be executed if the number of iterations for epilog
2473      loop is known at compile time, otherwise we need to add a check at
2474      the end of vector loop and skip to the end of epilog loop.  */
2475   bool skip_epilog = (prolog_peeling < 0
2476 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2477 		      || !vf.is_constant ());
2478   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2479   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2480     skip_epilog = false;
2481 
2482   if (skip_vector)
2483     {
2484       split_edge (loop_preheader_edge (loop));
2485 
2486       /* Due to the order in which we peel prolog and epilog, we first
2487 	 propagate probability to the whole loop.  The purpose is to
2488 	 avoid adjusting probabilities of both prolog and vector loops
2489 	 separately.  Note in this case, the probability of epilog loop
2490 	 needs to be scaled back later.  */
2491       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2492       if (prob_vector.initialized_p ())
2493 	{
2494 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2495 	  scale_loop_profile (loop, prob_vector, 0);
2496 	}
2497     }
2498 
2499   source_location loop_loc = find_loop_location (loop);
2500   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2501   if (prolog_peeling)
2502     {
2503       e = loop_preheader_edge (loop);
2504       if (!slpeel_can_duplicate_loop_p (loop, e))
2505 	{
2506 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2507 			   "loop can't be duplicated to preheader edge.\n");
2508 	  gcc_unreachable ();
2509 	}
2510       /* Peel prolog and put it on preheader edge of loop.  */
2511       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2512       if (!prolog)
2513 	{
2514 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2515 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2516 	  gcc_unreachable ();
2517 	}
2518       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2519       first_loop = prolog;
2520       reset_original_copy_tables ();
2521 
2522       /* Update the number of iterations for prolog loop.  */
2523       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2524       vect_set_loop_condition (prolog, NULL, niters_prolog,
2525 			       step_prolog, NULL_TREE, false);
2526 
2527       /* Skip the prolog loop.  */
2528       if (skip_prolog)
2529 	{
2530 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2531 				    niters_prolog, build_int_cst (type, 0));
2532 	  guard_bb = loop_preheader_edge (prolog)->src;
2533 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2534 	  guard_to = split_edge (loop_preheader_edge (loop));
2535 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2536 					   guard_to, guard_bb,
2537 					   prob_prolog.invert (),
2538 					   irred_flag);
2539 	  e = EDGE_PRED (guard_to, 0);
2540 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2541 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2542 
2543 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2544 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2545 	}
2546       /* Update init address of DRs.  */
2547       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2548       /* Update niters for vector loop.  */
2549       LOOP_VINFO_NITERS (loop_vinfo)
2550 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2551       LOOP_VINFO_NITERSM1 (loop_vinfo)
2552 	= fold_build2 (MINUS_EXPR, type,
2553 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2554       bool new_var_p = false;
2555       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2556       /* It's guaranteed that vector loop bound before vectorization is at
2557 	 least VF, so set range information for newly generated var.  */
2558       if (new_var_p)
2559 	set_range_info (niters, VR_RANGE,
2560 			wi::to_wide (build_int_cst (type, vf)),
2561 			wi::to_wide (TYPE_MAX_VALUE (type)));
2562 
2563       /* Prolog iterates at most bound_prolog times, latch iterates at
2564 	 most bound_prolog - 1 times.  */
2565       record_niter_bound (prolog, bound_prolog - 1, false, true);
2566       delete_update_ssa ();
2567       adjust_vec_debug_stmts ();
2568       scev_reset ();
2569     }
2570 
2571   if (epilog_peeling)
2572     {
2573       e = single_exit (loop);
2574       if (!slpeel_can_duplicate_loop_p (loop, e))
2575 	{
2576 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2577 			   "loop can't be duplicated to exit edge.\n");
2578 	  gcc_unreachable ();
2579 	}
2580       /* Peel epilog and put it on exit edge of loop.  */
2581       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2582       if (!epilog)
2583 	{
2584 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2585 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2586 	  gcc_unreachable ();
2587 	}
2588       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2589 
2590       /* Scalar version loop may be preferred.  In this case, add guard
2591 	 and skip to epilog.  Note this only happens when the number of
2592 	 iterations of loop is unknown at compile time, otherwise this
2593 	 won't be vectorized.  */
2594       if (skip_vector)
2595 	{
2596 	  /* Additional epilogue iteration is peeled if gap exists.  */
2597 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2598 						bound_prolog, bound_epilog,
2599 						th, &bound_scalar,
2600 						check_profitability);
2601 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2602 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2603 	  guard_bb = anchor;
2604 	  guard_to = split_edge (loop_preheader_edge (epilog));
2605 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2606 					   guard_to, guard_bb,
2607 					   prob_vector.invert (),
2608 					   irred_flag);
2609 	  e = EDGE_PRED (guard_to, 0);
2610 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2611 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2612 
2613 	  /* Simply propagate profile info from guard_bb to guard_to which is
2614 	     a merge point of control flow.  */
2615 	  guard_to->count = guard_bb->count;
2616 
2617 	  /* Scale probability of epilog loop back.
2618 	     FIXME: We should avoid scaling down and back up.  Profile may
2619 	     get lost if we scale down to 0.  */
2620 	  basic_block *bbs = get_loop_body (epilog);
2621 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2622 	    bbs[i]->count = bbs[i]->count.apply_scale
2623 				 (bbs[i]->count,
2624 				  bbs[i]->count.apply_probability
2625 				    (prob_vector));
2626 	  free (bbs);
2627 	}
2628 
2629       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2630       tree niters_vector_mult_vf;
2631       /* If loop is peeled for non-zero constant times, now niters refers to
2632 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2633 	 overflows.  */
2634       niters_no_overflow |= (prolog_peeling > 0);
2635       vect_gen_vector_loop_niters (loop_vinfo, niters,
2636 				   niters_vector, step_vector,
2637 				   niters_no_overflow);
2638       if (!integer_onep (*step_vector))
2639 	{
2640 	  /* On exit from the loop we will have an easy way of calcalating
2641 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2642 	     until then.  */
2643 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2644 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2645 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2646 	}
2647       else
2648 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2649 					     &niters_vector_mult_vf);
2650       /* Update IVs of original loop as if they were advanced by
2651 	 niters_vector_mult_vf steps.  */
2652       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2653       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2654       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2655 					update_e);
2656 
2657       if (skip_epilog)
2658 	{
2659 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2660 				    niters, niters_vector_mult_vf);
2661 	  guard_bb = single_exit (loop)->dest;
2662 	  guard_to = split_edge (single_exit (epilog));
2663 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2664 					   skip_vector ? anchor : guard_bb,
2665 					   prob_epilog.invert (),
2666 					   irred_flag);
2667 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2668 					      single_exit (epilog));
2669 	  /* Only need to handle basic block before epilog loop if it's not
2670 	     the guard_bb, which is the case when skip_vector is true.  */
2671 	  if (guard_bb != bb_before_epilog)
2672 	    {
2673 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2674 
2675 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2676 	    }
2677 	  scale_loop_profile (epilog, prob_epilog, 0);
2678 	}
2679       else
2680 	slpeel_update_phi_nodes_for_lcssa (epilog);
2681 
2682       unsigned HOST_WIDE_INT bound;
2683       if (bound_scalar.is_constant (&bound))
2684 	{
2685 	  gcc_assert (bound != 0);
2686 	  /* -1 to convert loop iterations to latch iterations.  */
2687 	  record_niter_bound (epilog, bound - 1, false, true);
2688 	}
2689 
2690       delete_update_ssa ();
2691       adjust_vec_debug_stmts ();
2692       scev_reset ();
2693     }
2694   adjust_vec.release ();
2695   free_original_copy_tables ();
2696 
2697   return epilog;
2698 }
2699 
2700 /* Function vect_create_cond_for_niters_checks.
2701 
2702    Create a conditional expression that represents the run-time checks for
2703    loop's niter.  The loop is guaranteed to terminate if the run-time
2704    checks hold.
2705 
2706    Input:
2707    COND_EXPR  - input conditional expression.  New conditions will be chained
2708 		with logical AND operation.  If it is NULL, then the function
2709 		is used to return the number of alias checks.
2710    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2711 		to be checked.
2712 
2713    Output:
2714    COND_EXPR - conditional expression.
2715 
2716    The returned COND_EXPR is the conditional expression to be used in the
2717    if statement that controls which version of the loop gets executed at
2718    runtime.  */
2719 
2720 static void
2721 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2722 {
2723   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2724 
2725   if (*cond_expr)
2726     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2727 			      *cond_expr, part_cond_expr);
2728   else
2729     *cond_expr = part_cond_expr;
2730 }
2731 
2732 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2733    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2734 
2735 static void
2736 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2737 {
2738   if (*cond_expr)
2739     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2740 			      *cond_expr, part_cond_expr);
2741   else
2742     *cond_expr = part_cond_expr;
2743 }
2744 
2745 /* Function vect_create_cond_for_align_checks.
2746 
2747    Create a conditional expression that represents the alignment checks for
2748    all of data references (array element references) whose alignment must be
2749    checked at runtime.
2750 
2751    Input:
2752    COND_EXPR  - input conditional expression.  New conditions will be chained
2753                 with logical AND operation.
2754    LOOP_VINFO - two fields of the loop information are used.
2755                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2756                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2757 
2758    Output:
2759    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2760                          expression.
2761    The returned value is the conditional expression to be used in the if
2762    statement that controls which version of the loop gets executed at runtime.
2763 
2764    The algorithm makes two assumptions:
2765      1) The number of bytes "n" in a vector is a power of 2.
2766      2) An address "a" is aligned if a%n is zero and that this
2767         test can be done as a&(n-1) == 0.  For example, for 16
2768         byte vectors the test is a&0xf == 0.  */
2769 
2770 static void
2771 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2772                                    tree *cond_expr,
2773 				   gimple_seq *cond_expr_stmt_list)
2774 {
2775   vec<gimple *> may_misalign_stmts
2776     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2777   gimple *ref_stmt;
2778   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2779   tree mask_cst;
2780   unsigned int i;
2781   tree int_ptrsize_type;
2782   char tmp_name[20];
2783   tree or_tmp_name = NULL_TREE;
2784   tree and_tmp_name;
2785   gimple *and_stmt;
2786   tree ptrsize_zero;
2787   tree part_cond_expr;
2788 
2789   /* Check that mask is one less than a power of 2, i.e., mask is
2790      all zeros followed by all ones.  */
2791   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2792 
2793   int_ptrsize_type = signed_type_for (ptr_type_node);
2794 
2795   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2796      of the first vector of the i'th data reference. */
2797 
2798   FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2799     {
2800       gimple_seq new_stmt_list = NULL;
2801       tree addr_base;
2802       tree addr_tmp_name;
2803       tree new_or_tmp_name;
2804       gimple *addr_stmt, *or_stmt;
2805       stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2806       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2807       bool negative = tree_int_cst_compare
2808 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2809       tree offset = negative
2810 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2811 
2812       /* create: addr_tmp = (int)(address_of_first_vector) */
2813       addr_base =
2814 	vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2815 					      offset);
2816       if (new_stmt_list != NULL)
2817 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2818 
2819       sprintf (tmp_name, "addr2int%d", i);
2820       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2821       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2822       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2823 
2824       /* The addresses are OR together.  */
2825 
2826       if (or_tmp_name != NULL_TREE)
2827         {
2828           /* create: or_tmp = or_tmp | addr_tmp */
2829           sprintf (tmp_name, "orptrs%d", i);
2830 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2831 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2832 					 or_tmp_name, addr_tmp_name);
2833 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2834           or_tmp_name = new_or_tmp_name;
2835         }
2836       else
2837         or_tmp_name = addr_tmp_name;
2838 
2839     } /* end for i */
2840 
2841   mask_cst = build_int_cst (int_ptrsize_type, mask);
2842 
2843   /* create: and_tmp = or_tmp & mask  */
2844   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2845 
2846   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2847 				  or_tmp_name, mask_cst);
2848   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2849 
2850   /* Make and_tmp the left operand of the conditional test against zero.
2851      if and_tmp has a nonzero bit then some address is unaligned.  */
2852   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2853   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2854 				and_tmp_name, ptrsize_zero);
2855   chain_cond_expr (cond_expr, part_cond_expr);
2856 }
2857 
2858 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2859    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2860    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2861    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2862 
2863 static void
2864 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2865 {
2866   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2867   unsigned int i;
2868   vec_object_pair *pair;
2869   FOR_EACH_VEC_ELT (pairs, i, pair)
2870     {
2871       tree addr1 = build_fold_addr_expr (pair->first);
2872       tree addr2 = build_fold_addr_expr (pair->second);
2873       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2874 					 addr1, addr2);
2875       chain_cond_expr (cond_expr, part_cond_expr);
2876     }
2877 }
2878 
2879 /* Create an expression that is true when all lower-bound conditions for
2880    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2881 
2882 static void
2883 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2884 {
2885   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2886   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2887     {
2888       tree expr = lower_bounds[i].expr;
2889       tree type = unsigned_type_for (TREE_TYPE (expr));
2890       expr = fold_convert (type, expr);
2891       poly_uint64 bound = lower_bounds[i].min_value;
2892       if (!lower_bounds[i].unsigned_p)
2893 	{
2894 	  expr = fold_build2 (PLUS_EXPR, type, expr,
2895 			      build_int_cstu (type, bound - 1));
2896 	  bound += bound - 1;
2897 	}
2898       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2899 					 build_int_cstu (type, bound));
2900       chain_cond_expr (cond_expr, part_cond_expr);
2901     }
2902 }
2903 
2904 /* Function vect_create_cond_for_alias_checks.
2905 
2906    Create a conditional expression that represents the run-time checks for
2907    overlapping of address ranges represented by a list of data references
2908    relations passed as input.
2909 
2910    Input:
2911    COND_EXPR  - input conditional expression.  New conditions will be chained
2912                 with logical AND operation.  If it is NULL, then the function
2913                 is used to return the number of alias checks.
2914    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2915 	        to be checked.
2916 
2917    Output:
2918    COND_EXPR - conditional expression.
2919 
2920    The returned COND_EXPR is the conditional expression to be used in the if
2921    statement that controls which version of the loop gets executed at runtime.
2922 */
2923 
2924 void
2925 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2926 {
2927   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2928     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2929 
2930   if (comp_alias_ddrs.is_empty ())
2931     return;
2932 
2933   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2934 			       &comp_alias_ddrs, cond_expr);
2935   if (dump_enabled_p ())
2936     dump_printf_loc (MSG_NOTE, vect_location,
2937 		     "created %u versioning for alias checks.\n",
2938 		     comp_alias_ddrs.length ());
2939 }
2940 
2941 
2942 /* Function vect_loop_versioning.
2943 
2944    If the loop has data references that may or may not be aligned or/and
2945    has data reference relations whose independence was not proven then
2946    two versions of the loop need to be generated, one which is vectorized
2947    and one which isn't.  A test is then generated to control which of the
2948    loops is executed.  The test checks for the alignment of all of the
2949    data references that may or may not be aligned.  An additional
2950    sequence of runtime tests is generated for each pairs of DDRs whose
2951    independence was not proven.  The vectorized version of loop is
2952    executed only if both alias and alignment tests are passed.
2953 
2954    The test generated to check which version of loop is executed
2955    is modified to also check for profitability as indicated by the
2956    cost model threshold TH.
2957 
2958    The versioning precondition(s) are placed in *COND_EXPR and
2959    *COND_EXPR_STMT_LIST.  */
2960 
2961 void
2962 vect_loop_versioning (loop_vec_info loop_vinfo,
2963 		      unsigned int th, bool check_profitability,
2964 		      poly_uint64 versioning_threshold)
2965 {
2966   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2967   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2968   basic_block condition_bb;
2969   gphi_iterator gsi;
2970   gimple_stmt_iterator cond_exp_gsi;
2971   basic_block merge_bb;
2972   basic_block new_exit_bb;
2973   edge new_exit_e, e;
2974   gphi *orig_phi, *new_phi;
2975   tree cond_expr = NULL_TREE;
2976   gimple_seq cond_expr_stmt_list = NULL;
2977   tree arg;
2978   profile_probability prob = profile_probability::likely ();
2979   gimple_seq gimplify_stmt_list = NULL;
2980   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2981   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2982   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2983   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2984 
2985   if (check_profitability)
2986     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2987 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
2988 					    th - 1));
2989   if (maybe_ne (versioning_threshold, 0U))
2990     {
2991       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2992 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
2993 					      versioning_threshold - 1));
2994       if (cond_expr)
2995 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2996 				 expr, cond_expr);
2997       else
2998 	cond_expr = expr;
2999     }
3000 
3001   if (version_niter)
3002     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3003 
3004   if (cond_expr)
3005     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3006 					is_gimple_condexpr, NULL_TREE);
3007 
3008   if (version_align)
3009     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3010 				       &cond_expr_stmt_list);
3011 
3012   if (version_alias)
3013     {
3014       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3015       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3016       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3017     }
3018 
3019   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3020 				      &gimplify_stmt_list,
3021 				      is_gimple_condexpr, NULL_TREE);
3022   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3023 
3024   initialize_original_copy_tables ();
3025   if (scalar_loop)
3026     {
3027       edge scalar_e;
3028       basic_block preheader, scalar_preheader;
3029 
3030       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3031 	 scale LOOP's frequencies instead.  */
3032       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3033 			    prob, prob.invert (), prob, prob.invert (), true);
3034       scale_loop_frequencies (loop, prob);
3035       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3036 	 while we need to move it above LOOP's preheader.  */
3037       e = loop_preheader_edge (loop);
3038       scalar_e = loop_preheader_edge (scalar_loop);
3039       gcc_assert (empty_block_p (e->src)
3040 		  && single_pred_p (e->src));
3041       gcc_assert (empty_block_p (scalar_e->src)
3042 		  && single_pred_p (scalar_e->src));
3043       gcc_assert (single_pred_p (condition_bb));
3044       preheader = e->src;
3045       scalar_preheader = scalar_e->src;
3046       scalar_e = find_edge (condition_bb, scalar_preheader);
3047       e = single_pred_edge (preheader);
3048       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3049 				      scalar_preheader);
3050       redirect_edge_and_branch_force (scalar_e, preheader);
3051       redirect_edge_and_branch_force (e, condition_bb);
3052       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3053 			       single_pred (condition_bb));
3054       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3055 			       single_pred (scalar_preheader));
3056       set_immediate_dominator (CDI_DOMINATORS, preheader,
3057 			       condition_bb);
3058     }
3059   else
3060     nloop = loop_version (loop, cond_expr, &condition_bb,
3061 			  prob, prob.invert (), prob, prob.invert (), true);
3062 
3063   if (version_niter)
3064     {
3065       /* The versioned loop could be infinite, we need to clear existing
3066 	 niter information which is copied from the original loop.  */
3067       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3068       vect_free_loop_info_assumptions (nloop);
3069       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3070       loop_constraint_set (loop, LOOP_C_INFINITE);
3071     }
3072 
3073   if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
3074       && dump_enabled_p ())
3075     {
3076       if (version_alias)
3077         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3078                          "loop versioned for vectorization because of "
3079 			 "possible aliasing\n");
3080       if (version_align)
3081         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3082                          "loop versioned for vectorization to enhance "
3083 			 "alignment\n");
3084 
3085     }
3086   free_original_copy_tables ();
3087 
3088   /* Loop versioning violates an assumption we try to maintain during
3089      vectorization - that the loop exit block has a single predecessor.
3090      After versioning, the exit block of both loop versions is the same
3091      basic block (i.e. it has two predecessors). Just in order to simplify
3092      following transformations in the vectorizer, we fix this situation
3093      here by adding a new (empty) block on the exit-edge of the loop,
3094      with the proper loop-exit phis to maintain loop-closed-form.
3095      If loop versioning wasn't done from loop, but scalar_loop instead,
3096      merge_bb will have already just a single successor.  */
3097 
3098   merge_bb = single_exit (loop)->dest;
3099   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3100     {
3101       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3102       new_exit_bb = split_edge (single_exit (loop));
3103       new_exit_e = single_exit (loop);
3104       e = EDGE_SUCC (new_exit_bb, 0);
3105 
3106       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3107 	{
3108 	  tree new_res;
3109 	  orig_phi = gsi.phi ();
3110 	  new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3111 	  new_phi = create_phi_node (new_res, new_exit_bb);
3112 	  arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3113 	  add_phi_arg (new_phi, arg, new_exit_e,
3114 		       gimple_phi_arg_location_from_edge (orig_phi, e));
3115 	  adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3116 	}
3117     }
3118 
3119   /* End loop-exit-fixes after versioning.  */
3120 
3121   if (cond_expr_stmt_list)
3122     {
3123       cond_exp_gsi = gsi_last_bb (condition_bb);
3124       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3125 			     GSI_SAME_STMT);
3126     }
3127   update_ssa (TODO_update_ssa);
3128 }
3129