1 /* Vectorizer Specific Loop Manipulations
2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53 
54 /*************************************************************************
55   Simple Loop Peeling Utilities
56 
57   Utilities to support loop peeling for vectorization purposes.
58  *************************************************************************/
59 
60 
61 /* Renames the use *OP_P.  */
62 
63 static void
rename_use_op(use_operand_p op_p)64 rename_use_op (use_operand_p op_p)
65 {
66   tree new_name;
67 
68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69     return;
70 
71   new_name = get_current_def (USE_FROM_PTR (op_p));
72 
73   /* Something defined outside of the loop.  */
74   if (!new_name)
75     return;
76 
77   /* An ordinary ssa name defined in the loop.  */
78 
79   SET_USE (op_p, new_name);
80 }
81 
82 
83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85    true.  */
86 
87 static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90   gimple *stmt;
91   use_operand_p use_p;
92   ssa_op_iter iter;
93   edge e;
94   edge_iterator ei;
95   class loop *loop = bb->loop_father;
96   class loop *outer_loop = NULL;
97 
98   if (rename_from_outer_loop)
99     {
100       gcc_assert (loop);
101       outer_loop = loop_outer (loop);
102     }
103 
104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105        gsi_next (&gsi))
106     {
107       stmt = gsi_stmt (gsi);
108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 	rename_use_op (use_p);
110     }
111 
112   FOR_EACH_EDGE (e, ei, bb->preds)
113     {
114       if (!flow_bb_inside_loop_p (loop, e->src))
115 	{
116 	  if (!rename_from_outer_loop)
117 	    continue;
118 	  if (e->src != outer_loop->header)
119 	    {
120 	      if (outer_loop->inner->next)
121 		{
122 		  /* If outer_loop has 2 inner loops, allow there to
123 		     be an extra basic block which decides which of the
124 		     two loops to use using LOOP_VECTORIZED.  */
125 		  if (!single_pred_p (e->src)
126 		      || single_pred (e->src) != outer_loop->header)
127 		    continue;
128 		}
129 	    }
130 	}
131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 	   gsi_next (&gsi))
133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134     }
135 }
136 
137 
138 struct adjust_info
139 {
140   tree from, to;
141   basic_block bb;
142 };
143 
144 /* A stack of values to be adjusted in debug stmts.  We have to
145    process them LIFO, so that the closest substitution applies.  If we
146    processed them FIFO, without the stack, we might substitute uses
147    with a PHI DEF that would soon become non-dominant, and when we got
148    to the suitable one, it wouldn't have anything to substitute any
149    more.  */
150 static vec<adjust_info, va_heap> adjust_vec;
151 
152 /* Adjust any debug stmts that referenced AI->from values to use the
153    loop-closed AI->to, if the references are dominated by AI->bb and
154    not by the definition of AI->from.  */
155 
156 static void
adjust_debug_stmts_now(adjust_info * ai)157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159   basic_block bbphi = ai->bb;
160   tree orig_def = ai->from;
161   tree new_def = ai->to;
162   imm_use_iterator imm_iter;
163   gimple *stmt;
164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165 
166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167 
168   /* Adjust any debug stmts that held onto non-loop-closed
169      references.  */
170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171     {
172       use_operand_p use_p;
173       basic_block bbuse;
174 
175       if (!is_gimple_debug (stmt))
176 	continue;
177 
178       gcc_assert (gimple_debug_bind_p (stmt));
179 
180       bbuse = gimple_bb (stmt);
181 
182       if ((bbuse == bbphi
183 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 	  && !(bbuse == bbdef
185 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 	{
187 	  if (new_def)
188 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 	      SET_USE (use_p, new_def);
190 	  else
191 	    {
192 	      gimple_debug_bind_reset_value (stmt);
193 	      update_stmt (stmt);
194 	    }
195 	}
196     }
197 }
198 
199 /* Adjust debug stmts as scheduled before.  */
200 
201 static void
adjust_vec_debug_stmts(void)202 adjust_vec_debug_stmts (void)
203 {
204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
205     return;
206 
207   gcc_assert (adjust_vec.exists ());
208 
209   while (!adjust_vec.is_empty ())
210     {
211       adjust_debug_stmts_now (&adjust_vec.last ());
212       adjust_vec.pop ();
213     }
214 }
215 
216 /* Adjust any debug stmts that referenced FROM values to use the
217    loop-closed TO, if the references are dominated by BB and not by
218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
219    will be postponed until adjust_vec_debug_stmts is called.  */
220 
221 static void
adjust_debug_stmts(tree from,tree to,basic_block bb)222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224   adjust_info ai;
225 
226   if (MAY_HAVE_DEBUG_BIND_STMTS
227       && TREE_CODE (from) == SSA_NAME
228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
229       && ! virtual_operand_p (from))
230     {
231       ai.from = from;
232       ai.to = to;
233       ai.bb = bb;
234 
235       if (adjust_vec.exists ())
236 	adjust_vec.safe_push (ai);
237       else
238 	adjust_debug_stmts_now (&ai);
239     }
240 }
241 
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243    to adjust any debug stmts that referenced the old phi arg,
244    presumably non-loop-closed references left over from other
245    transformations.  */
246 
247 static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251 
252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253 
254   if (MAY_HAVE_DEBUG_BIND_STMTS)
255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 			gimple_bb (update_phi));
257 }
258 
259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
260    that the control should have during the first iteration and NEXT_CTRL is the
261    value that it should have on subsequent iterations.  */
262 
263 static void
vect_set_loop_control(class loop * loop,tree ctrl,tree init_ctrl,tree next_ctrl)264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 		       tree next_ctrl)
266 {
267   gphi *phi = create_phi_node (ctrl, loop->header);
268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271 
272 /* Add SEQ to the end of LOOP's preheader block.  */
273 
274 static void
add_preheader_seq(class loop * loop,gimple_seq seq)275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277   if (seq)
278     {
279       edge pe = loop_preheader_edge (loop);
280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281       gcc_assert (!new_bb);
282     }
283 }
284 
285 /* Add SEQ to the beginning of LOOP's header block.  */
286 
287 static void
add_header_seq(class loop * loop,gimple_seq seq)288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290   if (seq)
291     {
292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294     }
295 }
296 
297 /* Return true if the target can interleave elements of two vectors.
298    OFFSET is 0 if the first half of the vectors should be interleaved
299    or 1 if the second half should.  When returning true, store the
300    associated permutation in INDICES.  */
301 
302 static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 			unsigned int offset)
305 {
306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307   poly_uint64 base = exact_div (nelts, 2) * offset;
308   vec_perm_builder sel (nelts, 2, 3);
309   for (unsigned int i = 0; i < 3; ++i)
310     {
311       sel.quick_push (base + i);
312       sel.quick_push (base + i + nelts);
313     }
314   indices->new_vector (sel, 2, nelts);
315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317 
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319    in SRC_RGM, given that the former has twice as many masks as the
320    latter.  Return true on success, adding any new statements to SEQ.  */
321 
322 static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_controls * dest_rgm,rgroup_controls * src_rgm)323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 			       rgroup_controls *src_rgm)
325 {
326   tree src_masktype = src_rgm->type;
327   tree dest_masktype = dest_rgm->type;
328   machine_mode src_mode = TYPE_MODE (src_masktype);
329   insn_code icode1, icode2;
330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 				  src_mode)) != CODE_FOR_nothing
333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 				  src_mode)) != CODE_FOR_nothing)
335     {
336       /* Unpacking the source masks gives at least as many mask bits as
337 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
342 	{
343 	  tree src = src_rgm->controls[i / 2];
344 	  tree dest = dest_rgm->controls[i];
345 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 			    ? VEC_UNPACK_HI_EXPR
347 			    : VEC_UNPACK_LO_EXPR);
348 	  gassign *stmt;
349 	  if (dest_masktype == unpack_masktype)
350 	    stmt = gimple_build_assign (dest, code, src);
351 	  else
352 	    {
353 	      tree temp = make_ssa_name (unpack_masktype);
354 	      stmt = gimple_build_assign (temp, code, src);
355 	      gimple_seq_add_stmt (seq, stmt);
356 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 					  build1 (VIEW_CONVERT_EXPR,
358 						  dest_masktype, temp));
359 	    }
360 	  gimple_seq_add_stmt (seq, stmt);
361 	}
362       return true;
363     }
364   vec_perm_indices indices[2];
365   if (dest_masktype == src_masktype
366       && interleave_supported_p (&indices[0], src_masktype, 0)
367       && interleave_supported_p (&indices[1], src_masktype, 1))
368     {
369       /* The destination requires twice as many mask bits as the source, so
370 	 we can use interleaving permutes to double up the number of bits.  */
371       tree masks[2];
372       for (unsigned int i = 0; i < 2; ++i)
373 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
375 	{
376 	  tree src = src_rgm->controls[i / 2];
377 	  tree dest = dest_rgm->controls[i];
378 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 					      src, src, masks[i & 1]);
380 	  gimple_seq_add_stmt (seq, stmt);
381 	}
382       return true;
383     }
384   return false;
385 }
386 
387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
388    for all the rgroup controls in RGC and return a control that is nonzero
389    when the loop needs to iterate.  Add any new preheader statements to
390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
391 
392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
393    times and has been vectorized according to LOOP_VINFO.
394 
395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396    starts with NITERS_SKIP dummy iterations of the scalar loop before
397    the real work starts.  The mask elements for these dummy iterations
398    must be 0, to ensure that the extra iterations do not have an effect.
399 
400    It is known that:
401 
402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
403 
404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
405    variable that starts at 0 and has step:
406 
407      VF * RGC->max_nscalars_per_iter * RGC->factor
408 
409    might overflow before hitting a value above:
410 
411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
412 
413    This means that we cannot guarantee that such an induction variable
414    would ever hit a value that produces a set of all-false masks or zero
415    lengths for RGC.
416 
417    Note: the cost of the code generated by this function is modeled
418    by vect_estimate_min_profitable_iters, so changes here may need
419    corresponding changes there.  */
420 
421 static tree
vect_set_loop_controls_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_controls * rgc,tree niters,tree niters_skip,bool might_wrap_p)422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 				 gimple_seq *preheader_seq,
424 				 gimple_stmt_iterator loop_cond_gsi,
425 				 rgroup_controls *rgc, tree niters,
426 				 tree niters_skip, bool might_wrap_p)
427 {
428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
429   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
430   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
431 
432   tree ctrl_type = rgc->type;
433   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
434   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
435   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
436   tree length_limit = NULL_TREE;
437   /* For length, we need length_limit to ensure length in range.  */
438   if (!use_masks_p)
439     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
440 
441   /* Calculate the maximum number of item values that the rgroup
442      handles in total, the number that it handles for each iteration
443      of the vector loop, and the number that it should skip during the
444      first iteration of the vector loop.  */
445   tree nitems_total = niters;
446   tree nitems_step = build_int_cst (iv_type, vf);
447   tree nitems_skip = niters_skip;
448   if (nitems_per_iter != 1)
449     {
450       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
451 	 these multiplications don't overflow.  */
452       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
453       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
454       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
455 				   nitems_total, compare_factor);
456       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
457 				  nitems_step, iv_factor);
458       if (nitems_skip)
459 	nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
460 				    nitems_skip, compare_factor);
461     }
462 
463   /* Create an induction variable that counts the number of items
464      processed.  */
465   tree index_before_incr, index_after_incr;
466   gimple_stmt_iterator incr_gsi;
467   bool insert_after;
468   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
469   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
470 	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
471 
472   tree zero_index = build_int_cst (compare_type, 0);
473   tree test_index, test_limit, first_limit;
474   gimple_stmt_iterator *test_gsi;
475   if (might_wrap_p)
476     {
477       /* In principle the loop should stop iterating once the incremented
478 	 IV reaches a value greater than or equal to:
479 
480 	   NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
481 
482 	 However, there's no guarantee that this addition doesn't overflow
483 	 the comparison type, or that the IV hits a value above it before
484 	 wrapping around.  We therefore adjust the limit down by one
485 	 IV step:
486 
487 	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
488 	   -[infinite-prec] NITEMS_STEP
489 
490 	 and compare the IV against this limit _before_ incrementing it.
491 	 Since the comparison type is unsigned, we actually want the
492 	 subtraction to saturate at zero:
493 
494 	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
495 	   -[sat] NITEMS_STEP
496 
497 	 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
498 
499 	   NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
500 
501 	 where the rightmost subtraction can be done directly in
502 	 COMPARE_TYPE.  */
503       test_index = index_before_incr;
504       tree adjust = gimple_convert (preheader_seq, compare_type,
505 				    nitems_step);
506       if (nitems_skip)
507 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
508 			       adjust, nitems_skip);
509       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
510 				 nitems_total, adjust);
511       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
512 				 test_limit, adjust);
513       test_gsi = &incr_gsi;
514 
515       /* Get a safe limit for the first iteration.  */
516       if (nitems_skip)
517 	{
518 	  /* The first vector iteration can handle at most NITEMS_STEP
519 	     items.  NITEMS_STEP <= CONST_LIMIT, and adding
520 	     NITEMS_SKIP to that cannot overflow.  */
521 	  tree const_limit = build_int_cst (compare_type,
522 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
523 					    * nitems_per_iter);
524 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
525 				      nitems_total, const_limit);
526 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
527 				      first_limit, nitems_skip);
528 	}
529       else
530 	/* For the first iteration it doesn't matter whether the IV hits
531 	   a value above NITEMS_TOTAL.  That only matters for the latch
532 	   condition.  */
533 	first_limit = nitems_total;
534     }
535   else
536     {
537       /* Test the incremented IV, which will always hit a value above
538 	 the bound before wrapping.  */
539       test_index = index_after_incr;
540       test_limit = nitems_total;
541       if (nitems_skip)
542 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
543 				   test_limit, nitems_skip);
544       test_gsi = &loop_cond_gsi;
545 
546       first_limit = test_limit;
547     }
548 
549   /* Convert the IV value to the comparison type (either a no-op or
550      a demotion).  */
551   gimple_seq test_seq = NULL;
552   test_index = gimple_convert (&test_seq, compare_type, test_index);
553   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
554 
555   /* Provide a definition of each control in the group.  */
556   tree next_ctrl = NULL_TREE;
557   tree ctrl;
558   unsigned int i;
559   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
560     {
561       /* Previous controls will cover BIAS items.  This control covers the
562 	 next batch.  */
563       poly_uint64 bias = nitems_per_ctrl * i;
564       tree bias_tree = build_int_cst (compare_type, bias);
565 
566       /* See whether the first iteration of the vector loop is known
567 	 to have a full control.  */
568       poly_uint64 const_limit;
569       bool first_iteration_full
570 	= (poly_int_tree_p (first_limit, &const_limit)
571 	   && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
572 
573       /* Rather than have a new IV that starts at BIAS and goes up to
574 	 TEST_LIMIT, prefer to use the same 0-based IV for each control
575 	 and adjust the bound down by BIAS.  */
576       tree this_test_limit = test_limit;
577       if (i != 0)
578 	{
579 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
580 					  compare_type, this_test_limit,
581 					  bias_tree);
582 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
583 					  compare_type, this_test_limit,
584 					  bias_tree);
585 	}
586 
587       /* Create the initial control.  First include all items that
588 	 are within the loop limit.  */
589       tree init_ctrl = NULL_TREE;
590       if (!first_iteration_full)
591 	{
592 	  tree start, end;
593 	  if (first_limit == test_limit)
594 	    {
595 	      /* Use a natural test between zero (the initial IV value)
596 		 and the loop limit.  The "else" block would be valid too,
597 		 but this choice can avoid the need to load BIAS_TREE into
598 		 a register.  */
599 	      start = zero_index;
600 	      end = this_test_limit;
601 	    }
602 	  else
603 	    {
604 	      /* FIRST_LIMIT is the maximum number of items handled by the
605 		 first iteration of the vector loop.  Test the portion
606 		 associated with this control.  */
607 	      start = bias_tree;
608 	      end = first_limit;
609 	    }
610 
611 	  if (use_masks_p)
612 	    {
613 	      init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
614 	      gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
615 	      gimple_seq_add_stmt (preheader_seq, tmp_stmt);
616 	    }
617 	  else
618 	    {
619 	      init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
620 	      gimple_seq seq = vect_gen_len (init_ctrl, start,
621 					     end, length_limit);
622 	      gimple_seq_add_seq (preheader_seq, seq);
623 	    }
624 	}
625 
626       /* Now AND out the bits that are within the number of skipped
627 	 items.  */
628       poly_uint64 const_skip;
629       if (nitems_skip
630 	  && !(poly_int_tree_p (nitems_skip, &const_skip)
631 	       && known_le (const_skip, bias)))
632 	{
633 	  gcc_assert (use_masks_p);
634 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
635 						    bias_tree, nitems_skip);
636 	  if (init_ctrl)
637 	    init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
638 				      init_ctrl, unskipped_mask);
639 	  else
640 	    init_ctrl = unskipped_mask;
641 	}
642 
643       if (!init_ctrl)
644 	{
645 	  /* First iteration is full.  */
646 	  if (use_masks_p)
647 	    init_ctrl = build_minus_one_cst (ctrl_type);
648 	  else
649 	    init_ctrl = length_limit;
650 	}
651 
652       /* Get the control value for the next iteration of the loop.  */
653       if (use_masks_p)
654 	{
655 	  next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
656 	  gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
657 	  gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
658 	}
659       else
660 	{
661 	  next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
662 	  gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
663 					 length_limit);
664 	  gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
665 	}
666 
667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
668     }
669   return next_ctrl;
670 }
671 
672 /* Set up the iteration condition and rgroup controls for LOOP, given
673    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
674    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
675    the number of iterations of the original scalar loop that should be
676    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
677    for vect_set_loop_condition.
678 
679    Insert the branch-back condition before LOOP_COND_GSI and return the
680    final gcond.  */
681 
682 static gcond *
vect_set_loop_condition_partial_vectors(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)683 vect_set_loop_condition_partial_vectors (class loop *loop,
684 					 loop_vec_info loop_vinfo, tree niters,
685 					 tree final_iv, bool niters_maybe_zero,
686 					 gimple_stmt_iterator loop_cond_gsi)
687 {
688   gimple_seq preheader_seq = NULL;
689   gimple_seq header_seq = NULL;
690 
691   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
692   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
693   unsigned int compare_precision = TYPE_PRECISION (compare_type);
694   tree orig_niters = niters;
695 
696   /* Type of the initial value of NITERS.  */
697   tree ni_actual_type = TREE_TYPE (niters);
698   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
699   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
700 
701   /* Convert NITERS to the same size as the compare.  */
702   if (compare_precision > ni_actual_precision
703       && niters_maybe_zero)
704     {
705       /* We know that there is always at least one iteration, so if the
706 	 count is zero then it must have wrapped.  Cope with this by
707 	 subtracting 1 before the conversion and adding 1 to the result.  */
708       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
709       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
710 			     niters, build_minus_one_cst (ni_actual_type));
711       niters = gimple_convert (&preheader_seq, compare_type, niters);
712       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
713 			     niters, build_one_cst (compare_type));
714     }
715   else
716     niters = gimple_convert (&preheader_seq, compare_type, niters);
717 
718   /* Iterate over all the rgroups and fill in their controls.  We could use
719      the first control from any rgroup for the loop condition; here we
720      arbitrarily pick the last.  */
721   tree test_ctrl = NULL_TREE;
722   rgroup_controls *rgc;
723   unsigned int i;
724   auto_vec<rgroup_controls> *controls = use_masks_p
725 					  ? &LOOP_VINFO_MASKS (loop_vinfo)
726 					  : &LOOP_VINFO_LENS (loop_vinfo);
727   FOR_EACH_VEC_ELT (*controls, i, rgc)
728     if (!rgc->controls.is_empty ())
729       {
730 	/* First try using permutes.  This adds a single vector
731 	   instruction to the loop for each mask, but needs no extra
732 	   loop invariants or IVs.  */
733 	unsigned int nmasks = i + 1;
734 	if (use_masks_p && (nmasks & 1) == 0)
735 	  {
736 	    rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
737 	    if (!half_rgc->controls.is_empty ()
738 		&& vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
739 	      continue;
740 	  }
741 
742 	/* See whether zero-based IV would ever generate all-false masks
743 	   or zero length before wrapping around.  */
744 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
745 
746 	/* Set up all controls for this group.  */
747 	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
748 						     &preheader_seq,
749 						     loop_cond_gsi, rgc,
750 						     niters, niters_skip,
751 						     might_wrap_p);
752       }
753 
754   /* Emit all accumulated statements.  */
755   add_preheader_seq (loop, preheader_seq);
756   add_header_seq (loop, header_seq);
757 
758   /* Get a boolean result that tells us whether to iterate.  */
759   edge exit_edge = single_exit (loop);
760   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
761   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
762   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
763 					NULL_TREE, NULL_TREE);
764   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
765 
766   /* The loop iterates (NITERS - 1) / VF + 1 times.
767      Subtract one from this to get the latch count.  */
768   tree step = build_int_cst (compare_type,
769 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
770   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
771 				       build_minus_one_cst (compare_type));
772   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
773 				     niters_minus_one, step);
774 
775   if (final_iv)
776     {
777       gassign *assign = gimple_build_assign (final_iv, orig_niters);
778       gsi_insert_on_edge_immediate (single_exit (loop), assign);
779     }
780 
781   return cond_stmt;
782 }
783 
784 /* Like vect_set_loop_condition, but handle the case in which the vector
785    loop handles exactly VF scalars per iteration.  */
786 
787 static gcond *
vect_set_loop_condition_normal(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)788 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
789 				tree final_iv, bool niters_maybe_zero,
790 				gimple_stmt_iterator loop_cond_gsi)
791 {
792   tree indx_before_incr, indx_after_incr;
793   gcond *cond_stmt;
794   gcond *orig_cond;
795   edge pe = loop_preheader_edge (loop);
796   edge exit_edge = single_exit (loop);
797   gimple_stmt_iterator incr_gsi;
798   bool insert_after;
799   enum tree_code code;
800   tree niters_type = TREE_TYPE (niters);
801 
802   orig_cond = get_loop_exit_condition (loop);
803   gcc_assert (orig_cond);
804   loop_cond_gsi = gsi_for_stmt (orig_cond);
805 
806   tree init, limit;
807   if (!niters_maybe_zero && integer_onep (step))
808     {
809       /* In this case we can use a simple 0-based IV:
810 
811 	 A:
812 	   x = 0;
813 	   do
814 	     {
815 	       ...
816 	       x += 1;
817 	     }
818 	   while (x < NITERS);  */
819       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
820       init = build_zero_cst (niters_type);
821       limit = niters;
822     }
823   else
824     {
825       /* The following works for all values of NITERS except 0:
826 
827 	 B:
828 	   x = 0;
829 	   do
830 	     {
831 	       ...
832 	       x += STEP;
833 	     }
834 	   while (x <= NITERS - STEP);
835 
836 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
837 	 but stops if x + STEP - 1 >= NITERS.
838 
839 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
840 	 before wrapping around.  There are two obvious ways of dealing with
841 	 this:
842 
843 	 - start at STEP - 1 and compare x before incrementing it
844 	 - start at -1 and compare x after incrementing it
845 
846 	 The latter is simpler and is what we use.  The loop in this case
847 	 looks like:
848 
849 	 C:
850 	   x = -1;
851 	   do
852 	     {
853 	       ...
854 	       x += STEP;
855 	     }
856 	   while (x < NITERS - STEP);
857 
858 	 In both cases the loop limit is NITERS - STEP.  */
859       gimple_seq seq = NULL;
860       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
861       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
862       if (seq)
863 	{
864 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
865 	  gcc_assert (!new_bb);
866 	}
867       if (niters_maybe_zero)
868 	{
869 	  /* Case C.  */
870 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
871 	  init = build_all_ones_cst (niters_type);
872 	}
873       else
874 	{
875 	  /* Case B.  */
876 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
877 	  init = build_zero_cst (niters_type);
878 	}
879     }
880 
881   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
882   create_iv (init, step, NULL_TREE, loop,
883              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
884   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
885 					      true, NULL_TREE, true,
886 					      GSI_SAME_STMT);
887   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
888 				     true, GSI_SAME_STMT);
889 
890   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
891 				 NULL_TREE);
892 
893   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
894 
895   /* Record the number of latch iterations.  */
896   if (limit == niters)
897     /* Case A: the loop iterates NITERS times.  Subtract one to get the
898        latch count.  */
899     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
900 				       build_int_cst (niters_type, 1));
901   else
902     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
903        Subtract one from this to get the latch count.  */
904     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
905 				       limit, step);
906 
907   if (final_iv)
908     {
909       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
910 					     indx_after_incr, init);
911       gsi_insert_on_edge_immediate (single_exit (loop), assign);
912     }
913 
914   return cond_stmt;
915 }
916 
917 /* If we're using fully-masked loops, make LOOP iterate:
918 
919       N == (NITERS - 1) / STEP + 1
920 
921    times.  When NITERS is zero, this is equivalent to making the loop
922    execute (1 << M) / STEP times, where M is the precision of NITERS.
923    NITERS_MAYBE_ZERO is true if this last case might occur.
924 
925    If we're not using fully-masked loops, make LOOP iterate:
926 
927       N == (NITERS - STEP) / STEP + 1
928 
929    times, where NITERS is known to be outside the range [1, STEP - 1].
930    This is equivalent to making the loop execute NITERS / STEP times
931    when NITERS is nonzero and (1 << M) / STEP times otherwise.
932    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
933 
934    If FINAL_IV is nonnull, it is an SSA name that should be set to
935    N * STEP on exit from the loop.
936 
937    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
938 
939 void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)940 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
941 			 tree niters, tree step, tree final_iv,
942 			 bool niters_maybe_zero)
943 {
944   gcond *cond_stmt;
945   gcond *orig_cond = get_loop_exit_condition (loop);
946   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
947 
948   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
949     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
950 							 niters, final_iv,
951 							 niters_maybe_zero,
952 							 loop_cond_gsi);
953   else
954     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
955 						niters_maybe_zero,
956 						loop_cond_gsi);
957 
958   /* Remove old loop exit test.  */
959   stmt_vec_info orig_cond_info;
960   if (loop_vinfo
961       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
962     loop_vinfo->remove_stmt (orig_cond_info);
963   else
964     gsi_remove (&loop_cond_gsi, true);
965 
966   if (dump_enabled_p ())
967     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
968 		     cond_stmt);
969 }
970 
971 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
972    For all PHI arguments in FROM->dest and TO->dest from those
973    edges ensure that TO->dest PHI arguments have current_def
974    to that in from.  */
975 
976 static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)977 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
978 {
979   gimple_stmt_iterator gsi_from, gsi_to;
980 
981   for (gsi_from = gsi_start_phis (from->dest),
982        gsi_to = gsi_start_phis (to->dest);
983        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
984     {
985       gimple *from_phi = gsi_stmt (gsi_from);
986       gimple *to_phi = gsi_stmt (gsi_to);
987       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
988       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
989       if (virtual_operand_p (from_arg))
990 	{
991 	  gsi_next (&gsi_from);
992 	  continue;
993 	}
994       if (virtual_operand_p (to_arg))
995 	{
996 	  gsi_next (&gsi_to);
997 	  continue;
998 	}
999       if (TREE_CODE (from_arg) != SSA_NAME)
1000 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1001       else if (TREE_CODE (to_arg) == SSA_NAME
1002 	       && from_arg != to_arg)
1003 	{
1004 	  if (get_current_def (to_arg) == NULL_TREE)
1005 	    {
1006 	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1007 					      TREE_TYPE (get_current_def
1008 							   (from_arg))));
1009 	      set_current_def (to_arg, get_current_def (from_arg));
1010 	    }
1011 	}
1012       gsi_next (&gsi_from);
1013       gsi_next (&gsi_to);
1014     }
1015 
1016   gphi *from_phi = get_virtual_phi (from->dest);
1017   gphi *to_phi = get_virtual_phi (to->dest);
1018   if (from_phi)
1019     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1020 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1021 }
1022 
1023 
1024 /* Given LOOP this function generates a new copy of it and puts it
1025    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1026    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1027    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1028    entry or exit of LOOP.  */
1029 
1030 class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)1031 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1032 					class loop *scalar_loop, edge e)
1033 {
1034   class loop *new_loop;
1035   basic_block *new_bbs, *bbs, *pbbs;
1036   bool at_exit;
1037   bool was_imm_dom;
1038   basic_block exit_dest;
1039   edge exit, new_exit;
1040   bool duplicate_outer_loop = false;
1041 
1042   exit = single_exit (loop);
1043   at_exit = (e == exit);
1044   if (!at_exit && e != loop_preheader_edge (loop))
1045     return NULL;
1046 
1047   if (scalar_loop == NULL)
1048     scalar_loop = loop;
1049 
1050   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1051   pbbs = bbs + 1;
1052   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1053   /* Allow duplication of outer loops.  */
1054   if (scalar_loop->inner)
1055     duplicate_outer_loop = true;
1056   /* Check whether duplication is possible.  */
1057   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1058     {
1059       free (bbs);
1060       return NULL;
1061     }
1062 
1063   /* Generate new loop structure.  */
1064   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1065   duplicate_subloops (scalar_loop, new_loop);
1066 
1067   exit_dest = exit->dest;
1068   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1069 					  exit_dest) == loop->header ?
1070 		 true : false);
1071 
1072   /* Also copy the pre-header, this avoids jumping through hoops to
1073      duplicate the loop entry PHI arguments.  Create an empty
1074      pre-header unconditionally for this.  */
1075   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1076   edge entry_e = single_pred_edge (preheader);
1077   bbs[0] = preheader;
1078   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1079 
1080   exit = single_exit (scalar_loop);
1081   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1082 	    &exit, 1, &new_exit, NULL,
1083 	    at_exit ? loop->latch : e->src, true);
1084   exit = single_exit (loop);
1085   basic_block new_preheader = new_bbs[0];
1086 
1087   /* Before installing PHI arguments make sure that the edges
1088      into them match that of the scalar loop we analyzed.  This
1089      makes sure the SLP tree matches up between the main vectorized
1090      loop and the epilogue vectorized copies.  */
1091   if (single_succ_edge (preheader)->dest_idx
1092       != single_succ_edge (new_bbs[0])->dest_idx)
1093     {
1094       basic_block swap_bb = new_bbs[1];
1095       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1096       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1097       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1098       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1099     }
1100   if (duplicate_outer_loop)
1101     {
1102       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1103       if (loop_preheader_edge (scalar_loop)->dest_idx
1104 	  != loop_preheader_edge (new_inner_loop)->dest_idx)
1105 	{
1106 	  basic_block swap_bb = new_inner_loop->header;
1107 	  gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1108 	  std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1109 	  EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1110 	  EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1111 	}
1112     }
1113 
1114   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1115 
1116   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1117   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1118     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1119 
1120   if (scalar_loop != loop)
1121     {
1122       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1123 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1124 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1125 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1126 	 header) to have current_def set, so copy them over.  */
1127       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1128 						exit);
1129       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1130 							   0),
1131 						EDGE_SUCC (loop->latch, 0));
1132     }
1133 
1134   if (at_exit) /* Add the loop copy at exit.  */
1135     {
1136       if (scalar_loop != loop)
1137 	{
1138 	  gphi_iterator gsi;
1139 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1140 
1141 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1142 	       gsi_next (&gsi))
1143 	    {
1144 	      gphi *phi = gsi.phi ();
1145 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1146 	      location_t orig_locus
1147 		= gimple_phi_arg_location_from_edge (phi, e);
1148 
1149 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1150 	    }
1151 	}
1152       redirect_edge_and_branch_force (e, new_preheader);
1153       flush_pending_stmts (e);
1154       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1155       if (was_imm_dom || duplicate_outer_loop)
1156 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1157 
1158       /* And remove the non-necessary forwarder again.  Keep the other
1159          one so we have a proper pre-header for the loop at the exit edge.  */
1160       redirect_edge_pred (single_succ_edge (preheader),
1161 			  single_pred (preheader));
1162       delete_basic_block (preheader);
1163       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1164 			       loop_preheader_edge (scalar_loop)->src);
1165     }
1166   else /* Add the copy at entry.  */
1167     {
1168       if (scalar_loop != loop)
1169 	{
1170 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1171 	  redirect_edge_pred (single_succ_edge (preheader),
1172 			      single_pred (preheader));
1173 	  delete_basic_block (preheader);
1174 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1175 				   loop_preheader_edge (scalar_loop)->src);
1176 	  preheader = split_edge (loop_preheader_edge (loop));
1177 	  entry_e = single_pred_edge (preheader);
1178 	}
1179 
1180       redirect_edge_and_branch_force (entry_e, new_preheader);
1181       flush_pending_stmts (entry_e);
1182       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1183 
1184       redirect_edge_and_branch_force (new_exit, preheader);
1185       flush_pending_stmts (new_exit);
1186       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1187 
1188       /* And remove the non-necessary forwarder again.  Keep the other
1189          one so we have a proper pre-header for the loop at the exit edge.  */
1190       redirect_edge_pred (single_succ_edge (new_preheader),
1191 			  single_pred (new_preheader));
1192       delete_basic_block (new_preheader);
1193       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1194 			       loop_preheader_edge (new_loop)->src);
1195     }
1196 
1197   if (scalar_loop != loop)
1198     {
1199       /* Update new_loop->header PHIs, so that on the preheader
1200 	 edge they are the ones from loop rather than scalar_loop.  */
1201       gphi_iterator gsi_orig, gsi_new;
1202       edge orig_e = loop_preheader_edge (loop);
1203       edge new_e = loop_preheader_edge (new_loop);
1204 
1205       for (gsi_orig = gsi_start_phis (loop->header),
1206 	   gsi_new = gsi_start_phis (new_loop->header);
1207 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1208 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1209 	{
1210 	  gphi *orig_phi = gsi_orig.phi ();
1211 	  gphi *new_phi = gsi_new.phi ();
1212 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1213 	  location_t orig_locus
1214 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1215 
1216 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1217 	}
1218     }
1219 
1220   free (new_bbs);
1221   free (bbs);
1222 
1223   checking_verify_dominators (CDI_DOMINATORS);
1224 
1225   return new_loop;
1226 }
1227 
1228 
1229 /* Given the condition expression COND, put it as the last statement of
1230    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1231    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1232    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1233    new edge as irreducible if IRREDUCIBLE_P is true.  */
1234 
1235 static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1236 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1237 		       basic_block guard_to, basic_block dom_bb,
1238 		       profile_probability probability, bool irreducible_p)
1239 {
1240   gimple_stmt_iterator gsi;
1241   edge new_e, enter_e;
1242   gcond *cond_stmt;
1243   gimple_seq gimplify_stmt_list = NULL;
1244 
1245   enter_e = EDGE_SUCC (guard_bb, 0);
1246   enter_e->flags &= ~EDGE_FALLTHRU;
1247   enter_e->flags |= EDGE_FALSE_VALUE;
1248   gsi = gsi_last_bb (guard_bb);
1249 
1250   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1251 				 NULL_TREE);
1252   if (gimplify_stmt_list)
1253     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1254 
1255   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1256   gsi = gsi_last_bb (guard_bb);
1257   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1258 
1259   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1260   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1261 
1262   new_e->probability = probability;
1263   if (irreducible_p)
1264     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1265 
1266   enter_e->probability = probability.invert ();
1267   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1268 
1269   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1270   if (enter_e->dest->loop_father->header == enter_e->dest)
1271     split_edge (enter_e);
1272 
1273   return new_e;
1274 }
1275 
1276 
1277 /* This function verifies that the following restrictions apply to LOOP:
1278    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1279        for innermost loop and 5 basic blocks for outer-loop.
1280    (2) it is single entry, single exit
1281    (3) its exit condition is the last stmt in the header
1282    (4) E is the entry/exit edge of LOOP.
1283  */
1284 
1285 bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1286 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1287 {
1288   edge exit_e = single_exit (loop);
1289   edge entry_e = loop_preheader_edge (loop);
1290   gcond *orig_cond = get_loop_exit_condition (loop);
1291   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1292   unsigned int num_bb = loop->inner? 5 : 2;
1293 
1294   /* All loops have an outer scope; the only case loop->outer is NULL is for
1295      the function itself.  */
1296   if (!loop_outer (loop)
1297       || loop->num_nodes != num_bb
1298       || !empty_block_p (loop->latch)
1299       || !single_exit (loop)
1300       /* Verify that new loop exit condition can be trivially modified.  */
1301       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1302       || (e != exit_e && e != entry_e))
1303     return false;
1304 
1305   return true;
1306 }
1307 
1308 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1309    in the exit bb and rename all the uses after the loop.  This simplifies
1310    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1311    (but normally loop closed SSA form doesn't require virtual PHIs to be
1312    in the same form).  Doing this early simplifies the checking what
1313    uses should be renamed.
1314 
1315    If we create a new phi after the loop, return the definition that
1316    applies on entry to the loop, otherwise return null.  */
1317 
1318 static tree
create_lcssa_for_virtual_phi(class loop * loop)1319 create_lcssa_for_virtual_phi (class loop *loop)
1320 {
1321   gphi_iterator gsi;
1322   edge exit_e = single_exit (loop);
1323 
1324   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1325     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1326       {
1327 	gphi *phi = gsi.phi ();
1328 	for (gsi = gsi_start_phis (exit_e->dest);
1329 	     !gsi_end_p (gsi); gsi_next (&gsi))
1330 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1331 	    break;
1332 	if (gsi_end_p (gsi))
1333 	  {
1334 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1335 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1336 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1337 	    imm_use_iterator imm_iter;
1338 	    gimple *stmt;
1339 	    use_operand_p use_p;
1340 
1341 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1342 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1343 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1344 	    gimple_phi_set_result (new_phi, new_vop);
1345 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1346 	      if (stmt != new_phi
1347 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1348 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1349 		  SET_USE (use_p, new_vop);
1350 
1351 	    return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1352 	  }
1353 	break;
1354       }
1355   return NULL_TREE;
1356 }
1357 
1358 /* Function vect_get_loop_location.
1359 
1360    Extract the location of the loop in the source code.
1361    If the loop is not well formed for vectorization, an estimated
1362    location is calculated.
1363    Return the loop location if succeed and NULL if not.  */
1364 
1365 dump_user_location_t
find_loop_location(class loop * loop)1366 find_loop_location (class loop *loop)
1367 {
1368   gimple *stmt = NULL;
1369   basic_block bb;
1370   gimple_stmt_iterator si;
1371 
1372   if (!loop)
1373     return dump_user_location_t ();
1374 
1375   stmt = get_loop_exit_condition (loop);
1376 
1377   if (stmt
1378       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1379     return stmt;
1380 
1381   /* If we got here the loop is probably not "well formed",
1382      try to estimate the loop location */
1383 
1384   if (!loop->header)
1385     return dump_user_location_t ();
1386 
1387   bb = loop->header;
1388 
1389   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1390     {
1391       stmt = gsi_stmt (si);
1392       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1393         return stmt;
1394     }
1395 
1396   return dump_user_location_t ();
1397 }
1398 
1399 /* Return true if the phi described by STMT_INFO defines an IV of the
1400    loop to be vectorized.  */
1401 
1402 static bool
iv_phi_p(stmt_vec_info stmt_info)1403 iv_phi_p (stmt_vec_info stmt_info)
1404 {
1405   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1406   if (virtual_operand_p (PHI_RESULT (phi)))
1407     return false;
1408 
1409   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1410       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1411     return false;
1412 
1413   return true;
1414 }
1415 
1416 /* Function vect_can_advance_ivs_p
1417 
1418    In case the number of iterations that LOOP iterates is unknown at compile
1419    time, an epilog loop will be generated, and the loop induction variables
1420    (IVs) will be "advanced" to the value they are supposed to take just before
1421    the epilog loop.  Here we check that the access function of the loop IVs
1422    and the expression that represents the loop bound are simple enough.
1423    These restrictions will be relaxed in the future.  */
1424 
1425 bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1426 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1427 {
1428   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1429   basic_block bb = loop->header;
1430   gphi_iterator gsi;
1431 
1432   /* Analyze phi functions of the loop header.  */
1433 
1434   if (dump_enabled_p ())
1435     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1436   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1437     {
1438       tree evolution_part;
1439 
1440       gphi *phi = gsi.phi ();
1441       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1442       if (dump_enabled_p ())
1443 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1444 			 phi_info->stmt);
1445 
1446       /* Skip virtual phi's. The data dependences that are associated with
1447 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1448 
1449 	 Skip reduction phis.  */
1450       if (!iv_phi_p (phi_info))
1451 	{
1452 	  if (dump_enabled_p ())
1453 	    dump_printf_loc (MSG_NOTE, vect_location,
1454 			     "reduc or virtual phi. skip.\n");
1455 	  continue;
1456 	}
1457 
1458       /* Analyze the evolution function.  */
1459 
1460       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1461       if (evolution_part == NULL_TREE)
1462         {
1463 	  if (dump_enabled_p ())
1464 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1465 			 "No access function or evolution.\n");
1466 	  return false;
1467         }
1468 
1469       /* FORNOW: We do not transform initial conditions of IVs
1470 	 which evolution functions are not invariants in the loop.  */
1471 
1472       if (!expr_invariant_in_loop_p (loop, evolution_part))
1473 	{
1474 	  if (dump_enabled_p ())
1475 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476 			     "evolution not invariant in loop.\n");
1477 	  return false;
1478 	}
1479 
1480       /* FORNOW: We do not transform initial conditions of IVs
1481 	 which evolution functions are a polynomial of degree >= 2.  */
1482 
1483       if (tree_is_chrec (evolution_part))
1484 	{
1485 	  if (dump_enabled_p ())
1486 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 			     "evolution is chrec.\n");
1488 	  return false;
1489 	}
1490     }
1491 
1492   return true;
1493 }
1494 
1495 
1496 /*   Function vect_update_ivs_after_vectorizer.
1497 
1498      "Advance" the induction variables of LOOP to the value they should take
1499      after the execution of LOOP.  This is currently necessary because the
1500      vectorizer does not handle induction variables that are used after the
1501      loop.  Such a situation occurs when the last iterations of LOOP are
1502      peeled, because:
1503      1. We introduced new uses after LOOP for IVs that were not originally used
1504         after LOOP: the IVs of LOOP are now used by an epilog loop.
1505      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1506         times, whereas the loop IVs should be bumped N times.
1507 
1508      Input:
1509      - LOOP - a loop that is going to be vectorized. The last few iterations
1510               of LOOP were peeled.
1511      - NITERS - the number of iterations that LOOP executes (before it is
1512                 vectorized). i.e, the number of times the ivs should be bumped.
1513      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1514                   coming out from LOOP on which there are uses of the LOOP ivs
1515 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1516 
1517                   The new definitions of the ivs are placed in LOOP->exit.
1518                   The phi args associated with the edge UPDATE_E in the bb
1519                   UPDATE_E->dest are updated accordingly.
1520 
1521      Assumption 1: Like the rest of the vectorizer, this function assumes
1522      a single loop exit that has a single predecessor.
1523 
1524      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1525      organized in the same order.
1526 
1527      Assumption 3: The access function of the ivs is simple enough (see
1528      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1529 
1530      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1531      coming out of LOOP on which the ivs of LOOP are used (this is the path
1532      that leads to the epilog loop; other paths skip the epilog loop).  This
1533      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1534      needs to have its phis updated.
1535  */
1536 
1537 static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1538 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1539 				  tree niters, edge update_e)
1540 {
1541   gphi_iterator gsi, gsi1;
1542   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1543   basic_block update_bb = update_e->dest;
1544   basic_block exit_bb = single_exit (loop)->dest;
1545 
1546   /* Make sure there exists a single-predecessor exit bb:  */
1547   gcc_assert (single_pred_p (exit_bb));
1548   gcc_assert (single_succ_edge (exit_bb) == update_e);
1549 
1550   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1551        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1552        gsi_next (&gsi), gsi_next (&gsi1))
1553     {
1554       tree init_expr;
1555       tree step_expr, off;
1556       tree type;
1557       tree var, ni, ni_name;
1558       gimple_stmt_iterator last_gsi;
1559 
1560       gphi *phi = gsi.phi ();
1561       gphi *phi1 = gsi1.phi ();
1562       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1563       if (dump_enabled_p ())
1564 	dump_printf_loc (MSG_NOTE, vect_location,
1565 			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1566 
1567       /* Skip reduction and virtual phis.  */
1568       if (!iv_phi_p (phi_info))
1569 	{
1570 	  if (dump_enabled_p ())
1571 	    dump_printf_loc (MSG_NOTE, vect_location,
1572 			     "reduc or virtual phi. skip.\n");
1573 	  continue;
1574 	}
1575 
1576       type = TREE_TYPE (gimple_phi_result (phi));
1577       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1578       step_expr = unshare_expr (step_expr);
1579 
1580       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1581          of degree >= 2 or exponential.  */
1582       gcc_assert (!tree_is_chrec (step_expr));
1583 
1584       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1585 
1586       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1587 			 fold_convert (TREE_TYPE (step_expr), niters),
1588 			 step_expr);
1589       if (POINTER_TYPE_P (type))
1590 	ni = fold_build_pointer_plus (init_expr, off);
1591       else
1592 	ni = fold_build2 (PLUS_EXPR, type,
1593 			  init_expr, fold_convert (type, off));
1594 
1595       var = create_tmp_var (type, "tmp");
1596 
1597       last_gsi = gsi_last_bb (exit_bb);
1598       gimple_seq new_stmts = NULL;
1599       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1600       /* Exit_bb shouldn't be empty.  */
1601       if (!gsi_end_p (last_gsi))
1602 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1603       else
1604 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1605 
1606       /* Fix phi expressions in the successor bb.  */
1607       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1608     }
1609 }
1610 
1611 /* Return a gimple value containing the misalignment (measured in vector
1612    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1613    it is away from a perfectly aligned address.  Add any new statements
1614    to SEQ.  */
1615 
1616 static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1617 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1618 {
1619   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1620   stmt_vec_info stmt_info = dr_info->stmt;
1621   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1622 
1623   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1624   unsigned HOST_WIDE_INT target_align_c;
1625   tree target_align_minus_1;
1626 
1627   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1628 					size_zero_node) < 0;
1629   tree offset = (negative
1630 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1631 		 : size_zero_node);
1632   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1633 							  stmt_info, seq,
1634 							  offset);
1635   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1636   if (target_align.is_constant (&target_align_c))
1637     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1638   else
1639     {
1640       tree vla = build_int_cst (type, target_align);
1641       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1642 				    fold_build2 (MINUS_EXPR, type,
1643 						 build_int_cst (type, 0), vla));
1644       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1645 					  build_int_cst (type, 1));
1646     }
1647 
1648   HOST_WIDE_INT elem_size
1649     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1650   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1651 
1652   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1653   tree int_start_addr = fold_convert (type, start_addr);
1654   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1655 					target_align_minus_1);
1656 
1657   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1658   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1659 					elem_size_log);
1660 
1661   return misalign_in_elems;
1662 }
1663 
1664 /* Function vect_gen_prolog_loop_niters
1665 
1666    Generate the number of iterations which should be peeled as prolog for the
1667    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1668    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1669    As a result, after the execution of this loop, the data reference DR will
1670    refer to an aligned location.  The following computation is generated:
1671 
1672    If the misalignment of DR is known at compile time:
1673      addr_mis = int mis = DR_MISALIGNMENT (dr);
1674    Else, compute address misalignment in bytes:
1675      addr_mis = addr & (target_align - 1)
1676 
1677    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1678 
1679    (elem_size = element type size; an element is the scalar element whose type
1680    is the inner type of the vectype)
1681 
1682    The computations will be emitted at the end of BB.  We also compute and
1683    store upper bound (included) of the result in BOUND.
1684 
1685    When the step of the data-ref in the loop is not 1 (as in interleaved data
1686    and SLP), the number of iterations of the prolog must be divided by the step
1687    (which is equal to the size of interleaved group).
1688 
1689    The above formulas assume that VF == number of elements in the vector. This
1690    may not hold when there are multiple-types in the loop.
1691    In this case, for some data-references in the loop the VF does not represent
1692    the number of elements that fit in the vector.  Therefore, instead of VF we
1693    use TYPE_VECTOR_SUBPARTS.  */
1694 
1695 static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1696 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1697 			     basic_block bb, int *bound)
1698 {
1699   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1700   tree var;
1701   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1702   gimple_seq stmts = NULL, new_stmts = NULL;
1703   tree iters, iters_name;
1704   stmt_vec_info stmt_info = dr_info->stmt;
1705   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1706   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1707 
1708   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1709     {
1710       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1711 
1712       if (dump_enabled_p ())
1713         dump_printf_loc (MSG_NOTE, vect_location,
1714                          "known peeling = %d.\n", npeel);
1715 
1716       iters = build_int_cst (niters_type, npeel);
1717       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1718     }
1719   else
1720     {
1721       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1722       tree type = TREE_TYPE (misalign_in_elems);
1723       HOST_WIDE_INT elem_size
1724 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1725       /* We only do prolog peeling if the target alignment is known at compile
1726          time.  */
1727       poly_uint64 align_in_elems =
1728 	exact_div (target_align, elem_size);
1729       tree align_in_elems_minus_1 =
1730 	build_int_cst (type, align_in_elems - 1);
1731       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1732 
1733       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1734 				 & (align_in_elems - 1)).  */
1735       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1736 					    size_zero_node) < 0;
1737       if (negative)
1738 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1739 			     align_in_elems_tree);
1740       else
1741 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1742 			     misalign_in_elems);
1743       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1744       iters = fold_convert (niters_type, iters);
1745       unsigned HOST_WIDE_INT align_in_elems_c;
1746       if (align_in_elems.is_constant (&align_in_elems_c))
1747 	*bound = align_in_elems_c - 1;
1748       else
1749 	*bound = -1;
1750     }
1751 
1752   if (dump_enabled_p ())
1753     dump_printf_loc (MSG_NOTE, vect_location,
1754 		     "niters for prolog loop: %T\n", iters);
1755 
1756   var = create_tmp_var (niters_type, "prolog_loop_niters");
1757   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1758 
1759   if (new_stmts)
1760     gimple_seq_add_seq (&stmts, new_stmts);
1761   if (stmts)
1762     {
1763       gcc_assert (single_succ_p (bb));
1764       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1765       if (gsi_end_p (gsi))
1766 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1767       else
1768 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1769     }
1770   return iters_name;
1771 }
1772 
1773 
1774 /* Function vect_update_init_of_dr
1775 
1776    If CODE is PLUS, the vector loop starts NITERS iterations after the
1777    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1778    iterations before the scalar one (using masking to skip inactive
1779    elements).  This function updates the information recorded in DR to
1780    account for the difference.  Specifically, it updates the OFFSET
1781    field of DR_INFO.  */
1782 
1783 static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1784 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1785 {
1786   struct data_reference *dr = dr_info->dr;
1787   tree offset = dr_info->offset;
1788   if (!offset)
1789     offset = build_zero_cst (sizetype);
1790 
1791   niters = fold_build2 (MULT_EXPR, sizetype,
1792 			fold_convert (sizetype, niters),
1793 			fold_convert (sizetype, DR_STEP (dr)));
1794   offset = fold_build2 (code, sizetype,
1795 			fold_convert (sizetype, offset), niters);
1796   dr_info->offset = offset;
1797 }
1798 
1799 
1800 /* Function vect_update_inits_of_drs
1801 
1802    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1803    CODE and NITERS are as for vect_update_inits_of_dr.  */
1804 
1805 void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1806 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1807 			  tree_code code)
1808 {
1809   unsigned int i;
1810   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1811   struct data_reference *dr;
1812 
1813   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1814 
1815   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1816      here, but since we might use these niters to update the epilogues niters
1817      and data references we can't insert them here as this definition might not
1818      always dominate its uses.  */
1819   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1820     niters = fold_convert (sizetype, niters);
1821 
1822   FOR_EACH_VEC_ELT (datarefs, i, dr)
1823     {
1824       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1825       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1826 	vect_update_init_of_dr (dr_info, niters, code);
1827     }
1828 }
1829 
1830 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1831    by masking.  This involves calculating the number of iterations to
1832    be peeled and then aligning all memory references appropriately.  */
1833 
1834 void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1835 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1836 {
1837   tree misalign_in_elems;
1838   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1839 
1840   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1841 
1842   /* From the information recorded in LOOP_VINFO get the number of iterations
1843      that need to be skipped via masking.  */
1844   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1845     {
1846       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1847 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1848       misalign_in_elems = build_int_cst (type, misalign);
1849     }
1850   else
1851     {
1852       gimple_seq seq1 = NULL, seq2 = NULL;
1853       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1854       misalign_in_elems = fold_convert (type, misalign_in_elems);
1855       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1856 						&seq2, true, NULL_TREE);
1857       gimple_seq_add_seq (&seq1, seq2);
1858       if (seq1)
1859 	{
1860 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1861 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1862 	  gcc_assert (!new_bb);
1863 	}
1864     }
1865 
1866   if (dump_enabled_p ())
1867     dump_printf_loc (MSG_NOTE, vect_location,
1868 		     "misalignment for fully-masked loop: %T\n",
1869 		     misalign_in_elems);
1870 
1871   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1872 
1873   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1874 }
1875 
1876 /* This function builds ni_name = number of iterations.  Statements
1877    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1878    it to TRUE if new ssa_var is generated.  */
1879 
1880 tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1881 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1882 {
1883   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1884   if (TREE_CODE (ni) == INTEGER_CST)
1885     return ni;
1886   else
1887     {
1888       tree ni_name, var;
1889       gimple_seq stmts = NULL;
1890       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1891 
1892       var = create_tmp_var (TREE_TYPE (ni), "niters");
1893       ni_name = force_gimple_operand (ni, &stmts, false, var);
1894       if (stmts)
1895 	{
1896 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1897 	  if (new_var_p != NULL)
1898 	    *new_var_p = true;
1899 	}
1900 
1901       return ni_name;
1902     }
1903 }
1904 
1905 /* Calculate the number of iterations above which vectorized loop will be
1906    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1907    of prolog loop.  If it's integer const, the integer number is also passed
1908    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1909    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1910    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1911    threshold below which the scalar (rather than vectorized) loop will be
1912    executed.  This function stores the upper bound (inclusive) of the result
1913    in BOUND_SCALAR.  */
1914 
1915 static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1916 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1917 			     int bound_prolog, poly_int64 bound_epilog, int th,
1918 			     poly_uint64 *bound_scalar,
1919 			     bool check_profitability)
1920 {
1921   tree type = TREE_TYPE (niters_prolog);
1922   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1923 			     build_int_cst (type, bound_epilog));
1924 
1925   *bound_scalar = bound_prolog + bound_epilog;
1926   if (check_profitability)
1927     {
1928       /* TH indicates the minimum niters of vectorized loop, while we
1929 	 compute the maximum niters of scalar loop.  */
1930       th--;
1931       /* Peeling for constant times.  */
1932       if (int_niters_prolog >= 0)
1933 	{
1934 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1935 	  return build_int_cst (type, *bound_scalar);
1936 	}
1937       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1938 	 and BOUND_EPILOG are inclusive upper bounds.  */
1939       if (known_ge (th, bound_prolog + bound_epilog))
1940 	{
1941 	  *bound_scalar = th;
1942 	  return build_int_cst (type, th);
1943 	}
1944       /* Need to do runtime comparison.  */
1945       else if (maybe_gt (th, bound_epilog))
1946 	{
1947 	  *bound_scalar = upper_bound (*bound_scalar, th);
1948 	  return fold_build2 (MAX_EXPR, type,
1949 			      build_int_cst (type, th), niters);
1950 	}
1951     }
1952   return niters;
1953 }
1954 
1955 /* NITERS is the number of times that the original scalar loop executes
1956    after peeling.  Work out the maximum number of iterations N that can
1957    be handled by the vectorized form of the loop and then either:
1958 
1959    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1960 
1961 	niters_vector = N
1962 
1963    b) set *STEP_VECTOR_PTR to one and generate:
1964 
1965         niters_vector = N / vf
1966 
1967    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1968    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1969    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1970 
1971 void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1972 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1973 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1974 			     bool niters_no_overflow)
1975 {
1976   tree ni_minus_gap, var;
1977   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1978   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1980   tree log_vf = NULL_TREE;
1981 
1982   /* If epilogue loop is required because of data accesses with gaps, we
1983      subtract one iteration from the total number of iterations here for
1984      correct calculation of RATIO.  */
1985   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1986     {
1987       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1988 				  build_one_cst (type));
1989       if (!is_gimple_val (ni_minus_gap))
1990 	{
1991 	  var = create_tmp_var (type, "ni_gap");
1992 	  gimple *stmts = NULL;
1993 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1994 					       true, var);
1995 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1996 	}
1997     }
1998   else
1999     ni_minus_gap = niters;
2000 
2001   unsigned HOST_WIDE_INT const_vf;
2002   if (vf.is_constant (&const_vf)
2003       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2004     {
2005       /* Create: niters >> log2(vf) */
2006       /* If it's known that niters == number of latch executions + 1 doesn't
2007 	 overflow, we can generate niters >> log2(vf); otherwise we generate
2008 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2009 	 will be at least one.  */
2010       log_vf = build_int_cst (type, exact_log2 (const_vf));
2011       if (niters_no_overflow)
2012 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2013       else
2014 	niters_vector
2015 	  = fold_build2 (PLUS_EXPR, type,
2016 			 fold_build2 (RSHIFT_EXPR, type,
2017 				      fold_build2 (MINUS_EXPR, type,
2018 						   ni_minus_gap,
2019 						   build_int_cst (type, vf)),
2020 				      log_vf),
2021 			 build_int_cst (type, 1));
2022       step_vector = build_one_cst (type);
2023     }
2024   else
2025     {
2026       niters_vector = ni_minus_gap;
2027       step_vector = build_int_cst (type, vf);
2028     }
2029 
2030   if (!is_gimple_val (niters_vector))
2031     {
2032       var = create_tmp_var (type, "bnd");
2033       gimple_seq stmts = NULL;
2034       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2035       gsi_insert_seq_on_edge_immediate (pe, stmts);
2036       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2037 	 we set range information to make niters analyzer's life easier.
2038 	 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2039 	 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2040       if (stmts != NULL && log_vf)
2041 	{
2042 	  if (niters_no_overflow)
2043 	    set_range_info (niters_vector, VR_RANGE,
2044 			    wi::one (TYPE_PRECISION (type)),
2045 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
2046 						       TYPE_SIGN (type)),
2047 					exact_log2 (const_vf),
2048 					TYPE_SIGN (type)));
2049 	  /* For VF == 1 the vector IV might also overflow so we cannot
2050 	     assert a minimum value of 1.  */
2051 	  else if (const_vf > 1)
2052 	    set_range_info (niters_vector, VR_RANGE,
2053 			    wi::one (TYPE_PRECISION (type)),
2054 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
2055 						       TYPE_SIGN (type))
2056 					- (const_vf - 1),
2057 					exact_log2 (const_vf), TYPE_SIGN (type))
2058 			    + 1);
2059 	}
2060     }
2061   *niters_vector_ptr = niters_vector;
2062   *step_vector_ptr = step_vector;
2063 
2064   return;
2065 }
2066 
2067 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2068    loop specified by LOOP_VINFO after vectorization, compute the number
2069    of iterations before vectorization (niters_vector * vf) and store it
2070    to NITERS_VECTOR_MULT_VF_PTR.  */
2071 
2072 static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)2073 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2074 				     tree niters_vector,
2075 				     tree *niters_vector_mult_vf_ptr)
2076 {
2077   /* We should be using a step_vector of VF if VF is variable.  */
2078   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2079   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2080   tree type = TREE_TYPE (niters_vector);
2081   tree log_vf = build_int_cst (type, exact_log2 (vf));
2082   basic_block exit_bb = single_exit (loop)->dest;
2083 
2084   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2085   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2086 					    niters_vector, log_vf);
2087   if (!is_gimple_val (niters_vector_mult_vf))
2088     {
2089       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2090       gimple_seq stmts = NULL;
2091       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2092 						    &stmts, true, var);
2093       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2094       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2095     }
2096   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2097 }
2098 
2099 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2100    this function searches for the corresponding lcssa phi node in exit
2101    bb of LOOP.  If it is found, return the phi result; otherwise return
2102    NULL.  */
2103 
2104 static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2105 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2106 		gphi *lcssa_phi)
2107 {
2108   gphi_iterator gsi;
2109   edge e = single_exit (loop);
2110 
2111   gcc_assert (single_pred_p (e->dest));
2112   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2113     {
2114       gphi *phi = gsi.phi ();
2115       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2116 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2117 	return PHI_RESULT (phi);
2118     }
2119   return NULL_TREE;
2120 }
2121 
2122 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2123    from SECOND/FIRST and puts it at the original loop's preheader/exit
2124    edge, the two loops are arranged as below:
2125 
2126        preheader_a:
2127      first_loop:
2128        header_a:
2129 	 i_1 = PHI<i_0, i_2>;
2130 	 ...
2131 	 i_2 = i_1 + 1;
2132 	 if (cond_a)
2133 	   goto latch_a;
2134 	 else
2135 	   goto between_bb;
2136        latch_a:
2137 	 goto header_a;
2138 
2139        between_bb:
2140 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2141 
2142      second_loop:
2143        header_b:
2144 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2145 				 or with i_2 if no LCSSA phi is created
2146 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2147 	 ...
2148 	 i_4 = i_3 + 1;
2149 	 if (cond_b)
2150 	   goto latch_b;
2151 	 else
2152 	   goto exit_bb;
2153        latch_b:
2154 	 goto header_b;
2155 
2156        exit_bb:
2157 
2158    This function creates loop closed SSA for the first loop; update the
2159    second loop's PHI nodes by replacing argument on incoming edge with the
2160    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2161    is false, Loop closed ssa phis will only be created for non-iv phis for
2162    the first loop.
2163 
2164    This function assumes exit bb of the first loop is preheader bb of the
2165    second loop, i.e, between_bb in the example code.  With PHIs updated,
2166    the second loop will execute rest iterations of the first.  */
2167 
2168 static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)2169 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2170 				   class loop *first, class loop *second,
2171 				   bool create_lcssa_for_iv_phis)
2172 {
2173   gphi_iterator gsi_update, gsi_orig;
2174   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2175 
2176   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2177   edge second_preheader_e = loop_preheader_edge (second);
2178   basic_block between_bb = single_exit (first)->dest;
2179 
2180   gcc_assert (between_bb == second_preheader_e->src);
2181   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2182   /* Either the first loop or the second is the loop to be vectorized.  */
2183   gcc_assert (loop == first || loop == second);
2184 
2185   for (gsi_orig = gsi_start_phis (first->header),
2186        gsi_update = gsi_start_phis (second->header);
2187        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2188        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2189     {
2190       gphi *orig_phi = gsi_orig.phi ();
2191       gphi *update_phi = gsi_update.phi ();
2192 
2193       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2194       /* Generate lcssa PHI node for the first loop.  */
2195       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2196       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2197       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2198 	{
2199 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2200 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2201 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2202 	  arg = new_res;
2203 	}
2204 
2205       /* Update PHI node in the second loop by replacing arg on the loop's
2206 	 incoming edge.  */
2207       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2208     }
2209 
2210   /* For epilogue peeling we have to make sure to copy all LC PHIs
2211      for correct vectorization of live stmts.  */
2212   if (loop == first)
2213     {
2214       basic_block orig_exit = single_exit (second)->dest;
2215       for (gsi_orig = gsi_start_phis (orig_exit);
2216 	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2217 	{
2218 	  gphi *orig_phi = gsi_orig.phi ();
2219 	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2220 	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2221 	    continue;
2222 
2223 	  /* Already created in the above loop.   */
2224 	  if (find_guard_arg (first, second, orig_phi))
2225 	    continue;
2226 
2227 	  tree new_res = copy_ssa_name (orig_arg);
2228 	  gphi *lcphi = create_phi_node (new_res, between_bb);
2229 	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2230 	}
2231     }
2232 }
2233 
2234 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2235    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2236    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2237    appear like below:
2238 
2239        guard_bb:
2240 	 if (cond)
2241 	   goto merge_bb;
2242 	 else
2243 	   goto skip_loop;
2244 
2245      skip_loop:
2246        header_a:
2247 	 i_1 = PHI<i_0, i_2>;
2248 	 ...
2249 	 i_2 = i_1 + 1;
2250 	 if (cond_a)
2251 	   goto latch_a;
2252 	 else
2253 	   goto exit_a;
2254        latch_a:
2255 	 goto header_a;
2256 
2257        exit_a:
2258 	 i_5 = PHI<i_2>;
2259 
2260        merge_bb:
2261 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2262 
2263      update_loop:
2264        header_b:
2265 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2266 	 ...
2267 	 i_4 = i_3 + 1;
2268 	 if (cond_b)
2269 	   goto latch_b;
2270 	 else
2271 	   goto exit_bb;
2272        latch_b:
2273 	 goto header_b;
2274 
2275        exit_bb:
2276 
2277    This function creates PHI nodes at merge_bb and replaces the use of i_5
2278    in the update_loop's PHI node with the result of new PHI result.  */
2279 
2280 static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2281 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2282 				    class loop *update_loop,
2283 				    edge guard_edge, edge merge_edge)
2284 {
2285   location_t merge_loc, guard_loc;
2286   edge orig_e = loop_preheader_edge (skip_loop);
2287   edge update_e = loop_preheader_edge (update_loop);
2288   gphi_iterator gsi_orig, gsi_update;
2289 
2290   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2291 	gsi_update = gsi_start_phis (update_loop->header));
2292        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2293        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2294     {
2295       gphi *orig_phi = gsi_orig.phi ();
2296       gphi *update_phi = gsi_update.phi ();
2297 
2298       /* Generate new phi node at merge bb of the guard.  */
2299       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2300       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2301 
2302       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2303 	 args in NEW_PHI for these edges.  */
2304       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2305       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2306       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2307       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2308       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2309       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2310 
2311       /* Update phi in UPDATE_PHI.  */
2312       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2313     }
2314 }
2315 
2316 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2317    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2318    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2319    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2320    The CFG looks like:
2321 
2322      loop:
2323        header_a:
2324 	 i_1 = PHI<i_0, i_2>;
2325 	 ...
2326 	 i_2 = i_1 + 1;
2327 	 if (cond_a)
2328 	   goto latch_a;
2329 	 else
2330 	   goto exit_a;
2331        latch_a:
2332 	 goto header_a;
2333 
2334        exit_a:
2335 
2336        guard_bb:
2337 	 if (cond)
2338 	   goto merge_bb;
2339 	 else
2340 	   goto epilog_loop;
2341 
2342        ;; fall_through_bb
2343 
2344      epilog_loop:
2345        header_b:
2346 	 i_3 = PHI<i_2, i_4>;
2347 	 ...
2348 	 i_4 = i_3 + 1;
2349 	 if (cond_b)
2350 	   goto latch_b;
2351 	 else
2352 	   goto merge_bb;
2353        latch_b:
2354 	 goto header_b;
2355 
2356        merge_bb:
2357 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2358 
2359        exit_bb:
2360 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2361 
2362    For each name used out side EPILOG (i.e - for each name that has a lcssa
2363    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2364    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2365    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2366    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2367    in exit_bb will also be updated.  */
2368 
2369 static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2370 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2371 				    edge guard_edge, edge merge_edge)
2372 {
2373   gphi_iterator gsi;
2374   basic_block merge_bb = guard_edge->dest;
2375 
2376   gcc_assert (single_succ_p (merge_bb));
2377   edge e = single_succ_edge (merge_bb);
2378   basic_block exit_bb = e->dest;
2379   gcc_assert (single_pred_p (exit_bb));
2380   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2381 
2382   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2383     {
2384       gphi *update_phi = gsi.phi ();
2385       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2386 
2387       tree merge_arg = NULL_TREE;
2388 
2389       /* If the old argument is a SSA_NAME use its current_def.  */
2390       if (TREE_CODE (old_arg) == SSA_NAME)
2391 	merge_arg = get_current_def (old_arg);
2392       /* If it's a constant or doesn't have a current_def, just use the old
2393 	 argument.  */
2394       if (!merge_arg)
2395 	merge_arg = old_arg;
2396 
2397       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2398       /* If the var is live after loop but not a reduction, we simply
2399 	 use the old arg.  */
2400       if (!guard_arg)
2401 	guard_arg = old_arg;
2402 
2403       /* Create new phi node in MERGE_BB:  */
2404       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2405       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2406 
2407       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2408 	 the two PHI args in merge_phi for these edges.  */
2409       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2410       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2411 
2412       /* Update the original phi in exit_bb.  */
2413       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2414     }
2415 }
2416 
2417 /* EPILOG loop is duplicated from the original loop for vectorizing,
2418    the arg of its loop closed ssa PHI needs to be updated.  */
2419 
2420 static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2421 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2422 {
2423   gphi_iterator gsi;
2424   basic_block exit_bb = single_exit (epilog)->dest;
2425 
2426   gcc_assert (single_pred_p (exit_bb));
2427   edge e = EDGE_PRED (exit_bb, 0);
2428   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2429     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2430 }
2431 
2432 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2433    iterate exactly CONST_NITERS times.  Make a final decision about
2434    whether the epilogue loop should be used, returning true if so.  */
2435 
2436 static bool
vect_update_epilogue_niters(loop_vec_info epilogue_vinfo,unsigned HOST_WIDE_INT const_niters)2437 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2438 			     unsigned HOST_WIDE_INT const_niters)
2439 {
2440   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2441      using an epilogue loop for a single scalar iteration, even if
2442      we could in principle implement that using partial vectors.  */
2443   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2444   if (const_niters <= gap_niters + 1)
2445     return false;
2446 
2447   /* Install the number of iterations.  */
2448   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2449   tree niters_tree = build_int_cst (niters_type, const_niters);
2450   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2451 
2452   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2453   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2454 
2455   /* Decide what to do if the number of epilogue iterations is not
2456      a multiple of the epilogue loop's vectorization factor.  */
2457   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2458 }
2459 
2460 /* Function vect_do_peeling.
2461 
2462    Input:
2463    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2464 
2465        preheader:
2466      LOOP:
2467        header_bb:
2468 	 loop_body
2469 	 if (exit_loop_cond) goto exit_bb
2470 	 else                goto header_bb
2471        exit_bb:
2472 
2473    - NITERS: The number of iterations of the loop.
2474    - NITERSM1: The number of iterations of the loop's latch.
2475    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2476    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2477 			      CHECK_PROFITABILITY is true.
2478    Output:
2479    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2480      iterate after vectorization; see vect_set_loop_condition for details.
2481    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2482      should be set to the number of scalar iterations handled by the
2483      vector loop.  The SSA name is only used on exit from the loop.
2484 
2485    This function peels prolog and epilog from the loop, adds guards skipping
2486    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2487    would look like:
2488 
2489        guard_bb_1:
2490 	 if (prefer_scalar_loop) goto merge_bb_1
2491 	 else                    goto guard_bb_2
2492 
2493        guard_bb_2:
2494          if (skip_prolog) goto merge_bb_2
2495          else             goto prolog_preheader
2496 
2497        prolog_preheader:
2498      PROLOG:
2499        prolog_header_bb:
2500 	 prolog_body
2501 	 if (exit_prolog_cond) goto prolog_exit_bb
2502 	 else                  goto prolog_header_bb
2503        prolog_exit_bb:
2504 
2505        merge_bb_2:
2506 
2507        vector_preheader:
2508      VECTOR LOOP:
2509        vector_header_bb:
2510 	 vector_body
2511 	 if (exit_vector_cond) goto vector_exit_bb
2512 	 else                  goto vector_header_bb
2513        vector_exit_bb:
2514 
2515        guard_bb_3:
2516 	 if (skip_epilog) goto merge_bb_3
2517 	 else             goto epilog_preheader
2518 
2519        merge_bb_1:
2520 
2521        epilog_preheader:
2522      EPILOG:
2523        epilog_header_bb:
2524 	 epilog_body
2525 	 if (exit_epilog_cond) goto merge_bb_3
2526 	 else                  goto epilog_header_bb
2527 
2528        merge_bb_3:
2529 
2530    Note this function peels prolog and epilog only if it's necessary,
2531    as well as guards.
2532    This function returns the epilogue loop if a decision was made to vectorize
2533    it, otherwise NULL.
2534 
2535    The analysis resulting in this epilogue loop's loop_vec_info was performed
2536    in the same vect_analyze_loop call as the main loop's.  At that time
2537    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2538    vectorization factors than the main loop.  This list is stored in the main
2539    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2540    vectorize the epilogue loop for a lower vectorization factor,  the
2541    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2542    updated and linked to the epilogue loop.  This is later used to vectorize
2543    the epilogue.  The reason the loop_vec_info needs updating is that it was
2544    constructed based on the original main loop, and the epilogue loop is a
2545    copy of this loop, so all links pointing to statements in the original loop
2546    need updating.  Furthermore, these loop_vec_infos share the
2547    data_reference's records, which will also need to be updated.
2548 
2549    TODO: Guard for prefer_scalar_loop should be emitted along with
2550    versioning conditions if loop versioning is needed.  */
2551 
2552 
2553 class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)2554 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2555 		 tree *niters_vector, tree *step_vector,
2556 		 tree *niters_vector_mult_vf_var, int th,
2557 		 bool check_profitability, bool niters_no_overflow,
2558 		 tree *advance)
2559 {
2560   edge e, guard_e;
2561   tree type = TREE_TYPE (niters), guard_cond;
2562   basic_block guard_bb, guard_to;
2563   profile_probability prob_prolog, prob_vector, prob_epilog;
2564   int estimated_vf;
2565   int prolog_peeling = 0;
2566   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2567   bool vect_epilogues_updated_niters = false;
2568   /* We currently do not support prolog peeling if the target alignment is not
2569      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2570      target alignment being constant.  */
2571   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2572   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2573     return NULL;
2574 
2575   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2576     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2577 
2578   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2579   poly_uint64 bound_epilog = 0;
2580   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2581       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2582     bound_epilog += vf - 1;
2583   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2584     bound_epilog += 1;
2585   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2586   poly_uint64 bound_scalar = bound_epilog;
2587 
2588   if (!prolog_peeling && !epilog_peeling)
2589     return NULL;
2590 
2591   /* Before doing any peeling make sure to reset debug binds outside of
2592      the loop refering to defs not in LC SSA.  */
2593   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594   for (unsigned i = 0; i < loop->num_nodes; ++i)
2595     {
2596       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2597       imm_use_iterator ui;
2598       gimple *use_stmt;
2599       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2600 	   gsi_next (&gsi))
2601 	{
2602 	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2603 	    if (gimple_debug_bind_p (use_stmt)
2604 		&& loop != gimple_bb (use_stmt)->loop_father
2605 		&& !flow_loop_nested_p (loop,
2606 					gimple_bb (use_stmt)->loop_father))
2607 	      {
2608 		gimple_debug_bind_reset_value (use_stmt);
2609 		update_stmt (use_stmt);
2610 	      }
2611 	}
2612       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2613 	   gsi_next (&gsi))
2614 	{
2615 	  ssa_op_iter op_iter;
2616 	  def_operand_p def_p;
2617 	  FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2618 	    FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2619 	      if (gimple_debug_bind_p (use_stmt)
2620 		  && loop != gimple_bb (use_stmt)->loop_father
2621 		  && !flow_loop_nested_p (loop,
2622 					  gimple_bb (use_stmt)->loop_father))
2623 		{
2624 		  gimple_debug_bind_reset_value (use_stmt);
2625 		  update_stmt (use_stmt);
2626 		}
2627 	}
2628     }
2629 
2630   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2631   estimated_vf = vect_vf_for_cost (loop_vinfo);
2632   if (estimated_vf == 2)
2633     estimated_vf = 3;
2634   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2635 			.apply_scale (estimated_vf - 1, estimated_vf);
2636 
2637   class loop *prolog, *epilog = NULL;
2638   class loop *first_loop = loop;
2639   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2640 
2641   /* We might have a queued need to update virtual SSA form.  As we
2642      delete the update SSA machinery below after doing a regular
2643      incremental SSA update during loop copying make sure we don't
2644      lose that fact.
2645      ???  Needing to update virtual SSA form by renaming is unfortunate
2646      but not all of the vectorizer code inserting new loads / stores
2647      properly assigns virtual operands to those statements.  */
2648   update_ssa (TODO_update_ssa_only_virtuals);
2649 
2650   create_lcssa_for_virtual_phi (loop);
2651 
2652   /* If we're vectorizing an epilogue loop, the update_ssa above will
2653      have ensured that the virtual operand is in SSA form throughout the
2654      vectorized main loop.  Normally it is possible to trace the updated
2655      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2656      back to scalar-stmt vuses, meaning that the effect of the SSA update
2657      remains local to the main loop.  However, there are rare cases in
2658      which the vectorized loop has vdefs even when the original scalar
2659      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2660      introduces clobbers of the temporary vector array, which in turn
2661      needs new vdefs.  If the scalar loop doesn't write to memory, these
2662      new vdefs will be the only ones in the vector loop.
2663 
2664      In that case, update_ssa will have added a new virtual phi to the
2665      main loop, which previously didn't need one.  Ensure that we (locally)
2666      maintain LCSSA form for the virtual operand, just as we would have
2667      done if the virtual phi had existed from the outset.  This makes it
2668      easier to duplicate the scalar epilogue loop below.  */
2669   tree vop_to_rename = NULL_TREE;
2670   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2671     {
2672       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2673       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2674     }
2675 
2676   if (MAY_HAVE_DEBUG_BIND_STMTS)
2677     {
2678       gcc_assert (!adjust_vec.exists ());
2679       adjust_vec.create (32);
2680     }
2681   initialize_original_copy_tables ();
2682 
2683   /* Record the anchor bb at which the guard should be placed if the scalar
2684      loop might be preferred.  */
2685   basic_block anchor = loop_preheader_edge (loop)->src;
2686 
2687   /* Generate the number of iterations for the prolog loop.  We do this here
2688      so that we can also get the upper bound on the number of iterations.  */
2689   tree niters_prolog;
2690   int bound_prolog = 0;
2691   if (prolog_peeling)
2692     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2693 						  &bound_prolog);
2694   else
2695     niters_prolog = build_int_cst (type, 0);
2696 
2697   loop_vec_info epilogue_vinfo = NULL;
2698   if (vect_epilogues)
2699     {
2700       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2701       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2702     }
2703 
2704   tree niters_vector_mult_vf = NULL_TREE;
2705   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2706   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2707   edge update_e = NULL, skip_e = NULL;
2708   unsigned int lowest_vf = constant_lower_bound (vf);
2709   /* If we know the number of scalar iterations for the main loop we should
2710      check whether after the main loop there are enough iterations left over
2711      for the epilogue.  */
2712   if (vect_epilogues
2713       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2714       && prolog_peeling >= 0
2715       && known_eq (vf, lowest_vf))
2716     {
2717       unsigned HOST_WIDE_INT eiters
2718 	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
2719 	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2720 
2721       eiters -= prolog_peeling;
2722       eiters
2723 	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2724 
2725       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2726 	{
2727 	  delete epilogue_vinfo;
2728 	  epilogue_vinfo = NULL;
2729 	  if (loop_vinfo->epilogue_vinfos.length () == 0)
2730 	    {
2731 	      vect_epilogues = false;
2732 	      break;
2733 	    }
2734 	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2735 	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
2736 	}
2737       vect_epilogues_updated_niters = true;
2738     }
2739   /* Prolog loop may be skipped.  */
2740   bool skip_prolog = (prolog_peeling != 0);
2741   /* Skip this loop to epilog when there are not enough iterations to enter this
2742      vectorized loop.  If true we should perform runtime checks on the NITERS
2743      to check whether we should skip the current vectorized loop.  If we know
2744      the number of scalar iterations we may choose to add a runtime check if
2745      this number "maybe" smaller than the number of iterations required
2746      when we know the number of scalar iterations may potentially
2747      be smaller than the number of iterations required to enter this loop, for
2748      this we use the upper bounds on the prolog and epilog peeling.  When we
2749      don't know the number of iterations and don't require versioning it is
2750      because we have asserted that there are enough scalar iterations to enter
2751      the main loop, so this skip is not necessary.  When we are versioning then
2752      we only add such a skip if we have chosen to vectorize the epilogue.  */
2753   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2754 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2755 				  bound_prolog + bound_epilog)
2756 		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2757 			 || vect_epilogues));
2758   /* Epilog loop must be executed if the number of iterations for epilog
2759      loop is known at compile time, otherwise we need to add a check at
2760      the end of vector loop and skip to the end of epilog loop.  */
2761   bool skip_epilog = (prolog_peeling < 0
2762 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2763 		      || !vf.is_constant ());
2764   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2765   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2766     skip_epilog = false;
2767 
2768   if (skip_vector)
2769     {
2770       split_edge (loop_preheader_edge (loop));
2771 
2772       /* Due to the order in which we peel prolog and epilog, we first
2773 	 propagate probability to the whole loop.  The purpose is to
2774 	 avoid adjusting probabilities of both prolog and vector loops
2775 	 separately.  Note in this case, the probability of epilog loop
2776 	 needs to be scaled back later.  */
2777       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2778       if (prob_vector.initialized_p ())
2779 	{
2780 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2781 	  scale_loop_profile (loop, prob_vector, 0);
2782 	}
2783     }
2784 
2785   dump_user_location_t loop_loc = find_loop_location (loop);
2786   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2787   if (vect_epilogues)
2788     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2789        use the original scalar loop as remaining epilogue if necessary.  */
2790     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2791       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2792 
2793   if (prolog_peeling)
2794     {
2795       e = loop_preheader_edge (loop);
2796       if (!slpeel_can_duplicate_loop_p (loop, e))
2797 	{
2798 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2799 			   "loop can't be duplicated to preheader edge.\n");
2800 	  gcc_unreachable ();
2801 	}
2802       /* Peel prolog and put it on preheader edge of loop.  */
2803       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2804       if (!prolog)
2805 	{
2806 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2807 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2808 	  gcc_unreachable ();
2809 	}
2810       prolog->force_vectorize = false;
2811       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2812       first_loop = prolog;
2813       reset_original_copy_tables ();
2814 
2815       /* Update the number of iterations for prolog loop.  */
2816       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2817       vect_set_loop_condition (prolog, NULL, niters_prolog,
2818 			       step_prolog, NULL_TREE, false);
2819 
2820       /* Skip the prolog loop.  */
2821       if (skip_prolog)
2822 	{
2823 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2824 				    niters_prolog, build_int_cst (type, 0));
2825 	  guard_bb = loop_preheader_edge (prolog)->src;
2826 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2827 	  guard_to = split_edge (loop_preheader_edge (loop));
2828 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2829 					   guard_to, guard_bb,
2830 					   prob_prolog.invert (),
2831 					   irred_flag);
2832 	  e = EDGE_PRED (guard_to, 0);
2833 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2834 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2835 
2836 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2837 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2838 	}
2839 
2840       /* Update init address of DRs.  */
2841       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2842       /* Update niters for vector loop.  */
2843       LOOP_VINFO_NITERS (loop_vinfo)
2844 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2845       LOOP_VINFO_NITERSM1 (loop_vinfo)
2846 	= fold_build2 (MINUS_EXPR, type,
2847 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2848       bool new_var_p = false;
2849       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2850       /* It's guaranteed that vector loop bound before vectorization is at
2851 	 least VF, so set range information for newly generated var.  */
2852       if (new_var_p)
2853 	set_range_info (niters, VR_RANGE,
2854 			wi::to_wide (build_int_cst (type, vf)),
2855 			wi::to_wide (TYPE_MAX_VALUE (type)));
2856 
2857       /* Prolog iterates at most bound_prolog times, latch iterates at
2858 	 most bound_prolog - 1 times.  */
2859       record_niter_bound (prolog, bound_prolog - 1, false, true);
2860       delete_update_ssa ();
2861       adjust_vec_debug_stmts ();
2862       scev_reset ();
2863     }
2864 
2865   if (epilog_peeling)
2866     {
2867       e = single_exit (loop);
2868       if (!slpeel_can_duplicate_loop_p (loop, e))
2869 	{
2870 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2871 			   "loop can't be duplicated to exit edge.\n");
2872 	  gcc_unreachable ();
2873 	}
2874       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2875 	 said epilog then we should use a copy of the main loop as a starting
2876 	 point.  This loop may have already had some preliminary transformations
2877 	 to allow for more optimal vectorization, for example if-conversion.
2878 	 If we are not vectorizing the epilog then we should use the scalar loop
2879 	 as the transformations mentioned above make less or no sense when not
2880 	 vectorizing.  */
2881       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2882       if (vop_to_rename)
2883 	{
2884 	  /* Vectorizing the main loop can sometimes introduce a vdef to
2885 	     a loop that previously didn't have one; see the comment above
2886 	     the definition of VOP_TO_RENAME for details.  The definition
2887 	     D that holds on E will then be different from the definition
2888 	     VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2889 	     rename VOP_TO_RENAME to D when copying the loop.
2890 
2891 	     The virtual operand is in LCSSA form for the main loop,
2892 	     and no stmt between the main loop and E needs a vdef,
2893 	     so we know that D is provided by a phi rather than by a
2894 	     vdef on a normal gimple stmt.  */
2895 	  basic_block vdef_bb = e->src;
2896 	  gphi *vphi;
2897 	  while (!(vphi = get_virtual_phi (vdef_bb)))
2898 	    vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2899 	  gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2900 	  set_current_def (vop_to_rename, gimple_phi_result (vphi));
2901 	}
2902       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2903       if (!epilog)
2904 	{
2905 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2906 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2907 	  gcc_unreachable ();
2908 	}
2909       epilog->force_vectorize = false;
2910       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2911 
2912       /* Scalar version loop may be preferred.  In this case, add guard
2913 	 and skip to epilog.  Note this only happens when the number of
2914 	 iterations of loop is unknown at compile time, otherwise this
2915 	 won't be vectorized.  */
2916       if (skip_vector)
2917 	{
2918 	  /* Additional epilogue iteration is peeled if gap exists.  */
2919 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2920 						bound_prolog, bound_epilog,
2921 						th, &bound_scalar,
2922 						check_profitability);
2923 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2924 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2925 	  guard_bb = anchor;
2926 	  guard_to = split_edge (loop_preheader_edge (epilog));
2927 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2928 					   guard_to, guard_bb,
2929 					   prob_vector.invert (),
2930 					   irred_flag);
2931 	  skip_e = guard_e;
2932 	  e = EDGE_PRED (guard_to, 0);
2933 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2934 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2935 
2936 	  /* Simply propagate profile info from guard_bb to guard_to which is
2937 	     a merge point of control flow.  */
2938 	  guard_to->count = guard_bb->count;
2939 
2940 	  /* Scale probability of epilog loop back.
2941 	     FIXME: We should avoid scaling down and back up.  Profile may
2942 	     get lost if we scale down to 0.  */
2943 	  basic_block *bbs = get_loop_body (epilog);
2944 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2945 	    bbs[i]->count = bbs[i]->count.apply_scale
2946 				 (bbs[i]->count,
2947 				  bbs[i]->count.apply_probability
2948 				    (prob_vector));
2949 	  free (bbs);
2950 	}
2951 
2952       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2953       /* If loop is peeled for non-zero constant times, now niters refers to
2954 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2955 	 overflows.  */
2956       niters_no_overflow |= (prolog_peeling > 0);
2957       vect_gen_vector_loop_niters (loop_vinfo, niters,
2958 				   niters_vector, step_vector,
2959 				   niters_no_overflow);
2960       if (!integer_onep (*step_vector))
2961 	{
2962 	  /* On exit from the loop we will have an easy way of calcalating
2963 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2964 	     until then.  */
2965 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2966 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2967 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2968 	}
2969       else
2970 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2971 					     &niters_vector_mult_vf);
2972       /* Update IVs of original loop as if they were advanced by
2973 	 niters_vector_mult_vf steps.  */
2974       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2975       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2976       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2977 					update_e);
2978 
2979       if (skip_epilog)
2980 	{
2981 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2982 				    niters, niters_vector_mult_vf);
2983 	  guard_bb = single_exit (loop)->dest;
2984 	  guard_to = split_edge (single_exit (epilog));
2985 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2986 					   skip_vector ? anchor : guard_bb,
2987 					   prob_epilog.invert (),
2988 					   irred_flag);
2989 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2990 					      single_exit (epilog));
2991 	  /* Only need to handle basic block before epilog loop if it's not
2992 	     the guard_bb, which is the case when skip_vector is true.  */
2993 	  if (guard_bb != bb_before_epilog)
2994 	    {
2995 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2996 
2997 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2998 	    }
2999 	  scale_loop_profile (epilog, prob_epilog, 0);
3000 	}
3001       else
3002 	slpeel_update_phi_nodes_for_lcssa (epilog);
3003 
3004       unsigned HOST_WIDE_INT bound;
3005       if (bound_scalar.is_constant (&bound))
3006 	{
3007 	  gcc_assert (bound != 0);
3008 	  /* -1 to convert loop iterations to latch iterations.  */
3009 	  record_niter_bound (epilog, bound - 1, false, true);
3010 	}
3011 
3012       delete_update_ssa ();
3013       adjust_vec_debug_stmts ();
3014       scev_reset ();
3015     }
3016 
3017   if (vect_epilogues)
3018     {
3019       epilog->aux = epilogue_vinfo;
3020       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3021 
3022       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3023 
3024       /* We now must calculate the number of NITERS performed by the previous
3025 	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3026       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3027 				 niters_prolog, niters_vector_mult_vf);
3028 
3029       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3030 	 determine whether we are coming from the previous vectorized loop
3031 	 using the update_e edge or the skip_vector basic block using the
3032 	 skip_e edge.  */
3033       if (skip_vector)
3034 	{
3035 	  gcc_assert (update_e != NULL
3036 		      && skip_e != NULL
3037 		      && !vect_epilogues_updated_niters);
3038 	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3039 					   update_e->dest);
3040 	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3041 	  gimple *stmt = gimple_build_assign (new_ssa, niters);
3042 	  gimple_stmt_iterator gsi;
3043 	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3044 	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3045 	    {
3046 	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3047 	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3048 	    }
3049 	  else
3050 	    {
3051 	      gsi = gsi_last_bb (update_e->src);
3052 	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3053 	    }
3054 
3055 	  niters = new_ssa;
3056 	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3057 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3058 		       UNKNOWN_LOCATION);
3059 	  niters = PHI_RESULT (new_phi);
3060 	}
3061 
3062       /* Set ADVANCE to the number of iterations performed by the previous
3063 	 loop and its prologue.  */
3064       *advance = niters;
3065 
3066       if (!vect_epilogues_updated_niters)
3067 	{
3068 	  /* Subtract the number of iterations performed by the vectorized loop
3069 	     from the number of total iterations.  */
3070 	  tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3071 					      before_loop_niters,
3072 					      niters);
3073 
3074 	  LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3075 	  LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3076 	    = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3077 			   epilogue_niters,
3078 			   build_one_cst (TREE_TYPE (epilogue_niters)));
3079 
3080 	  /* Decide what to do if the number of epilogue iterations is not
3081 	     a multiple of the epilogue loop's vectorization factor.
3082 	     We should have rejected the loop during the analysis phase
3083 	     if this fails.  */
3084 	  if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3085 							   true))
3086 	    gcc_unreachable ();
3087 	}
3088     }
3089 
3090   adjust_vec.release ();
3091   free_original_copy_tables ();
3092 
3093   return vect_epilogues ? epilog : NULL;
3094 }
3095 
3096 /* Function vect_create_cond_for_niters_checks.
3097 
3098    Create a conditional expression that represents the run-time checks for
3099    loop's niter.  The loop is guaranteed to terminate if the run-time
3100    checks hold.
3101 
3102    Input:
3103    COND_EXPR  - input conditional expression.  New conditions will be chained
3104 		with logical AND operation.  If it is NULL, then the function
3105 		is used to return the number of alias checks.
3106    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3107 		to be checked.
3108 
3109    Output:
3110    COND_EXPR - conditional expression.
3111 
3112    The returned COND_EXPR is the conditional expression to be used in the
3113    if statement that controls which version of the loop gets executed at
3114    runtime.  */
3115 
3116 static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)3117 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3118 {
3119   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3120 
3121   if (*cond_expr)
3122     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3123 			      *cond_expr, part_cond_expr);
3124   else
3125     *cond_expr = part_cond_expr;
3126 }
3127 
3128 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3129    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3130 
3131 static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3132 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3133 {
3134   if (*cond_expr)
3135     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3136 			      *cond_expr, part_cond_expr);
3137   else
3138     *cond_expr = part_cond_expr;
3139 }
3140 
3141 /* Function vect_create_cond_for_align_checks.
3142 
3143    Create a conditional expression that represents the alignment checks for
3144    all of data references (array element references) whose alignment must be
3145    checked at runtime.
3146 
3147    Input:
3148    COND_EXPR  - input conditional expression.  New conditions will be chained
3149                 with logical AND operation.
3150    LOOP_VINFO - two fields of the loop information are used.
3151                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3152                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3153 
3154    Output:
3155    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3156                          expression.
3157    The returned value is the conditional expression to be used in the if
3158    statement that controls which version of the loop gets executed at runtime.
3159 
3160    The algorithm makes two assumptions:
3161      1) The number of bytes "n" in a vector is a power of 2.
3162      2) An address "a" is aligned if a%n is zero and that this
3163         test can be done as a&(n-1) == 0.  For example, for 16
3164         byte vectors the test is a&0xf == 0.  */
3165 
3166 static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)3167 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3168                                    tree *cond_expr,
3169 				   gimple_seq *cond_expr_stmt_list)
3170 {
3171   vec<stmt_vec_info> may_misalign_stmts
3172     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3173   stmt_vec_info stmt_info;
3174   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3175   tree mask_cst;
3176   unsigned int i;
3177   tree int_ptrsize_type;
3178   char tmp_name[20];
3179   tree or_tmp_name = NULL_TREE;
3180   tree and_tmp_name;
3181   gimple *and_stmt;
3182   tree ptrsize_zero;
3183   tree part_cond_expr;
3184 
3185   /* Check that mask is one less than a power of 2, i.e., mask is
3186      all zeros followed by all ones.  */
3187   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3188 
3189   int_ptrsize_type = signed_type_for (ptr_type_node);
3190 
3191   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3192      of the first vector of the i'th data reference. */
3193 
3194   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3195     {
3196       gimple_seq new_stmt_list = NULL;
3197       tree addr_base;
3198       tree addr_tmp_name;
3199       tree new_or_tmp_name;
3200       gimple *addr_stmt, *or_stmt;
3201       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3202       bool negative = tree_int_cst_compare
3203 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3204       tree offset = negative
3205 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3206 
3207       /* create: addr_tmp = (int)(address_of_first_vector) */
3208       addr_base =
3209 	vect_create_addr_base_for_vector_ref (loop_vinfo,
3210 					      stmt_info, &new_stmt_list,
3211 					      offset);
3212       if (new_stmt_list != NULL)
3213 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3214 
3215       sprintf (tmp_name, "addr2int%d", i);
3216       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3217       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3218       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3219 
3220       /* The addresses are OR together.  */
3221 
3222       if (or_tmp_name != NULL_TREE)
3223         {
3224           /* create: or_tmp = or_tmp | addr_tmp */
3225           sprintf (tmp_name, "orptrs%d", i);
3226 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3227 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3228 					 or_tmp_name, addr_tmp_name);
3229 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3230           or_tmp_name = new_or_tmp_name;
3231         }
3232       else
3233         or_tmp_name = addr_tmp_name;
3234 
3235     } /* end for i */
3236 
3237   mask_cst = build_int_cst (int_ptrsize_type, mask);
3238 
3239   /* create: and_tmp = or_tmp & mask  */
3240   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3241 
3242   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3243 				  or_tmp_name, mask_cst);
3244   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3245 
3246   /* Make and_tmp the left operand of the conditional test against zero.
3247      if and_tmp has a nonzero bit then some address is unaligned.  */
3248   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3249   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3250 				and_tmp_name, ptrsize_zero);
3251   chain_cond_expr (cond_expr, part_cond_expr);
3252 }
3253 
3254 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3255    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3256    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3257    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3258 
3259 static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3260 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3261 {
3262   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3263   unsigned int i;
3264   vec_object_pair *pair;
3265   FOR_EACH_VEC_ELT (pairs, i, pair)
3266     {
3267       tree addr1 = build_fold_addr_expr (pair->first);
3268       tree addr2 = build_fold_addr_expr (pair->second);
3269       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3270 					 addr1, addr2);
3271       chain_cond_expr (cond_expr, part_cond_expr);
3272     }
3273 }
3274 
3275 /* Create an expression that is true when all lower-bound conditions for
3276    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3277 
3278 static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3279 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3280 {
3281   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3282   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3283     {
3284       tree expr = lower_bounds[i].expr;
3285       tree type = unsigned_type_for (TREE_TYPE (expr));
3286       expr = fold_convert (type, expr);
3287       poly_uint64 bound = lower_bounds[i].min_value;
3288       if (!lower_bounds[i].unsigned_p)
3289 	{
3290 	  expr = fold_build2 (PLUS_EXPR, type, expr,
3291 			      build_int_cstu (type, bound - 1));
3292 	  bound += bound - 1;
3293 	}
3294       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3295 					 build_int_cstu (type, bound));
3296       chain_cond_expr (cond_expr, part_cond_expr);
3297     }
3298 }
3299 
3300 /* Function vect_create_cond_for_alias_checks.
3301 
3302    Create a conditional expression that represents the run-time checks for
3303    overlapping of address ranges represented by a list of data references
3304    relations passed as input.
3305 
3306    Input:
3307    COND_EXPR  - input conditional expression.  New conditions will be chained
3308                 with logical AND operation.  If it is NULL, then the function
3309                 is used to return the number of alias checks.
3310    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3311 	        to be checked.
3312 
3313    Output:
3314    COND_EXPR - conditional expression.
3315 
3316    The returned COND_EXPR is the conditional expression to be used in the if
3317    statement that controls which version of the loop gets executed at runtime.
3318 */
3319 
3320 void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)3321 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3322 {
3323   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3324     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3325 
3326   if (comp_alias_ddrs.is_empty ())
3327     return;
3328 
3329   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3330 			       &comp_alias_ddrs, cond_expr);
3331   if (dump_enabled_p ())
3332     dump_printf_loc (MSG_NOTE, vect_location,
3333 		     "created %u versioning for alias checks.\n",
3334 		     comp_alias_ddrs.length ());
3335 }
3336 
3337 
3338 /* Function vect_loop_versioning.
3339 
3340    If the loop has data references that may or may not be aligned or/and
3341    has data reference relations whose independence was not proven then
3342    two versions of the loop need to be generated, one which is vectorized
3343    and one which isn't.  A test is then generated to control which of the
3344    loops is executed.  The test checks for the alignment of all of the
3345    data references that may or may not be aligned.  An additional
3346    sequence of runtime tests is generated for each pairs of DDRs whose
3347    independence was not proven.  The vectorized version of loop is
3348    executed only if both alias and alignment tests are passed.
3349 
3350    The test generated to check which version of loop is executed
3351    is modified to also check for profitability as indicated by the
3352    cost model threshold TH.
3353 
3354    The versioning precondition(s) are placed in *COND_EXPR and
3355    *COND_EXPR_STMT_LIST.  */
3356 
3357 class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)3358 vect_loop_versioning (loop_vec_info loop_vinfo,
3359 		      gimple *loop_vectorized_call)
3360 {
3361   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3362   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3363   basic_block condition_bb;
3364   gphi_iterator gsi;
3365   gimple_stmt_iterator cond_exp_gsi;
3366   basic_block merge_bb;
3367   basic_block new_exit_bb;
3368   edge new_exit_e, e;
3369   gphi *orig_phi, *new_phi;
3370   tree cond_expr = NULL_TREE;
3371   gimple_seq cond_expr_stmt_list = NULL;
3372   tree arg;
3373   profile_probability prob = profile_probability::likely ();
3374   gimple_seq gimplify_stmt_list = NULL;
3375   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3376   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3377   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3378   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3379   poly_uint64 versioning_threshold
3380     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3381   tree version_simd_if_cond
3382     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3383   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3384 
3385   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3386       && !ordered_p (th, versioning_threshold))
3387     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3388 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3389 					    th - 1));
3390   if (maybe_ne (versioning_threshold, 0U))
3391     {
3392       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3393 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3394 					      versioning_threshold - 1));
3395       if (cond_expr)
3396 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3397 				 expr, cond_expr);
3398       else
3399 	cond_expr = expr;
3400     }
3401 
3402   if (version_niter)
3403     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3404 
3405   if (cond_expr)
3406     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3407 					&cond_expr_stmt_list,
3408 					is_gimple_condexpr, NULL_TREE);
3409 
3410   if (version_align)
3411     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3412 				       &cond_expr_stmt_list);
3413 
3414   if (version_alias)
3415     {
3416       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3417       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3418       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3419     }
3420 
3421   if (version_simd_if_cond)
3422     {
3423       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3424       if (flag_checking)
3425 	if (basic_block bb
3426 	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3427 	  gcc_assert (bb != loop->header
3428 		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3429 		      && (scalar_loop == NULL
3430 			  || (bb != scalar_loop->header
3431 			      && dominated_by_p (CDI_DOMINATORS,
3432 						 scalar_loop->header, bb))));
3433       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3434       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3435 			    version_simd_if_cond, zero);
3436       if (cond_expr)
3437         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3438 				 c, cond_expr);
3439       else
3440         cond_expr = c;
3441       if (dump_enabled_p ())
3442 	dump_printf_loc (MSG_NOTE, vect_location,
3443 			 "created versioning for simd if condition check.\n");
3444     }
3445 
3446   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3447 				      &gimplify_stmt_list,
3448 				      is_gimple_condexpr, NULL_TREE);
3449   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3450 
3451   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3452      invariant in.  */
3453   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3454   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3455        !gsi_end_p (gsi); gsi_next (&gsi))
3456     {
3457       gimple *stmt = gsi_stmt (gsi);
3458       update_stmt (stmt);
3459       ssa_op_iter iter;
3460       use_operand_p use_p;
3461       basic_block def_bb;
3462       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3463 	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3464 	    && flow_bb_inside_loop_p (outermost, def_bb))
3465 	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3466     }
3467 
3468   /* Search for the outermost loop we can version.  Avoid versioning of
3469      non-perfect nests but allow if-conversion versioned loops inside.  */
3470   class loop *loop_to_version = loop;
3471   if (flow_loop_nested_p (outermost, loop))
3472     {
3473       if (dump_enabled_p ())
3474 	dump_printf_loc (MSG_NOTE, vect_location,
3475 			 "trying to apply versioning to outer loop %d\n",
3476 			 outermost->num);
3477       if (outermost->num == 0)
3478 	outermost = superloop_at_depth (loop, 1);
3479       /* And avoid applying versioning on non-perfect nests.  */
3480       while (loop_to_version != outermost
3481 	     && single_exit (loop_outer (loop_to_version))
3482 	     && (!loop_outer (loop_to_version)->inner->next
3483 		 || vect_loop_vectorized_call (loop_to_version))
3484 	     && (!loop_outer (loop_to_version)->inner->next
3485 		 || !loop_outer (loop_to_version)->inner->next->next))
3486 	loop_to_version = loop_outer (loop_to_version);
3487     }
3488 
3489   /* Apply versioning.  If there is already a scalar version created by
3490      if-conversion re-use that.  Note we cannot re-use the copy of
3491      an if-converted outer-loop when vectorizing the inner loop only.  */
3492   gcond *cond;
3493   if ((!loop_to_version->inner || loop == loop_to_version)
3494       && loop_vectorized_call)
3495     {
3496       gcc_assert (scalar_loop);
3497       condition_bb = gimple_bb (loop_vectorized_call);
3498       cond = as_a <gcond *> (last_stmt (condition_bb));
3499       gimple_cond_set_condition_from_tree (cond, cond_expr);
3500       update_stmt (cond);
3501 
3502       if (cond_expr_stmt_list)
3503 	{
3504 	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3505 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3506 				 GSI_SAME_STMT);
3507 	}
3508 
3509       /* if-conversion uses profile_probability::always () for both paths,
3510 	 reset the paths probabilities appropriately.  */
3511       edge te, fe;
3512       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3513       te->probability = prob;
3514       fe->probability = prob.invert ();
3515       /* We can scale loops counts immediately but have to postpone
3516          scaling the scalar loop because we re-use it during peeling.  */
3517       scale_loop_frequencies (loop_to_version, te->probability);
3518       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3519 
3520       nloop = scalar_loop;
3521       if (dump_enabled_p ())
3522 	dump_printf_loc (MSG_NOTE, vect_location,
3523 			 "reusing %sloop version created by if conversion\n",
3524 			 loop_to_version != loop ? "outer " : "");
3525     }
3526   else
3527     {
3528       if (loop_to_version != loop
3529 	  && dump_enabled_p ())
3530 	dump_printf_loc (MSG_NOTE, vect_location,
3531 			 "applying loop versioning to outer loop %d\n",
3532 			 loop_to_version->num);
3533 
3534       initialize_original_copy_tables ();
3535       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3536 			    prob, prob.invert (), prob, prob.invert (), true);
3537       gcc_assert (nloop);
3538       nloop = get_loop_copy (loop);
3539 
3540       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3541          reap those otherwise;  they also refer to the original
3542 	 loops.  */
3543       class loop *l = loop;
3544       while (gimple *call = vect_loop_vectorized_call (l))
3545 	{
3546 	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3547 	  fold_loop_internal_call (call, boolean_false_node);
3548 	  l = loop_outer (l);
3549 	}
3550       free_original_copy_tables ();
3551 
3552       if (cond_expr_stmt_list)
3553 	{
3554 	  cond_exp_gsi = gsi_last_bb (condition_bb);
3555 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3556 				 GSI_SAME_STMT);
3557 	}
3558 
3559       /* Loop versioning violates an assumption we try to maintain during
3560 	 vectorization - that the loop exit block has a single predecessor.
3561 	 After versioning, the exit block of both loop versions is the same
3562 	 basic block (i.e. it has two predecessors). Just in order to simplify
3563 	 following transformations in the vectorizer, we fix this situation
3564 	 here by adding a new (empty) block on the exit-edge of the loop,
3565 	 with the proper loop-exit phis to maintain loop-closed-form.
3566 	 If loop versioning wasn't done from loop, but scalar_loop instead,
3567 	 merge_bb will have already just a single successor.  */
3568 
3569       merge_bb = single_exit (loop_to_version)->dest;
3570       if (EDGE_COUNT (merge_bb->preds) >= 2)
3571 	{
3572 	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3573 	  new_exit_bb = split_edge (single_exit (loop_to_version));
3574 	  new_exit_e = single_exit (loop_to_version);
3575 	  e = EDGE_SUCC (new_exit_bb, 0);
3576 
3577 	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3578 	       gsi_next (&gsi))
3579 	    {
3580 	      tree new_res;
3581 	      orig_phi = gsi.phi ();
3582 	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3583 	      new_phi = create_phi_node (new_res, new_exit_bb);
3584 	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3585 	      add_phi_arg (new_phi, arg, new_exit_e,
3586 			   gimple_phi_arg_location_from_edge (orig_phi, e));
3587 	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3588 	    }
3589 	}
3590 
3591       update_ssa (TODO_update_ssa);
3592     }
3593 
3594   if (version_niter)
3595     {
3596       /* The versioned loop could be infinite, we need to clear existing
3597 	 niter information which is copied from the original loop.  */
3598       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3599       vect_free_loop_info_assumptions (nloop);
3600       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3601       loop_constraint_set (loop, LOOP_C_INFINITE);
3602     }
3603 
3604   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3605       && dump_enabled_p ())
3606     {
3607       if (version_alias)
3608         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3609 			 vect_location,
3610                          "loop versioned for vectorization because of "
3611 			 "possible aliasing\n");
3612       if (version_align)
3613         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3614 			 vect_location,
3615                          "loop versioned for vectorization to enhance "
3616 			 "alignment\n");
3617 
3618     }
3619 
3620   return nloop;
3621 }
3622