1 /* Loop Vectorization
2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56 
57 /* Loop Vectorization Pass.
58 
59    This pass tries to vectorize loops.
60 
61    For example, the vectorizer transforms the following simple loop:
62 
63         short a[N]; short b[N]; short c[N]; int i;
64 
65         for (i=0; i<N; i++){
66           a[i] = b[i] + c[i];
67         }
68 
69    as if it was manually vectorized by rewriting the source code into:
70 
71         typedef int __attribute__((mode(V8HI))) v8hi;
72         short a[N];  short b[N]; short c[N];   int i;
73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74         v8hi va, vb, vc;
75 
76         for (i=0; i<N/8; i++){
77           vb = pb[i];
78           vc = pc[i];
79           va = vb + vc;
80           pa[i] = va;
81         }
82 
83         The main entry to this pass is vectorize_loops(), in which
84    the vectorizer applies a set of analyses on a given set of loops,
85    followed by the actual vectorization transformation for the loops that
86    had successfully passed the analysis phase.
87         Throughout this pass we make a distinction between two types of
88    data: scalars (which are represented by SSA_NAMES), and memory references
89    ("data-refs").  These two types of data require different handling both
90    during analysis and transformation. The types of data-refs that the
91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93    accesses are required to have a simple (consecutive) access pattern.
94 
95    Analysis phase:
96    ===============
97         The driver for the analysis phase is vect_analyze_loop().
98    It applies a set of analyses, some of which rely on the scalar evolution
99    analyzer (scev) developed by Sebastian Pop.
100 
101         During the analysis phase the vectorizer records some information
102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103    loop, as well as general information about the loop as a whole, which is
104    recorded in a "loop_vec_info" struct attached to each loop.
105 
106    Transformation phase:
107    =====================
108         The loop transformation phase scans all the stmts in the loop, and
109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110    the loop that needs to be vectorized.  It inserts the vector code sequence
111    just before the scalar stmt S, and records a pointer to the vector code
112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113    attached to S).  This pointer will be used for the vectorization of following
114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115    otherwise, we rely on dead code elimination for removing it.
116 
117         For example, say stmt S1 was vectorized into stmt VS1:
118 
119    VS1: vb = px[i];
120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121    S2:  a = b;
122 
123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
126    resulting sequence would be:
127 
128    VS1: vb = px[i];
129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130    VS2: va = vb;
131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132 
133         Operands that are not SSA_NAMEs, are data-refs that appear in
134    load/store operations (like 'x[i]' in S1), and are handled differently.
135 
136    Target modeling:
137    =================
138         Currently the only target specific information that is used is the
139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140    Targets that can support different sizes of vectors, for now will need
141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
142    flexibility will be added in the future.
143 
144         Since we only vectorize operations which vector form can be
145    expressed using existing tree codes, to verify that an operation is
146    supported, the vectorizer checks the relevant optab at the relevant
147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
148    the value found is CODE_FOR_nothing, then there's no target support, and
149    we can't vectorize the stmt.
150 
151    For additional information on this project see:
152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154 
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 					       bool *, bool *);
158 
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161    may already be set for general statements (not just data refs).  */
162 
163 static opt_result
vect_determine_vf_for_stmt_1(vec_info * vinfo,stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 			      bool vectype_maybe_set_p,
166 			      poly_uint64 *vf)
167 {
168   gimple *stmt = stmt_info->stmt;
169 
170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171        && !STMT_VINFO_LIVE_P (stmt_info))
172       || gimple_clobber_p (stmt))
173     {
174       if (dump_enabled_p ())
175 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176       return opt_result::success ();
177     }
178 
179   tree stmt_vectype, nunits_vectype;
180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 						   &stmt_vectype,
182 						   &nunits_vectype);
183   if (!res)
184     return res;
185 
186   if (stmt_vectype)
187     {
188       if (STMT_VINFO_VECTYPE (stmt_info))
189 	/* The only case when a vectype had been already set is for stmts
190 	   that contain a data ref, or for "pattern-stmts" (stmts generated
191 	   by the vectorizer to represent/replace a certain idiom).  */
192 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 		     || vectype_maybe_set_p)
194 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195       else
196 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197     }
198 
199   if (nunits_vectype)
200     vect_update_max_nunits (vf, nunits_vectype);
201 
202   return opt_result::success ();
203 }
204 
205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
206    types of STMT_INFO and all attached pattern statements and update
207    the vectorization factor VF accordingly.  Return true on success
208    or false if something prevented vectorization.  */
209 
210 static opt_result
vect_determine_vf_for_stmt(vec_info * vinfo,stmt_vec_info stmt_info,poly_uint64 * vf)211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 			    stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214   if (dump_enabled_p ())
215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 		     stmt_info->stmt);
217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218   if (!res)
219     return res;
220 
221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222       && STMT_VINFO_RELATED_STMT (stmt_info))
223     {
224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226 
227       /* If a pattern statement has def stmts, analyze them too.  */
228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 	   !gsi_end_p (si); gsi_next (&si))
230 	{
231 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 	  if (dump_enabled_p ())
233 	    dump_printf_loc (MSG_NOTE, vect_location,
234 			     "==> examining pattern def stmt: %G",
235 			     def_stmt_info->stmt);
236 	  res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 	  if (!res)
238 	    return res;
239 	}
240 
241       if (dump_enabled_p ())
242 	dump_printf_loc (MSG_NOTE, vect_location,
243 			 "==> examining pattern statement: %G",
244 			 stmt_info->stmt);
245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246       if (!res)
247 	return res;
248     }
249 
250   return opt_result::success ();
251 }
252 
253 /* Function vect_determine_vectorization_factor
254 
255    Determine the vectorization factor (VF).  VF is the number of data elements
256    that are operated upon in parallel in a single iteration of the vectorized
257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259    elements can fit in a single vector register.
260 
261    We currently support vectorization of loops in which all types operated upon
262    are of the same size.  Therefore this function currently sets VF according to
263    the size of the types operated upon, and fails if there are multiple sizes
264    in the loop.
265 
266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
267    original loop:
268         for (i=0; i<N; i++){
269           a[i] = b[i] + c[i];
270         }
271 
272    vectorized loop:
273         for (i=0; i<N; i+=VF){
274           a[i:VF] = b[i:VF] + c[i:VF];
275         }
276 */
277 
278 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283   unsigned nbbs = loop->num_nodes;
284   poly_uint64 vectorization_factor = 1;
285   tree scalar_type = NULL_TREE;
286   gphi *phi;
287   tree vectype;
288   stmt_vec_info stmt_info;
289   unsigned i;
290 
291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292 
293   for (i = 0; i < nbbs; i++)
294     {
295       basic_block bb = bbs[i];
296 
297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 	   gsi_next (&si))
299 	{
300 	  phi = si.phi ();
301 	  stmt_info = loop_vinfo->lookup_stmt (phi);
302 	  if (dump_enabled_p ())
303 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 			     phi);
305 
306 	  gcc_assert (stmt_info);
307 
308 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
309 	      || STMT_VINFO_LIVE_P (stmt_info))
310             {
311 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
313 
314 	      if (dump_enabled_p ())
315 		dump_printf_loc (MSG_NOTE, vect_location,
316 				 "get vectype for scalar type:  %T\n",
317 				 scalar_type);
318 
319 	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 	      if (!vectype)
321 		return opt_result::failure_at (phi,
322 					       "not vectorized: unsupported "
323 					       "data-type %T\n",
324 					       scalar_type);
325 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
326 
327 	      if (dump_enabled_p ())
328 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 				 vectype);
330 
331 	      if (dump_enabled_p ())
332 		{
333 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 		  dump_printf (MSG_NOTE, "\n");
336 		}
337 
338 	      vect_update_max_nunits (&vectorization_factor, vectype);
339 	    }
340 	}
341 
342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 	   gsi_next (&si))
344 	{
345 	  if (is_gimple_debug (gsi_stmt (si)))
346 	    continue;
347 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 	  opt_result res
349 	    = vect_determine_vf_for_stmt (loop_vinfo,
350 					  stmt_info, &vectorization_factor);
351 	  if (!res)
352 	    return res;
353         }
354     }
355 
356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
357   if (dump_enabled_p ())
358     {
359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360       dump_dec (MSG_NOTE, vectorization_factor);
361       dump_printf (MSG_NOTE, "\n");
362     }
363 
364   if (known_le (vectorization_factor, 1U))
365     return opt_result::failure_at (vect_location,
366 				   "not vectorized: unsupported data-type\n");
367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368   return opt_result::success ();
369 }
370 
371 
372 /* Function vect_is_simple_iv_evolution.
373 
374    FORNOW: A simple evolution of an induction variables in the loop is
375    considered a polynomial evolution.  */
376 
377 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379                              tree * step)
380 {
381   tree init_expr;
382   tree step_expr;
383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384   basic_block bb;
385 
386   /* When there is no evolution in this loop, the evolution function
387      is not "simple".  */
388   if (evolution_part == NULL_TREE)
389     return false;
390 
391   /* When the evolution is a polynomial of degree >= 2
392      the evolution function is not "simple".  */
393   if (tree_is_chrec (evolution_part))
394     return false;
395 
396   step_expr = evolution_part;
397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398 
399   if (dump_enabled_p ())
400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
401 		     step_expr, init_expr);
402 
403   *init = init_expr;
404   *step = step_expr;
405 
406   if (TREE_CODE (step_expr) != INTEGER_CST
407       && (TREE_CODE (step_expr) != SSA_NAME
408 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 		  || !flag_associative_math)))
413       && (TREE_CODE (step_expr) != REAL_CST
414 	  || !flag_associative_math))
415     {
416       if (dump_enabled_p ())
417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418                          "step unknown.\n");
419       return false;
420     }
421 
422   return true;
423 }
424 
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426    what we are assuming is a double reduction.  For example, given
427    a structure like this:
428 
429       outer1:
430 	x_1 = PHI <x_4(outer2), ...>;
431 	...
432 
433       inner:
434 	x_2 = PHI <x_1(outer1), ...>;
435 	...
436 	x_3 = ...;
437 	...
438 
439       outer2:
440 	x_4 = PHI <x_3(inner)>;
441 	...
442 
443    outer loop analysis would treat x_1 as a double reduction phi and
444    this function would then return true for x_2.  */
445 
446 static bool
vect_inner_phi_in_double_reduction_p(loop_vec_info loop_vinfo,gphi * phi)447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449   use_operand_p use_p;
450   ssa_op_iter op_iter;
451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 	return true;
455   return false;
456 }
457 
458 /* Function vect_analyze_scalar_cycles_1.
459 
460    Examine the cross iteration def-use cycles of scalar variables
461    in LOOP.  LOOP_VINFO represents the loop that is now being
462    considered for vectorization (can be LOOP, or an outer-loop
463    enclosing LOOP).  */
464 
465 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468   basic_block bb = loop->header;
469   tree init, step;
470   auto_vec<stmt_vec_info, 64> worklist;
471   gphi_iterator gsi;
472   bool double_reduc, reduc_chain;
473 
474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475 
476   /* First - identify all inductions.  Reduction detection assumes that all the
477      inductions have been identified, therefore, this order must not be
478      changed.  */
479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480     {
481       gphi *phi = gsi.phi ();
482       tree access_fn = NULL;
483       tree def = PHI_RESULT (phi);
484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485 
486       if (dump_enabled_p ())
487 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488 
489       /* Skip virtual phi's.  The data dependences that are associated with
490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
491       if (virtual_operand_p (def))
492 	continue;
493 
494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495 
496       /* Analyze the evolution function.  */
497       access_fn = analyze_scalar_evolution (loop, def);
498       if (access_fn)
499 	{
500 	  STRIP_NOPS (access_fn);
501 	  if (dump_enabled_p ())
502 	    dump_printf_loc (MSG_NOTE, vect_location,
503 			     "Access function of PHI: %T\n", access_fn);
504 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 	    = initial_condition_in_loop_num (access_fn, loop->num);
506 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 	    = evolution_part_in_loop_num (access_fn, loop->num);
508 	}
509 
510       if (!access_fn
511 	  || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 	      && TREE_CODE (step) != INTEGER_CST))
515 	{
516 	  worklist.safe_push (stmt_vinfo);
517 	  continue;
518 	}
519 
520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 		  != NULL_TREE);
522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523 
524       if (dump_enabled_p ())
525 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527     }
528 
529 
530   /* Second - identify all reductions and nested cycles.  */
531   while (worklist.length () > 0)
532     {
533       stmt_vec_info stmt_vinfo = worklist.pop ();
534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535       tree def = PHI_RESULT (phi);
536 
537       if (dump_enabled_p ())
538 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539 
540       gcc_assert (!virtual_operand_p (def)
541 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542 
543       stmt_vec_info reduc_stmt_info
544 	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 				    &reduc_chain);
546       if (reduc_stmt_info)
547         {
548 	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 	  if (double_reduc)
551 	    {
552 	      if (dump_enabled_p ())
553 		dump_printf_loc (MSG_NOTE, vect_location,
554 				 "Detected double reduction.\n");
555 
556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558             }
559           else
560             {
561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562                 {
563                   if (dump_enabled_p ())
564                     dump_printf_loc (MSG_NOTE, vect_location,
565 				     "Detected vectorizable nested cycle.\n");
566 
567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568                 }
569               else
570                 {
571                   if (dump_enabled_p ())
572                     dump_printf_loc (MSG_NOTE, vect_location,
573 				     "Detected reduction.\n");
574 
575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577                   /* Store the reduction cycles for possible vectorization in
578                      loop-aware SLP if it was not detected as reduction
579 		     chain.  */
580 		  if (! reduc_chain)
581 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 		      (reduc_stmt_info);
583                 }
584             }
585         }
586       else
587         if (dump_enabled_p ())
588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 			   "Unknown def-use cycle pattern.\n");
590     }
591 }
592 
593 
594 /* Function vect_analyze_scalar_cycles.
595 
596    Examine the cross iteration def-use cycles of scalar variables, by
597    analyzing the loop-header PHIs of scalar variables.  Classify each
598    cycle as one of the following: invariant, induction, reduction, unknown.
599    We do that for the loop represented by LOOP_VINFO, and also to its
600    inner-loop, if exists.
601    Examples for scalar cycles:
602 
603    Example1: reduction:
604 
605               loop1:
606               for (i=0; i<N; i++)
607                  sum += a[i];
608 
609    Example2: induction:
610 
611               loop2:
612               for (i=0; i<N; i++)
613                  a[i] = i;  */
614 
615 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619 
620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621 
622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623      Reductions in such inner-loop therefore have different properties than
624      the reductions in the nest that gets vectorized:
625      1. When vectorized, they are executed in the same order as in the original
626         scalar loop, so we can't change the order of computation when
627         vectorizing them.
628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629         current checks are too strict.  */
630 
631   if (loop->inner)
632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634 
635 /* Transfer group and reduction information from STMT_INFO to its
636    pattern stmt.  */
637 
638 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642   stmt_vec_info stmtp;
643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646   do
647     {
648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 			   == STMT_VINFO_DEF_TYPE (stmt_info));
651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653       if (stmt_info)
654 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 	  = STMT_VINFO_RELATED_STMT (stmt_info);
656     }
657   while (stmt_info);
658 }
659 
660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
661 
662 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665   stmt_vec_info first;
666   unsigned i;
667 
668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669     {
670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671       while (next)
672 	{
673 	  if ((STMT_VINFO_IN_PATTERN_P (next)
674 	       != STMT_VINFO_IN_PATTERN_P (first))
675 	      || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 	    break;
677 	  next = REDUC_GROUP_NEXT_ELEMENT (next);
678 	}
679       /* If all reduction chain members are well-formed patterns adjust
680 	 the group to group the pattern stmts instead.  */
681       if (! next
682 	  && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
683 	{
684 	  if (STMT_VINFO_IN_PATTERN_P (first))
685 	    {
686 	      vect_fixup_reduc_chain (first);
687 	      LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 		= STMT_VINFO_RELATED_STMT (first);
689 	    }
690 	}
691       /* If not all stmt in the chain are patterns or if we failed
692 	 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 	 it as regular reduction instead.  */
694       else
695 	{
696 	  stmt_vec_info vinfo = first;
697 	  stmt_vec_info last = NULL;
698 	  while (vinfo)
699 	    {
700 	      next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 	      REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 	      REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 	      last = vinfo;
704 	      vinfo = next;
705 	    }
706 	  STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 	    = vect_internal_def;
708 	  loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 	  LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 	  --i;
711 	}
712     }
713 }
714 
715 /* Function vect_get_loop_niters.
716 
717    Determine how many iterations the loop is executed and place it
718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
720    niter information holds in ASSUMPTIONS.
721 
722    Return the loop exit condition.  */
723 
724 
725 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 		      tree *number_of_iterations, tree *number_of_iterationsm1)
728 {
729   edge exit = single_exit (loop);
730   class tree_niter_desc niter_desc;
731   tree niter_assumptions, niter, may_be_zero;
732   gcond *cond = get_loop_exit_condition (loop);
733 
734   *assumptions = boolean_true_node;
735   *number_of_iterationsm1 = chrec_dont_know;
736   *number_of_iterations = chrec_dont_know;
737   DUMP_VECT_SCOPE ("get_loop_niters");
738 
739   if (!exit)
740     return cond;
741 
742   may_be_zero = NULL_TREE;
743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744       || chrec_contains_undetermined (niter_desc.niter))
745     return cond;
746 
747   niter_assumptions = niter_desc.assumptions;
748   may_be_zero = niter_desc.may_be_zero;
749   niter = niter_desc.niter;
750 
751   if (may_be_zero && integer_zerop (may_be_zero))
752     may_be_zero = NULL_TREE;
753 
754   if (may_be_zero)
755     {
756       if (COMPARISON_CLASS_P (may_be_zero))
757 	{
758 	  /* Try to combine may_be_zero with assumptions, this can simplify
759 	     computation of niter expression.  */
760 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 					     niter_assumptions,
763 					     fold_build1 (TRUTH_NOT_EXPR,
764 							  boolean_type_node,
765 							  may_be_zero));
766 	  else
767 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 				 build_int_cst (TREE_TYPE (niter), 0),
769 				 rewrite_to_non_trapping_overflow (niter));
770 
771 	  may_be_zero = NULL_TREE;
772 	}
773       else if (integer_nonzerop (may_be_zero))
774 	{
775 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 	  return cond;
778 	}
779       else
780 	return cond;
781     }
782 
783   *assumptions = niter_assumptions;
784   *number_of_iterationsm1 = niter;
785 
786   /* We want the number of loop header executions which is the number
787      of latch executions plus one.
788      ???  For UINT_MAX latch executions this number overflows to zero
789      for loops like do { n++; } while (n != 0);  */
790   if (niter && !chrec_contains_undetermined (niter))
791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 			  build_int_cst (TREE_TYPE (niter), 1));
793   *number_of_iterations = niter;
794 
795   return cond;
796 }
797 
798 /* Function bb_in_loop_p
799 
800    Used as predicate for dfs order traversal of the loop bbs.  */
801 
802 static bool
bb_in_loop_p(const_basic_block bb,const void * data)803 bb_in_loop_p (const_basic_block bb, const void *data)
804 {
805   const class loop *const loop = (const class loop *)data;
806   if (flow_bb_inside_loop_p (loop, bb))
807     return true;
808   return false;
809 }
810 
811 
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
814 
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
817     loop (loop_in),
818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819     num_itersm1 (NULL_TREE),
820     num_iters (NULL_TREE),
821     num_iters_unchanged (NULL_TREE),
822     num_iters_assumptions (NULL_TREE),
823     th (0),
824     versioning_threshold (0),
825     vectorization_factor (0),
826     max_vectorization_factor (0),
827     mask_skip_niters (NULL_TREE),
828     rgroup_compare_type (NULL_TREE),
829     simd_if_cond (NULL_TREE),
830     unaligned_dr (NULL),
831     peeling_for_alignment (0),
832     ptr_mask (0),
833     ivexpr_map (NULL),
834     scan_map (NULL),
835     slp_unrolling_factor (1),
836     single_scalar_iteration_cost (0),
837     vec_outside_cost (0),
838     vec_inside_cost (0),
839     vectorizable (false),
840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841     using_partial_vectors_p (false),
842     epil_using_partial_vectors_p (false),
843     peeling_for_gaps (false),
844     peeling_for_niter (false),
845     no_data_dependencies (false),
846     has_mask_store (false),
847     scalar_loop_scaling (profile_probability::uninitialized ()),
848     scalar_loop (NULL),
849     orig_loop_info (NULL)
850 {
851   /* CHECKME: We want to visit all BBs before their successors (except for
852      latch blocks, for which this assertion wouldn't hold).  In the simple
853      case of the loop forms we allow, a dfs order of the BBs would the same
854      as reversed postorder traversal, so we are safe.  */
855 
856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 					  bbs, loop->num_nodes, loop);
858   gcc_assert (nbbs == loop->num_nodes);
859 
860   for (unsigned int i = 0; i < nbbs; i++)
861     {
862       basic_block bb = bbs[i];
863       gimple_stmt_iterator si;
864 
865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
866 	{
867 	  gimple *phi = gsi_stmt (si);
868 	  gimple_set_uid (phi, 0);
869 	  add_stmt (phi);
870 	}
871 
872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
873 	{
874 	  gimple *stmt = gsi_stmt (si);
875 	  gimple_set_uid (stmt, 0);
876 	  if (is_gimple_debug (stmt))
877 	    continue;
878 	  add_stmt (stmt);
879 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 	     third argument is the #pragma omp simd if (x) condition, when 0,
881 	     loop shouldn't be vectorized, when non-zero constant, it should
882 	     be vectorized normally, otherwise versioned with vectorized loop
883 	     done if the condition is non-zero at runtime.  */
884 	  if (loop_in->simduid
885 	      && is_gimple_call (stmt)
886 	      && gimple_call_internal_p (stmt)
887 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 	      && gimple_call_num_args (stmt) >= 3
889 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 	      && (loop_in->simduid
891 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
892 	    {
893 	      tree arg = gimple_call_arg (stmt, 2);
894 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 		simd_if_cond = arg;
896 	      else
897 		gcc_assert (integer_nonzerop (arg));
898 	    }
899 	}
900     }
901 
902   epilogue_vinfos.create (6);
903 }
904 
905 /* Free all levels of rgroup CONTROLS.  */
906 
907 void
release_vec_loop_controls(vec<rgroup_controls> * controls)908 release_vec_loop_controls (vec<rgroup_controls> *controls)
909 {
910   rgroup_controls *rgc;
911   unsigned int i;
912   FOR_EACH_VEC_ELT (*controls, i, rgc)
913     rgc->controls.release ();
914   controls->release ();
915 }
916 
917 /* Free all memory used by the _loop_vec_info, as well as all the
918    stmt_vec_info structs of all the stmts in the loop.  */
919 
~_loop_vec_info()920 _loop_vec_info::~_loop_vec_info ()
921 {
922   free (bbs);
923 
924   release_vec_loop_controls (&masks);
925   release_vec_loop_controls (&lens);
926   delete ivexpr_map;
927   delete scan_map;
928   epilogue_vinfos.release ();
929 
930   /* When we release an epiloge vinfo that we do not intend to use
931      avoid clearing AUX of the main loop which should continue to
932      point to the main loop vinfo since otherwise we'll leak that.  */
933   if (loop->aux == this)
934     loop->aux = NULL;
935 }
936 
937 /* Return an invariant or register for EXPR and emit necessary
938    computations in the LOOP_VINFO loop preheader.  */
939 
940 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)941 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
942 {
943   if (is_gimple_reg (expr)
944       || is_gimple_min_invariant (expr))
945     return expr;
946 
947   if (! loop_vinfo->ivexpr_map)
948     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
949   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
950   if (! cached)
951     {
952       gimple_seq stmts = NULL;
953       cached = force_gimple_operand (unshare_expr (expr),
954 				     &stmts, true, NULL_TREE);
955       if (stmts)
956 	{
957 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
958 	  gsi_insert_seq_on_edge_immediate (e, stmts);
959 	}
960     }
961   return cached;
962 }
963 
964 /* Return true if we can use CMP_TYPE as the comparison type to produce
965    all masks required to mask LOOP_VINFO.  */
966 
967 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)968 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
969 {
970   rgroup_controls *rgm;
971   unsigned int i;
972   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
973     if (rgm->type != NULL_TREE
974 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
975 					    cmp_type, rgm->type,
976 					    OPTIMIZE_FOR_SPEED))
977       return false;
978   return true;
979 }
980 
981 /* Calculate the maximum number of scalars per iteration for every
982    rgroup in LOOP_VINFO.  */
983 
984 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)985 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
986 {
987   unsigned int res = 1;
988   unsigned int i;
989   rgroup_controls *rgm;
990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
991     res = MAX (res, rgm->max_nscalars_per_iter);
992   return res;
993 }
994 
995 /* Calculate the minimum precision necessary to represent:
996 
997       MAX_NITERS * FACTOR
998 
999    as an unsigned integer, where MAX_NITERS is the maximum number of
1000    loop header iterations for the original scalar form of LOOP_VINFO.  */
1001 
1002 static unsigned
vect_min_prec_for_max_niters(loop_vec_info loop_vinfo,unsigned int factor)1003 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1004 {
1005   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1006 
1007   /* Get the maximum number of iterations that is representable
1008      in the counter type.  */
1009   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1010   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1011 
1012   /* Get a more refined estimate for the number of iterations.  */
1013   widest_int max_back_edges;
1014   if (max_loop_iterations (loop, &max_back_edges))
1015     max_ni = wi::smin (max_ni, max_back_edges + 1);
1016 
1017   /* Work out how many bits we need to represent the limit.  */
1018   return wi::min_precision (max_ni * factor, UNSIGNED);
1019 }
1020 
1021 /* True if the loop needs peeling or partial vectors when vectorized.  */
1022 
1023 static bool
vect_need_peeling_or_partial_vectors_p(loop_vec_info loop_vinfo)1024 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1025 {
1026   unsigned HOST_WIDE_INT const_vf;
1027   HOST_WIDE_INT max_niter
1028     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1029 
1030   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1031   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1032     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1033 					  (loop_vinfo));
1034 
1035   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1036       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1037     {
1038       /* Work out the (constant) number of iterations that need to be
1039 	 peeled for reasons other than niters.  */
1040       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1041       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1042 	peel_niter += 1;
1043       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1044 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1045 	return true;
1046     }
1047   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1048       /* ??? When peeling for gaps but not alignment, we could
1049 	 try to check whether the (variable) niters is known to be
1050 	 VF * N + 1.  That's something of a niche case though.  */
1051       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1052       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1053       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1054 	   < (unsigned) exact_log2 (const_vf))
1055 	  /* In case of versioning, check if the maximum number of
1056 	     iterations is greater than th.  If they are identical,
1057 	     the epilogue is unnecessary.  */
1058 	  && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1059 	      || ((unsigned HOST_WIDE_INT) max_niter
1060 		  > (th / const_vf) * const_vf))))
1061     return true;
1062 
1063   return false;
1064 }
1065 
1066 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1067    whether we can actually generate the masks required.  Return true if so,
1068    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1069 
1070 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1071 vect_verify_full_masking (loop_vec_info loop_vinfo)
1072 {
1073   unsigned int min_ni_width;
1074   unsigned int max_nscalars_per_iter
1075     = vect_get_max_nscalars_per_iter (loop_vinfo);
1076 
1077   /* Use a normal loop if there are no statements that need masking.
1078      This only happens in rare degenerate cases: it means that the loop
1079      has no loads, no stores, and no live-out values.  */
1080   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1081     return false;
1082 
1083   /* Work out how many bits we need to represent the limit.  */
1084   min_ni_width
1085     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1086 
1087   /* Find a scalar mode for which WHILE_ULT is supported.  */
1088   opt_scalar_int_mode cmp_mode_iter;
1089   tree cmp_type = NULL_TREE;
1090   tree iv_type = NULL_TREE;
1091   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1092   unsigned int iv_precision = UINT_MAX;
1093 
1094   if (iv_limit != -1)
1095     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1096 				      UNSIGNED);
1097 
1098   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1099     {
1100       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1101       if (cmp_bits >= min_ni_width
1102 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1103 	{
1104 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1105 	  if (this_type
1106 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1107 	    {
1108 	      /* Although we could stop as soon as we find a valid mode,
1109 		 there are at least two reasons why that's not always the
1110 		 best choice:
1111 
1112 		 - An IV that's Pmode or wider is more likely to be reusable
1113 		   in address calculations than an IV that's narrower than
1114 		   Pmode.
1115 
1116 		 - Doing the comparison in IV_PRECISION or wider allows
1117 		   a natural 0-based IV, whereas using a narrower comparison
1118 		   type requires mitigations against wrap-around.
1119 
1120 		 Conversely, if the IV limit is variable, doing the comparison
1121 		 in a wider type than the original type can introduce
1122 		 unnecessary extensions, so picking the widest valid mode
1123 		 is not always a good choice either.
1124 
1125 		 Here we prefer the first IV type that's Pmode or wider,
1126 		 and the first comparison type that's IV_PRECISION or wider.
1127 		 (The comparison type must be no wider than the IV type,
1128 		 to avoid extensions in the vector loop.)
1129 
1130 		 ??? We might want to try continuing beyond Pmode for ILP32
1131 		 targets if CMP_BITS < IV_PRECISION.  */
1132 	      iv_type = this_type;
1133 	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1134 		cmp_type = this_type;
1135 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1136 		break;
1137 	    }
1138 	}
1139     }
1140 
1141   if (!cmp_type)
1142     return false;
1143 
1144   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1145   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1146   return true;
1147 }
1148 
1149 /* Check whether we can use vector access with length based on precison
1150    comparison.  So far, to keep it simple, we only allow the case that the
1151    precision of the target supported length is larger than the precision
1152    required by loop niters.  */
1153 
1154 static bool
vect_verify_loop_lens(loop_vec_info loop_vinfo)1155 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1156 {
1157   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1158     return false;
1159 
1160   unsigned int max_nitems_per_iter = 1;
1161   unsigned int i;
1162   rgroup_controls *rgl;
1163   /* Find the maximum number of items per iteration for every rgroup.  */
1164   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1165     {
1166       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1167       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1168     }
1169 
1170   /* Work out how many bits we need to represent the length limit.  */
1171   unsigned int min_ni_prec
1172     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1173 
1174   /* Now use the maximum of below precisions for one suitable IV type:
1175      - the IV's natural precision
1176      - the precision needed to hold: the maximum number of scalar
1177        iterations multiplied by the scale factor (min_ni_prec above)
1178      - the Pmode precision
1179 
1180      If min_ni_prec is less than the precision of the current niters,
1181      we perfer to still use the niters type.  Prefer to use Pmode and
1182      wider IV to avoid narrow conversions.  */
1183 
1184   unsigned int ni_prec
1185     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1186   min_ni_prec = MAX (min_ni_prec, ni_prec);
1187   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1188 
1189   tree iv_type = NULL_TREE;
1190   opt_scalar_int_mode tmode_iter;
1191   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1192     {
1193       scalar_mode tmode = tmode_iter.require ();
1194       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1195 
1196       /* ??? Do we really want to construct one IV whose precision exceeds
1197 	 BITS_PER_WORD?  */
1198       if (tbits > BITS_PER_WORD)
1199 	break;
1200 
1201       /* Find the first available standard integral type.  */
1202       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1203 	{
1204 	  iv_type = build_nonstandard_integer_type (tbits, true);
1205 	  break;
1206 	}
1207     }
1208 
1209   if (!iv_type)
1210     {
1211       if (dump_enabled_p ())
1212 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 			 "can't vectorize with length-based partial vectors"
1214 			 " because there is no suitable iv type.\n");
1215       return false;
1216     }
1217 
1218   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1219   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1220 
1221   return true;
1222 }
1223 
1224 /* Calculate the cost of one scalar iteration of the loop.  */
1225 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1226 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1227 {
1228   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1229   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1230   int nbbs = loop->num_nodes, factor;
1231   int innerloop_iters, i;
1232 
1233   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1234 
1235   /* Gather costs for statements in the scalar loop.  */
1236 
1237   /* FORNOW.  */
1238   innerloop_iters = 1;
1239   if (loop->inner)
1240     innerloop_iters = 50; /* FIXME */
1241 
1242   for (i = 0; i < nbbs; i++)
1243     {
1244       gimple_stmt_iterator si;
1245       basic_block bb = bbs[i];
1246 
1247       if (bb->loop_father == loop->inner)
1248         factor = innerloop_iters;
1249       else
1250         factor = 1;
1251 
1252       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1253         {
1254 	  gimple *stmt = gsi_stmt (si);
1255 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1256 
1257           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1258             continue;
1259 
1260           /* Skip stmts that are not vectorized inside the loop.  */
1261 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1262           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1263               && (!STMT_VINFO_LIVE_P (vstmt_info)
1264                   || !VECTORIZABLE_CYCLE_DEF
1265 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1266             continue;
1267 
1268 	  vect_cost_for_stmt kind;
1269           if (STMT_VINFO_DATA_REF (stmt_info))
1270             {
1271               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1272                kind = scalar_load;
1273              else
1274                kind = scalar_store;
1275             }
1276 	  else if (vect_nop_conversion_p (stmt_info))
1277 	    continue;
1278 	  else
1279             kind = scalar_stmt;
1280 
1281 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1282 			    factor, kind, stmt_info, 0, vect_prologue);
1283         }
1284     }
1285 
1286   /* Now accumulate cost.  */
1287   void *target_cost_data = init_cost (loop);
1288   stmt_info_for_cost *si;
1289   int j;
1290   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291 		    j, si)
1292     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1293 			  si->kind, si->stmt_info, si->vectype,
1294 			  si->misalign, vect_body);
1295   unsigned dummy, body_cost = 0;
1296   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1297   destroy_cost_data (target_cost_data);
1298   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1299 }
1300 
1301 
1302 /* Function vect_analyze_loop_form_1.
1303 
1304    Verify that certain CFG restrictions hold, including:
1305    - the loop has a pre-header
1306    - the loop has a single entry and exit
1307    - the loop exit condition is simple enough
1308    - the number of iterations can be analyzed, i.e, a countable loop.  The
1309      niter could be analyzed under some assumptions.  */
1310 
1311 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1312 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1313 			  tree *assumptions, tree *number_of_iterationsm1,
1314 			  tree *number_of_iterations, gcond **inner_loop_cond)
1315 {
1316   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1317 
1318   /* Different restrictions apply when we are considering an inner-most loop,
1319      vs. an outer (nested) loop.
1320      (FORNOW. May want to relax some of these restrictions in the future).  */
1321 
1322   if (!loop->inner)
1323     {
1324       /* Inner-most loop.  We currently require that the number of BBs is
1325 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1326 	 look like this:
1327 
1328                         (pre-header)
1329                            |
1330                           header <--------+
1331                            | |            |
1332                            | +--> latch --+
1333                            |
1334                         (exit-bb)  */
1335 
1336       if (loop->num_nodes != 2)
1337 	return opt_result::failure_at (vect_location,
1338 				       "not vectorized:"
1339 				       " control flow in loop.\n");
1340 
1341       if (empty_block_p (loop->header))
1342 	return opt_result::failure_at (vect_location,
1343 				       "not vectorized: empty loop.\n");
1344     }
1345   else
1346     {
1347       class loop *innerloop = loop->inner;
1348       edge entryedge;
1349 
1350       /* Nested loop. We currently require that the loop is doubly-nested,
1351 	 contains a single inner loop, and the number of BBs is exactly 5.
1352 	 Vectorizable outer-loops look like this:
1353 
1354 			(pre-header)
1355 			   |
1356 			  header <---+
1357 			   |         |
1358 		          inner-loop |
1359 			   |         |
1360 			  tail ------+
1361 			   |
1362 		        (exit-bb)
1363 
1364 	 The inner-loop has the properties expected of inner-most loops
1365 	 as described above.  */
1366 
1367       if ((loop->inner)->inner || (loop->inner)->next)
1368 	return opt_result::failure_at (vect_location,
1369 				       "not vectorized:"
1370 				       " multiple nested loops.\n");
1371 
1372       if (loop->num_nodes != 5)
1373 	return opt_result::failure_at (vect_location,
1374 				       "not vectorized:"
1375 				       " control flow in loop.\n");
1376 
1377       entryedge = loop_preheader_edge (innerloop);
1378       if (entryedge->src != loop->header
1379 	  || !single_exit (innerloop)
1380 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1381 	return opt_result::failure_at (vect_location,
1382 				       "not vectorized:"
1383 				       " unsupported outerloop form.\n");
1384 
1385       /* Analyze the inner-loop.  */
1386       tree inner_niterm1, inner_niter, inner_assumptions;
1387       opt_result res
1388 	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1389 				    &inner_assumptions, &inner_niterm1,
1390 				    &inner_niter, NULL);
1391       if (!res)
1392 	{
1393 	  if (dump_enabled_p ())
1394 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1395 			     "not vectorized: Bad inner loop.\n");
1396 	  return res;
1397 	}
1398 
1399       /* Don't support analyzing niter under assumptions for inner
1400 	 loop.  */
1401       if (!integer_onep (inner_assumptions))
1402 	return opt_result::failure_at (vect_location,
1403 				       "not vectorized: Bad inner loop.\n");
1404 
1405       if (!expr_invariant_in_loop_p (loop, inner_niter))
1406 	return opt_result::failure_at (vect_location,
1407 				       "not vectorized: inner-loop count not"
1408 				       " invariant.\n");
1409 
1410       if (dump_enabled_p ())
1411         dump_printf_loc (MSG_NOTE, vect_location,
1412 			 "Considering outer-loop vectorization.\n");
1413     }
1414 
1415   if (!single_exit (loop))
1416     return opt_result::failure_at (vect_location,
1417 				   "not vectorized: multiple exits.\n");
1418   if (EDGE_COUNT (loop->header->preds) != 2)
1419     return opt_result::failure_at (vect_location,
1420 				   "not vectorized:"
1421 				   " too many incoming edges.\n");
1422 
1423   /* We assume that the loop exit condition is at the end of the loop. i.e,
1424      that the loop is represented as a do-while (with a proper if-guard
1425      before the loop if needed), where the loop header contains all the
1426      executable statements, and the latch is empty.  */
1427   if (!empty_block_p (loop->latch)
1428       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1429     return opt_result::failure_at (vect_location,
1430 				   "not vectorized: latch block not empty.\n");
1431 
1432   /* Make sure the exit is not abnormal.  */
1433   edge e = single_exit (loop);
1434   if (e->flags & EDGE_ABNORMAL)
1435     return opt_result::failure_at (vect_location,
1436 				   "not vectorized:"
1437 				   " abnormal loop exit edge.\n");
1438 
1439   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1440 				     number_of_iterationsm1);
1441   if (!*loop_cond)
1442     return opt_result::failure_at
1443       (vect_location,
1444        "not vectorized: complicated exit condition.\n");
1445 
1446   if (integer_zerop (*assumptions)
1447       || !*number_of_iterations
1448       || chrec_contains_undetermined (*number_of_iterations))
1449     return opt_result::failure_at
1450       (*loop_cond,
1451        "not vectorized: number of iterations cannot be computed.\n");
1452 
1453   if (integer_zerop (*number_of_iterations))
1454     return opt_result::failure_at
1455       (*loop_cond,
1456        "not vectorized: number of iterations = 0.\n");
1457 
1458   return opt_result::success ();
1459 }
1460 
1461 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1462 
1463 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1464 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1465 {
1466   tree assumptions, number_of_iterations, number_of_iterationsm1;
1467   gcond *loop_cond, *inner_loop_cond = NULL;
1468 
1469   opt_result res
1470     = vect_analyze_loop_form_1 (loop, &loop_cond,
1471 				&assumptions, &number_of_iterationsm1,
1472 				&number_of_iterations, &inner_loop_cond);
1473   if (!res)
1474     return opt_loop_vec_info::propagate_failure (res);
1475 
1476   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1477   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1478   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1479   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1480   if (!integer_onep (assumptions))
1481     {
1482       /* We consider to vectorize this loop by versioning it under
1483 	 some assumptions.  In order to do this, we need to clear
1484 	 existing information computed by scev and niter analyzer.  */
1485       scev_reset_htab ();
1486       free_numbers_of_iterations_estimates (loop);
1487       /* Also set flag for this loop so that following scev and niter
1488 	 analysis are done under the assumptions.  */
1489       loop_constraint_set (loop, LOOP_C_FINITE);
1490       /* Also record the assumptions for versioning.  */
1491       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1492     }
1493 
1494   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1495     {
1496       if (dump_enabled_p ())
1497         {
1498           dump_printf_loc (MSG_NOTE, vect_location,
1499 			   "Symbolic number of iterations is ");
1500 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1501           dump_printf (MSG_NOTE, "\n");
1502         }
1503     }
1504 
1505   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1506   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1507   if (inner_loop_cond)
1508     {
1509       stmt_vec_info inner_loop_cond_info
1510 	= loop_vinfo->lookup_stmt (inner_loop_cond);
1511       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1512     }
1513 
1514   gcc_assert (!loop->aux);
1515   loop->aux = loop_vinfo;
1516   return opt_loop_vec_info::success (loop_vinfo);
1517 }
1518 
1519 
1520 
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522    statements update the vectorization factor.  */
1523 
1524 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1526 {
1527   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1528   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1529   int nbbs = loop->num_nodes;
1530   poly_uint64 vectorization_factor;
1531   int i;
1532 
1533   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1534 
1535   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1536   gcc_assert (known_ne (vectorization_factor, 0U));
1537 
1538   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1539      vectorization factor of the loop is the unrolling factor required by
1540      the SLP instances.  If that unrolling factor is 1, we say, that we
1541      perform pure SLP on loop - cross iteration parallelism is not
1542      exploited.  */
1543   bool only_slp_in_loop = true;
1544   for (i = 0; i < nbbs; i++)
1545     {
1546       basic_block bb = bbs[i];
1547       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1548 	   gsi_next (&si))
1549 	{
1550 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1551 	  if (!stmt_info)
1552 	    continue;
1553 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555 	      && !PURE_SLP_STMT (stmt_info))
1556 	    /* STMT needs both SLP and loop-based vectorization.  */
1557 	    only_slp_in_loop = false;
1558 	}
1559       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1560 	   gsi_next (&si))
1561 	{
1562 	  if (is_gimple_debug (gsi_stmt (si)))
1563 	    continue;
1564 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1565 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1566 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1567 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1568 	      && !PURE_SLP_STMT (stmt_info))
1569 	    /* STMT needs both SLP and loop-based vectorization.  */
1570 	    only_slp_in_loop = false;
1571 	}
1572     }
1573 
1574   if (only_slp_in_loop)
1575     {
1576       if (dump_enabled_p ())
1577 	dump_printf_loc (MSG_NOTE, vect_location,
1578 			 "Loop contains only SLP stmts\n");
1579       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1580     }
1581   else
1582     {
1583       if (dump_enabled_p ())
1584 	dump_printf_loc (MSG_NOTE, vect_location,
1585 			 "Loop contains SLP and non-SLP stmts\n");
1586       /* Both the vectorization factor and unroll factor have the form
1587 	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1588 	 so they must have a common multiple.  */
1589       vectorization_factor
1590 	= force_common_multiple (vectorization_factor,
1591 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1592     }
1593 
1594   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1595   if (dump_enabled_p ())
1596     {
1597       dump_printf_loc (MSG_NOTE, vect_location,
1598 		       "Updating vectorization factor to ");
1599       dump_dec (MSG_NOTE, vectorization_factor);
1600       dump_printf (MSG_NOTE, ".\n");
1601     }
1602 }
1603 
1604 /* Return true if STMT_INFO describes a double reduction phi and if
1605    the other phi in the reduction is also relevant for vectorization.
1606    This rejects cases such as:
1607 
1608       outer1:
1609 	x_1 = PHI <x_3(outer2), ...>;
1610 	...
1611 
1612       inner:
1613 	x_2 = ...;
1614 	...
1615 
1616       outer2:
1617 	x_3 = PHI <x_2(inner)>;
1618 
1619    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1620 
1621 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1622 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1623 {
1624   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1625     return false;
1626 
1627   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1628 }
1629 
1630 /* Function vect_analyze_loop_operations.
1631 
1632    Scan the loop stmts and make sure they are all vectorizable.  */
1633 
1634 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1635 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1636 {
1637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1638   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1639   int nbbs = loop->num_nodes;
1640   int i;
1641   stmt_vec_info stmt_info;
1642   bool need_to_vectorize = false;
1643   bool ok;
1644 
1645   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1646 
1647   auto_vec<stmt_info_for_cost> cost_vec;
1648 
1649   for (i = 0; i < nbbs; i++)
1650     {
1651       basic_block bb = bbs[i];
1652 
1653       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1654 	   gsi_next (&si))
1655         {
1656           gphi *phi = si.phi ();
1657           ok = true;
1658 
1659 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1660           if (dump_enabled_p ())
1661 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1662 	  if (virtual_operand_p (gimple_phi_result (phi)))
1663 	    continue;
1664 
1665           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1666              (i.e., a phi in the tail of the outer-loop).  */
1667           if (! is_loop_header_bb_p (bb))
1668             {
1669               /* FORNOW: we currently don't support the case that these phis
1670                  are not used in the outerloop (unless it is double reduction,
1671                  i.e., this phi is vect_reduction_def), cause this case
1672                  requires to actually do something here.  */
1673               if (STMT_VINFO_LIVE_P (stmt_info)
1674 		  && !vect_active_double_reduction_p (stmt_info))
1675 		return opt_result::failure_at (phi,
1676 					       "Unsupported loop-closed phi"
1677 					       " in outer-loop.\n");
1678 
1679               /* If PHI is used in the outer loop, we check that its operand
1680                  is defined in the inner loop.  */
1681               if (STMT_VINFO_RELEVANT_P (stmt_info))
1682                 {
1683                   tree phi_op;
1684 
1685                   if (gimple_phi_num_args (phi) != 1)
1686                     return opt_result::failure_at (phi, "unsupported phi");
1687 
1688                   phi_op = PHI_ARG_DEF (phi, 0);
1689 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1690 		  if (!op_def_info)
1691 		    return opt_result::failure_at (phi, "unsupported phi\n");
1692 
1693 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1694 		      && (STMT_VINFO_RELEVANT (op_def_info)
1695 			  != vect_used_in_outer_by_reduction))
1696 		    return opt_result::failure_at (phi, "unsupported phi\n");
1697 
1698 		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1699 		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1700 			   == vect_double_reduction_def))
1701 		      && !vectorizable_lc_phi (loop_vinfo,
1702 					       stmt_info, NULL, NULL))
1703 		    return opt_result::failure_at (phi, "unsupported phi\n");
1704                 }
1705 
1706               continue;
1707             }
1708 
1709           gcc_assert (stmt_info);
1710 
1711           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712                || STMT_VINFO_LIVE_P (stmt_info))
1713               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714 	    /* A scalar-dependence cycle that we don't support.  */
1715 	    return opt_result::failure_at (phi,
1716 					   "not vectorized:"
1717 					   " scalar dependence cycle.\n");
1718 
1719           if (STMT_VINFO_RELEVANT_P (stmt_info))
1720             {
1721               need_to_vectorize = true;
1722               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1723 		  && ! PURE_SLP_STMT (stmt_info))
1724 		ok = vectorizable_induction (loop_vinfo,
1725 					     stmt_info, NULL, NULL,
1726 					     &cost_vec);
1727 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1728 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1729 			    == vect_double_reduction_def)
1730 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731 		       && ! PURE_SLP_STMT (stmt_info))
1732 		ok = vectorizable_reduction (loop_vinfo,
1733 					     stmt_info, NULL, NULL, &cost_vec);
1734             }
1735 
1736 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1737 	  if (ok
1738 	      && STMT_VINFO_LIVE_P (stmt_info)
1739 	      && !PURE_SLP_STMT (stmt_info))
1740 	    ok = vectorizable_live_operation (loop_vinfo,
1741 					      stmt_info, NULL, NULL, NULL,
1742 					      -1, false, &cost_vec);
1743 
1744           if (!ok)
1745 	    return opt_result::failure_at (phi,
1746 					   "not vectorized: relevant phi not "
1747 					   "supported: %G",
1748 					   static_cast <gimple *> (phi));
1749         }
1750 
1751       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752 	   gsi_next (&si))
1753         {
1754 	  gimple *stmt = gsi_stmt (si);
1755 	  if (!gimple_clobber_p (stmt)
1756 	      && !is_gimple_debug (stmt))
1757 	    {
1758 	      opt_result res
1759 		= vect_analyze_stmt (loop_vinfo,
1760 				     loop_vinfo->lookup_stmt (stmt),
1761 				     &need_to_vectorize,
1762 				     NULL, NULL, &cost_vec);
1763 	      if (!res)
1764 		return res;
1765 	    }
1766         }
1767     } /* bbs */
1768 
1769   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1770 
1771   /* All operations in the loop are either irrelevant (deal with loop
1772      control, or dead), or only used outside the loop and can be moved
1773      out of the loop (e.g. invariants, inductions).  The loop can be
1774      optimized away by scalar optimizations.  We're better off not
1775      touching this loop.  */
1776   if (!need_to_vectorize)
1777     {
1778       if (dump_enabled_p ())
1779         dump_printf_loc (MSG_NOTE, vect_location,
1780 			 "All the computation can be taken out of the loop.\n");
1781       return opt_result::failure_at
1782 	(vect_location,
1783 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1784     }
1785 
1786   return opt_result::success ();
1787 }
1788 
1789 /* Return true if we know that the iteration count is smaller than the
1790    vectorization factor.  Return false if it isn't, or if we can't be sure
1791    either way.  */
1792 
1793 static bool
vect_known_niters_smaller_than_vf(loop_vec_info loop_vinfo)1794 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1795 {
1796   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1797 
1798   HOST_WIDE_INT max_niter;
1799   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1800     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1801   else
1802     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1803 
1804   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1805     return true;
1806 
1807   return false;
1808 }
1809 
1810 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1811    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1812    definitely no, or -1 if it's worth retrying.  */
1813 
1814 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1815 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1816 {
1817   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1818   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1819 
1820   /* Only loops that can handle partially-populated vectors can have iteration
1821      counts less than the vectorization factor.  */
1822   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1823     {
1824       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1825 	{
1826 	  if (dump_enabled_p ())
1827 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 			     "not vectorized: iteration count smaller than "
1829 			     "vectorization factor.\n");
1830 	  return 0;
1831 	}
1832     }
1833 
1834   /* If using the "very cheap" model. reject cases in which we'd keep
1835      a copy of the scalar code (even if we might be able to vectorize it).  */
1836   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1837       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1838 	  || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1839 	  || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1840     {
1841       if (dump_enabled_p ())
1842 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 			 "some scalar iterations would need to be peeled\n");
1844       return 0;
1845     }
1846 
1847   int min_profitable_iters, min_profitable_estimate;
1848   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1849 				      &min_profitable_estimate);
1850 
1851   if (min_profitable_iters < 0)
1852     {
1853       if (dump_enabled_p ())
1854 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 			 "not vectorized: vectorization not profitable.\n");
1856       if (dump_enabled_p ())
1857 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858 			 "not vectorized: vector version will never be "
1859 			 "profitable.\n");
1860       return -1;
1861     }
1862 
1863   int min_scalar_loop_bound = (param_min_vect_loop_bound
1864 			       * assumed_vf);
1865 
1866   /* Use the cost model only if it is more conservative than user specified
1867      threshold.  */
1868   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1869 				    min_profitable_iters);
1870 
1871   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1872 
1873   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1874       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1875     {
1876       if (dump_enabled_p ())
1877 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 			 "not vectorized: vectorization not profitable.\n");
1879       if (dump_enabled_p ())
1880 	dump_printf_loc (MSG_NOTE, vect_location,
1881 			 "not vectorized: iteration count smaller than user "
1882 			 "specified loop bound parameter or minimum profitable "
1883 			 "iterations (whichever is more conservative).\n");
1884       return 0;
1885     }
1886 
1887   /* The static profitablity threshold min_profitable_estimate includes
1888      the cost of having to check at runtime whether the scalar loop
1889      should be used instead.  If it turns out that we don't need or want
1890      such a check, the threshold we should use for the static estimate
1891      is simply the point at which the vector loop becomes more profitable
1892      than the scalar loop.  */
1893   if (min_profitable_estimate > min_profitable_iters
1894       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1896       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1897       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1898     {
1899       if (dump_enabled_p ())
1900 	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1901 			 " choice between the scalar and vector loops\n");
1902       min_profitable_estimate = min_profitable_iters;
1903     }
1904 
1905   /* If the vector loop needs multiple iterations to be beneficial then
1906      things are probably too close to call, and the conservative thing
1907      would be to stick with the scalar code.  */
1908   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1909       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1910     {
1911       if (dump_enabled_p ())
1912 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1913 			 "one iteration of the vector loop would be"
1914 			 " more expensive than the equivalent number of"
1915 			 " iterations of the scalar loop\n");
1916       return 0;
1917     }
1918 
1919   HOST_WIDE_INT estimated_niter;
1920 
1921   /* If we are vectorizing an epilogue then we know the maximum number of
1922      scalar iterations it will cover is at least one lower than the
1923      vectorization factor of the main loop.  */
1924   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925     estimated_niter
1926       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1927   else
1928     {
1929       estimated_niter = estimated_stmt_executions_int (loop);
1930       if (estimated_niter == -1)
1931 	estimated_niter = likely_max_stmt_executions_int (loop);
1932     }
1933   if (estimated_niter != -1
1934       && ((unsigned HOST_WIDE_INT) estimated_niter
1935 	  < MAX (th, (unsigned) min_profitable_estimate)))
1936     {
1937       if (dump_enabled_p ())
1938 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 			 "not vectorized: estimated iteration count too "
1940 			 "small.\n");
1941       if (dump_enabled_p ())
1942 	dump_printf_loc (MSG_NOTE, vect_location,
1943 			 "not vectorized: estimated iteration count smaller "
1944 			 "than specified loop bound parameter or minimum "
1945 			 "profitable iterations (whichever is more "
1946 			 "conservative).\n");
1947       return -1;
1948     }
1949 
1950   return 1;
1951 }
1952 
1953 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1954 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1955 			   vec<data_reference_p> *datarefs,
1956 			   unsigned int *n_stmts)
1957 {
1958   *n_stmts = 0;
1959   for (unsigned i = 0; i < loop->num_nodes; i++)
1960     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1961 	 !gsi_end_p (gsi); gsi_next (&gsi))
1962       {
1963 	gimple *stmt = gsi_stmt (gsi);
1964 	if (is_gimple_debug (stmt))
1965 	  continue;
1966 	++(*n_stmts);
1967 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1968 							NULL, 0);
1969 	if (!res)
1970 	  {
1971 	    if (is_gimple_call (stmt) && loop->safelen)
1972 	      {
1973 		tree fndecl = gimple_call_fndecl (stmt), op;
1974 		if (fndecl != NULL_TREE)
1975 		  {
1976 		    cgraph_node *node = cgraph_node::get (fndecl);
1977 		    if (node != NULL && node->simd_clones != NULL)
1978 		      {
1979 			unsigned int j, n = gimple_call_num_args (stmt);
1980 			for (j = 0; j < n; j++)
1981 			  {
1982 			    op = gimple_call_arg (stmt, j);
1983 			    if (DECL_P (op)
1984 				|| (REFERENCE_CLASS_P (op)
1985 				    && get_base_address (op)))
1986 			      break;
1987 			  }
1988 			op = gimple_call_lhs (stmt);
1989 			/* Ignore #pragma omp declare simd functions
1990 			   if they don't have data references in the
1991 			   call stmt itself.  */
1992 			if (j == n
1993 			    && !(op
1994 				 && (DECL_P (op)
1995 				     || (REFERENCE_CLASS_P (op)
1996 					 && get_base_address (op)))))
1997 			  continue;
1998 		      }
1999 		  }
2000 	      }
2001 	    return res;
2002 	  }
2003 	/* If dependence analysis will give up due to the limit on the
2004 	   number of datarefs stop here and fail fatally.  */
2005 	if (datarefs->length ()
2006 	    > (unsigned)param_loop_max_datarefs_for_datadeps)
2007 	  return opt_result::failure_at (stmt, "exceeded param "
2008 					 "loop-max-datarefs-for-datadeps\n");
2009       }
2010   return opt_result::success ();
2011 }
2012 
2013 /* Look for SLP-only access groups and turn each individual access into its own
2014    group.  */
2015 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)2016 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2017 {
2018   unsigned int i;
2019   struct data_reference *dr;
2020 
2021   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2022 
2023   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2024   FOR_EACH_VEC_ELT (datarefs, i, dr)
2025     {
2026       gcc_assert (DR_REF (dr));
2027       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2028 
2029       /* Check if the load is a part of an interleaving chain.  */
2030       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 	{
2032 	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2033 	  unsigned int group_size = DR_GROUP_SIZE (first_element);
2034 
2035 	  /* Check if SLP-only groups.  */
2036 	  if (!STMT_SLP_TYPE (stmt_info)
2037 	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
2038 	    {
2039 	      /* Dissolve the group.  */
2040 	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2041 
2042 	      stmt_vec_info vinfo = first_element;
2043 	      while (vinfo)
2044 		{
2045 		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2046 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2047 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2048 		  DR_GROUP_SIZE (vinfo) = 1;
2049 		  if (STMT_VINFO_STRIDED_P (first_element))
2050 		    DR_GROUP_GAP (vinfo) = 0;
2051 		  else
2052 		    DR_GROUP_GAP (vinfo) = group_size - 1;
2053 		  vinfo = next;
2054 		}
2055 	    }
2056 	}
2057     }
2058 }
2059 
2060 /* Determine if operating on full vectors for LOOP_VINFO might leave
2061    some scalar iterations still to do.  If so, decide how we should
2062    handle those scalar iterations.  The possibilities are:
2063 
2064    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2065        In this case:
2066 
2067 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2068 	 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2069 	 LOOP_VINFO_PEELING_FOR_NITER == false
2070 
2071    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2072        to handle the remaining scalar iterations.  In this case:
2073 
2074 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2075 	 LOOP_VINFO_PEELING_FOR_NITER == true
2076 
2077        There are two choices:
2078 
2079        (2a) Consider vectorizing the epilogue loop at the same VF as the
2080 	    main loop, but using partial vectors instead of full vectors.
2081 	    In this case:
2082 
2083 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2084 
2085        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2086 	    In this case:
2087 
2088 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2089 
2090    When FOR_EPILOGUE_P is true, make this determination based on the
2091    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2092    based on the assumption that LOOP_VINFO is the main loop.  The caller
2093    has made sure that the number of iterations is set appropriately for
2094    this value of FOR_EPILOGUE_P.  */
2095 
2096 opt_result
vect_determine_partial_vectors_and_peeling(loop_vec_info loop_vinfo,bool for_epilogue_p)2097 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2098 					    bool for_epilogue_p)
2099 {
2100   /* Determine whether there would be any scalar iterations left over.  */
2101   bool need_peeling_or_partial_vectors_p
2102     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2103 
2104   /* Decide whether to vectorize the loop with partial vectors.  */
2105   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2106   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2107   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2108       && need_peeling_or_partial_vectors_p)
2109     {
2110       /* For partial-vector-usage=1, try to push the handling of partial
2111 	 vectors to the epilogue, with the main loop continuing to operate
2112 	 on full vectors.
2113 
2114 	 ??? We could then end up failing to use partial vectors if we
2115 	 decide to peel iterations into a prologue, and if the main loop
2116 	 then ends up processing fewer than VF iterations.  */
2117       if (param_vect_partial_vector_usage == 1
2118 	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2119 	  && !vect_known_niters_smaller_than_vf (loop_vinfo))
2120 	LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2121       else
2122 	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2123     }
2124 
2125   if (dump_enabled_p ())
2126     {
2127       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2128 	dump_printf_loc (MSG_NOTE, vect_location,
2129 			 "operating on partial vectors%s.\n",
2130 			 for_epilogue_p ? " for epilogue loop" : "");
2131       else
2132 	dump_printf_loc (MSG_NOTE, vect_location,
2133 			 "operating only on full vectors%s.\n",
2134 			 for_epilogue_p ? " for epilogue loop" : "");
2135     }
2136 
2137   if (for_epilogue_p)
2138     {
2139       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2140       gcc_assert (orig_loop_vinfo);
2141       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142 	gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2143 			      LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2144     }
2145 
2146   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2147       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148     {
2149       /* Check that the loop processes at least one full vector.  */
2150       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2151       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2152       if (known_lt (wi::to_widest (scalar_niters), vf))
2153 	return opt_result::failure_at (vect_location,
2154 				       "loop does not have enough iterations"
2155 				       " to support vectorization.\n");
2156 
2157       /* If we need to peel an extra epilogue iteration to handle data
2158 	 accesses with gaps, check that there are enough scalar iterations
2159 	 available.
2160 
2161 	 The check above is redundant with this one when peeling for gaps,
2162 	 but the distinction is useful for diagnostics.  */
2163       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2164       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165 	  && known_lt (wi::to_widest (scalar_nitersm1), vf))
2166 	return opt_result::failure_at (vect_location,
2167 				       "loop does not have enough iterations"
2168 				       " to support peeling for gaps.\n");
2169     }
2170 
2171   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2172     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2173        && need_peeling_or_partial_vectors_p);
2174 
2175   return opt_result::success ();
2176 }
2177 
2178 /* Function vect_analyze_loop_2.
2179 
2180    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2181    for it.  The different analyses will record information in the
2182    loop_vec_info struct.  */
2183 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)2184 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2185 {
2186   opt_result ok = opt_result::success ();
2187   int res;
2188   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2189   poly_uint64 min_vf = 2;
2190   loop_vec_info orig_loop_vinfo = NULL;
2191 
2192   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2193      loop_vec_info of the first vectorized loop.  */
2194   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2195     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2196   else
2197     orig_loop_vinfo = loop_vinfo;
2198   gcc_assert (orig_loop_vinfo);
2199 
2200   /* The first group of checks is independent of the vector size.  */
2201   fatal = true;
2202 
2203   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2204       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2205     return opt_result::failure_at (vect_location,
2206 				   "not vectorized: simd if(0)\n");
2207 
2208   /* Find all data references in the loop (which correspond to vdefs/vuses)
2209      and analyze their evolution in the loop.  */
2210 
2211   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2212 
2213   /* Gather the data references and count stmts in the loop.  */
2214   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2215     {
2216       opt_result res
2217 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2218 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
2219 				     n_stmts);
2220       if (!res)
2221 	{
2222 	  if (dump_enabled_p ())
2223 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 			     "not vectorized: loop contains function "
2225 			     "calls or data references that cannot "
2226 			     "be analyzed\n");
2227 	  return res;
2228 	}
2229       loop_vinfo->shared->save_datarefs ();
2230     }
2231   else
2232     loop_vinfo->shared->check_datarefs ();
2233 
2234   /* Analyze the data references and also adjust the minimal
2235      vectorization factor according to the loads and stores.  */
2236 
2237   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2238   if (!ok)
2239     {
2240       if (dump_enabled_p ())
2241 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 			 "bad data references.\n");
2243       return ok;
2244     }
2245 
2246   /* Classify all cross-iteration scalar data-flow cycles.
2247      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2248   vect_analyze_scalar_cycles (loop_vinfo);
2249 
2250   vect_pattern_recog (loop_vinfo);
2251 
2252   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2253 
2254   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2255      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2256 
2257   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2258   if (!ok)
2259     {
2260       if (dump_enabled_p ())
2261 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 			 "bad data access.\n");
2263       return ok;
2264     }
2265 
2266   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2267 
2268   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2269   if (!ok)
2270     {
2271       if (dump_enabled_p ())
2272 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 			 "unexpected pattern.\n");
2274       return ok;
2275     }
2276 
2277   /* While the rest of the analysis below depends on it in some way.  */
2278   fatal = false;
2279 
2280   /* Analyze data dependences between the data-refs in the loop
2281      and adjust the maximum vectorization factor according to
2282      the dependences.
2283      FORNOW: fail at the first data dependence that we encounter.  */
2284 
2285   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2286   if (!ok)
2287     {
2288       if (dump_enabled_p ())
2289 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290 			 "bad data dependence.\n");
2291       return ok;
2292     }
2293   if (max_vf != MAX_VECTORIZATION_FACTOR
2294       && maybe_lt (max_vf, min_vf))
2295     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2296   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2297 
2298   ok = vect_determine_vectorization_factor (loop_vinfo);
2299   if (!ok)
2300     {
2301       if (dump_enabled_p ())
2302 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 			 "can't determine vectorization factor.\n");
2304       return ok;
2305     }
2306   if (max_vf != MAX_VECTORIZATION_FACTOR
2307       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2308     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2309 
2310   /* Compute the scalar iteration cost.  */
2311   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2312 
2313   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2314 
2315   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2316   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2317   if (!ok)
2318     return ok;
2319 
2320   /* If there are any SLP instances mark them as pure_slp.  */
2321   bool slp = vect_make_slp_decision (loop_vinfo);
2322   if (slp)
2323     {
2324       /* Find stmts that need to be both vectorized and SLPed.  */
2325       vect_detect_hybrid_slp (loop_vinfo);
2326 
2327       /* Update the vectorization factor based on the SLP decision.  */
2328       vect_update_vf_for_slp (loop_vinfo);
2329 
2330       /* Optimize the SLP graph with the vectorization factor fixed.  */
2331       vect_optimize_slp (loop_vinfo);
2332 
2333       /* Gather the loads reachable from the SLP graph entries.  */
2334       vect_gather_slp_loads (loop_vinfo);
2335     }
2336 
2337   bool saved_can_use_partial_vectors_p
2338     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2339 
2340   /* We don't expect to have to roll back to anything other than an empty
2341      set of rgroups.  */
2342   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2343 
2344   /* This is the point where we can re-start analysis with SLP forced off.  */
2345 start_over:
2346 
2347   /* Now the vectorization factor is final.  */
2348   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2349   gcc_assert (known_ne (vectorization_factor, 0U));
2350 
2351   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2352     {
2353       dump_printf_loc (MSG_NOTE, vect_location,
2354 		       "vectorization_factor = ");
2355       dump_dec (MSG_NOTE, vectorization_factor);
2356       dump_printf (MSG_NOTE, ", niters = %wd\n",
2357 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2358     }
2359 
2360   /* Analyze the alignment of the data-refs in the loop.
2361      Fail if a data reference is found that cannot be vectorized.  */
2362 
2363   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2364   if (!ok)
2365     {
2366       if (dump_enabled_p ())
2367 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 			 "bad data alignment.\n");
2369       return ok;
2370     }
2371 
2372   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2373      It is important to call pruning after vect_analyze_data_ref_accesses,
2374      since we use grouping information gathered by interleaving analysis.  */
2375   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2376   if (!ok)
2377     return ok;
2378 
2379   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2380      vectorization, since we do not want to add extra peeling or
2381      add versioning for alignment.  */
2382   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2383     /* This pass will decide on using loop versioning and/or loop peeling in
2384        order to enhance the alignment of data references in the loop.  */
2385     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2386   if (!ok)
2387     return ok;
2388 
2389   if (slp)
2390     {
2391       /* Analyze operations in the SLP instances.  Note this may
2392 	 remove unsupported SLP instances which makes the above
2393 	 SLP kind detection invalid.  */
2394       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2395       vect_slp_analyze_operations (loop_vinfo);
2396       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2397 	{
2398 	  ok = opt_result::failure_at (vect_location,
2399 				       "unsupported SLP instances\n");
2400 	  goto again;
2401 	}
2402 
2403       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2404       slp_tree load_node, slp_root;
2405       unsigned i, x;
2406       slp_instance instance;
2407       bool can_use_lanes = true;
2408       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2409 	{
2410 	  slp_root = SLP_INSTANCE_TREE (instance);
2411 	  int group_size = SLP_TREE_LANES (slp_root);
2412 	  tree vectype = SLP_TREE_VECTYPE (slp_root);
2413 	  bool loads_permuted = false;
2414 	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2415 	    {
2416 	      if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2417 		continue;
2418 	      unsigned j;
2419 	      stmt_vec_info load_info;
2420 	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2421 		if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2422 		  {
2423 		    loads_permuted = true;
2424 		    break;
2425 		  }
2426 	    }
2427 
2428 	  /* If the loads and stores can be handled with load/store-lane
2429 	     instructions record it and move on to the next instance.  */
2430 	  if (loads_permuted
2431 	      && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2432 	      && vect_store_lanes_supported (vectype, group_size, false))
2433 	    {
2434 	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2435 		{
2436 		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2437 		      (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2438 		  /* Use SLP for strided accesses (or if we can't
2439 		     load-lanes).  */
2440 		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2441 		      || ! vect_load_lanes_supported
2442 			    (STMT_VINFO_VECTYPE (stmt_vinfo),
2443 			     DR_GROUP_SIZE (stmt_vinfo), false))
2444 		    break;
2445 		}
2446 
2447 	      can_use_lanes
2448 		= can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2449 
2450 	      if (can_use_lanes && dump_enabled_p ())
2451 		dump_printf_loc (MSG_NOTE, vect_location,
2452 				 "SLP instance %p can use load/store-lanes\n",
2453 				 instance);
2454 	    }
2455 	  else
2456 	    {
2457 	      can_use_lanes = false;
2458 	      break;
2459 	    }
2460 	}
2461 
2462       /* If all SLP instances can use load/store-lanes abort SLP and try again
2463 	 with SLP disabled.  */
2464       if (can_use_lanes)
2465 	{
2466 	  ok = opt_result::failure_at (vect_location,
2467 				       "Built SLP cancelled: can use "
2468 				       "load/store-lanes\n");
2469 	  if (dump_enabled_p ())
2470 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 			     "Built SLP cancelled: all SLP instances support "
2472 			     "load/store-lanes\n");
2473 	  goto again;
2474 	}
2475     }
2476 
2477   /* Dissolve SLP-only groups.  */
2478   vect_dissolve_slp_only_groups (loop_vinfo);
2479 
2480   /* Scan all the remaining operations in the loop that are not subject
2481      to SLP and make sure they are vectorizable.  */
2482   ok = vect_analyze_loop_operations (loop_vinfo);
2483   if (!ok)
2484     {
2485       if (dump_enabled_p ())
2486 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487 			 "bad operation or unsupported loop bound.\n");
2488       return ok;
2489     }
2490 
2491   /* For now, we don't expect to mix both masking and length approaches for one
2492      loop, disable it if both are recorded.  */
2493   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2494       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2495       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2496     {
2497       if (dump_enabled_p ())
2498 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 			 "can't vectorize a loop with partial vectors"
2500 			 " because we don't expect to mix different"
2501 			 " approaches with partial vectors for the"
2502 			 " same loop.\n");
2503       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2504     }
2505 
2506   /* If we still have the option of using partial vectors,
2507      check whether we can generate the necessary loop controls.  */
2508   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2509       && !vect_verify_full_masking (loop_vinfo)
2510       && !vect_verify_loop_lens (loop_vinfo))
2511     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2512 
2513   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2514      to be able to handle fewer than VF scalars, or needs to have a lower VF
2515      than the main loop.  */
2516   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2517       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2518       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2519 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2520     return opt_result::failure_at (vect_location,
2521 				   "Vectorization factor too high for"
2522 				   " epilogue loop.\n");
2523 
2524   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2525      assuming that the loop will be used as a main loop.  We will redo
2526      this analysis later if we instead decide to use the loop as an
2527      epilogue loop.  */
2528   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2529   if (!ok)
2530     return ok;
2531 
2532   /* Check the costings of the loop make vectorizing worthwhile.  */
2533   res = vect_analyze_loop_costing (loop_vinfo);
2534   if (res < 0)
2535     {
2536       ok = opt_result::failure_at (vect_location,
2537 				   "Loop costings may not be worthwhile.\n");
2538       goto again;
2539     }
2540   if (!res)
2541     return opt_result::failure_at (vect_location,
2542 				   "Loop costings not worthwhile.\n");
2543 
2544   /* If an epilogue loop is required make sure we can create one.  */
2545   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2546       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2547     {
2548       if (dump_enabled_p ())
2549         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2550       if (!vect_can_advance_ivs_p (loop_vinfo)
2551 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2552 					   single_exit (LOOP_VINFO_LOOP
2553 							 (loop_vinfo))))
2554         {
2555 	  ok = opt_result::failure_at (vect_location,
2556 				       "not vectorized: can't create required "
2557 				       "epilog loop\n");
2558           goto again;
2559         }
2560     }
2561 
2562   /* During peeling, we need to check if number of loop iterations is
2563      enough for both peeled prolog loop and vector loop.  This check
2564      can be merged along with threshold check of loop versioning, so
2565      increase threshold for this case if necessary.
2566 
2567      If we are analyzing an epilogue we still want to check what its
2568      versioning threshold would be.  If we decide to vectorize the epilogues we
2569      will want to use the lowest versioning threshold of all epilogues and main
2570      loop.  This will enable us to enter a vectorized epilogue even when
2571      versioning the loop.  We can't simply check whether the epilogue requires
2572      versioning though since we may have skipped some versioning checks when
2573      analyzing the epilogue.  For instance, checks for alias versioning will be
2574      skipped when dealing with epilogues as we assume we already checked them
2575      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2576   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2577     {
2578       poly_uint64 niters_th = 0;
2579       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2580 
2581       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2582 	{
2583 	  /* Niters for peeled prolog loop.  */
2584 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2585 	    {
2586 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2587 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2588 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2589 	    }
2590 	  else
2591 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2592 	}
2593 
2594       /* Niters for at least one iteration of vectorized loop.  */
2595       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2596 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2597       /* One additional iteration because of peeling for gap.  */
2598       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2599 	niters_th += 1;
2600 
2601       /*  Use the same condition as vect_transform_loop to decide when to use
2602 	  the cost to determine a versioning threshold.  */
2603       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2604 	  && ordered_p (th, niters_th))
2605 	niters_th = ordered_max (poly_uint64 (th), niters_th);
2606 
2607       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2608     }
2609 
2610   gcc_assert (known_eq (vectorization_factor,
2611 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2612 
2613   /* Ok to vectorize!  */
2614   return opt_result::success ();
2615 
2616 again:
2617   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2618   gcc_assert (!ok);
2619 
2620   /* Try again with SLP forced off but if we didn't do any SLP there is
2621      no point in re-trying.  */
2622   if (!slp)
2623     return ok;
2624 
2625   /* If there are reduction chains re-trying will fail anyway.  */
2626   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2627     return ok;
2628 
2629   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630      via interleaving or lane instructions.  */
2631   slp_instance instance;
2632   slp_tree node;
2633   unsigned i, j;
2634   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2635     {
2636       stmt_vec_info vinfo;
2637       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2638       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2639 	continue;
2640       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2641       unsigned int size = DR_GROUP_SIZE (vinfo);
2642       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2643       if (! vect_store_lanes_supported (vectype, size, false)
2644 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2645 	 && ! vect_grouped_store_supported (vectype, size))
2646 	return opt_result::failure_at (vinfo->stmt,
2647 				       "unsupported grouped store\n");
2648       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2649 	{
2650 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2651 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2652 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2653 	  size = DR_GROUP_SIZE (vinfo);
2654 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2655 	  if (! vect_load_lanes_supported (vectype, size, false)
2656 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2657 						size))
2658 	    return opt_result::failure_at (vinfo->stmt,
2659 					   "unsupported grouped load\n");
2660 	}
2661     }
2662 
2663   if (dump_enabled_p ())
2664     dump_printf_loc (MSG_NOTE, vect_location,
2665 		     "re-trying with SLP disabled\n");
2666 
2667   /* Roll back state appropriately.  No SLP this time.  */
2668   slp = false;
2669   /* Restore vectorization factor as it were without SLP.  */
2670   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2671   /* Free the SLP instances.  */
2672   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2673     vect_free_slp_instance (instance);
2674   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2675   /* Reset SLP type to loop_vect on all stmts.  */
2676   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2677     {
2678       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2679       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2680 	   !gsi_end_p (si); gsi_next (&si))
2681 	{
2682 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2683 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2684 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2685 	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2686 	    {
2687 	      /* vectorizable_reduction adjusts reduction stmt def-types,
2688 		 restore them to that of the PHI.  */
2689 	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2690 		= STMT_VINFO_DEF_TYPE (stmt_info);
2691 	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2692 					(STMT_VINFO_REDUC_DEF (stmt_info)))
2693 		= STMT_VINFO_DEF_TYPE (stmt_info);
2694 	    }
2695 	}
2696       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2697 	   !gsi_end_p (si); gsi_next (&si))
2698 	{
2699 	  if (is_gimple_debug (gsi_stmt (si)))
2700 	    continue;
2701 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2703 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2704 	    {
2705 	      stmt_vec_info pattern_stmt_info
2706 		= STMT_VINFO_RELATED_STMT (stmt_info);
2707 	      if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2708 		STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2709 
2710 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2711 	      STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2712 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2713 		   !gsi_end_p (pi); gsi_next (&pi))
2714 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2715 		  = loop_vect;
2716 	    }
2717 	}
2718     }
2719   /* Free optimized alias test DDRS.  */
2720   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2721   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2722   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2723   /* Reset target cost data.  */
2724   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2725   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2726     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2727   /* Reset accumulated rgroup information.  */
2728   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2729   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2730   /* Reset assorted flags.  */
2731   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2732   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2733   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2734   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2735   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2736     = saved_can_use_partial_vectors_p;
2737 
2738   goto start_over;
2739 }
2740 
2741 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2742    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2743    OLD_LOOP_VINFO is better unless something specifically indicates
2744    otherwise.
2745 
2746    Note that this deliberately isn't a partial order.  */
2747 
2748 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2749 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2750 			  loop_vec_info old_loop_vinfo)
2751 {
2752   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2753   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2754 
2755   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2756   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2757 
2758   /* Always prefer a VF of loop->simdlen over any other VF.  */
2759   if (loop->simdlen)
2760     {
2761       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2762       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2763       if (new_simdlen_p != old_simdlen_p)
2764 	return new_simdlen_p;
2765     }
2766 
2767   /* Limit the VFs to what is likely to be the maximum number of iterations,
2768      to handle cases in which at least one loop_vinfo is fully-masked.  */
2769   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2770   if (estimated_max_niter != -1)
2771     {
2772       if (known_le (estimated_max_niter, new_vf))
2773 	new_vf = estimated_max_niter;
2774       if (known_le (estimated_max_niter, old_vf))
2775 	old_vf = estimated_max_niter;
2776     }
2777 
2778   /* Check whether the (fractional) cost per scalar iteration is lower
2779      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2780   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2781   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2782 
2783   HOST_WIDE_INT est_rel_new_min
2784     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2785   HOST_WIDE_INT est_rel_new_max
2786     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2787 
2788   HOST_WIDE_INT est_rel_old_min
2789     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2790   HOST_WIDE_INT est_rel_old_max
2791     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2792 
2793   /* Check first if we can make out an unambigous total order from the minimum
2794      and maximum estimates.  */
2795   if (est_rel_new_min < est_rel_old_min
2796       && est_rel_new_max < est_rel_old_max)
2797     return true;
2798   else if (est_rel_old_min < est_rel_new_min
2799 	   && est_rel_old_max < est_rel_new_max)
2800     return false;
2801   /* When old_loop_vinfo uses a variable vectorization factor,
2802      we know that it has a lower cost for at least one runtime VF.
2803      However, we don't know how likely that VF is.
2804 
2805      One option would be to compare the costs for the estimated VFs.
2806      The problem is that that can put too much pressure on the cost
2807      model.  E.g. if the estimated VF is also the lowest possible VF,
2808      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2809      for the estimated VF, we'd then choose new_loop_vinfo even
2810      though (a) new_loop_vinfo might not actually be better than
2811      old_loop_vinfo for that VF and (b) it would be significantly
2812      worse at larger VFs.
2813 
2814      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2815      no more expensive than old_loop_vinfo even after doubling the
2816      estimated old_loop_vinfo VF.  For all but trivial loops, this
2817      ensures that we only pick new_loop_vinfo if it is significantly
2818      better than old_loop_vinfo at the estimated VF.  */
2819 
2820   if (est_rel_old_min != est_rel_new_min
2821       || est_rel_old_max != est_rel_new_max)
2822     {
2823       HOST_WIDE_INT est_rel_new_likely
2824 	= estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2825       HOST_WIDE_INT est_rel_old_likely
2826 	= estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2827 
2828       return est_rel_new_likely * 2 <= est_rel_old_likely;
2829     }
2830 
2831   /* If there's nothing to choose between the loop bodies, see whether
2832      there's a difference in the prologue and epilogue costs.  */
2833   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2834     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2835 
2836   return false;
2837 }
2838 
2839 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2840    true if we should.  */
2841 
2842 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2843 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2844 			loop_vec_info old_loop_vinfo)
2845 {
2846   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2847     return false;
2848 
2849   if (dump_enabled_p ())
2850     dump_printf_loc (MSG_NOTE, vect_location,
2851 		     "***** Preferring vector mode %s to vector mode %s\n",
2852 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2853 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2854   return true;
2855 }
2856 
2857 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2858    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2859    and null on failure.  */
2860 
2861 static loop_vec_info
vect_reanalyze_as_main_loop(loop_vec_info loop_vinfo,unsigned int * n_stmts)2862 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2863 {
2864   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2865     return loop_vinfo;
2866 
2867   if (dump_enabled_p ())
2868     dump_printf_loc (MSG_NOTE, vect_location,
2869 		     "***** Reanalyzing as a main loop with vector mode %s\n",
2870 		     GET_MODE_NAME (loop_vinfo->vector_mode));
2871 
2872   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2873   vec_info_shared *shared = loop_vinfo->shared;
2874   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2875   gcc_assert (main_loop_vinfo);
2876 
2877   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2878 
2879   bool fatal = false;
2880   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2881   loop->aux = NULL;
2882   if (!res)
2883     {
2884       if (dump_enabled_p ())
2885 	dump_printf_loc (MSG_NOTE, vect_location,
2886 			 "***** Failed to analyze main loop with vector"
2887 			 " mode %s\n",
2888 			 GET_MODE_NAME (loop_vinfo->vector_mode));
2889       delete main_loop_vinfo;
2890       return NULL;
2891     }
2892   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2893   return main_loop_vinfo;
2894 }
2895 
2896 /* Function vect_analyze_loop.
2897 
2898    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2899    for it.  The different analyses will record information in the
2900    loop_vec_info struct.  */
2901 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2902 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2903 {
2904   auto_vector_modes vector_modes;
2905 
2906   /* Autodetect first vector size we try.  */
2907   unsigned int autovec_flags
2908     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2909 						    loop->simdlen != 0);
2910   unsigned int mode_i = 0;
2911 
2912   DUMP_VECT_SCOPE ("analyze_loop_nest");
2913 
2914   if (loop_outer (loop)
2915       && loop_vec_info_for_loop (loop_outer (loop))
2916       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2917     return opt_loop_vec_info::failure_at (vect_location,
2918 					  "outer-loop already vectorized.\n");
2919 
2920   if (!find_loop_nest (loop, &shared->loop_nest))
2921     return opt_loop_vec_info::failure_at
2922       (vect_location,
2923        "not vectorized: loop nest containing two or more consecutive inner"
2924        " loops cannot be vectorized\n");
2925 
2926   unsigned n_stmts = 0;
2927   machine_mode autodetected_vector_mode = VOIDmode;
2928   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2929   machine_mode next_vector_mode = VOIDmode;
2930   poly_uint64 lowest_th = 0;
2931   unsigned vectorized_loops = 0;
2932   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2933 			     && !unlimited_cost_model (loop));
2934 
2935   bool vect_epilogues = false;
2936   opt_result res = opt_result::success ();
2937   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2938   while (1)
2939     {
2940       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2941       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2942       if (!loop_vinfo)
2943 	{
2944 	  if (dump_enabled_p ())
2945 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946 			     "bad loop form.\n");
2947 	  gcc_checking_assert (first_loop_vinfo == NULL);
2948 	  return loop_vinfo;
2949 	}
2950       loop_vinfo->vector_mode = next_vector_mode;
2951 
2952       bool fatal = false;
2953 
2954       /* When pick_lowest_cost_p is true, we should in principle iterate
2955 	 over all the loop_vec_infos that LOOP_VINFO could replace and
2956 	 try to vectorize LOOP_VINFO under the same conditions.
2957 	 E.g. when trying to replace an epilogue loop, we should vectorize
2958 	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2959 	 to replace the main loop, we should vectorize LOOP_VINFO as a main
2960 	 loop too.
2961 
2962 	 However, autovectorize_vector_modes is usually sorted as follows:
2963 
2964 	 - Modes that naturally produce lower VFs usually follow modes that
2965 	   naturally produce higher VFs.
2966 
2967 	 - When modes naturally produce the same VF, maskable modes
2968 	   usually follow unmaskable ones, so that the maskable mode
2969 	   can be used to vectorize the epilogue of the unmaskable mode.
2970 
2971 	 This order is preferred because it leads to the maximum
2972 	 epilogue vectorization opportunities.  Targets should only use
2973 	 a different order if they want to make wide modes available while
2974 	 disparaging them relative to earlier, smaller modes.  The assumption
2975 	 in that case is that the wider modes are more expensive in some
2976 	 way that isn't reflected directly in the costs.
2977 
2978 	 There should therefore be few interesting cases in which
2979 	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2980 	 treated as a standalone loop, and ends up being genuinely cheaper
2981 	 than FIRST_LOOP_VINFO.  */
2982       if (vect_epilogues)
2983 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2984 
2985       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2986       if (mode_i == 0)
2987 	autodetected_vector_mode = loop_vinfo->vector_mode;
2988       if (dump_enabled_p ())
2989 	{
2990 	  if (res)
2991 	    dump_printf_loc (MSG_NOTE, vect_location,
2992 			     "***** Analysis succeeded with vector mode %s\n",
2993 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2994 	  else
2995 	    dump_printf_loc (MSG_NOTE, vect_location,
2996 			     "***** Analysis failed with vector mode %s\n",
2997 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2998 	}
2999 
3000       loop->aux = NULL;
3001 
3002       if (!fatal)
3003 	while (mode_i < vector_modes.length ()
3004 	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3005 	  {
3006 	    if (dump_enabled_p ())
3007 	      dump_printf_loc (MSG_NOTE, vect_location,
3008 			       "***** The result for vector mode %s would"
3009 			       " be the same\n",
3010 			       GET_MODE_NAME (vector_modes[mode_i]));
3011 	    mode_i += 1;
3012 	  }
3013 
3014       if (res)
3015 	{
3016 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3017 	  vectorized_loops++;
3018 
3019 	  /* Once we hit the desired simdlen for the first time,
3020 	     discard any previous attempts.  */
3021 	  if (simdlen
3022 	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3023 	    {
3024 	      delete first_loop_vinfo;
3025 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3026 	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3027 	      simdlen = 0;
3028 	    }
3029 	  else if (pick_lowest_cost_p && first_loop_vinfo)
3030 	    {
3031 	      /* Keep trying to roll back vectorization attempts while the
3032 		 loop_vec_infos they produced were worse than this one.  */
3033 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3034 	      while (!vinfos.is_empty ()
3035 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3036 		{
3037 		  gcc_assert (vect_epilogues);
3038 		  delete vinfos.pop ();
3039 		}
3040 	      if (vinfos.is_empty ()
3041 		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3042 		{
3043 		  loop_vec_info main_loop_vinfo
3044 		    = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3045 		  if (main_loop_vinfo == loop_vinfo)
3046 		    {
3047 		      delete first_loop_vinfo;
3048 		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3049 		    }
3050 		  else if (main_loop_vinfo
3051 			   && vect_joust_loop_vinfos (main_loop_vinfo,
3052 						      first_loop_vinfo))
3053 		    {
3054 		      delete first_loop_vinfo;
3055 		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3056 		      delete loop_vinfo;
3057 		      loop_vinfo
3058 			= opt_loop_vec_info::success (main_loop_vinfo);
3059 		    }
3060 		  else
3061 		    delete main_loop_vinfo;
3062 		}
3063 	    }
3064 
3065 	  if (first_loop_vinfo == NULL)
3066 	    {
3067 	      first_loop_vinfo = loop_vinfo;
3068 	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3069 	    }
3070 	  else if (vect_epilogues
3071 		   /* For now only allow one epilogue loop.  */
3072 		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
3073 	    {
3074 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3075 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3076 	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3077 			  || maybe_ne (lowest_th, 0U));
3078 	      /* Keep track of the known smallest versioning
3079 		 threshold.  */
3080 	      if (ordered_p (lowest_th, th))
3081 		lowest_th = ordered_min (lowest_th, th);
3082 	    }
3083 	  else
3084 	    {
3085 	      delete loop_vinfo;
3086 	      loop_vinfo = opt_loop_vec_info::success (NULL);
3087 	    }
3088 
3089 	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3090 	     enabled, SIMDUID is not set, it is the innermost loop and we have
3091 	     either already found the loop's SIMDLEN or there was no SIMDLEN to
3092 	     begin with.
3093 	     TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3094 	  vect_epilogues = (!simdlen
3095 			    && loop->inner == NULL
3096 			    && param_vect_epilogues_nomask
3097 			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3098 			    && !loop->simduid
3099 			    /* For now only allow one epilogue loop, but allow
3100 			       pick_lowest_cost_p to replace it.  */
3101 			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3102 				|| pick_lowest_cost_p));
3103 
3104 	  /* Commit to first_loop_vinfo if we have no reason to try
3105 	     alternatives.  */
3106 	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3107 	    break;
3108 	}
3109       else
3110 	{
3111 	  delete loop_vinfo;
3112 	  loop_vinfo = opt_loop_vec_info::success (NULL);
3113 	  if (fatal)
3114 	    {
3115 	      gcc_checking_assert (first_loop_vinfo == NULL);
3116 	      break;
3117 	    }
3118 	}
3119 
3120       /* Handle the case that the original loop can use partial
3121 	 vectorization, but want to only adopt it for the epilogue.
3122 	 The retry should be in the same mode as original.  */
3123       if (vect_epilogues
3124 	  && loop_vinfo
3125 	  && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3126 	{
3127 	  gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3128 		      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3129 	  if (dump_enabled_p ())
3130 	    dump_printf_loc (MSG_NOTE, vect_location,
3131 			     "***** Re-trying analysis with same vector mode"
3132 			     " %s for epilogue with partial vectors.\n",
3133 			     GET_MODE_NAME (loop_vinfo->vector_mode));
3134 	  continue;
3135 	}
3136 
3137       if (mode_i < vector_modes.length ()
3138 	  && VECTOR_MODE_P (autodetected_vector_mode)
3139 	  && (related_vector_mode (vector_modes[mode_i],
3140 				   GET_MODE_INNER (autodetected_vector_mode))
3141 	      == autodetected_vector_mode)
3142 	  && (related_vector_mode (autodetected_vector_mode,
3143 				   GET_MODE_INNER (vector_modes[mode_i]))
3144 	      == vector_modes[mode_i]))
3145 	{
3146 	  if (dump_enabled_p ())
3147 	    dump_printf_loc (MSG_NOTE, vect_location,
3148 			     "***** Skipping vector mode %s, which would"
3149 			     " repeat the analysis for %s\n",
3150 			     GET_MODE_NAME (vector_modes[mode_i]),
3151 			     GET_MODE_NAME (autodetected_vector_mode));
3152 	  mode_i += 1;
3153 	}
3154 
3155       if (mode_i == vector_modes.length ()
3156 	  || autodetected_vector_mode == VOIDmode)
3157 	break;
3158 
3159       /* Try the next biggest vector size.  */
3160       next_vector_mode = vector_modes[mode_i++];
3161       if (dump_enabled_p ())
3162 	dump_printf_loc (MSG_NOTE, vect_location,
3163 			 "***** Re-trying analysis with vector mode %s\n",
3164 			 GET_MODE_NAME (next_vector_mode));
3165     }
3166 
3167   if (first_loop_vinfo)
3168     {
3169       loop->aux = (loop_vec_info) first_loop_vinfo;
3170       if (dump_enabled_p ())
3171 	dump_printf_loc (MSG_NOTE, vect_location,
3172 			 "***** Choosing vector mode %s\n",
3173 			 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3174       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3175       return first_loop_vinfo;
3176     }
3177 
3178   return opt_loop_vec_info::propagate_failure (res);
3179 }
3180 
3181 /* Return true if there is an in-order reduction function for CODE, storing
3182    it in *REDUC_FN if so.  */
3183 
3184 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)3185 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3186 {
3187   switch (code)
3188     {
3189     case PLUS_EXPR:
3190       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3191       return true;
3192 
3193     default:
3194       return false;
3195     }
3196 }
3197 
3198 /* Function reduction_fn_for_scalar_code
3199 
3200    Input:
3201    CODE - tree_code of a reduction operations.
3202 
3203    Output:
3204    REDUC_FN - the corresponding internal function to be used to reduce the
3205       vector of partial results into a single scalar result, or IFN_LAST
3206       if the operation is a supported reduction operation, but does not have
3207       such an internal function.
3208 
3209    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3210 
3211 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)3212 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3213 {
3214   switch (code)
3215     {
3216       case MAX_EXPR:
3217         *reduc_fn = IFN_REDUC_MAX;
3218         return true;
3219 
3220       case MIN_EXPR:
3221         *reduc_fn = IFN_REDUC_MIN;
3222         return true;
3223 
3224       case PLUS_EXPR:
3225         *reduc_fn = IFN_REDUC_PLUS;
3226         return true;
3227 
3228       case BIT_AND_EXPR:
3229 	*reduc_fn = IFN_REDUC_AND;
3230 	return true;
3231 
3232       case BIT_IOR_EXPR:
3233 	*reduc_fn = IFN_REDUC_IOR;
3234 	return true;
3235 
3236       case BIT_XOR_EXPR:
3237 	*reduc_fn = IFN_REDUC_XOR;
3238 	return true;
3239 
3240       case MULT_EXPR:
3241       case MINUS_EXPR:
3242         *reduc_fn = IFN_LAST;
3243         return true;
3244 
3245       default:
3246        return false;
3247     }
3248 }
3249 
3250 /* If there is a neutral value X such that SLP reduction NODE would not
3251    be affected by the introduction of additional X elements, return that X,
3252    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3253    is the vector type that would hold element X.  REDUC_CHAIN is true if
3254    the SLP statements perform a single reduction, false if each statement
3255    performs an independent reduction.  */
3256 
3257 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)3258 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3259 			      tree_code code, bool reduc_chain)
3260 {
3261   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3262   stmt_vec_info stmt_vinfo = stmts[0];
3263   tree scalar_type = TREE_TYPE (vector_type);
3264   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3265   gcc_assert (loop);
3266 
3267   switch (code)
3268     {
3269     case WIDEN_SUM_EXPR:
3270     case DOT_PROD_EXPR:
3271     case SAD_EXPR:
3272     case PLUS_EXPR:
3273     case MINUS_EXPR:
3274     case BIT_IOR_EXPR:
3275     case BIT_XOR_EXPR:
3276       return build_zero_cst (scalar_type);
3277 
3278     case MULT_EXPR:
3279       return build_one_cst (scalar_type);
3280 
3281     case BIT_AND_EXPR:
3282       return build_all_ones_cst (scalar_type);
3283 
3284     case MAX_EXPR:
3285     case MIN_EXPR:
3286       /* For MIN/MAX the initial values are neutral.  A reduction chain
3287 	 has only a single initial value, so that value is neutral for
3288 	 all statements.  */
3289       if (reduc_chain)
3290 	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3291 				      loop_preheader_edge (loop));
3292       return NULL_TREE;
3293 
3294     default:
3295       return NULL_TREE;
3296     }
3297 }
3298 
3299 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3300    STMT is printed with a message MSG. */
3301 
3302 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)3303 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3304 {
3305   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3306 }
3307 
3308 /* Return true if we need an in-order reduction for operation CODE
3309    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3310    overflow must wrap.  */
3311 
3312 bool
needs_fold_left_reduction_p(tree type,tree_code code)3313 needs_fold_left_reduction_p (tree type, tree_code code)
3314 {
3315   /* CHECKME: check for !flag_finite_math_only too?  */
3316   if (SCALAR_FLOAT_TYPE_P (type))
3317     switch (code)
3318       {
3319       case MIN_EXPR:
3320       case MAX_EXPR:
3321 	return false;
3322 
3323       default:
3324 	return !flag_associative_math;
3325       }
3326 
3327   if (INTEGRAL_TYPE_P (type))
3328     {
3329       if (!operation_no_trapping_overflow (type, code))
3330 	return true;
3331       return false;
3332     }
3333 
3334   if (SAT_FIXED_POINT_TYPE_P (type))
3335     return true;
3336 
3337   return false;
3338 }
3339 
3340 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3341    has a handled computation expression.  Store the main reduction
3342    operation in *CODE.  */
3343 
3344 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)3345 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3346 		      tree loop_arg, enum tree_code *code,
3347 		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3348 {
3349   auto_bitmap visited;
3350   tree lookfor = PHI_RESULT (phi);
3351   ssa_op_iter curri;
3352   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3353   while (USE_FROM_PTR (curr) != loop_arg)
3354     curr = op_iter_next_use (&curri);
3355   curri.i = curri.numops;
3356   do
3357     {
3358       path.safe_push (std::make_pair (curri, curr));
3359       tree use = USE_FROM_PTR (curr);
3360       if (use == lookfor)
3361 	break;
3362       gimple *def = SSA_NAME_DEF_STMT (use);
3363       if (gimple_nop_p (def)
3364 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3365 	{
3366 pop:
3367 	  do
3368 	    {
3369 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3370 	      curri = x.first;
3371 	      curr = x.second;
3372 	      do
3373 		curr = op_iter_next_use (&curri);
3374 	      /* Skip already visited or non-SSA operands (from iterating
3375 	         over PHI args).  */
3376 	      while (curr != NULL_USE_OPERAND_P
3377 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3378 			 || ! bitmap_set_bit (visited,
3379 					      SSA_NAME_VERSION
3380 					        (USE_FROM_PTR (curr)))));
3381 	    }
3382 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3383 	  if (curr == NULL_USE_OPERAND_P)
3384 	    break;
3385 	}
3386       else
3387 	{
3388 	  if (gimple_code (def) == GIMPLE_PHI)
3389 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3390 	  else
3391 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3392 	  while (curr != NULL_USE_OPERAND_P
3393 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3394 		     || ! bitmap_set_bit (visited,
3395 					  SSA_NAME_VERSION
3396 					    (USE_FROM_PTR (curr)))))
3397 	    curr = op_iter_next_use (&curri);
3398 	  if (curr == NULL_USE_OPERAND_P)
3399 	    goto pop;
3400 	}
3401     }
3402   while (1);
3403   if (dump_file && (dump_flags & TDF_DETAILS))
3404     {
3405       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3406       unsigned i;
3407       std::pair<ssa_op_iter, use_operand_p> *x;
3408       FOR_EACH_VEC_ELT (path, i, x)
3409 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3410       dump_printf (MSG_NOTE, "\n");
3411     }
3412 
3413   /* Check whether the reduction path detected is valid.  */
3414   bool fail = path.length () == 0;
3415   bool neg = false;
3416   int sign = -1;
3417   *code = ERROR_MARK;
3418   for (unsigned i = 1; i < path.length (); ++i)
3419     {
3420       gimple *use_stmt = USE_STMT (path[i].second);
3421       tree op = USE_FROM_PTR (path[i].second);
3422       if (! is_gimple_assign (use_stmt)
3423 	  /* The following make sure we can compute the operand index
3424 	     easily plus it mostly disallows chaining via COND_EXPR condition
3425 	     operands.  */
3426 	  || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3427 	      && (gimple_num_ops (use_stmt) <= 2
3428 		  || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3429 	      && (gimple_num_ops (use_stmt) <= 3
3430 		  || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3431 	{
3432 	  fail = true;
3433 	  break;
3434 	}
3435       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3436       if (use_code == MINUS_EXPR)
3437 	{
3438 	  use_code = PLUS_EXPR;
3439 	  /* Track whether we negate the reduction value each iteration.  */
3440 	  if (gimple_assign_rhs2 (use_stmt) == op)
3441 	    neg = ! neg;
3442 	}
3443       if (CONVERT_EXPR_CODE_P (use_code)
3444 	  && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3445 				    TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3446 	;
3447       else if (*code == ERROR_MARK)
3448 	{
3449 	  *code = use_code;
3450 	  sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3451 	}
3452       else if (use_code != *code)
3453 	{
3454 	  fail = true;
3455 	  break;
3456 	}
3457       else if ((use_code == MIN_EXPR
3458 		|| use_code == MAX_EXPR)
3459 	       && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3460 	{
3461 	  fail = true;
3462 	  break;
3463 	}
3464       /* Check there's only a single stmt the op is used on.  For the
3465 	 not value-changing tail and the last stmt allow out-of-loop uses.
3466 	 ???  We could relax this and handle arbitrary live stmts by
3467 	 forcing a scalar epilogue for example.  */
3468       imm_use_iterator imm_iter;
3469       gimple *op_use_stmt;
3470       unsigned cnt = 0;
3471       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3472 	if (!is_gimple_debug (op_use_stmt)
3473 	    && (*code != ERROR_MARK
3474 		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3475 	  {
3476 	    /* We want to allow x + x but not x < 1 ? x : 2.  */
3477 	    if (is_gimple_assign (op_use_stmt)
3478 		&& gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3479 	      {
3480 		use_operand_p use_p;
3481 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3482 		  cnt++;
3483 	      }
3484 	    else
3485 	      cnt++;
3486 	  }
3487       if (cnt != 1)
3488 	{
3489 	  fail = true;
3490 	  break;
3491 	}
3492     }
3493   return ! fail && ! neg && *code != ERROR_MARK;
3494 }
3495 
3496 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3497 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3498 		      tree loop_arg, enum tree_code code)
3499 {
3500   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3501   enum tree_code code_;
3502   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3503 	  && code_ == code);
3504 }
3505 
3506 
3507 
3508 /* Function vect_is_simple_reduction
3509 
3510    (1) Detect a cross-iteration def-use cycle that represents a simple
3511    reduction computation.  We look for the following pattern:
3512 
3513    loop_header:
3514      a1 = phi < a0, a2 >
3515      a3 = ...
3516      a2 = operation (a3, a1)
3517 
3518    or
3519 
3520    a3 = ...
3521    loop_header:
3522      a1 = phi < a0, a2 >
3523      a2 = operation (a3, a1)
3524 
3525    such that:
3526    1. operation is commutative and associative and it is safe to
3527       change the order of the computation
3528    2. no uses for a2 in the loop (a2 is used out of the loop)
3529    3. no uses of a1 in the loop besides the reduction operation
3530    4. no uses of a1 outside the loop.
3531 
3532    Conditions 1,4 are tested here.
3533    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3534 
3535    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3536    nested cycles.
3537 
3538    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3539    reductions:
3540 
3541      a1 = phi < a0, a2 >
3542      inner loop (def of a3)
3543      a2 = phi < a3 >
3544 
3545    (4) Detect condition expressions, ie:
3546      for (int i = 0; i < N; i++)
3547        if (a[i] < val)
3548 	ret_val = a[i];
3549 
3550 */
3551 
3552 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3553 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3554 			  bool *double_reduc, bool *reduc_chain_p)
3555 {
3556   gphi *phi = as_a <gphi *> (phi_info->stmt);
3557   gimple *phi_use_stmt = NULL;
3558   imm_use_iterator imm_iter;
3559   use_operand_p use_p;
3560 
3561   *double_reduc = false;
3562   *reduc_chain_p = false;
3563   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3564 
3565   tree phi_name = PHI_RESULT (phi);
3566   /* ???  If there are no uses of the PHI result the inner loop reduction
3567      won't be detected as possibly double-reduction by vectorizable_reduction
3568      because that tries to walk the PHI arg from the preheader edge which
3569      can be constant.  See PR60382.  */
3570   if (has_zero_uses (phi_name))
3571     return NULL;
3572   class loop *loop = (gimple_bb (phi))->loop_father;
3573   unsigned nphi_def_loop_uses = 0;
3574   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3575     {
3576       gimple *use_stmt = USE_STMT (use_p);
3577       if (is_gimple_debug (use_stmt))
3578 	continue;
3579 
3580       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3581         {
3582           if (dump_enabled_p ())
3583 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 			     "intermediate value used outside loop.\n");
3585 
3586           return NULL;
3587         }
3588 
3589       nphi_def_loop_uses++;
3590       phi_use_stmt = use_stmt;
3591     }
3592 
3593   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3594   if (TREE_CODE (latch_def) != SSA_NAME)
3595     {
3596       if (dump_enabled_p ())
3597 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 			 "reduction: not ssa_name: %T\n", latch_def);
3599       return NULL;
3600     }
3601 
3602   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3603   if (!def_stmt_info
3604       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3605     return NULL;
3606 
3607   bool nested_in_vect_loop
3608     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3609   unsigned nlatch_def_loop_uses = 0;
3610   auto_vec<gphi *, 3> lcphis;
3611   bool inner_loop_of_double_reduc = false;
3612   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3613     {
3614       gimple *use_stmt = USE_STMT (use_p);
3615       if (is_gimple_debug (use_stmt))
3616 	continue;
3617       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3618 	nlatch_def_loop_uses++;
3619       else
3620 	{
3621 	  /* We can have more than one loop-closed PHI.  */
3622 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3623 	  if (nested_in_vect_loop
3624 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3625 		  == vect_double_reduction_def))
3626 	    inner_loop_of_double_reduc = true;
3627 	}
3628     }
3629 
3630   /* If we are vectorizing an inner reduction we are executing that
3631      in the original order only in case we are not dealing with a
3632      double reduction.  */
3633   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3634     {
3635       if (dump_enabled_p ())
3636 	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3637 			"detected nested cycle: ");
3638       return def_stmt_info;
3639     }
3640 
3641   /* If this isn't a nested cycle or if the nested cycle reduction value
3642      is used ouside of the inner loop we cannot handle uses of the reduction
3643      value.  */
3644   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3645     {
3646       if (dump_enabled_p ())
3647 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648 			 "reduction used in loop.\n");
3649       return NULL;
3650     }
3651 
3652   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3653      defined in the inner loop.  */
3654   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3655     {
3656       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3657       if (gimple_phi_num_args (def_stmt) != 1
3658           || TREE_CODE (op1) != SSA_NAME)
3659         {
3660           if (dump_enabled_p ())
3661 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 			     "unsupported phi node definition.\n");
3663 
3664           return NULL;
3665         }
3666 
3667       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3668       if (gimple_bb (def1)
3669 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3670           && loop->inner
3671           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3672           && is_gimple_assign (def1)
3673 	  && is_a <gphi *> (phi_use_stmt)
3674 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3675         {
3676           if (dump_enabled_p ())
3677             report_vect_op (MSG_NOTE, def_stmt,
3678 			    "detected double reduction: ");
3679 
3680           *double_reduc = true;
3681 	  return def_stmt_info;
3682         }
3683 
3684       return NULL;
3685     }
3686 
3687   /* Look for the expression computing latch_def from then loop PHI result.  */
3688   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3689   enum tree_code code;
3690   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3691 			    path))
3692     {
3693       STMT_VINFO_REDUC_CODE (phi_info) = code;
3694       if (code == COND_EXPR && !nested_in_vect_loop)
3695 	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3696 
3697       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3698 	 reduction chain for which the additional restriction is that
3699 	 all operations in the chain are the same.  */
3700       auto_vec<stmt_vec_info, 8> reduc_chain;
3701       unsigned i;
3702       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3703       for (i = path.length () - 1; i >= 1; --i)
3704 	{
3705 	  gimple *stmt = USE_STMT (path[i].second);
3706 	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3707 	  STMT_VINFO_REDUC_IDX (stmt_info)
3708 	    = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3709 	  enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3710 	  bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3711 				     && (i == 1 || i == path.length () - 1));
3712 	  if ((stmt_code != code && !leading_conversion)
3713 	      /* We can only handle the final value in epilogue
3714 		 generation for reduction chains.  */
3715 	      || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3716 	    is_slp_reduc = false;
3717 	  /* For reduction chains we support a trailing/leading
3718 	     conversions.  We do not store those in the actual chain.  */
3719 	  if (leading_conversion)
3720 	    continue;
3721 	  reduc_chain.safe_push (stmt_info);
3722 	}
3723       if (is_slp_reduc && reduc_chain.length () > 1)
3724 	{
3725 	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3726 	    {
3727 	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3728 	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3729 	    }
3730 	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3731 	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3732 
3733 	  /* Save the chain for further analysis in SLP detection.  */
3734 	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3735 	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3736 
3737 	  *reduc_chain_p = true;
3738 	  if (dump_enabled_p ())
3739 	    dump_printf_loc (MSG_NOTE, vect_location,
3740 			    "reduction: detected reduction chain\n");
3741 	}
3742       else if (dump_enabled_p ())
3743 	dump_printf_loc (MSG_NOTE, vect_location,
3744 			 "reduction: detected reduction\n");
3745 
3746       return def_stmt_info;
3747     }
3748 
3749   if (dump_enabled_p ())
3750     dump_printf_loc (MSG_NOTE, vect_location,
3751 		     "reduction: unknown pattern\n");
3752 
3753   return NULL;
3754 }
3755 
3756 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3757    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3758    or -1 if not known.  */
3759 
3760 static int
vect_get_peel_iters_epilogue(loop_vec_info loop_vinfo,int peel_iters_prologue)3761 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3762 {
3763   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3764   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3765     {
3766       if (dump_enabled_p ())
3767 	dump_printf_loc (MSG_NOTE, vect_location,
3768 			 "cost model: epilogue peel iters set to vf/2 "
3769 			 "because loop iterations are unknown .\n");
3770       return assumed_vf / 2;
3771     }
3772   else
3773     {
3774       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3775       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3776       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3777       /* If we need to peel for gaps, but no peeling is required, we have to
3778 	 peel VF iterations.  */
3779       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3780 	peel_iters_epilogue = assumed_vf;
3781       return peel_iters_epilogue;
3782     }
3783 }
3784 
3785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3786 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3788 			     int *peel_iters_epilogue,
3789 			     stmt_vector_for_cost *scalar_cost_vec,
3790 			     stmt_vector_for_cost *prologue_cost_vec,
3791 			     stmt_vector_for_cost *epilogue_cost_vec)
3792 {
3793   int retval = 0;
3794 
3795   *peel_iters_epilogue
3796     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3797 
3798   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3799     {
3800       /* If peeled iterations are known but number of scalar loop
3801 	 iterations are unknown, count a taken branch per peeled loop.  */
3802       if (peel_iters_prologue > 0)
3803 	retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3804 				   NULL, NULL_TREE, 0, vect_prologue);
3805       if (*peel_iters_epilogue > 0)
3806 	retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3807 				    NULL, NULL_TREE, 0, vect_epilogue);
3808     }
3809 
3810   stmt_info_for_cost *si;
3811   int j;
3812   if (peel_iters_prologue)
3813     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3814       retval += record_stmt_cost (prologue_cost_vec,
3815 				  si->count * peel_iters_prologue,
3816 				  si->kind, si->stmt_info, si->misalign,
3817 				  vect_prologue);
3818   if (*peel_iters_epilogue)
3819     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3820       retval += record_stmt_cost (epilogue_cost_vec,
3821 				  si->count * *peel_iters_epilogue,
3822 				  si->kind, si->stmt_info, si->misalign,
3823 				  vect_epilogue);
3824 
3825   return retval;
3826 }
3827 
3828 /* Function vect_estimate_min_profitable_iters
3829 
3830    Return the number of iterations required for the vector version of the
3831    loop to be profitable relative to the cost of the scalar version of the
3832    loop.
3833 
3834    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3835    of iterations for vectorization.  -1 value means loop vectorization
3836    is not profitable.  This returned value may be used for dynamic
3837    profitability check.
3838 
3839    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3840    for static check against estimated number of iterations.  */
3841 
3842 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3843 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3844 				    int *ret_min_profitable_niters,
3845 				    int *ret_min_profitable_estimate)
3846 {
3847   int min_profitable_iters;
3848   int min_profitable_estimate;
3849   int peel_iters_prologue;
3850   int peel_iters_epilogue;
3851   unsigned vec_inside_cost = 0;
3852   int vec_outside_cost = 0;
3853   unsigned vec_prologue_cost = 0;
3854   unsigned vec_epilogue_cost = 0;
3855   int scalar_single_iter_cost = 0;
3856   int scalar_outside_cost = 0;
3857   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3858   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3859   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3860 
3861   /* Cost model disabled.  */
3862   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3863     {
3864       if (dump_enabled_p ())
3865 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3866       *ret_min_profitable_niters = 0;
3867       *ret_min_profitable_estimate = 0;
3868       return;
3869     }
3870 
3871   /* Requires loop versioning tests to handle misalignment.  */
3872   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3873     {
3874       /*  FIXME: Make cost depend on complexity of individual check.  */
3875       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3876       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3877 			    NULL, NULL_TREE, 0, vect_prologue);
3878       if (dump_enabled_p ())
3879 	dump_printf (MSG_NOTE,
3880 		     "cost model: Adding cost of checks for loop "
3881 		     "versioning to treat misalignment.\n");
3882     }
3883 
3884   /* Requires loop versioning with alias checks.  */
3885   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3886     {
3887       /*  FIXME: Make cost depend on complexity of individual check.  */
3888       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3889       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3890 			    NULL, NULL_TREE, 0, vect_prologue);
3891       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3892       if (len)
3893 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3894 	(void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3895 			      scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3896       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3897       if (len)
3898 	{
3899 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3900 	  unsigned int nstmts = len * 2 - 1;
3901 	  /* +1 for each bias that needs adding.  */
3902 	  for (unsigned int i = 0; i < len; ++i)
3903 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3904 	      nstmts += 1;
3905 	  (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3906 				scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3907 	}
3908       if (dump_enabled_p ())
3909 	dump_printf (MSG_NOTE,
3910 		     "cost model: Adding cost of checks for loop "
3911 		     "versioning aliasing.\n");
3912     }
3913 
3914   /* Requires loop versioning with niter checks.  */
3915   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3916     {
3917       /*  FIXME: Make cost depend on complexity of individual check.  */
3918       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3919 			    NULL, NULL_TREE, 0, vect_prologue);
3920       if (dump_enabled_p ())
3921 	dump_printf (MSG_NOTE,
3922 		     "cost model: Adding cost of checks for loop "
3923 		     "versioning niters.\n");
3924     }
3925 
3926   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3927     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3928 			  NULL, NULL_TREE, 0, vect_prologue);
3929 
3930   /* Count statements in scalar loop.  Using this as scalar cost for a single
3931      iteration for now.
3932 
3933      TODO: Add outer loop support.
3934 
3935      TODO: Consider assigning different costs to different scalar
3936      statements.  */
3937 
3938   scalar_single_iter_cost
3939     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3940 
3941   /* Add additional cost for the peeled instructions in prologue and epilogue
3942      loop.  (For fully-masked loops there will be no peeling.)
3943 
3944      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3945      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3946 
3947      TODO: Build an expression that represents peel_iters for prologue and
3948      epilogue to be used in a run-time test.  */
3949 
3950   bool prologue_need_br_taken_cost = false;
3951   bool prologue_need_br_not_taken_cost = false;
3952 
3953   /* Calculate peel_iters_prologue.  */
3954   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3955     peel_iters_prologue = 0;
3956   else if (npeel < 0)
3957     {
3958       peel_iters_prologue = assumed_vf / 2;
3959       if (dump_enabled_p ())
3960 	dump_printf (MSG_NOTE, "cost model: "
3961 		     "prologue peel iters set to vf/2.\n");
3962 
3963       /* If peeled iterations are unknown, count a taken branch and a not taken
3964 	 branch per peeled loop.  Even if scalar loop iterations are known,
3965 	 vector iterations are not known since peeled prologue iterations are
3966 	 not known.  Hence guards remain the same.  */
3967       prologue_need_br_taken_cost = true;
3968       prologue_need_br_not_taken_cost = true;
3969     }
3970   else
3971     {
3972       peel_iters_prologue = npeel;
3973       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3974 	/* If peeled iterations are known but number of scalar loop
3975 	   iterations are unknown, count a taken branch per peeled loop.  */
3976 	prologue_need_br_taken_cost = true;
3977     }
3978 
3979   bool epilogue_need_br_taken_cost = false;
3980   bool epilogue_need_br_not_taken_cost = false;
3981 
3982   /* Calculate peel_iters_epilogue.  */
3983   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3984     /* We need to peel exactly one iteration for gaps.  */
3985     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3986   else if (npeel < 0)
3987     {
3988       /* If peeling for alignment is unknown, loop bound of main loop
3989 	 becomes unknown.  */
3990       peel_iters_epilogue = assumed_vf / 2;
3991       if (dump_enabled_p ())
3992 	dump_printf (MSG_NOTE, "cost model: "
3993 		     "epilogue peel iters set to vf/2 because "
3994 		     "peeling for alignment is unknown.\n");
3995 
3996       /* See the same reason above in peel_iters_prologue calculation.  */
3997       epilogue_need_br_taken_cost = true;
3998       epilogue_need_br_not_taken_cost = true;
3999     }
4000   else
4001     {
4002       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4003       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4004 	/* If peeled iterations are known but number of scalar loop
4005 	   iterations are unknown, count a taken branch per peeled loop.  */
4006 	epilogue_need_br_taken_cost = true;
4007     }
4008 
4009   stmt_info_for_cost *si;
4010   int j;
4011   /* Add costs associated with peel_iters_prologue.  */
4012   if (peel_iters_prologue)
4013     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4014       {
4015 	(void) add_stmt_cost (loop_vinfo, target_cost_data,
4016 			      si->count * peel_iters_prologue, si->kind,
4017 			      si->stmt_info, si->vectype, si->misalign,
4018 			      vect_prologue);
4019       }
4020 
4021   /* Add costs associated with peel_iters_epilogue.  */
4022   if (peel_iters_epilogue)
4023     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4024       {
4025 	(void) add_stmt_cost (loop_vinfo, target_cost_data,
4026 			      si->count * peel_iters_epilogue, si->kind,
4027 			      si->stmt_info, si->vectype, si->misalign,
4028 			      vect_epilogue);
4029       }
4030 
4031   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4032 
4033   if (prologue_need_br_taken_cost)
4034     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4035 			  NULL, NULL_TREE, 0, vect_prologue);
4036 
4037   if (prologue_need_br_not_taken_cost)
4038     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4039 			  cond_branch_not_taken, NULL, NULL_TREE, 0,
4040 			  vect_prologue);
4041 
4042   if (epilogue_need_br_taken_cost)
4043     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044 			  NULL, NULL_TREE, 0, vect_epilogue);
4045 
4046   if (epilogue_need_br_not_taken_cost)
4047     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048 			  cond_branch_not_taken, NULL, NULL_TREE, 0,
4049 			  vect_epilogue);
4050 
4051   /* Take care of special costs for rgroup controls of partial vectors.  */
4052   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4053     {
4054       /* Calculate how many masks we need to generate.  */
4055       unsigned int num_masks = 0;
4056       rgroup_controls *rgm;
4057       unsigned int num_vectors_m1;
4058       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4059 	if (rgm->type)
4060 	  num_masks += num_vectors_m1 + 1;
4061       gcc_assert (num_masks > 0);
4062 
4063       /* In the worst case, we need to generate each mask in the prologue
4064 	 and in the loop body.  One of the loop body mask instructions
4065 	 replaces the comparison in the scalar loop, and since we don't
4066 	 count the scalar comparison against the scalar body, we shouldn't
4067 	 count that vector instruction against the vector body either.
4068 
4069 	 Sometimes we can use unpacks instead of generating prologue
4070 	 masks and sometimes the prologue mask will fold to a constant,
4071 	 so the actual prologue cost might be smaller.  However, it's
4072 	 simpler and safer to use the worst-case cost; if this ends up
4073 	 being the tie-breaker between vectorizing or not, then it's
4074 	 probably better not to vectorize.  */
4075       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4076 			    vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4077       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4078 			    vector_stmt, NULL, NULL_TREE, 0, vect_body);
4079     }
4080   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4081     {
4082       /* Referring to the functions vect_set_loop_condition_partial_vectors
4083 	 and vect_set_loop_controls_directly, we need to generate each
4084 	 length in the prologue and in the loop body if required. Although
4085 	 there are some possible optimizations, we consider the worst case
4086 	 here.  */
4087 
4088       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4089       bool need_iterate_p
4090 	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4091 	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
4092 
4093       /* Calculate how many statements to be added.  */
4094       unsigned int prologue_stmts = 0;
4095       unsigned int body_stmts = 0;
4096 
4097       rgroup_controls *rgc;
4098       unsigned int num_vectors_m1;
4099       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4100 	if (rgc->type)
4101 	  {
4102 	    /* May need one SHIFT for nitems_total computation.  */
4103 	    unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4104 	    if (nitems != 1 && !niters_known_p)
4105 	      prologue_stmts += 1;
4106 
4107 	    /* May need one MAX and one MINUS for wrap around.  */
4108 	    if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4109 	      prologue_stmts += 2;
4110 
4111 	    /* Need one MAX and one MINUS for each batch limit excepting for
4112 	       the 1st one.  */
4113 	    prologue_stmts += num_vectors_m1 * 2;
4114 
4115 	    unsigned int num_vectors = num_vectors_m1 + 1;
4116 
4117 	    /* Need to set up lengths in prologue, only one MIN required
4118 	       for each since start index is zero.  */
4119 	    prologue_stmts += num_vectors;
4120 
4121 	    /* Each may need two MINs and one MINUS to update lengths in body
4122 	       for next iteration.  */
4123 	    if (need_iterate_p)
4124 	      body_stmts += 3 * num_vectors;
4125 	  }
4126 
4127       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4128 			    scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4129       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4130 			    scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4131     }
4132 
4133   /* FORNOW: The scalar outside cost is incremented in one of the
4134      following ways:
4135 
4136      1. The vectorizer checks for alignment and aliasing and generates
4137      a condition that allows dynamic vectorization.  A cost model
4138      check is ANDED with the versioning condition.  Hence scalar code
4139      path now has the added cost of the versioning check.
4140 
4141        if (cost > th & versioning_check)
4142          jmp to vector code
4143 
4144      Hence run-time scalar is incremented by not-taken branch cost.
4145 
4146      2. The vectorizer then checks if a prologue is required.  If the
4147      cost model check was not done before during versioning, it has to
4148      be done before the prologue check.
4149 
4150        if (cost <= th)
4151          prologue = scalar_iters
4152        if (prologue == 0)
4153          jmp to vector code
4154        else
4155          execute prologue
4156        if (prologue == num_iters)
4157 	 go to exit
4158 
4159      Hence the run-time scalar cost is incremented by a taken branch,
4160      plus a not-taken branch, plus a taken branch cost.
4161 
4162      3. The vectorizer then checks if an epilogue is required.  If the
4163      cost model check was not done before during prologue check, it
4164      has to be done with the epilogue check.
4165 
4166        if (prologue == 0)
4167          jmp to vector code
4168        else
4169          execute prologue
4170        if (prologue == num_iters)
4171 	 go to exit
4172        vector code:
4173          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4174            jmp to epilogue
4175 
4176      Hence the run-time scalar cost should be incremented by 2 taken
4177      branches.
4178 
4179      TODO: The back end may reorder the BBS's differently and reverse
4180      conditions/branch directions.  Change the estimates below to
4181      something more reasonable.  */
4182 
4183   /* If the number of iterations is known and we do not do versioning, we can
4184      decide whether to vectorize at compile time.  Hence the scalar version
4185      do not carry cost model guard costs.  */
4186   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4187       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4188     {
4189       /* Cost model check occurs at versioning.  */
4190       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4191 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4192       else
4193 	{
4194 	  /* Cost model check occurs at prologue generation.  */
4195 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4196 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4197 	      + vect_get_stmt_cost (cond_branch_not_taken);
4198 	  /* Cost model check occurs at epilogue generation.  */
4199 	  else
4200 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4201 	}
4202     }
4203 
4204   /* Complete the target-specific cost calculations.  */
4205   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4206 	       &vec_inside_cost, &vec_epilogue_cost);
4207 
4208   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4209 
4210   /* Stash the costs so that we can compare two loop_vec_infos.  */
4211   loop_vinfo->vec_inside_cost = vec_inside_cost;
4212   loop_vinfo->vec_outside_cost = vec_outside_cost;
4213 
4214   if (dump_enabled_p ())
4215     {
4216       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4217       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4218                    vec_inside_cost);
4219       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4220                    vec_prologue_cost);
4221       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4222                    vec_epilogue_cost);
4223       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4224                    scalar_single_iter_cost);
4225       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4226                    scalar_outside_cost);
4227       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4228                    vec_outside_cost);
4229       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4230                    peel_iters_prologue);
4231       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4232                    peel_iters_epilogue);
4233     }
4234 
4235   /* Calculate number of iterations required to make the vector version
4236      profitable, relative to the loop bodies only.  The following condition
4237      must hold true:
4238      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4239      where
4240      SIC = scalar iteration cost, VIC = vector iteration cost,
4241      VOC = vector outside cost, VF = vectorization factor,
4242      NPEEL = prologue iterations + epilogue iterations,
4243      SOC = scalar outside cost for run time cost model check.  */
4244 
4245   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4246 			  - vec_inside_cost);
4247   if (saving_per_viter <= 0)
4248     {
4249       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4250 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4251 		    "vectorization did not happen for a simd loop");
4252 
4253       if (dump_enabled_p ())
4254         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4255 			 "cost model: the vector iteration cost = %d "
4256 			 "divided by the scalar iteration cost = %d "
4257 			 "is greater or equal to the vectorization factor = %d"
4258                          ".\n",
4259 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4260       *ret_min_profitable_niters = -1;
4261       *ret_min_profitable_estimate = -1;
4262       return;
4263     }
4264 
4265   /* ??? The "if" arm is written to handle all cases; see below for what
4266      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4267   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4268     {
4269       /* Rewriting the condition above in terms of the number of
4270 	 vector iterations (vniters) rather than the number of
4271 	 scalar iterations (niters) gives:
4272 
4273 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4274 
4275 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4276 
4277 	 For integer N, X and Y when X > 0:
4278 
4279 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4280       int outside_overhead = (vec_outside_cost
4281 			      - scalar_single_iter_cost * peel_iters_prologue
4282 			      - scalar_single_iter_cost * peel_iters_epilogue
4283 			      - scalar_outside_cost);
4284       /* We're only interested in cases that require at least one
4285 	 vector iteration.  */
4286       int min_vec_niters = 1;
4287       if (outside_overhead > 0)
4288 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4289 
4290       if (dump_enabled_p ())
4291 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4292 		     min_vec_niters);
4293 
4294       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4295 	{
4296 	  /* Now that we know the minimum number of vector iterations,
4297 	     find the minimum niters for which the scalar cost is larger:
4298 
4299 	     SIC * niters > VIC * vniters + VOC - SOC
4300 
4301 	     We know that the minimum niters is no more than
4302 	     vniters * VF + NPEEL, but it might be (and often is) less
4303 	     than that if a partial vector iteration is cheaper than the
4304 	     equivalent scalar code.  */
4305 	  int threshold = (vec_inside_cost * min_vec_niters
4306 			   + vec_outside_cost
4307 			   - scalar_outside_cost);
4308 	  if (threshold <= 0)
4309 	    min_profitable_iters = 1;
4310 	  else
4311 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4312 	}
4313       else
4314 	/* Convert the number of vector iterations into a number of
4315 	   scalar iterations.  */
4316 	min_profitable_iters = (min_vec_niters * assumed_vf
4317 				+ peel_iters_prologue
4318 				+ peel_iters_epilogue);
4319     }
4320   else
4321     {
4322       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4323 			      * assumed_vf
4324 			      - vec_inside_cost * peel_iters_prologue
4325 			      - vec_inside_cost * peel_iters_epilogue);
4326       if (min_profitable_iters <= 0)
4327         min_profitable_iters = 0;
4328       else
4329 	{
4330 	  min_profitable_iters /= saving_per_viter;
4331 
4332 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4333 	      <= (((int) vec_inside_cost * min_profitable_iters)
4334 		  + (((int) vec_outside_cost - scalar_outside_cost)
4335 		     * assumed_vf)))
4336 	    min_profitable_iters++;
4337 	}
4338     }
4339 
4340   if (dump_enabled_p ())
4341     dump_printf (MSG_NOTE,
4342 		 "  Calculated minimum iters for profitability: %d\n",
4343 		 min_profitable_iters);
4344 
4345   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4346       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4347     /* We want the vectorized loop to execute at least once.  */
4348     min_profitable_iters = assumed_vf + peel_iters_prologue;
4349   else if (min_profitable_iters < peel_iters_prologue)
4350     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4351        vectorized loop executes at least once.  */
4352     min_profitable_iters = peel_iters_prologue;
4353 
4354   if (dump_enabled_p ())
4355     dump_printf_loc (MSG_NOTE, vect_location,
4356                      "  Runtime profitability threshold = %d\n",
4357                      min_profitable_iters);
4358 
4359   *ret_min_profitable_niters = min_profitable_iters;
4360 
4361   /* Calculate number of iterations required to make the vector version
4362      profitable, relative to the loop bodies only.
4363 
4364      Non-vectorized variant is SIC * niters and it must win over vector
4365      variant on the expected loop trip count.  The following condition must hold true:
4366      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4367 
4368   if (vec_outside_cost <= 0)
4369     min_profitable_estimate = 0;
4370   /* ??? This "else if" arm is written to handle all cases; see below for
4371      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4372   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4373     {
4374       /* This is a repeat of the code above, but with + SOC rather
4375 	 than - SOC.  */
4376       int outside_overhead = (vec_outside_cost
4377 			      - scalar_single_iter_cost * peel_iters_prologue
4378 			      - scalar_single_iter_cost * peel_iters_epilogue
4379 			      + scalar_outside_cost);
4380       int min_vec_niters = 1;
4381       if (outside_overhead > 0)
4382 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4383 
4384       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4385 	{
4386 	  int threshold = (vec_inside_cost * min_vec_niters
4387 			   + vec_outside_cost
4388 			   + scalar_outside_cost);
4389 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4390 	}
4391       else
4392 	min_profitable_estimate = (min_vec_niters * assumed_vf
4393 				   + peel_iters_prologue
4394 				   + peel_iters_epilogue);
4395     }
4396   else
4397     {
4398       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4399 				 * assumed_vf
4400 				 - vec_inside_cost * peel_iters_prologue
4401 				 - vec_inside_cost * peel_iters_epilogue)
4402 				 / ((scalar_single_iter_cost * assumed_vf)
4403 				   - vec_inside_cost);
4404     }
4405   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4406   if (dump_enabled_p ())
4407     dump_printf_loc (MSG_NOTE, vect_location,
4408 		     "  Static estimate profitability threshold = %d\n",
4409 		     min_profitable_estimate);
4410 
4411   *ret_min_profitable_estimate = min_profitable_estimate;
4412 }
4413 
4414 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4415    vector elements (not bits) for a vector with NELT elements.  */
4416 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4417 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4418 			      vec_perm_builder *sel)
4419 {
4420   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4421      by vec_perm_indices.  */
4422   sel->new_vector (nelt, 1, 3);
4423   for (unsigned int i = 0; i < 3; i++)
4424     sel->quick_push (i + offset);
4425 }
4426 
4427 /* Checks whether the target supports whole-vector shifts for vectors of mode
4428    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4429    it supports vec_perm_const with masks for all necessary shift amounts.  */
4430 static bool
have_whole_vector_shift(machine_mode mode)4431 have_whole_vector_shift (machine_mode mode)
4432 {
4433   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4434     return true;
4435 
4436   /* Variable-length vectors should be handled via the optab.  */
4437   unsigned int nelt;
4438   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4439     return false;
4440 
4441   vec_perm_builder sel;
4442   vec_perm_indices indices;
4443   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4444     {
4445       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4446       indices.new_vector (sel, 2, nelt);
4447       if (!can_vec_perm_const_p (mode, indices, false))
4448 	return false;
4449     }
4450   return true;
4451 }
4452 
4453 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4454    functions. Design better to avoid maintenance issues.  */
4455 
4456 /* Function vect_model_reduction_cost.
4457 
4458    Models cost for a reduction operation, including the vector ops
4459    generated within the strip-mine loop in some cases, the initial
4460    definition before the loop, and the epilogue code that must be generated.  */
4461 
4462 static void
vect_model_reduction_cost(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)4463 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4464 			   stmt_vec_info stmt_info, internal_fn reduc_fn,
4465 			   vect_reduction_type reduction_type,
4466 			   int ncopies, stmt_vector_for_cost *cost_vec)
4467 {
4468   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4469   enum tree_code code;
4470   optab optab;
4471   tree vectype;
4472   machine_mode mode;
4473   class loop *loop = NULL;
4474 
4475   if (loop_vinfo)
4476     loop = LOOP_VINFO_LOOP (loop_vinfo);
4477 
4478   /* Condition reductions generate two reductions in the loop.  */
4479   if (reduction_type == COND_REDUCTION)
4480     ncopies *= 2;
4481 
4482   vectype = STMT_VINFO_VECTYPE (stmt_info);
4483   mode = TYPE_MODE (vectype);
4484   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4485 
4486   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4487 
4488   if (reduction_type == EXTRACT_LAST_REDUCTION)
4489     /* No extra instructions are needed in the prologue.  The loop body
4490        operations are costed in vectorizable_condition.  */
4491     inside_cost = 0;
4492   else if (reduction_type == FOLD_LEFT_REDUCTION)
4493     {
4494       /* No extra instructions needed in the prologue.  */
4495       prologue_cost = 0;
4496 
4497       if (reduc_fn != IFN_LAST)
4498 	/* Count one reduction-like operation per vector.  */
4499 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4500 					stmt_info, 0, vect_body);
4501       else
4502 	{
4503 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4504 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4505 	  inside_cost = record_stmt_cost (cost_vec, nelements,
4506 					  vec_to_scalar, stmt_info, 0,
4507 					  vect_body);
4508 	  inside_cost += record_stmt_cost (cost_vec, nelements,
4509 					   scalar_stmt, stmt_info, 0,
4510 					   vect_body);
4511 	}
4512     }
4513   else
4514     {
4515       /* Add in cost for initial definition.
4516 	 For cond reduction we have four vectors: initial index, step,
4517 	 initial result of the data reduction, initial value of the index
4518 	 reduction.  */
4519       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4520       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4521 					 scalar_to_vec, stmt_info, 0,
4522 					 vect_prologue);
4523     }
4524 
4525   /* Determine cost of epilogue code.
4526 
4527      We have a reduction operator that will reduce the vector in one statement.
4528      Also requires scalar extract.  */
4529 
4530   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4531     {
4532       if (reduc_fn != IFN_LAST)
4533 	{
4534 	  if (reduction_type == COND_REDUCTION)
4535 	    {
4536 	      /* An EQ stmt and an COND_EXPR stmt.  */
4537 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4538 						 vector_stmt, stmt_info, 0,
4539 						 vect_epilogue);
4540 	      /* Reduction of the max index and a reduction of the found
4541 		 values.  */
4542 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4543 						 vec_to_scalar, stmt_info, 0,
4544 						 vect_epilogue);
4545 	      /* A broadcast of the max value.  */
4546 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4547 						 scalar_to_vec, stmt_info, 0,
4548 						 vect_epilogue);
4549 	    }
4550 	  else
4551 	    {
4552 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4553 						 stmt_info, 0, vect_epilogue);
4554 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4555 						 vec_to_scalar, stmt_info, 0,
4556 						 vect_epilogue);
4557 	    }
4558 	}
4559       else if (reduction_type == COND_REDUCTION)
4560 	{
4561 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4562 	  /* Extraction of scalar elements.  */
4563 	  epilogue_cost += record_stmt_cost (cost_vec,
4564 					     2 * estimated_nunits,
4565 					     vec_to_scalar, stmt_info, 0,
4566 					     vect_epilogue);
4567 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4568 	  epilogue_cost += record_stmt_cost (cost_vec,
4569 					     2 * estimated_nunits - 3,
4570 					     scalar_stmt, stmt_info, 0,
4571 					     vect_epilogue);
4572 	}
4573       else if (reduction_type == EXTRACT_LAST_REDUCTION
4574 	       || reduction_type == FOLD_LEFT_REDUCTION)
4575 	/* No extra instructions need in the epilogue.  */
4576 	;
4577       else
4578 	{
4579 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4580 	  tree bitsize =
4581 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4582 	  int element_bitsize = tree_to_uhwi (bitsize);
4583 	  int nelements = vec_size_in_bits / element_bitsize;
4584 
4585 	  if (code == COND_EXPR)
4586 	    code = MAX_EXPR;
4587 
4588 	  optab = optab_for_tree_code (code, vectype, optab_default);
4589 
4590 	  /* We have a whole vector shift available.  */
4591 	  if (optab != unknown_optab
4592 	      && VECTOR_MODE_P (mode)
4593 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4594 	      && have_whole_vector_shift (mode))
4595 	    {
4596 	      /* Final reduction via vector shifts and the reduction operator.
4597 		 Also requires scalar extract.  */
4598 	      epilogue_cost += record_stmt_cost (cost_vec,
4599 						 exact_log2 (nelements) * 2,
4600 						 vector_stmt, stmt_info, 0,
4601 						 vect_epilogue);
4602 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4603 						 vec_to_scalar, stmt_info, 0,
4604 						 vect_epilogue);
4605 	    }
4606 	  else
4607 	    /* Use extracts and reduction op for final reduction.  For N
4608 	       elements, we have N extracts and N-1 reduction ops.  */
4609 	    epilogue_cost += record_stmt_cost (cost_vec,
4610 					       nelements + nelements - 1,
4611 					       vector_stmt, stmt_info, 0,
4612 					       vect_epilogue);
4613 	}
4614     }
4615 
4616   if (dump_enabled_p ())
4617     dump_printf (MSG_NOTE,
4618                  "vect_model_reduction_cost: inside_cost = %d, "
4619                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4620                  prologue_cost, epilogue_cost);
4621 }
4622 
4623 
4624 
4625 /* Function get_initial_def_for_reduction
4626 
4627    Input:
4628    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4629    INIT_VAL - the initial value of the reduction variable
4630 
4631    Output:
4632    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4633         of the reduction (used for adjusting the epilog - see below).
4634    Return a vector variable, initialized according to the operation that
4635 	STMT_VINFO performs. This vector will be used as the initial value
4636 	of the vector of partial results.
4637 
4638    Option1 (adjust in epilog): Initialize the vector as follows:
4639      add/bit or/xor:    [0,0,...,0,0]
4640      mult/bit and:      [1,1,...,1,1]
4641      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4642    and when necessary (e.g. add/mult case) let the caller know
4643    that it needs to adjust the result by init_val.
4644 
4645    Option2: Initialize the vector as follows:
4646      add/bit or/xor:    [init_val,0,0,...,0]
4647      mult/bit and:      [init_val,1,1,...,1]
4648      min/max/cond_expr: [init_val,init_val,...,init_val]
4649    and no adjustments are needed.
4650 
4651    For example, for the following code:
4652 
4653    s = init_val;
4654    for (i=0;i<n;i++)
4655      s = s + a[i];
4656 
4657    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4658    For a vector of 4 units, we want to return either [0,0,0,init_val],
4659    or [0,0,0,0] and let the caller know that it needs to adjust
4660    the result at the end by 'init_val'.
4661 
4662    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4663    initialization vector is simpler (same element in all entries), if
4664    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4665 
4666    A cost model should help decide between these two schemes.  */
4667 
4668 static tree
get_initial_def_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4669 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4670 			       stmt_vec_info stmt_vinfo,
4671 			       enum tree_code code, tree init_val,
4672                                tree *adjustment_def)
4673 {
4674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675   tree scalar_type = TREE_TYPE (init_val);
4676   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677   tree def_for_init;
4678   tree init_def;
4679   REAL_VALUE_TYPE real_init_val = dconst0;
4680   int int_init_val = 0;
4681   gimple_seq stmts = NULL;
4682 
4683   gcc_assert (vectype);
4684 
4685   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4686 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4687 
4688   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4689 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4690 
4691   /* ADJUSTMENT_DEF is NULL when called from
4692      vect_create_epilog_for_reduction to vectorize double reduction.  */
4693   if (adjustment_def)
4694     *adjustment_def = NULL;
4695 
4696   switch (code)
4697     {
4698     case WIDEN_SUM_EXPR:
4699     case DOT_PROD_EXPR:
4700     case SAD_EXPR:
4701     case PLUS_EXPR:
4702     case MINUS_EXPR:
4703     case BIT_IOR_EXPR:
4704     case BIT_XOR_EXPR:
4705     case MULT_EXPR:
4706     case BIT_AND_EXPR:
4707       {
4708         if (code == MULT_EXPR)
4709           {
4710             real_init_val = dconst1;
4711             int_init_val = 1;
4712           }
4713 
4714         if (code == BIT_AND_EXPR)
4715           int_init_val = -1;
4716 
4717         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4718           def_for_init = build_real (scalar_type, real_init_val);
4719         else
4720           def_for_init = build_int_cst (scalar_type, int_init_val);
4721 
4722 	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4723 	  {
4724 	    /* Option1: the first element is '0' or '1' as well.  */
4725 	    if (!operand_equal_p (def_for_init, init_val, 0))
4726 	      *adjustment_def = init_val;
4727 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4728 						     def_for_init);
4729 	  }
4730 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4731 	  {
4732 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4733 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4734 						     def_for_init);
4735 	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4736 				     vectype, init_def, init_val);
4737 	  }
4738 	else
4739 	  {
4740 	    /* Option2: the first element is INIT_VAL.  */
4741 	    tree_vector_builder elts (vectype, 1, 2);
4742 	    elts.quick_push (init_val);
4743 	    elts.quick_push (def_for_init);
4744 	    init_def = gimple_build_vector (&stmts, &elts);
4745 	  }
4746       }
4747       break;
4748 
4749     case MIN_EXPR:
4750     case MAX_EXPR:
4751     case COND_EXPR:
4752       {
4753 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4754 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4755       }
4756       break;
4757 
4758     default:
4759       gcc_unreachable ();
4760     }
4761 
4762   if (stmts)
4763     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4764   return init_def;
4765 }
4766 
4767 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4768    NUMBER_OF_VECTORS is the number of vector defs to create.
4769    If NEUTRAL_OP is nonnull, introducing extra elements of that
4770    value will not change the result.  */
4771 
4772 static void
get_initial_defs_for_reduction(vec_info * vinfo,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4773 get_initial_defs_for_reduction (vec_info *vinfo,
4774 				slp_tree slp_node,
4775 				vec<tree> *vec_oprnds,
4776 				unsigned int number_of_vectors,
4777 				bool reduc_chain, tree neutral_op)
4778 {
4779   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4780   stmt_vec_info stmt_vinfo = stmts[0];
4781   unsigned HOST_WIDE_INT nunits;
4782   unsigned j, number_of_places_left_in_vector;
4783   tree vector_type;
4784   unsigned int group_size = stmts.length ();
4785   unsigned int i;
4786   class loop *loop;
4787 
4788   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4789 
4790   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4791 
4792   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4793   gcc_assert (loop);
4794   edge pe = loop_preheader_edge (loop);
4795 
4796   gcc_assert (!reduc_chain || neutral_op);
4797 
4798   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4799      created vectors. It is greater than 1 if unrolling is performed.
4800 
4801      For example, we have two scalar operands, s1 and s2 (e.g., group of
4802      strided accesses of size two), while NUNITS is four (i.e., four scalars
4803      of this type can be packed in a vector).  The output vector will contain
4804      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4805      will be 2).
4806 
4807      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4808      vectors containing the operands.
4809 
4810      For example, NUNITS is four as before, and the group size is 8
4811      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4812      {s5, s6, s7, s8}.  */
4813 
4814   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4815     nunits = group_size;
4816 
4817   number_of_places_left_in_vector = nunits;
4818   bool constant_p = true;
4819   tree_vector_builder elts (vector_type, nunits, 1);
4820   elts.quick_grow (nunits);
4821   gimple_seq ctor_seq = NULL;
4822   for (j = 0; j < nunits * number_of_vectors; ++j)
4823     {
4824       tree op;
4825       i = j % group_size;
4826       stmt_vinfo = stmts[i];
4827 
4828       /* Get the def before the loop.  In reduction chain we have only
4829 	 one initial value.  Else we have as many as PHIs in the group.  */
4830       if (reduc_chain)
4831 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4832       else if (((vec_oprnds->length () + 1) * nunits
4833 		- number_of_places_left_in_vector >= group_size)
4834 	       && neutral_op)
4835 	op = neutral_op;
4836       else
4837 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4838 
4839       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4840       number_of_places_left_in_vector--;
4841       elts[nunits - number_of_places_left_in_vector - 1] = op;
4842       if (!CONSTANT_CLASS_P (op))
4843 	constant_p = false;
4844 
4845       if (number_of_places_left_in_vector == 0)
4846 	{
4847 	  tree init;
4848 	  if (constant_p && !neutral_op
4849 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4850 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4851 	    /* Build the vector directly from ELTS.  */
4852 	    init = gimple_build_vector (&ctor_seq, &elts);
4853 	  else if (neutral_op)
4854 	    {
4855 	      /* Build a vector of the neutral value and shift the
4856 		 other elements into place.  */
4857 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4858 						   neutral_op);
4859 	      int k = nunits;
4860 	      while (k > 0 && elts[k - 1] == neutral_op)
4861 		k -= 1;
4862 	      while (k > 0)
4863 		{
4864 		  k -= 1;
4865 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4866 				       vector_type, init, elts[k]);
4867 		}
4868 	    }
4869 	  else
4870 	    {
4871 	      /* First time round, duplicate ELTS to fill the
4872 		 required number of vectors.  */
4873 	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4874 					number_of_vectors, *vec_oprnds);
4875 	      break;
4876 	    }
4877 	  vec_oprnds->quick_push (init);
4878 
4879 	  number_of_places_left_in_vector = nunits;
4880 	  elts.new_vector (vector_type, nunits, 1);
4881 	  elts.quick_grow (nunits);
4882 	  constant_p = true;
4883 	}
4884     }
4885   if (ctor_seq != NULL)
4886     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4887 }
4888 
4889 /* For a statement STMT_INFO taking part in a reduction operation return
4890    the stmt_vec_info the meta information is stored on.  */
4891 
4892 stmt_vec_info
info_for_reduction(vec_info * vinfo,stmt_vec_info stmt_info)4893 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4894 {
4895   stmt_info = vect_orig_stmt (stmt_info);
4896   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4897   if (!is_a <gphi *> (stmt_info->stmt)
4898       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4899     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4900   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4901   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4902     {
4903       if (gimple_phi_num_args (phi) == 1)
4904 	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4905     }
4906   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4907     {
4908       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4909       stmt_vec_info info
4910 	  = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4911       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4912 	stmt_info = info;
4913     }
4914   return stmt_info;
4915 }
4916 
4917 /* Function vect_create_epilog_for_reduction
4918 
4919    Create code at the loop-epilog to finalize the result of a reduction
4920    computation.
4921 
4922    STMT_INFO is the scalar reduction stmt that is being vectorized.
4923    SLP_NODE is an SLP node containing a group of reduction statements. The
4924      first one in this group is STMT_INFO.
4925    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4926    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4927      (counting from 0)
4928 
4929    This function:
4930    1. Completes the reduction def-use cycles.
4931    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4932       by calling the function specified by REDUC_FN if available, or by
4933       other means (whole-vector shifts or a scalar loop).
4934       The function also creates a new phi node at the loop exit to preserve
4935       loop-closed form, as illustrated below.
4936 
4937      The flow at the entry to this function:
4938 
4939         loop:
4940           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4941           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4942           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4943         loop_exit:
4944           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4945           use <s_out0>
4946           use <s_out0>
4947 
4948      The above is transformed by this function into:
4949 
4950         loop:
4951           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4952           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4953           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4954         loop_exit:
4955           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4956           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4957           v_out2 = reduce <v_out1>
4958           s_out3 = extract_field <v_out2, 0>
4959           s_out4 = adjust_result <s_out3>
4960           use <s_out4>
4961           use <s_out4>
4962 */
4963 
4964 static void
vect_create_epilog_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4965 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4966 				  stmt_vec_info stmt_info,
4967 				  slp_tree slp_node,
4968 				  slp_instance slp_node_instance)
4969 {
4970   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4971   gcc_assert (reduc_info->is_reduc_info);
4972   /* For double reductions we need to get at the inner loop reduction
4973      stmt which has the meta info attached.  Our stmt_info is that of the
4974      loop-closed PHI of the inner loop which we remember as
4975      def for the reduction PHI generation.  */
4976   bool double_reduc = false;
4977   stmt_vec_info rdef_info = stmt_info;
4978   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4979     {
4980       gcc_assert (!slp_node);
4981       double_reduc = true;
4982       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4983 					    (stmt_info->stmt, 0));
4984       stmt_info = vect_stmt_to_vectorize (stmt_info);
4985     }
4986   gphi *reduc_def_stmt
4987     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4988   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4989   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4990   tree vectype;
4991   machine_mode mode;
4992   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4993   basic_block exit_bb;
4994   tree scalar_dest;
4995   tree scalar_type;
4996   gimple *new_phi = NULL, *phi;
4997   gimple_stmt_iterator exit_gsi;
4998   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4999   gimple *epilog_stmt = NULL;
5000   gimple *exit_phi;
5001   tree bitsize;
5002   tree def;
5003   tree orig_name, scalar_result;
5004   imm_use_iterator imm_iter, phi_imm_iter;
5005   use_operand_p use_p, phi_use_p;
5006   gimple *use_stmt;
5007   bool nested_in_vect_loop = false;
5008   auto_vec<gimple *> new_phis;
5009   int j, i;
5010   auto_vec<tree> scalar_results;
5011   unsigned int group_size = 1, k;
5012   auto_vec<gimple *> phis;
5013   bool slp_reduc = false;
5014   bool direct_slp_reduc;
5015   tree new_phi_result;
5016   tree induction_index = NULL_TREE;
5017 
5018   if (slp_node)
5019     group_size = SLP_TREE_LANES (slp_node);
5020 
5021   if (nested_in_vect_loop_p (loop, stmt_info))
5022     {
5023       outer_loop = loop;
5024       loop = loop->inner;
5025       nested_in_vect_loop = true;
5026       gcc_assert (!slp_node);
5027     }
5028   gcc_assert (!nested_in_vect_loop || double_reduc);
5029 
5030   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5031   gcc_assert (vectype);
5032   mode = TYPE_MODE (vectype);
5033 
5034   tree initial_def = NULL;
5035   tree induc_val = NULL_TREE;
5036   tree adjustment_def = NULL;
5037   if (slp_node)
5038     ;
5039   else
5040     {
5041       /* Get at the scalar def before the loop, that defines the initial value
5042 	 of the reduction variable.  */
5043       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5044 					   loop_preheader_edge (loop));
5045       /* Optimize: for induction condition reduction, if we can't use zero
5046          for induc_val, use initial_def.  */
5047       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5048 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5049       else if (double_reduc)
5050 	;
5051       else if (nested_in_vect_loop)
5052 	;
5053       else
5054 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5055     }
5056 
5057   unsigned vec_num;
5058   int ncopies;
5059   if (slp_node)
5060     {
5061       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5062       ncopies = 1;
5063     }
5064   else
5065     {
5066       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5067       vec_num = 1;
5068       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5069     }
5070 
5071   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5072      which is updated with the current index of the loop for every match of
5073      the original loop's cond_expr (VEC_STMT).  This results in a vector
5074      containing the last time the condition passed for that vector lane.
5075      The first match will be a 1 to allow 0 to be used for non-matching
5076      indexes.  If there are no matches at all then the vector will be all
5077      zeroes.
5078 
5079      PR92772: This algorithm is broken for architectures that support
5080      masked vectors, but do not provide fold_extract_last.  */
5081   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5082     {
5083       auto_vec<std::pair<tree, bool>, 2> ccompares;
5084       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5085       cond_info = vect_stmt_to_vectorize (cond_info);
5086       while (cond_info != reduc_info)
5087 	{
5088 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5089 	    {
5090 	      gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5091 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5092 	      ccompares.safe_push
5093 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5094 				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5095 	    }
5096 	  cond_info
5097 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5098 						 1 + STMT_VINFO_REDUC_IDX
5099 							(cond_info)));
5100 	  cond_info = vect_stmt_to_vectorize (cond_info);
5101 	}
5102       gcc_assert (ccompares.length () != 0);
5103 
5104       tree indx_before_incr, indx_after_incr;
5105       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5106       int scalar_precision
5107 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5108       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5109       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5110 	(TYPE_MODE (vectype), cr_index_scalar_type,
5111 	 TYPE_VECTOR_SUBPARTS (vectype));
5112 
5113       /* First we create a simple vector induction variable which starts
5114 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5115 	 vector size (STEP).  */
5116 
5117       /* Create a {1,2,3,...} vector.  */
5118       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5119 
5120       /* Create a vector of the step value.  */
5121       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5122       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5123 
5124       /* Create an induction variable.  */
5125       gimple_stmt_iterator incr_gsi;
5126       bool insert_after;
5127       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5128       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5129 		 insert_after, &indx_before_incr, &indx_after_incr);
5130 
5131       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5132 	 filled with zeros (VEC_ZERO).  */
5133 
5134       /* Create a vector of 0s.  */
5135       tree zero = build_zero_cst (cr_index_scalar_type);
5136       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5137 
5138       /* Create a vector phi node.  */
5139       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5140       new_phi = create_phi_node (new_phi_tree, loop->header);
5141       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5142 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
5143 
5144       /* Now take the condition from the loops original cond_exprs
5145 	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5146 	 every match uses values from the induction variable
5147 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5148 	 (NEW_PHI_TREE).
5149 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5150 	 the new cond_expr (INDEX_COND_EXPR).  */
5151       gimple_seq stmts = NULL;
5152       for (int i = ccompares.length () - 1; i != -1; --i)
5153 	{
5154 	  tree ccompare = ccompares[i].first;
5155 	  if (ccompares[i].second)
5156 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5157 					 cr_index_vector_type,
5158 					 ccompare,
5159 					 indx_before_incr, new_phi_tree);
5160 	  else
5161 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5162 					 cr_index_vector_type,
5163 					 ccompare,
5164 					 new_phi_tree, indx_before_incr);
5165 	}
5166       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5167 
5168       /* Update the phi with the vec cond.  */
5169       induction_index = new_phi_tree;
5170       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5171 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
5172     }
5173 
5174   /* 2. Create epilog code.
5175         The reduction epilog code operates across the elements of the vector
5176         of partial results computed by the vectorized loop.
5177         The reduction epilog code consists of:
5178 
5179         step 1: compute the scalar result in a vector (v_out2)
5180         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5181         step 3: adjust the scalar result (s_out3) if needed.
5182 
5183         Step 1 can be accomplished using one the following three schemes:
5184           (scheme 1) using reduc_fn, if available.
5185           (scheme 2) using whole-vector shifts, if available.
5186           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5187                      combined.
5188 
5189           The overall epilog code looks like this:
5190 
5191           s_out0 = phi <s_loop>         # original EXIT_PHI
5192           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5193           v_out2 = reduce <v_out1>              # step 1
5194           s_out3 = extract_field <v_out2, 0>    # step 2
5195           s_out4 = adjust_result <s_out3>       # step 3
5196 
5197           (step 3 is optional, and steps 1 and 2 may be combined).
5198           Lastly, the uses of s_out0 are replaced by s_out4.  */
5199 
5200 
5201   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5202          v_out1 = phi <VECT_DEF>
5203          Store them in NEW_PHIS.  */
5204   if (double_reduc)
5205     loop = outer_loop;
5206   exit_bb = single_exit (loop)->dest;
5207   new_phis.create (slp_node ? vec_num : ncopies);
5208   for (unsigned i = 0; i < vec_num; i++)
5209     {
5210       if (slp_node)
5211 	def = vect_get_slp_vect_def (slp_node, i);
5212       else
5213 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5214       for (j = 0; j < ncopies; j++)
5215         {
5216 	  tree new_def = copy_ssa_name (def);
5217           phi = create_phi_node (new_def, exit_bb);
5218           if (j == 0)
5219             new_phis.quick_push (phi);
5220           else
5221 	    {
5222 	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5223 	      new_phis.quick_push (phi);
5224 	    }
5225 
5226           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5227         }
5228     }
5229 
5230   exit_gsi = gsi_after_labels (exit_bb);
5231 
5232   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5233          (i.e. when reduc_fn is not available) and in the final adjustment
5234 	 code (if needed).  Also get the original scalar reduction variable as
5235          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5236          represents a reduction pattern), the tree-code and scalar-def are
5237          taken from the original stmt that the pattern-stmt (STMT) replaces.
5238          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5239          are taken from STMT.  */
5240 
5241   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5242   if (orig_stmt_info != stmt_info)
5243     {
5244       /* Reduction pattern  */
5245       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5246       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5247     }
5248 
5249   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5250   scalar_type = TREE_TYPE (scalar_dest);
5251   scalar_results.create (group_size);
5252   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5253   bitsize = TYPE_SIZE (scalar_type);
5254 
5255   /* SLP reduction without reduction chain, e.g.,
5256      # a1 = phi <a2, a0>
5257      # b1 = phi <b2, b0>
5258      a2 = operation (a1)
5259      b2 = operation (b1)  */
5260   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5261 
5262   /* True if we should implement SLP_REDUC using native reduction operations
5263      instead of scalar operations.  */
5264   direct_slp_reduc = (reduc_fn != IFN_LAST
5265 		      && slp_reduc
5266 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5267 
5268   /* In case of reduction chain, e.g.,
5269      # a1 = phi <a3, a0>
5270      a2 = operation (a1)
5271      a3 = operation (a2),
5272 
5273      we may end up with more than one vector result.  Here we reduce them to
5274      one vector.  */
5275   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5276     {
5277       gimple_seq stmts = NULL;
5278       tree first_vect = PHI_RESULT (new_phis[0]);
5279       first_vect = gimple_convert (&stmts, vectype, first_vect);
5280       for (k = 1; k < new_phis.length (); k++)
5281         {
5282 	  gimple *next_phi = new_phis[k];
5283           tree second_vect = PHI_RESULT (next_phi);
5284 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
5285           first_vect = gimple_build (&stmts, code, vectype,
5286 				     first_vect, second_vect);
5287         }
5288       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5289 
5290       new_phi_result = first_vect;
5291       new_phis.truncate (0);
5292       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5293     }
5294   /* Likewise if we couldn't use a single defuse cycle.  */
5295   else if (ncopies > 1)
5296     {
5297       gimple_seq stmts = NULL;
5298       tree first_vect = PHI_RESULT (new_phis[0]);
5299       first_vect = gimple_convert (&stmts, vectype, first_vect);
5300       for (int k = 1; k < ncopies; ++k)
5301 	{
5302 	  tree second_vect = PHI_RESULT (new_phis[k]);
5303 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
5304 	  first_vect = gimple_build (&stmts, code, vectype,
5305 				     first_vect, second_vect);
5306 	}
5307       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5308       new_phi_result = first_vect;
5309       new_phis.truncate (0);
5310       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5311     }
5312   else
5313     new_phi_result = PHI_RESULT (new_phis[0]);
5314 
5315   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5316       && reduc_fn != IFN_LAST)
5317     {
5318       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5319 	 various data values where the condition matched and another vector
5320 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5321 	 need to extract the last matching index (which will be the index with
5322 	 highest value) and use this to index into the data vector.
5323 	 For the case where there were no matches, the data vector will contain
5324 	 all default values and the index vector will be all zeros.  */
5325 
5326       /* Get various versions of the type of the vector of indexes.  */
5327       tree index_vec_type = TREE_TYPE (induction_index);
5328       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5329       tree index_scalar_type = TREE_TYPE (index_vec_type);
5330       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5331 
5332       /* Get an unsigned integer version of the type of the data vector.  */
5333       int scalar_precision
5334 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5335       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5336       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5337 						vectype);
5338 
5339       /* First we need to create a vector (ZERO_VEC) of zeros and another
5340 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5341 	 can create using a MAX reduction and then expanding.
5342 	 In the case where the loop never made any matches, the max index will
5343 	 be zero.  */
5344 
5345       /* Vector of {0, 0, 0,...}.  */
5346       tree zero_vec = build_zero_cst (vectype);
5347 
5348       gimple_seq stmts = NULL;
5349       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5350       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5351 
5352       /* Find maximum value from the vector of found indexes.  */
5353       tree max_index = make_ssa_name (index_scalar_type);
5354       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5355 							  1, induction_index);
5356       gimple_call_set_lhs (max_index_stmt, max_index);
5357       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5358 
5359       /* Vector of {max_index, max_index, max_index,...}.  */
5360       tree max_index_vec = make_ssa_name (index_vec_type);
5361       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5362 						      max_index);
5363       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5364 							max_index_vec_rhs);
5365       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5366 
5367       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5368 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5369 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5370 	 otherwise.  Only one value should match, resulting in a vector
5371 	 (VEC_COND) with one data value and the rest zeros.
5372 	 In the case where the loop never made any matches, every index will
5373 	 match, resulting in a vector with all data values (which will all be
5374 	 the default value).  */
5375 
5376       /* Compare the max index vector to the vector of found indexes to find
5377 	 the position of the max value.  */
5378       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5379       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5380 						      induction_index,
5381 						      max_index_vec);
5382       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5383 
5384       /* Use the compare to choose either values from the data vector or
5385 	 zero.  */
5386       tree vec_cond = make_ssa_name (vectype);
5387       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5388 						   vec_compare, new_phi_result,
5389 						   zero_vec);
5390       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5391 
5392       /* Finally we need to extract the data value from the vector (VEC_COND)
5393 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5394 	 reduction, but because this doesn't exist, we can use a MAX reduction
5395 	 instead.  The data value might be signed or a float so we need to cast
5396 	 it first.
5397 	 In the case where the loop never made any matches, the data values are
5398 	 all identical, and so will reduce down correctly.  */
5399 
5400       /* Make the matched data values unsigned.  */
5401       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5402       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5403 				       vec_cond);
5404       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5405 							VIEW_CONVERT_EXPR,
5406 							vec_cond_cast_rhs);
5407       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5408 
5409       /* Reduce down to a scalar value.  */
5410       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5411       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5412 							   1, vec_cond_cast);
5413       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5414       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5415 
5416       /* Convert the reduced value back to the result type and set as the
5417 	 result.  */
5418       stmts = NULL;
5419       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5420 			       data_reduc);
5421       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5422       scalar_results.safe_push (new_temp);
5423     }
5424   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5425 	   && reduc_fn == IFN_LAST)
5426     {
5427       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5428 	 idx = 0;
5429          idx_val = induction_index[0];
5430 	 val = data_reduc[0];
5431          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5432 	   if (induction_index[i] > idx_val)
5433 	     val = data_reduc[i], idx_val = induction_index[i];
5434 	 return val;  */
5435 
5436       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5437       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5438       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5439       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5440       /* Enforced by vectorizable_reduction, which ensures we have target
5441 	 support before allowing a conditional reduction on variable-length
5442 	 vectors.  */
5443       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5444       tree idx_val = NULL_TREE, val = NULL_TREE;
5445       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5446 	{
5447 	  tree old_idx_val = idx_val;
5448 	  tree old_val = val;
5449 	  idx_val = make_ssa_name (idx_eltype);
5450 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5451 					     build3 (BIT_FIELD_REF, idx_eltype,
5452 						     induction_index,
5453 						     bitsize_int (el_size),
5454 						     bitsize_int (off)));
5455 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5456 	  val = make_ssa_name (data_eltype);
5457 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5458 					     build3 (BIT_FIELD_REF,
5459 						     data_eltype,
5460 						     new_phi_result,
5461 						     bitsize_int (el_size),
5462 						     bitsize_int (off)));
5463 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464 	  if (off != 0)
5465 	    {
5466 	      tree new_idx_val = idx_val;
5467 	      if (off != v_size - el_size)
5468 		{
5469 		  new_idx_val = make_ssa_name (idx_eltype);
5470 		  epilog_stmt = gimple_build_assign (new_idx_val,
5471 						     MAX_EXPR, idx_val,
5472 						     old_idx_val);
5473 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5474 		}
5475 	      tree new_val = make_ssa_name (data_eltype);
5476 	      epilog_stmt = gimple_build_assign (new_val,
5477 						 COND_EXPR,
5478 						 build2 (GT_EXPR,
5479 							 boolean_type_node,
5480 							 idx_val,
5481 							 old_idx_val),
5482 						 val, old_val);
5483 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5484 	      idx_val = new_idx_val;
5485 	      val = new_val;
5486 	    }
5487 	}
5488       /* Convert the reduced value back to the result type and set as the
5489 	 result.  */
5490       gimple_seq stmts = NULL;
5491       val = gimple_convert (&stmts, scalar_type, val);
5492       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5493       scalar_results.safe_push (val);
5494     }
5495 
5496   /* 2.3 Create the reduction code, using one of the three schemes described
5497          above. In SLP we simply need to extract all the elements from the
5498          vector (without reducing them), so we use scalar shifts.  */
5499   else if (reduc_fn != IFN_LAST && !slp_reduc)
5500     {
5501       tree tmp;
5502       tree vec_elem_type;
5503 
5504       /* Case 1:  Create:
5505          v_out2 = reduc_expr <v_out1>  */
5506 
5507       if (dump_enabled_p ())
5508         dump_printf_loc (MSG_NOTE, vect_location,
5509 			 "Reduce using direct vector reduction.\n");
5510 
5511       gimple_seq stmts = NULL;
5512       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5513       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5514       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5515 			       vec_elem_type, new_phi_result);
5516       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5517       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5518 
5519       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5520 	  && induc_val)
5521 	{
5522 	  /* Earlier we set the initial value to be a vector if induc_val
5523 	     values.  Check the result and if it is induc_val then replace
5524 	     with the original initial value, unless induc_val is
5525 	     the same as initial_def already.  */
5526 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527 				  induc_val);
5528 
5529 	  tmp = make_ssa_name (new_scalar_dest);
5530 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531 					     initial_def, new_temp);
5532 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533 	  new_temp = tmp;
5534 	}
5535 
5536       scalar_results.safe_push (new_temp);
5537     }
5538   else if (direct_slp_reduc)
5539     {
5540       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5541 	 with the elements for other SLP statements replaced with the
5542 	 neutral value.  We can then do a normal reduction on each vector.  */
5543 
5544       /* Enforced by vectorizable_reduction.  */
5545       gcc_assert (new_phis.length () == 1);
5546       gcc_assert (pow2p_hwi (group_size));
5547 
5548       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5549       vec<stmt_vec_info> orig_phis
5550 	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5551       gimple_seq seq = NULL;
5552 
5553       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5554 	 and the same element size as VECTYPE.  */
5555       tree index = build_index_vector (vectype, 0, 1);
5556       tree index_type = TREE_TYPE (index);
5557       tree index_elt_type = TREE_TYPE (index_type);
5558       tree mask_type = truth_type_for (index_type);
5559 
5560       /* Create a vector that, for each element, identifies which of
5561 	 the REDUC_GROUP_SIZE results should use it.  */
5562       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5563       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5564 			    build_vector_from_val (index_type, index_mask));
5565 
5566       /* Get a neutral vector value.  This is simply a splat of the neutral
5567 	 scalar value if we have one, otherwise the initial scalar value
5568 	 is itself a neutral value.  */
5569       tree vector_identity = NULL_TREE;
5570       tree neutral_op = NULL_TREE;
5571       if (slp_node)
5572 	{
5573 	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5574 	  neutral_op
5575 	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5576 					    vectype, code, first != NULL);
5577 	}
5578       if (neutral_op)
5579 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5580 							neutral_op);
5581       for (unsigned int i = 0; i < group_size; ++i)
5582 	{
5583 	  /* If there's no univeral neutral value, we can use the
5584 	     initial scalar value from the original PHI.  This is used
5585 	     for MIN and MAX reduction, for example.  */
5586 	  if (!neutral_op)
5587 	    {
5588 	      tree scalar_value
5589 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5590 					 loop_preheader_edge (loop));
5591 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5592 					     scalar_value);
5593 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5594 							      scalar_value);
5595 	    }
5596 
5597 	  /* Calculate the equivalent of:
5598 
5599 	     sel[j] = (index[j] == i);
5600 
5601 	     which selects the elements of NEW_PHI_RESULT that should
5602 	     be included in the result.  */
5603 	  tree compare_val = build_int_cst (index_elt_type, i);
5604 	  compare_val = build_vector_from_val (index_type, compare_val);
5605 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5606 				   index, compare_val);
5607 
5608 	  /* Calculate the equivalent of:
5609 
5610 	     vec = seq ? new_phi_result : vector_identity;
5611 
5612 	     VEC is now suitable for a full vector reduction.  */
5613 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5614 				   sel, new_phi_result, vector_identity);
5615 
5616 	  /* Do the reduction and convert it to the appropriate type.  */
5617 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5618 				      TREE_TYPE (vectype), vec);
5619 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5620 	  scalar_results.safe_push (scalar);
5621 	}
5622       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5623     }
5624   else
5625     {
5626       bool reduce_with_shift;
5627       tree vec_temp;
5628 
5629       gcc_assert (slp_reduc || new_phis.length () == 1);
5630 
5631       /* See if the target wants to do the final (shift) reduction
5632 	 in a vector mode of smaller size and first reduce upper/lower
5633 	 halves against each other.  */
5634       enum machine_mode mode1 = mode;
5635       tree stype = TREE_TYPE (vectype);
5636       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5637       unsigned nunits1 = nunits;
5638       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5639 	  && new_phis.length () == 1)
5640 	{
5641 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5642 	  /* For SLP reductions we have to make sure lanes match up, but
5643 	     since we're doing individual element final reduction reducing
5644 	     vector width here is even more important.
5645 	     ???  We can also separate lanes with permutes, for the common
5646 	     case of power-of-two group-size odd/even extracts would work.  */
5647 	  if (slp_reduc && nunits != nunits1)
5648 	    {
5649 	      nunits1 = least_common_multiple (nunits1, group_size);
5650 	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5651 	    }
5652 	}
5653       if (!slp_reduc
5654 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5655 	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5656 
5657       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5658 							   stype, nunits1);
5659       reduce_with_shift = have_whole_vector_shift (mode1);
5660       if (!VECTOR_MODE_P (mode1))
5661 	reduce_with_shift = false;
5662       else
5663 	{
5664 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5665 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5666 	    reduce_with_shift = false;
5667 	}
5668 
5669       /* First reduce the vector to the desired vector size we should
5670 	 do shift reduction on by combining upper and lower halves.  */
5671       new_temp = new_phi_result;
5672       while (nunits > nunits1)
5673 	{
5674 	  nunits /= 2;
5675 	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5676 							  stype, nunits);
5677 	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5678 
5679 	  /* The target has to make sure we support lowpart/highpart
5680 	     extraction, either via direct vector extract or through
5681 	     an integer mode punning.  */
5682 	  tree dst1, dst2;
5683 	  if (convert_optab_handler (vec_extract_optab,
5684 				     TYPE_MODE (TREE_TYPE (new_temp)),
5685 				     TYPE_MODE (vectype1))
5686 	      != CODE_FOR_nothing)
5687 	    {
5688 	      /* Extract sub-vectors directly once vec_extract becomes
5689 		 a conversion optab.  */
5690 	      dst1 = make_ssa_name (vectype1);
5691 	      epilog_stmt
5692 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5693 					 build3 (BIT_FIELD_REF, vectype1,
5694 						 new_temp, TYPE_SIZE (vectype1),
5695 						 bitsize_int (0)));
5696 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5697 	      dst2 =  make_ssa_name (vectype1);
5698 	      epilog_stmt
5699 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5700 					 build3 (BIT_FIELD_REF, vectype1,
5701 						 new_temp, TYPE_SIZE (vectype1),
5702 						 bitsize_int (bitsize)));
5703 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5704 	    }
5705 	  else
5706 	    {
5707 	      /* Extract via punning to appropriately sized integer mode
5708 		 vector.  */
5709 	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
5710 	      tree etype = build_vector_type (eltype, 2);
5711 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5712 						 TYPE_MODE (etype),
5713 						 TYPE_MODE (eltype))
5714 			  != CODE_FOR_nothing);
5715 	      tree tem = make_ssa_name (etype);
5716 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5717 						 build1 (VIEW_CONVERT_EXPR,
5718 							 etype, new_temp));
5719 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5720 	      new_temp = tem;
5721 	      tem = make_ssa_name (eltype);
5722 	      epilog_stmt
5723 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5724 					 build3 (BIT_FIELD_REF, eltype,
5725 						 new_temp, TYPE_SIZE (eltype),
5726 						 bitsize_int (0)));
5727 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5728 	      dst1 = make_ssa_name (vectype1);
5729 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5730 						 build1 (VIEW_CONVERT_EXPR,
5731 							 vectype1, tem));
5732 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5733 	      tem = make_ssa_name (eltype);
5734 	      epilog_stmt
5735 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5736 					 build3 (BIT_FIELD_REF, eltype,
5737 						 new_temp, TYPE_SIZE (eltype),
5738 						 bitsize_int (bitsize)));
5739 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5740 	      dst2 =  make_ssa_name (vectype1);
5741 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5742 						 build1 (VIEW_CONVERT_EXPR,
5743 							 vectype1, tem));
5744 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5745 	    }
5746 
5747 	  new_temp = make_ssa_name (vectype1);
5748 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5749 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 	  new_phis[0] = epilog_stmt;
5751 	}
5752 
5753       if (reduce_with_shift && !slp_reduc)
5754 	{
5755 	  int element_bitsize = tree_to_uhwi (bitsize);
5756 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5757 	     for variable-length vectors and also requires direct target support
5758 	     for loop reductions.  */
5759 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760 	  int nelements = vec_size_in_bits / element_bitsize;
5761 	  vec_perm_builder sel;
5762 	  vec_perm_indices indices;
5763 
5764           int elt_offset;
5765 
5766           tree zero_vec = build_zero_cst (vectype1);
5767           /* Case 2: Create:
5768              for (offset = nelements/2; offset >= 1; offset/=2)
5769                 {
5770                   Create:  va' = vec_shift <va, offset>
5771                   Create:  va = vop <va, va'>
5772                 }  */
5773 
5774           tree rhs;
5775 
5776           if (dump_enabled_p ())
5777             dump_printf_loc (MSG_NOTE, vect_location,
5778 			     "Reduce using vector shifts\n");
5779 
5780 	  gimple_seq stmts = NULL;
5781 	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5782           for (elt_offset = nelements / 2;
5783                elt_offset >= 1;
5784                elt_offset /= 2)
5785             {
5786 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5787 	      indices.new_vector (sel, 2, nelements);
5788 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5789 	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5790 				       new_temp, zero_vec, mask);
5791 	      new_temp = gimple_build (&stmts, code,
5792 				       vectype1, new_name, new_temp);
5793             }
5794 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5795 
5796 	  /* 2.4  Extract the final scalar result.  Create:
5797 	     s_out3 = extract_field <v_out2, bitpos>  */
5798 
5799 	  if (dump_enabled_p ())
5800 	    dump_printf_loc (MSG_NOTE, vect_location,
5801 			     "extract scalar result\n");
5802 
5803 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5804 			bitsize, bitsize_zero_node);
5805 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5806 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5807 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5808 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5809 	  scalar_results.safe_push (new_temp);
5810         }
5811       else
5812         {
5813           /* Case 3: Create:
5814              s = extract_field <v_out2, 0>
5815              for (offset = element_size;
5816                   offset < vector_size;
5817                   offset += element_size;)
5818                {
5819                  Create:  s' = extract_field <v_out2, offset>
5820                  Create:  s = op <s, s'>  // For non SLP cases
5821                }  */
5822 
5823           if (dump_enabled_p ())
5824             dump_printf_loc (MSG_NOTE, vect_location,
5825 			     "Reduce using scalar code.\n");
5826 
5827 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5828 	  int element_bitsize = tree_to_uhwi (bitsize);
5829 	  tree compute_type = TREE_TYPE (vectype);
5830 	  gimple_seq stmts = NULL;
5831           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5832             {
5833               int bit_offset;
5834               if (gimple_code (new_phi) == GIMPLE_PHI)
5835                 vec_temp = PHI_RESULT (new_phi);
5836               else
5837                 vec_temp = gimple_assign_lhs (new_phi);
5838 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5839 				       vec_temp, bitsize, bitsize_zero_node);
5840 
5841               /* In SLP we don't need to apply reduction operation, so we just
5842                  collect s' values in SCALAR_RESULTS.  */
5843               if (slp_reduc)
5844                 scalar_results.safe_push (new_temp);
5845 
5846               for (bit_offset = element_bitsize;
5847                    bit_offset < vec_size_in_bits;
5848                    bit_offset += element_bitsize)
5849                 {
5850                   tree bitpos = bitsize_int (bit_offset);
5851 		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5852 					   compute_type, vec_temp,
5853 					   bitsize, bitpos);
5854                   if (slp_reduc)
5855                     {
5856                       /* In SLP we don't need to apply reduction operation, so
5857                          we just collect s' values in SCALAR_RESULTS.  */
5858                       new_temp = new_name;
5859                       scalar_results.safe_push (new_name);
5860                     }
5861                   else
5862 		    new_temp = gimple_build (&stmts, code, compute_type,
5863 					     new_name, new_temp);
5864                 }
5865             }
5866 
5867           /* The only case where we need to reduce scalar results in SLP, is
5868              unrolling.  If the size of SCALAR_RESULTS is greater than
5869              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5870              REDUC_GROUP_SIZE.  */
5871           if (slp_reduc)
5872             {
5873               tree res, first_res, new_res;
5874 
5875               /* Reduce multiple scalar results in case of SLP unrolling.  */
5876               for (j = group_size; scalar_results.iterate (j, &res);
5877                    j++)
5878                 {
5879                   first_res = scalar_results[j % group_size];
5880 		  new_res = gimple_build (&stmts, code, compute_type,
5881 					  first_res, res);
5882                   scalar_results[j % group_size] = new_res;
5883                 }
5884 	      for (k = 0; k < group_size; k++)
5885 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
5886 						    scalar_results[k]);
5887             }
5888           else
5889 	    {
5890 	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5891 	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5892 	      scalar_results.safe_push (new_temp);
5893 	    }
5894 
5895 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5896         }
5897 
5898       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5899 	  && induc_val)
5900 	{
5901 	  /* Earlier we set the initial value to be a vector if induc_val
5902 	     values.  Check the result and if it is induc_val then replace
5903 	     with the original initial value, unless induc_val is
5904 	     the same as initial_def already.  */
5905 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5906 				  induc_val);
5907 
5908 	  tree tmp = make_ssa_name (new_scalar_dest);
5909 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5910 					     initial_def, new_temp);
5911 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5912 	  scalar_results[0] = tmp;
5913 	}
5914     }
5915 
5916   /* 2.5 Adjust the final result by the initial value of the reduction
5917 	 variable. (When such adjustment is not needed, then
5918 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5919 	 new_temp = loop_exit_def + adjustment_def  */
5920 
5921   if (adjustment_def)
5922     {
5923       gcc_assert (!slp_reduc);
5924       gimple_seq stmts = NULL;
5925       if (nested_in_vect_loop)
5926 	{
5927           new_phi = new_phis[0];
5928 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5929 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5930 	  new_temp = gimple_build (&stmts, code, vectype,
5931 				   PHI_RESULT (new_phi), adjustment_def);
5932 	}
5933       else
5934 	{
5935           new_temp = scalar_results[0];
5936 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5937 	  adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5938 	  new_temp = gimple_build (&stmts, code, scalar_type,
5939 				   new_temp, adjustment_def);
5940 	}
5941 
5942       epilog_stmt = gimple_seq_last_stmt (stmts);
5943       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5944       if (nested_in_vect_loop)
5945         {
5946           if (!double_reduc)
5947             scalar_results.quick_push (new_temp);
5948           else
5949             scalar_results[0] = new_temp;
5950         }
5951       else
5952         scalar_results[0] = new_temp;
5953 
5954       new_phis[0] = epilog_stmt;
5955     }
5956 
5957   if (double_reduc)
5958     loop = loop->inner;
5959 
5960   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5961           phis with new adjusted scalar results, i.e., replace use <s_out0>
5962           with use <s_out4>.
5963 
5964      Transform:
5965         loop_exit:
5966           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5967           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5968           v_out2 = reduce <v_out1>
5969           s_out3 = extract_field <v_out2, 0>
5970           s_out4 = adjust_result <s_out3>
5971           use <s_out0>
5972           use <s_out0>
5973 
5974      into:
5975 
5976         loop_exit:
5977           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5978           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5979           v_out2 = reduce <v_out1>
5980           s_out3 = extract_field <v_out2, 0>
5981           s_out4 = adjust_result <s_out3>
5982           use <s_out4>
5983           use <s_out4> */
5984 
5985 
5986   /* In SLP reduction chain we reduce vector results into one vector if
5987      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5988      LHS of the last stmt in the reduction chain, since we are looking for
5989      the loop exit phi node.  */
5990   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5991     {
5992       stmt_vec_info dest_stmt_info
5993 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5994       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5995       group_size = 1;
5996     }
5997 
5998   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5999      case that REDUC_GROUP_SIZE is greater than vectorization factor).
6000      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
6001      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
6002      correspond to the first vector stmt, etc.
6003      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
6004   if (group_size > new_phis.length ())
6005     gcc_assert (!(group_size % new_phis.length ()));
6006 
6007   for (k = 0; k < group_size; k++)
6008     {
6009       if (slp_reduc)
6010         {
6011 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6012 
6013 	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6014 	  /* SLP statements can't participate in patterns.  */
6015 	  gcc_assert (!orig_stmt_info);
6016 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6017         }
6018 
6019       if (nested_in_vect_loop)
6020         {
6021           if (double_reduc)
6022             loop = outer_loop;
6023           else
6024 	    gcc_unreachable ();
6025         }
6026 
6027       phis.create (3);
6028       /* Find the loop-closed-use at the loop exit of the original scalar
6029          result.  (The reduction result is expected to have two immediate uses,
6030          one at the latch block, and one at the loop exit).  For double
6031          reductions we are looking for exit phis of the outer loop.  */
6032       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6033         {
6034           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6035 	    {
6036 	      if (!is_gimple_debug (USE_STMT (use_p)))
6037 		phis.safe_push (USE_STMT (use_p));
6038 	    }
6039           else
6040             {
6041               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6042                 {
6043                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6044 
6045                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6046                     {
6047                       if (!flow_bb_inside_loop_p (loop,
6048                                              gimple_bb (USE_STMT (phi_use_p)))
6049 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6050                         phis.safe_push (USE_STMT (phi_use_p));
6051                     }
6052                 }
6053             }
6054         }
6055 
6056       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6057         {
6058           /* Replace the uses:  */
6059           orig_name = PHI_RESULT (exit_phi);
6060           scalar_result = scalar_results[k];
6061           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6062 	    {
6063 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064 		SET_USE (use_p, scalar_result);
6065 	      update_stmt (use_stmt);
6066 	    }
6067         }
6068 
6069       phis.release ();
6070     }
6071 }
6072 
6073 /* Return a vector of type VECTYPE that is equal to the vector select
6074    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6075    before GSI.  */
6076 
6077 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6078 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6079 		     tree vec, tree identity)
6080 {
6081   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6082   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6083 					  mask, vec, identity);
6084   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6085   return cond;
6086 }
6087 
6088 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6089    order, starting with LHS.  Insert the extraction statements before GSI and
6090    associate the new scalar SSA names with variable SCALAR_DEST.
6091    Return the SSA name for the result.  */
6092 
6093 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6094 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6095 		       tree_code code, tree lhs, tree vector_rhs)
6096 {
6097   tree vectype = TREE_TYPE (vector_rhs);
6098   tree scalar_type = TREE_TYPE (vectype);
6099   tree bitsize = TYPE_SIZE (scalar_type);
6100   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6101   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6102 
6103   for (unsigned HOST_WIDE_INT bit_offset = 0;
6104        bit_offset < vec_size_in_bits;
6105        bit_offset += element_bitsize)
6106     {
6107       tree bitpos = bitsize_int (bit_offset);
6108       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6109 			 bitsize, bitpos);
6110 
6111       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6112       rhs = make_ssa_name (scalar_dest, stmt);
6113       gimple_assign_set_lhs (stmt, rhs);
6114       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6115 
6116       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6117       tree new_name = make_ssa_name (scalar_dest, stmt);
6118       gimple_assign_set_lhs (stmt, new_name);
6119       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6120       lhs = new_name;
6121     }
6122   return lhs;
6123 }
6124 
6125 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6126    type of the vector input.  */
6127 
6128 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)6129 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6130 {
6131   internal_fn mask_reduc_fn;
6132 
6133   switch (reduc_fn)
6134     {
6135     case IFN_FOLD_LEFT_PLUS:
6136       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6137       break;
6138 
6139     default:
6140       return IFN_LAST;
6141     }
6142 
6143   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6144 				      OPTIMIZE_FOR_SPEED))
6145     return mask_reduc_fn;
6146   return IFN_LAST;
6147 }
6148 
6149 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6150    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6151    statement.  CODE is the operation performed by STMT_INFO and OPS are
6152    its scalar operands.  REDUC_INDEX is the index of the operand in
6153    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6154    implements in-order reduction, or IFN_LAST if we should open-code it.
6155    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6156    that should be used to control the operation in a fully-masked loop.  */
6157 
6158 static bool
vectorize_fold_left_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6159 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6160 			       stmt_vec_info stmt_info,
6161 			       gimple_stmt_iterator *gsi,
6162 			       gimple **vec_stmt, slp_tree slp_node,
6163 			       gimple *reduc_def_stmt,
6164 			       tree_code code, internal_fn reduc_fn,
6165 			       tree ops[3], tree vectype_in,
6166 			       int reduc_index, vec_loop_masks *masks)
6167 {
6168   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6169   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6170   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6171 
6172   int ncopies;
6173   if (slp_node)
6174     ncopies = 1;
6175   else
6176     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6177 
6178   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6179   gcc_assert (ncopies == 1);
6180   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6181 
6182   if (slp_node)
6183     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6184 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6185 
6186   tree op0 = ops[1 - reduc_index];
6187 
6188   int group_size = 1;
6189   stmt_vec_info scalar_dest_def_info;
6190   auto_vec<tree> vec_oprnds0;
6191   if (slp_node)
6192     {
6193       auto_vec<vec<tree> > vec_defs (2);
6194       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6195       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6196       vec_defs[0].release ();
6197       vec_defs[1].release ();
6198       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6199       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6200     }
6201   else
6202     {
6203       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6204 				     op0, &vec_oprnds0);
6205       scalar_dest_def_info = stmt_info;
6206     }
6207 
6208   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6209   tree scalar_type = TREE_TYPE (scalar_dest);
6210   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6211 
6212   int vec_num = vec_oprnds0.length ();
6213   gcc_assert (vec_num == 1 || slp_node);
6214   tree vec_elem_type = TREE_TYPE (vectype_out);
6215   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6216 
6217   tree vector_identity = NULL_TREE;
6218   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6219     vector_identity = build_zero_cst (vectype_out);
6220 
6221   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6222   int i;
6223   tree def0;
6224   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6225     {
6226       gimple *new_stmt;
6227       tree mask = NULL_TREE;
6228       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6229 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6230 
6231       /* Handle MINUS by adding the negative.  */
6232       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6233 	{
6234 	  tree negated = make_ssa_name (vectype_out);
6235 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6236 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6237 	  def0 = negated;
6238 	}
6239 
6240       if (mask && mask_reduc_fn == IFN_LAST)
6241 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6242 				    vector_identity);
6243 
6244       /* On the first iteration the input is simply the scalar phi
6245 	 result, and for subsequent iterations it is the output of
6246 	 the preceding operation.  */
6247       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6248 	{
6249 	  if (mask && mask_reduc_fn != IFN_LAST)
6250 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6251 						   def0, mask);
6252 	  else
6253 	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6254 						   def0);
6255 	  /* For chained SLP reductions the output of the previous reduction
6256 	     operation serves as the input of the next. For the final statement
6257 	     the output cannot be a temporary - we reuse the original
6258 	     scalar destination of the last statement.  */
6259 	  if (i != vec_num - 1)
6260 	    {
6261 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6262 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6263 	      gimple_set_lhs (new_stmt, reduc_var);
6264 	    }
6265 	}
6266       else
6267 	{
6268 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6269 					     reduc_var, def0);
6270 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6271 	  /* Remove the statement, so that we can use the same code paths
6272 	     as for statements that we've just created.  */
6273 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6274 	  gsi_remove (&tmp_gsi, true);
6275 	}
6276 
6277       if (i == vec_num - 1)
6278 	{
6279 	  gimple_set_lhs (new_stmt, scalar_dest);
6280 	  vect_finish_replace_stmt (loop_vinfo,
6281 				    scalar_dest_def_info,
6282 				    new_stmt);
6283 	}
6284       else
6285 	vect_finish_stmt_generation (loop_vinfo,
6286 				     scalar_dest_def_info,
6287 				     new_stmt, gsi);
6288 
6289       if (slp_node)
6290 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6291       else
6292 	{
6293 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6294 	  *vec_stmt = new_stmt;
6295 	}
6296     }
6297 
6298   return true;
6299 }
6300 
6301 /* Function is_nonwrapping_integer_induction.
6302 
6303    Check if STMT_VINO (which is part of loop LOOP) both increments and
6304    does not cause overflow.  */
6305 
6306 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)6307 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6308 {
6309   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6310   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6311   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6312   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6313   widest_int ni, max_loop_value, lhs_max;
6314   wi::overflow_type overflow = wi::OVF_NONE;
6315 
6316   /* Make sure the loop is integer based.  */
6317   if (TREE_CODE (base) != INTEGER_CST
6318       || TREE_CODE (step) != INTEGER_CST)
6319     return false;
6320 
6321   /* Check that the max size of the loop will not wrap.  */
6322 
6323   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6324     return true;
6325 
6326   if (! max_stmt_executions (loop, &ni))
6327     return false;
6328 
6329   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6330 			    &overflow);
6331   if (overflow)
6332     return false;
6333 
6334   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6335 			    TYPE_SIGN (lhs_type), &overflow);
6336   if (overflow)
6337     return false;
6338 
6339   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6340 	  <= TYPE_PRECISION (lhs_type));
6341 }
6342 
6343 /* Check if masking can be supported by inserting a conditional expression.
6344    CODE is the code for the operation.  COND_FN is the conditional internal
6345    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6346 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)6347 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6348 			 tree vectype_in)
6349 {
6350   if (cond_fn != IFN_LAST
6351       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6352 					 OPTIMIZE_FOR_SPEED))
6353     return false;
6354 
6355   switch (code)
6356     {
6357     case DOT_PROD_EXPR:
6358     case SAD_EXPR:
6359       return true;
6360 
6361     default:
6362       return false;
6363     }
6364 }
6365 
6366 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6367    code for the operation.  VOP is the array of operands.  MASK is the loop
6368    mask.  GSI is a statement iterator used to place the new conditional
6369    expression.  */
6370 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)6371 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6372 		      gimple_stmt_iterator *gsi)
6373 {
6374   switch (code)
6375     {
6376     case DOT_PROD_EXPR:
6377       {
6378 	tree vectype = TREE_TYPE (vop[1]);
6379 	tree zero = build_zero_cst (vectype);
6380 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6381 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6382 					       mask, vop[1], zero);
6383 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6384 	vop[1] = masked_op1;
6385 	break;
6386       }
6387 
6388     case SAD_EXPR:
6389       {
6390 	tree vectype = TREE_TYPE (vop[1]);
6391 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6392 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6393 					       mask, vop[1], vop[0]);
6394 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6395 	vop[1] = masked_op1;
6396 	break;
6397       }
6398 
6399     default:
6400       gcc_unreachable ();
6401     }
6402 }
6403 
6404 /* Function vectorizable_reduction.
6405 
6406    Check if STMT_INFO performs a reduction operation that can be vectorized.
6407    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6408    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6409    Return true if STMT_INFO is vectorizable in this way.
6410 
6411    This function also handles reduction idioms (patterns) that have been
6412    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6413    may be of this form:
6414      X = pattern_expr (arg0, arg1, ..., X)
6415    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6416    sequence that had been detected and replaced by the pattern-stmt
6417    (STMT_INFO).
6418 
6419    This function also handles reduction of condition expressions, for example:
6420      for (int i = 0; i < N; i++)
6421        if (a[i] < value)
6422 	 last = a[i];
6423    This is handled by vectorising the loop and creating an additional vector
6424    containing the loop indexes for which "a[i] < value" was true.  In the
6425    function epilogue this is reduced to a single max value and then used to
6426    index into the vector of results.
6427 
6428    In some cases of reduction patterns, the type of the reduction variable X is
6429    different than the type of the other arguments of STMT_INFO.
6430    In such cases, the vectype that is used when transforming STMT_INFO into
6431    a vector stmt is different than the vectype that is used to determine the
6432    vectorization factor, because it consists of a different number of elements
6433    than the actual number of elements that are being operated upon in parallel.
6434 
6435    For example, consider an accumulation of shorts into an int accumulator.
6436    On some targets it's possible to vectorize this pattern operating on 8
6437    shorts at a time (hence, the vectype for purposes of determining the
6438    vectorization factor should be V8HI); on the other hand, the vectype that
6439    is used to create the vector form is actually V4SI (the type of the result).
6440 
6441    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6442    indicates what is the actual level of parallelism (V8HI in the example), so
6443    that the right vectorization factor would be derived.  This vectype
6444    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6445    be used to create the vectorized stmt.  The right vectype for the vectorized
6446    stmt is obtained from the type of the result X:
6447       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448 
6449    This means that, contrary to "regular" reductions (or "regular" stmts in
6450    general), the following equation:
6451       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6452    does *NOT* necessarily hold for reduction patterns.  */
6453 
6454 bool
vectorizable_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6455 vectorizable_reduction (loop_vec_info loop_vinfo,
6456 			stmt_vec_info stmt_info, slp_tree slp_node,
6457 			slp_instance slp_node_instance,
6458 			stmt_vector_for_cost *cost_vec)
6459 {
6460   tree scalar_dest;
6461   tree vectype_in = NULL_TREE;
6462   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6463   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6464   stmt_vec_info cond_stmt_vinfo = NULL;
6465   tree scalar_type;
6466   int i;
6467   int ncopies;
6468   bool single_defuse_cycle = false;
6469   bool nested_cycle = false;
6470   bool double_reduc = false;
6471   int vec_num;
6472   tree tem;
6473   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6474   tree cond_reduc_val = NULL_TREE;
6475 
6476   /* Make sure it was already recognized as a reduction computation.  */
6477   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6478       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6479       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6480     return false;
6481 
6482   /* The stmt we store reduction analysis meta on.  */
6483   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6484   reduc_info->is_reduc_info = true;
6485 
6486   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6487     {
6488       if (is_a <gphi *> (stmt_info->stmt))
6489 	{
6490 	  if (slp_node)
6491 	    {
6492 	      /* We eventually need to set a vector type on invariant
6493 		 arguments.  */
6494 	      unsigned j;
6495 	      slp_tree child;
6496 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6497 		if (!vect_maybe_update_slp_op_vectype
6498 		       (child, SLP_TREE_VECTYPE (slp_node)))
6499 		  {
6500 		    if (dump_enabled_p ())
6501 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502 				       "incompatible vector types for "
6503 				       "invariants\n");
6504 		    return false;
6505 		  }
6506 	    }
6507 	  /* Analysis for double-reduction is done on the outer
6508 	     loop PHI, nested cycles have no further restrictions.  */
6509 	  STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6510 	}
6511       else
6512 	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6513       return true;
6514     }
6515 
6516   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6517   stmt_vec_info phi_info = stmt_info;
6518   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6519       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6520     {
6521       if (!is_a <gphi *> (stmt_info->stmt))
6522 	{
6523 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6524 	  return true;
6525 	}
6526       if (slp_node)
6527 	{
6528 	  slp_node_instance->reduc_phis = slp_node;
6529 	  /* ???  We're leaving slp_node to point to the PHIs, we only
6530 	     need it to get at the number of vector stmts which wasn't
6531 	     yet initialized for the instance root.  */
6532 	}
6533       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6534 	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6535       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6536 	{
6537 	  use_operand_p use_p;
6538 	  gimple *use_stmt;
6539 	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6540 				     &use_p, &use_stmt);
6541 	  gcc_assert (res);
6542 	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
6543 	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6544 	}
6545     }
6546 
6547   /* PHIs should not participate in patterns.  */
6548   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6549   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6550 
6551   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6552      and compute the reduction chain length.  Discover the real
6553      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6554   tree reduc_def
6555     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6556 			     loop_latch_edge
6557 			       (gimple_bb (reduc_def_phi)->loop_father));
6558   unsigned reduc_chain_length = 0;
6559   bool only_slp_reduc_chain = true;
6560   stmt_info = NULL;
6561   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6562   while (reduc_def != PHI_RESULT (reduc_def_phi))
6563     {
6564       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6565       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6566       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6567 	{
6568 	  if (dump_enabled_p ())
6569 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6570 			     "reduction chain broken by patterns.\n");
6571 	  return false;
6572 	}
6573       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6574 	only_slp_reduc_chain = false;
6575       /* ???  For epilogue generation live members of the chain need
6576          to point back to the PHI via their original stmt for
6577 	 info_for_reduction to work.  */
6578       if (STMT_VINFO_LIVE_P (vdef))
6579 	STMT_VINFO_REDUC_DEF (def) = phi_info;
6580       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6581       if (!assign)
6582 	{
6583 	  if (dump_enabled_p ())
6584 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585 			     "reduction chain includes calls.\n");
6586 	  return false;
6587 	}
6588       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6589 	{
6590 	  if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6591 				      TREE_TYPE (gimple_assign_rhs1 (assign))))
6592 	    {
6593 	      if (dump_enabled_p ())
6594 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595 				 "conversion in the reduction chain.\n");
6596 	      return false;
6597 	    }
6598 	}
6599       else if (!stmt_info)
6600 	/* First non-conversion stmt.  */
6601 	stmt_info = vdef;
6602       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6603       reduc_chain_length++;
6604       if (!stmt_info && slp_node)
6605 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6606     }
6607   /* PHIs should not participate in patterns.  */
6608   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6609 
6610   if (nested_in_vect_loop_p (loop, stmt_info))
6611     {
6612       loop = loop->inner;
6613       nested_cycle = true;
6614     }
6615 
6616   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6617      element.  */
6618   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6619     {
6620       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6621       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6622     }
6623   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6624     gcc_assert (slp_node
6625 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6626 
6627   /* 1. Is vectorizable reduction?  */
6628   /* Not supportable if the reduction variable is used in the loop, unless
6629      it's a reduction chain.  */
6630   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6631       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6632     return false;
6633 
6634   /* Reductions that are not used even in an enclosing outer-loop,
6635      are expected to be "live" (used out of the loop).  */
6636   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6637       && !STMT_VINFO_LIVE_P (stmt_info))
6638     return false;
6639 
6640   /* 2. Has this been recognized as a reduction pattern?
6641 
6642      Check if STMT represents a pattern that has been recognized
6643      in earlier analysis stages.  For stmts that represent a pattern,
6644      the STMT_VINFO_RELATED_STMT field records the last stmt in
6645      the original sequence that constitutes the pattern.  */
6646 
6647   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6648   if (orig_stmt_info)
6649     {
6650       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6651       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6652     }
6653 
6654   /* 3. Check the operands of the operation.  The first operands are defined
6655         inside the loop body. The last operand is the reduction variable,
6656         which is defined by the loop-header-phi.  */
6657 
6658   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6659   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6660   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6661   enum tree_code code = gimple_assign_rhs_code (stmt);
6662   bool lane_reduc_code_p
6663     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6664   int op_type = TREE_CODE_LENGTH (code);
6665 
6666   scalar_dest = gimple_assign_lhs (stmt);
6667   scalar_type = TREE_TYPE (scalar_dest);
6668   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6669       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6670     return false;
6671 
6672   /* Do not try to vectorize bit-precision reductions.  */
6673   if (!type_has_mode_precision_p (scalar_type))
6674     return false;
6675 
6676   /* For lane-reducing ops we're reducing the number of reduction PHIs
6677      which means the only use of that may be in the lane-reducing operation.  */
6678   if (lane_reduc_code_p
6679       && reduc_chain_length != 1
6680       && !only_slp_reduc_chain)
6681     {
6682       if (dump_enabled_p ())
6683 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 			 "lane-reducing reduction with extra stmts.\n");
6685       return false;
6686     }
6687 
6688   /* All uses but the last are expected to be defined in the loop.
6689      The last use is the reduction variable.  In case of nested cycle this
6690      assumption is not true: we use reduc_index to record the index of the
6691      reduction variable.  */
6692   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6693   /* We need to skip an extra operand for COND_EXPRs with embedded
6694      comparison.  */
6695   unsigned opno_adjust = 0;
6696   if (code == COND_EXPR
6697       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6698     opno_adjust = 1;
6699   for (i = 0; i < op_type; i++)
6700     {
6701       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6702       if (i == 0 && code == COND_EXPR)
6703         continue;
6704 
6705       stmt_vec_info def_stmt_info;
6706       enum vect_def_type dt;
6707       tree op;
6708       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6709 			       i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6710 			       &def_stmt_info))
6711 	{
6712 	  if (dump_enabled_p ())
6713 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 			     "use not simple.\n");
6715 	  return false;
6716 	}
6717       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6718 	continue;
6719 
6720       /* There should be only one cycle def in the stmt, the one
6721          leading to reduc_def.  */
6722       if (VECTORIZABLE_CYCLE_DEF (dt))
6723 	return false;
6724 
6725       /* To properly compute ncopies we are interested in the widest
6726 	 non-reduction input type in case we're looking at a widening
6727 	 accumulation that we later handle in vect_transform_reduction.  */
6728       if (lane_reduc_code_p
6729 	  && tem
6730 	  && (!vectype_in
6731 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6732 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6733 	vectype_in = tem;
6734 
6735       if (code == COND_EXPR)
6736 	{
6737 	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6738 	  if (dt == vect_constant_def)
6739 	    {
6740 	      cond_reduc_dt = dt;
6741 	      cond_reduc_val = op;
6742 	    }
6743 	  if (dt == vect_induction_def
6744 	      && def_stmt_info
6745 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6746 	    {
6747 	      cond_reduc_dt = dt;
6748 	      cond_stmt_vinfo = def_stmt_info;
6749 	    }
6750 	}
6751     }
6752   if (!vectype_in)
6753     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6754   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6755 
6756   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6757   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6758   /* If we have a condition reduction, see if we can simplify it further.  */
6759   if (v_reduc_type == COND_REDUCTION)
6760     {
6761       if (slp_node)
6762 	return false;
6763 
6764       /* When the condition uses the reduction value in the condition, fail.  */
6765       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6766 	{
6767 	  if (dump_enabled_p ())
6768 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 			     "condition depends on previous iteration\n");
6770 	  return false;
6771 	}
6772 
6773       if (reduc_chain_length == 1
6774 	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6775 					     vectype_in, OPTIMIZE_FOR_SPEED))
6776 	{
6777 	  if (dump_enabled_p ())
6778 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 			     "optimizing condition reduction with"
6780 			     " FOLD_EXTRACT_LAST.\n");
6781 	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6782 	}
6783       else if (cond_reduc_dt == vect_induction_def)
6784 	{
6785 	  tree base
6786 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6787 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6788 
6789 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6790 		      && TREE_CODE (step) == INTEGER_CST);
6791 	  cond_reduc_val = NULL_TREE;
6792 	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6793 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6794 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6795 	    ;
6796 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6797 	     above base; punt if base is the minimum value of the type for
6798 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6799 	  else if (tree_int_cst_sgn (step) == -1)
6800 	    {
6801 	      cond_reduc_op_code = MIN_EXPR;
6802 	      if (tree_int_cst_sgn (base) == -1)
6803 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6804 	      else if (tree_int_cst_lt (base,
6805 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6806 		cond_reduc_val
6807 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6808 	    }
6809 	  else
6810 	    {
6811 	      cond_reduc_op_code = MAX_EXPR;
6812 	      if (tree_int_cst_sgn (base) == 1)
6813 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6814 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6815 					base))
6816 		cond_reduc_val
6817 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6818 	    }
6819 	  if (cond_reduc_val)
6820 	    {
6821 	      if (dump_enabled_p ())
6822 		dump_printf_loc (MSG_NOTE, vect_location,
6823 				 "condition expression based on "
6824 				 "integer induction.\n");
6825 	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6826 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6827 		= cond_reduc_val;
6828 	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6829 	    }
6830 	}
6831       else if (cond_reduc_dt == vect_constant_def)
6832 	{
6833 	  enum vect_def_type cond_initial_dt;
6834 	  tree cond_initial_val
6835 	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6836 
6837 	  gcc_assert (cond_reduc_val != NULL_TREE);
6838 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6839 	  if (cond_initial_dt == vect_constant_def
6840 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6841 				     TREE_TYPE (cond_reduc_val)))
6842 	    {
6843 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6844 				    cond_initial_val, cond_reduc_val);
6845 	      if (e && (integer_onep (e) || integer_zerop (e)))
6846 		{
6847 		  if (dump_enabled_p ())
6848 		    dump_printf_loc (MSG_NOTE, vect_location,
6849 				     "condition expression based on "
6850 				     "compile time constant.\n");
6851 		  /* Record reduction code at analysis stage.  */
6852 		  STMT_VINFO_REDUC_CODE (reduc_info)
6853 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6854 		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6855 		}
6856 	    }
6857 	}
6858     }
6859 
6860   if (STMT_VINFO_LIVE_P (phi_info))
6861     return false;
6862 
6863   if (slp_node)
6864     ncopies = 1;
6865   else
6866     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867 
6868   gcc_assert (ncopies >= 1);
6869 
6870   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6871 
6872   if (nested_cycle)
6873     {
6874       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6875 		  == vect_double_reduction_def);
6876       double_reduc = true;
6877     }
6878 
6879   /* 4.2. Check support for the epilog operation.
6880 
6881           If STMT represents a reduction pattern, then the type of the
6882           reduction variable may be different than the type of the rest
6883           of the arguments.  For example, consider the case of accumulation
6884           of shorts into an int accumulator; The original code:
6885                         S1: int_a = (int) short_a;
6886           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6887 
6888           was replaced with:
6889                         STMT: int_acc = widen_sum <short_a, int_acc>
6890 
6891           This means that:
6892           1. The tree-code that is used to create the vector operation in the
6893              epilog code (that reduces the partial results) is not the
6894              tree-code of STMT, but is rather the tree-code of the original
6895              stmt from the pattern that STMT is replacing.  I.e, in the example
6896              above we want to use 'widen_sum' in the loop, but 'plus' in the
6897              epilog.
6898           2. The type (mode) we use to check available target support
6899              for the vector operation to be created in the *epilog*, is
6900              determined by the type of the reduction variable (in the example
6901              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6902              However the type (mode) we use to check available target support
6903              for the vector operation to be created *inside the loop*, is
6904              determined by the type of the other arguments to STMT (in the
6905              example we'd check this: optab_handler (widen_sum_optab,
6906 	     vect_short_mode)).
6907 
6908           This is contrary to "regular" reductions, in which the types of all
6909           the arguments are the same as the type of the reduction variable.
6910           For "regular" reductions we can therefore use the same vector type
6911           (and also the same tree-code) when generating the epilog code and
6912           when generating the code inside the loop.  */
6913 
6914   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6915   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6916 
6917   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6918   if (reduction_type == TREE_CODE_REDUCTION)
6919     {
6920       /* Check whether it's ok to change the order of the computation.
6921 	 Generally, when vectorizing a reduction we change the order of the
6922 	 computation.  This may change the behavior of the program in some
6923 	 cases, so we need to check that this is ok.  One exception is when
6924 	 vectorizing an outer-loop: the inner-loop is executed sequentially,
6925 	 and therefore vectorizing reductions in the inner-loop during
6926 	 outer-loop vectorization is safe.  Likewise when we are vectorizing
6927 	 a series of reductions using SLP and the VF is one the reductions
6928 	 are performed in scalar order.  */
6929       if (slp_node
6930 	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6931 	  && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6932 	;
6933       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6934 	{
6935 	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
6936 	     is not directy used in stmt.  */
6937 	  if (!only_slp_reduc_chain
6938 	      && reduc_chain_length != 1)
6939 	    {
6940 	      if (dump_enabled_p ())
6941 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942 				 "in-order reduction chain without SLP.\n");
6943 	      return false;
6944 	    }
6945 	  STMT_VINFO_REDUC_TYPE (reduc_info)
6946 	    = reduction_type = FOLD_LEFT_REDUCTION;
6947 	}
6948       else if (!commutative_tree_code (orig_code)
6949 	       || !associative_tree_code (orig_code))
6950 	{
6951 	  if (dump_enabled_p ())
6952 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953 			    "reduction: not commutative/associative");
6954 	  return false;
6955 	}
6956     }
6957 
6958   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6959       && ncopies > 1)
6960     {
6961       if (dump_enabled_p ())
6962 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963 			 "multiple types in double reduction or condition "
6964 			 "reduction or fold-left reduction.\n");
6965       return false;
6966     }
6967 
6968   internal_fn reduc_fn = IFN_LAST;
6969   if (reduction_type == TREE_CODE_REDUCTION
6970       || reduction_type == FOLD_LEFT_REDUCTION
6971       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6972       || reduction_type == CONST_COND_REDUCTION)
6973     {
6974       if (reduction_type == FOLD_LEFT_REDUCTION
6975 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6976 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6977 	{
6978 	  if (reduc_fn != IFN_LAST
6979 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6980 						  OPTIMIZE_FOR_SPEED))
6981 	    {
6982 	      if (dump_enabled_p ())
6983 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6984 				 "reduc op not supported by target.\n");
6985 
6986 	      reduc_fn = IFN_LAST;
6987 	    }
6988 	}
6989       else
6990 	{
6991 	  if (!nested_cycle || double_reduc)
6992 	    {
6993 	      if (dump_enabled_p ())
6994 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995 				 "no reduc code for scalar code.\n");
6996 
6997 	      return false;
6998 	    }
6999 	}
7000     }
7001   else if (reduction_type == COND_REDUCTION)
7002     {
7003       int scalar_precision
7004 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7005       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7006       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7007 						vectype_out);
7008 
7009       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7010 					  OPTIMIZE_FOR_SPEED))
7011 	reduc_fn = IFN_REDUC_MAX;
7012     }
7013   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7014 
7015   if (reduction_type != EXTRACT_LAST_REDUCTION
7016       && (!nested_cycle || double_reduc)
7017       && reduc_fn == IFN_LAST
7018       && !nunits_out.is_constant ())
7019     {
7020       if (dump_enabled_p ())
7021 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7022 			 "missing target support for reduction on"
7023 			 " variable-length vectors.\n");
7024       return false;
7025     }
7026 
7027   /* For SLP reductions, see if there is a neutral value we can use.  */
7028   tree neutral_op = NULL_TREE;
7029   if (slp_node)
7030     neutral_op = neutral_op_for_slp_reduction
7031       (slp_node_instance->reduc_phis, vectype_out, orig_code,
7032        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7033 
7034   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7035     {
7036       /* We can't support in-order reductions of code such as this:
7037 
7038 	   for (int i = 0; i < n1; ++i)
7039 	     for (int j = 0; j < n2; ++j)
7040 	       l += a[j];
7041 
7042 	 since GCC effectively transforms the loop when vectorizing:
7043 
7044 	   for (int i = 0; i < n1 / VF; ++i)
7045 	     for (int j = 0; j < n2; ++j)
7046 	       for (int k = 0; k < VF; ++k)
7047 		 l += a[j];
7048 
7049 	 which is a reassociation of the original operation.  */
7050       if (dump_enabled_p ())
7051 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 			 "in-order double reduction not supported.\n");
7053 
7054       return false;
7055     }
7056 
7057   if (reduction_type == FOLD_LEFT_REDUCTION
7058       && slp_node
7059       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7060     {
7061       /* We cannot use in-order reductions in this case because there is
7062 	 an implicit reassociation of the operations involved.  */
7063       if (dump_enabled_p ())
7064 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 			 "in-order unchained SLP reductions not supported.\n");
7066       return false;
7067     }
7068 
7069   /* For double reductions, and for SLP reductions with a neutral value,
7070      we construct a variable-length initial vector by loading a vector
7071      full of the neutral value and then shift-and-inserting the start
7072      values into the low-numbered elements.  */
7073   if ((double_reduc || neutral_op)
7074       && !nunits_out.is_constant ()
7075       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7076 					  vectype_out, OPTIMIZE_FOR_SPEED))
7077     {
7078       if (dump_enabled_p ())
7079 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080 			 "reduction on variable-length vectors requires"
7081 			 " target support for a vector-shift-and-insert"
7082 			 " operation.\n");
7083       return false;
7084     }
7085 
7086   /* Check extra constraints for variable-length unchained SLP reductions.  */
7087   if (STMT_SLP_TYPE (stmt_info)
7088       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7089       && !nunits_out.is_constant ())
7090     {
7091       /* We checked above that we could build the initial vector when
7092 	 there's a neutral element value.  Check here for the case in
7093 	 which each SLP statement has its own initial value and in which
7094 	 that value needs to be repeated for every instance of the
7095 	 statement within the initial vector.  */
7096       unsigned int group_size = SLP_TREE_LANES (slp_node);
7097       if (!neutral_op
7098 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7099 					      TREE_TYPE (vectype_out)))
7100 	{
7101 	  if (dump_enabled_p ())
7102 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 			     "unsupported form of SLP reduction for"
7104 			     " variable-length vectors: cannot build"
7105 			     " initial vector.\n");
7106 	  return false;
7107 	}
7108       /* The epilogue code relies on the number of elements being a multiple
7109 	 of the group size.  The duplicate-and-interleave approach to setting
7110 	 up the initial vector does too.  */
7111       if (!multiple_p (nunits_out, group_size))
7112 	{
7113 	  if (dump_enabled_p ())
7114 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115 			     "unsupported form of SLP reduction for"
7116 			     " variable-length vectors: the vector size"
7117 			     " is not a multiple of the number of results.\n");
7118 	  return false;
7119 	}
7120     }
7121 
7122   if (reduction_type == COND_REDUCTION)
7123     {
7124       widest_int ni;
7125 
7126       if (! max_loop_iterations (loop, &ni))
7127 	{
7128 	  if (dump_enabled_p ())
7129 	    dump_printf_loc (MSG_NOTE, vect_location,
7130 			     "loop count not known, cannot create cond "
7131 			     "reduction.\n");
7132 	  return false;
7133 	}
7134       /* Convert backedges to iterations.  */
7135       ni += 1;
7136 
7137       /* The additional index will be the same type as the condition.  Check
7138 	 that the loop can fit into this less one (because we'll use up the
7139 	 zero slot for when there are no matches).  */
7140       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7141       if (wi::geu_p (ni, wi::to_widest (max_index)))
7142 	{
7143 	  if (dump_enabled_p ())
7144 	    dump_printf_loc (MSG_NOTE, vect_location,
7145 			     "loop size is greater than data size.\n");
7146 	  return false;
7147 	}
7148     }
7149 
7150   /* In case the vectorization factor (VF) is bigger than the number
7151      of elements that we can fit in a vectype (nunits), we have to generate
7152      more than one vector stmt - i.e - we need to "unroll" the
7153      vector stmt by a factor VF/nunits.  For more details see documentation
7154      in vectorizable_operation.  */
7155 
7156   /* If the reduction is used in an outer loop we need to generate
7157      VF intermediate results, like so (e.g. for ncopies=2):
7158 	r0 = phi (init, r0)
7159 	r1 = phi (init, r1)
7160 	r0 = x0 + r0;
7161         r1 = x1 + r1;
7162     (i.e. we generate VF results in 2 registers).
7163     In this case we have a separate def-use cycle for each copy, and therefore
7164     for each copy we get the vector def for the reduction variable from the
7165     respective phi node created for this copy.
7166 
7167     Otherwise (the reduction is unused in the loop nest), we can combine
7168     together intermediate results, like so (e.g. for ncopies=2):
7169 	r = phi (init, r)
7170 	r = x0 + r;
7171 	r = x1 + r;
7172    (i.e. we generate VF/2 results in a single register).
7173    In this case for each copy we get the vector def for the reduction variable
7174    from the vectorized reduction operation generated in the previous iteration.
7175 
7176    This only works when we see both the reduction PHI and its only consumer
7177    in vectorizable_reduction and there are no intermediate stmts
7178    participating.  */
7179   if (ncopies > 1
7180       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7181       && reduc_chain_length == 1)
7182     single_defuse_cycle = true;
7183 
7184   if (single_defuse_cycle || lane_reduc_code_p)
7185     {
7186       gcc_assert (code != COND_EXPR);
7187 
7188       /* 4. Supportable by target?  */
7189       bool ok = true;
7190 
7191       /* 4.1. check support for the operation in the loop  */
7192       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7193       if (!optab)
7194 	{
7195 	  if (dump_enabled_p ())
7196 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197 			     "no optab.\n");
7198 	  ok = false;
7199         }
7200 
7201       machine_mode vec_mode = TYPE_MODE (vectype_in);
7202       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7203         {
7204           if (dump_enabled_p ())
7205             dump_printf (MSG_NOTE, "op not supported by target.\n");
7206 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7207 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7208 	    ok = false;
7209 	  else
7210 	    if (dump_enabled_p ())
7211 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7212         }
7213 
7214       /* Worthwhile without SIMD support?  */
7215       if (ok
7216 	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7217 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7218         {
7219           if (dump_enabled_p ())
7220 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7221 			     "not worthwhile without SIMD support.\n");
7222 	  ok = false;
7223         }
7224 
7225       /* lane-reducing operations have to go through vect_transform_reduction.
7226          For the other cases try without the single cycle optimization.  */
7227       if (!ok)
7228 	{
7229 	  if (lane_reduc_code_p)
7230 	    return false;
7231 	  else
7232 	    single_defuse_cycle = false;
7233 	}
7234     }
7235   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7236 
7237   /* If the reduction stmt is one of the patterns that have lane
7238      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7239   if ((ncopies > 1 && ! single_defuse_cycle)
7240       && lane_reduc_code_p)
7241     {
7242       if (dump_enabled_p ())
7243 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244 			 "multi def-use cycle not possible for lane-reducing "
7245 			 "reduction operation\n");
7246       return false;
7247     }
7248 
7249   if (slp_node
7250       && !(!single_defuse_cycle
7251 	   && code != DOT_PROD_EXPR
7252 	   && code != WIDEN_SUM_EXPR
7253 	   && code != SAD_EXPR
7254 	   && reduction_type != FOLD_LEFT_REDUCTION))
7255     for (i = 0; i < op_type; i++)
7256       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7257 	{
7258 	  if (dump_enabled_p ())
7259 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7260 			     "incompatible vector types for invariants\n");
7261 	  return false;
7262 	}
7263 
7264   if (slp_node)
7265     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7266   else
7267     vec_num = 1;
7268 
7269   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7270 			     reduction_type, ncopies, cost_vec);
7271   /* Cost the reduction op inside the loop if transformed via
7272      vect_transform_reduction.  Otherwise this is costed by the
7273      separate vectorizable_* routines.  */
7274   if (single_defuse_cycle
7275       || code == DOT_PROD_EXPR
7276       || code == WIDEN_SUM_EXPR
7277       || code == SAD_EXPR)
7278     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7279 
7280   if (dump_enabled_p ()
7281       && reduction_type == FOLD_LEFT_REDUCTION)
7282     dump_printf_loc (MSG_NOTE, vect_location,
7283 		     "using an in-order (fold-left) reduction.\n");
7284   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7285   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7286      reductions go through their own vectorizable_* routines.  */
7287   if (!single_defuse_cycle
7288       && code != DOT_PROD_EXPR
7289       && code != WIDEN_SUM_EXPR
7290       && code != SAD_EXPR
7291       && reduction_type != FOLD_LEFT_REDUCTION)
7292     {
7293       stmt_vec_info tem
7294 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7295       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7296 	{
7297 	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7298 	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7299 	}
7300       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7301       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7302     }
7303   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7304     {
7305       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7306       internal_fn cond_fn = get_conditional_internal_fn (code);
7307 
7308       if (reduction_type != FOLD_LEFT_REDUCTION
7309 	  && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7310 	  && (cond_fn == IFN_LAST
7311 	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7312 						  OPTIMIZE_FOR_SPEED)))
7313 	{
7314 	  if (dump_enabled_p ())
7315 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 			     "can't operate on partial vectors because"
7317 			     " no conditional operation is available.\n");
7318 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7319 	}
7320       else if (reduction_type == FOLD_LEFT_REDUCTION
7321 	       && reduc_fn == IFN_LAST
7322 	       && !expand_vec_cond_expr_p (vectype_in,
7323 					   truth_type_for (vectype_in),
7324 					   SSA_NAME))
7325 	{
7326 	  if (dump_enabled_p ())
7327 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328 			     "can't operate on partial vectors because"
7329 			     " no conditional operation is available.\n");
7330 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7331 	}
7332       else
7333 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7334 			       vectype_in, NULL);
7335     }
7336   return true;
7337 }
7338 
7339 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7340    value.  */
7341 
7342 bool
vect_transform_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)7343 vect_transform_reduction (loop_vec_info loop_vinfo,
7344 			  stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7345 			  gimple **vec_stmt, slp_tree slp_node)
7346 {
7347   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7348   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7349   int i;
7350   int ncopies;
7351   int vec_num;
7352 
7353   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7354   gcc_assert (reduc_info->is_reduc_info);
7355 
7356   if (nested_in_vect_loop_p (loop, stmt_info))
7357     {
7358       loop = loop->inner;
7359       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7360     }
7361 
7362   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7363   enum tree_code code = gimple_assign_rhs_code (stmt);
7364   int op_type = TREE_CODE_LENGTH (code);
7365 
7366   /* Flatten RHS.  */
7367   tree ops[3];
7368   switch (get_gimple_rhs_class (code))
7369     {
7370     case GIMPLE_TERNARY_RHS:
7371       ops[2] = gimple_assign_rhs3 (stmt);
7372       /* Fall thru.  */
7373     case GIMPLE_BINARY_RHS:
7374       ops[0] = gimple_assign_rhs1 (stmt);
7375       ops[1] = gimple_assign_rhs2 (stmt);
7376       break;
7377     default:
7378       gcc_unreachable ();
7379     }
7380 
7381   /* All uses but the last are expected to be defined in the loop.
7382      The last use is the reduction variable.  In case of nested cycle this
7383      assumption is not true: we use reduc_index to record the index of the
7384      reduction variable.  */
7385   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7386   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7387   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7388   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7389 
7390   if (slp_node)
7391     {
7392       ncopies = 1;
7393       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7394     }
7395   else
7396     {
7397       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7398       vec_num = 1;
7399     }
7400 
7401   internal_fn cond_fn = get_conditional_internal_fn (code);
7402   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7403   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7404 
7405   /* Transform.  */
7406   tree new_temp = NULL_TREE;
7407   auto_vec<tree> vec_oprnds0;
7408   auto_vec<tree> vec_oprnds1;
7409   auto_vec<tree> vec_oprnds2;
7410   tree def0;
7411 
7412   if (dump_enabled_p ())
7413     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7414 
7415   /* FORNOW: Multiple types are not supported for condition.  */
7416   if (code == COND_EXPR)
7417     gcc_assert (ncopies == 1);
7418 
7419   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7420 
7421   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7422   if (reduction_type == FOLD_LEFT_REDUCTION)
7423     {
7424       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7425       return vectorize_fold_left_reduction
7426 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7427 	   reduc_fn, ops, vectype_in, reduc_index, masks);
7428     }
7429 
7430   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7431   gcc_assert (single_defuse_cycle
7432 	      || code == DOT_PROD_EXPR
7433 	      || code == WIDEN_SUM_EXPR
7434 	      || code == SAD_EXPR);
7435 
7436   /* Create the destination vector  */
7437   tree scalar_dest = gimple_assign_lhs (stmt);
7438   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7439 
7440   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7441 		     single_defuse_cycle && reduc_index == 0
7442 		     ? NULL_TREE : ops[0], &vec_oprnds0,
7443 		     single_defuse_cycle && reduc_index == 1
7444 		     ? NULL_TREE : ops[1], &vec_oprnds1,
7445 		     op_type == ternary_op
7446 		     && !(single_defuse_cycle && reduc_index == 2)
7447 		     ? ops[2] : NULL_TREE, &vec_oprnds2);
7448   if (single_defuse_cycle)
7449     {
7450       gcc_assert (!slp_node);
7451       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7452 				     ops[reduc_index],
7453 				     reduc_index == 0 ? &vec_oprnds0
7454 				     : (reduc_index == 1 ? &vec_oprnds1
7455 					: &vec_oprnds2));
7456     }
7457 
7458   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7459     {
7460       gimple *new_stmt;
7461       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7462       if (masked_loop_p && !mask_by_cond_expr)
7463 	{
7464 	  /* Make sure that the reduction accumulator is vop[0].  */
7465 	  if (reduc_index == 1)
7466 	    {
7467 	      gcc_assert (commutative_tree_code (code));
7468 	      std::swap (vop[0], vop[1]);
7469 	    }
7470 	  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7471 					  vectype_in, i);
7472 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7473 						    vop[0], vop[1], vop[0]);
7474 	  new_temp = make_ssa_name (vec_dest, call);
7475 	  gimple_call_set_lhs (call, new_temp);
7476 	  gimple_call_set_nothrow (call, true);
7477 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7478 	  new_stmt = call;
7479 	}
7480       else
7481 	{
7482 	  if (op_type == ternary_op)
7483 	    vop[2] = vec_oprnds2[i];
7484 
7485 	  if (masked_loop_p && mask_by_cond_expr)
7486 	    {
7487 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7488 					      vectype_in, i);
7489 	      build_vect_cond_expr (code, vop, mask, gsi);
7490 	    }
7491 
7492 	  new_stmt = gimple_build_assign (vec_dest, code,
7493 					  vop[0], vop[1], vop[2]);
7494 	  new_temp = make_ssa_name (vec_dest, new_stmt);
7495 	  gimple_assign_set_lhs (new_stmt, new_temp);
7496 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7497 	}
7498 
7499       if (slp_node)
7500 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7501       else if (single_defuse_cycle
7502 	       && i < ncopies - 1)
7503 	{
7504 	  if (reduc_index == 0)
7505 	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7506 	  else if (reduc_index == 1)
7507 	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7508 	  else if (reduc_index == 2)
7509 	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7510 	}
7511       else
7512 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7513     }
7514 
7515   if (!slp_node)
7516     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7517 
7518   return true;
7519 }
7520 
7521 /* Transform phase of a cycle PHI.  */
7522 
7523 bool
vect_transform_cycle_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7524 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7525 			  stmt_vec_info stmt_info, gimple **vec_stmt,
7526 			  slp_tree slp_node, slp_instance slp_node_instance)
7527 {
7528   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7529   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7530   int i;
7531   int ncopies;
7532   int j;
7533   bool nested_cycle = false;
7534   int vec_num;
7535 
7536   if (nested_in_vect_loop_p (loop, stmt_info))
7537     {
7538       loop = loop->inner;
7539       nested_cycle = true;
7540     }
7541 
7542   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7543   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7544   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7545   gcc_assert (reduc_info->is_reduc_info);
7546 
7547   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7548       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7549     /* Leave the scalar phi in place.  */
7550     return true;
7551 
7552   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7553   /* For a nested cycle we do not fill the above.  */
7554   if (!vectype_in)
7555     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7556   gcc_assert (vectype_in);
7557 
7558   if (slp_node)
7559     {
7560       /* The size vect_schedule_slp_instance computes is off for us.  */
7561       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7562 				      * SLP_TREE_LANES (slp_node), vectype_in);
7563       ncopies = 1;
7564     }
7565   else
7566     {
7567       vec_num = 1;
7568       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7569     }
7570 
7571   /* Check whether we should use a single PHI node and accumulate
7572      vectors to one before the backedge.  */
7573   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7574     ncopies = 1;
7575 
7576   /* Create the destination vector  */
7577   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7578   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7579 					       vectype_out);
7580 
7581   /* Get the loop-entry arguments.  */
7582   tree vec_initial_def;
7583   auto_vec<tree> vec_initial_defs;
7584   if (slp_node)
7585     {
7586       vec_initial_defs.reserve (vec_num);
7587       if (nested_cycle)
7588 	{
7589 	  unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7590 	  vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7591 			     &vec_initial_defs);
7592 	}
7593       else
7594 	{
7595 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
7596 	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7597 	  tree neutral_op
7598 	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
7599 					      STMT_VINFO_REDUC_CODE (reduc_info),
7600 					      first != NULL);
7601 	  get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7602 					  &vec_initial_defs, vec_num,
7603 					  first != NULL, neutral_op);
7604 	}
7605     }
7606   else
7607     {
7608       /* Get at the scalar def before the loop, that defines the initial
7609 	 value of the reduction variable.  */
7610       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7611 						loop_preheader_edge (loop));
7612       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7613 	 and we can't use zero for induc_val, use initial_def.  Similarly
7614 	 for REDUC_MIN and initial_def larger than the base.  */
7615       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7616 	{
7617 	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7618 	  if (TREE_CODE (initial_def) == INTEGER_CST
7619 	      && !integer_zerop (induc_val)
7620 	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7621 		   && tree_int_cst_lt (initial_def, induc_val))
7622 		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7623 		      && tree_int_cst_lt (induc_val, initial_def))))
7624 	    {
7625 	      induc_val = initial_def;
7626 	      /* Communicate we used the initial_def to epilouge
7627 		 generation.  */
7628 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7629 	    }
7630 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7631 	  vec_initial_defs.create (ncopies);
7632 	  for (i = 0; i < ncopies; ++i)
7633 	    vec_initial_defs.quick_push (vec_initial_def);
7634 	}
7635       else if (nested_cycle)
7636 	{
7637 	  /* Do not use an adjustment def as that case is not supported
7638 	     correctly if ncopies is not one.  */
7639 	  vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7640 					 ncopies, initial_def,
7641 					 &vec_initial_defs);
7642 	}
7643       else
7644 	{
7645 	  tree adjustment_def = NULL_TREE;
7646 	  tree *adjustment_defp = &adjustment_def;
7647 	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7648 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7649 	    adjustment_defp = NULL;
7650 	  vec_initial_def
7651 	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7652 					     initial_def, adjustment_defp);
7653 	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7654 	  vec_initial_defs.create (ncopies);
7655 	  for (i = 0; i < ncopies; ++i)
7656 	    vec_initial_defs.quick_push (vec_initial_def);
7657 	}
7658     }
7659 
7660   /* Generate the reduction PHIs upfront.  */
7661   for (i = 0; i < vec_num; i++)
7662     {
7663       tree vec_init_def = vec_initial_defs[i];
7664       for (j = 0; j < ncopies; j++)
7665 	{
7666 	  /* Create the reduction-phi that defines the reduction
7667 	     operand.  */
7668 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7669 
7670 	  /* Set the loop-entry arg of the reduction-phi.  */
7671 	  if (j != 0 && nested_cycle)
7672 	    vec_init_def = vec_initial_defs[j];
7673 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7674 		       UNKNOWN_LOCATION);
7675 
7676 	  /* The loop-latch arg is set in epilogue processing.  */
7677 
7678 	  if (slp_node)
7679 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7680 	  else
7681 	    {
7682 	      if (j == 0)
7683 		*vec_stmt = new_phi;
7684 	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7685 	    }
7686 	}
7687     }
7688 
7689   return true;
7690 }
7691 
7692 /* Vectorizes LC PHIs.  */
7693 
7694 bool
vectorizable_lc_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node)7695 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7696 		     stmt_vec_info stmt_info, gimple **vec_stmt,
7697 		     slp_tree slp_node)
7698 {
7699   if (!loop_vinfo
7700       || !is_a <gphi *> (stmt_info->stmt)
7701       || gimple_phi_num_args (stmt_info->stmt) != 1)
7702     return false;
7703 
7704   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7705       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7706     return false;
7707 
7708   if (!vec_stmt) /* transformation not required.  */
7709     {
7710       /* Deal with copies from externs or constants that disguise as
7711 	 loop-closed PHI nodes (PR97886).  */
7712       if (slp_node
7713 	  && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7714 						SLP_TREE_VECTYPE (slp_node)))
7715 	{
7716 	  if (dump_enabled_p ())
7717 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718 			     "incompatible vector types for invariants\n");
7719 	  return false;
7720 	}
7721       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7722       return true;
7723     }
7724 
7725   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7726   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7727   basic_block bb = gimple_bb (stmt_info->stmt);
7728   edge e = single_pred_edge (bb);
7729   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7730   auto_vec<tree> vec_oprnds;
7731   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7732 		     !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7733 		     gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7734   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7735     {
7736       /* Create the vectorized LC PHI node.  */
7737       gphi *new_phi = create_phi_node (vec_dest, bb);
7738       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7739       if (slp_node)
7740 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7741       else
7742 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7743     }
7744   if (!slp_node)
7745     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7746 
7747   return true;
7748 }
7749 
7750 /* Vectorizes PHIs.  */
7751 
7752 bool
vectorizable_phi(vec_info *,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7753 vectorizable_phi (vec_info *,
7754 		  stmt_vec_info stmt_info, gimple **vec_stmt,
7755 		  slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7756 {
7757   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7758     return false;
7759 
7760   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7761     return false;
7762 
7763   tree vectype = SLP_TREE_VECTYPE (slp_node);
7764 
7765   if (!vec_stmt) /* transformation not required.  */
7766     {
7767       slp_tree child;
7768       unsigned i;
7769       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7770 	if (!child)
7771 	  {
7772 	    if (dump_enabled_p ())
7773 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774 			       "PHI node with unvectorized backedge def\n");
7775 	    return false;
7776 	  }
7777 	else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7778 	  {
7779 	    if (dump_enabled_p ())
7780 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7781 			       "incompatible vector types for invariants\n");
7782 	    return false;
7783 	  }
7784       /* For single-argument PHIs assume coalescing which means zero cost
7785 	 for the scalar and the vector PHIs.  This avoids artificially
7786 	 favoring the vector path (but may pessimize it in some cases).  */
7787       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7788 	record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7789 			  vector_stmt, stmt_info, vectype, 0, vect_body);
7790       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7791       return true;
7792     }
7793 
7794   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7795   basic_block bb = gimple_bb (stmt_info->stmt);
7796   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7797   auto_vec<gphi *> new_phis;
7798   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7799     {
7800       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7801 
7802       /* Skip not yet vectorized defs.  */
7803       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7804 	  && SLP_TREE_VEC_STMTS (child).is_empty ())
7805 	continue;
7806 
7807       auto_vec<tree> vec_oprnds;
7808       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7809       if (!new_phis.exists ())
7810 	{
7811 	  new_phis.create (vec_oprnds.length ());
7812 	  for (unsigned j = 0; j < vec_oprnds.length (); j++)
7813 	    {
7814 	      /* Create the vectorized LC PHI node.  */
7815 	      new_phis.quick_push (create_phi_node (vec_dest, bb));
7816 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7817 	    }
7818 	}
7819       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7820       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7821 	add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7822     }
7823   /* We should have at least one already vectorized child.  */
7824   gcc_assert (new_phis.exists ());
7825 
7826   return true;
7827 }
7828 
7829 
7830 /* Function vect_min_worthwhile_factor.
7831 
7832    For a loop where we could vectorize the operation indicated by CODE,
7833    return the minimum vectorization factor that makes it worthwhile
7834    to use generic vectors.  */
7835 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7836 vect_min_worthwhile_factor (enum tree_code code)
7837 {
7838   switch (code)
7839     {
7840     case PLUS_EXPR:
7841     case MINUS_EXPR:
7842     case NEGATE_EXPR:
7843       return 4;
7844 
7845     case BIT_AND_EXPR:
7846     case BIT_IOR_EXPR:
7847     case BIT_XOR_EXPR:
7848     case BIT_NOT_EXPR:
7849       return 2;
7850 
7851     default:
7852       return INT_MAX;
7853     }
7854 }
7855 
7856 /* Return true if VINFO indicates we are doing loop vectorization and if
7857    it is worth decomposing CODE operations into scalar operations for
7858    that loop's vectorization factor.  */
7859 
7860 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7861 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7862 {
7863   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7864   unsigned HOST_WIDE_INT value;
7865   return (loop_vinfo
7866 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7867 	  && value >= vect_min_worthwhile_factor (code));
7868 }
7869 
7870 /* Function vectorizable_induction
7871 
7872    Check if STMT_INFO performs an induction computation that can be vectorized.
7873    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7874    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7875    Return true if STMT_INFO is vectorizable in this way.  */
7876 
7877 bool
vectorizable_induction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7878 vectorizable_induction (loop_vec_info loop_vinfo,
7879 			stmt_vec_info stmt_info,
7880 			gimple **vec_stmt, slp_tree slp_node,
7881 			stmt_vector_for_cost *cost_vec)
7882 {
7883   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7884   unsigned ncopies;
7885   bool nested_in_vect_loop = false;
7886   class loop *iv_loop;
7887   tree vec_def;
7888   edge pe = loop_preheader_edge (loop);
7889   basic_block new_bb;
7890   tree new_vec, vec_init, vec_step, t;
7891   tree new_name;
7892   gimple *new_stmt;
7893   gphi *induction_phi;
7894   tree induc_def, vec_dest;
7895   tree init_expr, step_expr;
7896   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7897   unsigned i;
7898   tree expr;
7899   gimple_stmt_iterator si;
7900 
7901   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7902   if (!phi)
7903     return false;
7904 
7905   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7906     return false;
7907 
7908   /* Make sure it was recognized as induction computation.  */
7909   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7910     return false;
7911 
7912   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7913   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7914 
7915   if (slp_node)
7916     ncopies = 1;
7917   else
7918     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7919   gcc_assert (ncopies >= 1);
7920 
7921   /* FORNOW. These restrictions should be relaxed.  */
7922   if (nested_in_vect_loop_p (loop, stmt_info))
7923     {
7924       imm_use_iterator imm_iter;
7925       use_operand_p use_p;
7926       gimple *exit_phi;
7927       edge latch_e;
7928       tree loop_arg;
7929 
7930       if (ncopies > 1)
7931 	{
7932 	  if (dump_enabled_p ())
7933 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934 			     "multiple types in nested loop.\n");
7935 	  return false;
7936 	}
7937 
7938       exit_phi = NULL;
7939       latch_e = loop_latch_edge (loop->inner);
7940       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7941       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7942 	{
7943 	  gimple *use_stmt = USE_STMT (use_p);
7944 	  if (is_gimple_debug (use_stmt))
7945 	    continue;
7946 
7947 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7948 	    {
7949 	      exit_phi = use_stmt;
7950 	      break;
7951 	    }
7952 	}
7953       if (exit_phi)
7954 	{
7955 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7956 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7957 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7958 	    {
7959 	      if (dump_enabled_p ())
7960 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961 				 "inner-loop induction only used outside "
7962 				 "of the outer vectorized loop.\n");
7963 	      return false;
7964 	    }
7965 	}
7966 
7967       nested_in_vect_loop = true;
7968       iv_loop = loop->inner;
7969     }
7970   else
7971     iv_loop = loop;
7972   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7973 
7974   if (slp_node && !nunits.is_constant ())
7975     {
7976       /* The current SLP code creates the step value element-by-element.  */
7977       if (dump_enabled_p ())
7978 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 			 "SLP induction not supported for variable-length"
7980 			 " vectors.\n");
7981       return false;
7982     }
7983 
7984   if (!vec_stmt) /* transformation not required.  */
7985     {
7986       unsigned inside_cost = 0, prologue_cost = 0;
7987       if (slp_node)
7988 	{
7989 	  /* We eventually need to set a vector type on invariant
7990 	     arguments.  */
7991 	  unsigned j;
7992 	  slp_tree child;
7993 	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7994 	    if (!vect_maybe_update_slp_op_vectype
7995 		(child, SLP_TREE_VECTYPE (slp_node)))
7996 	      {
7997 		if (dump_enabled_p ())
7998 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999 				   "incompatible vector types for "
8000 				   "invariants\n");
8001 		return false;
8002 	      }
8003 	  /* loop cost for vec_loop.  */
8004 	  inside_cost
8005 	    = record_stmt_cost (cost_vec,
8006 				SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8007 				vector_stmt, stmt_info, 0, vect_body);
8008 	  /* prologue cost for vec_init (if not nested) and step.  */
8009 	  prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8010 					    scalar_to_vec,
8011 					    stmt_info, 0, vect_prologue);
8012 	}
8013       else /* if (!slp_node) */
8014 	{
8015 	  /* loop cost for vec_loop.  */
8016 	  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8017 					  stmt_info, 0, vect_body);
8018 	  /* prologue cost for vec_init and vec_step.  */
8019 	  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8020 					    stmt_info, 0, vect_prologue);
8021 	}
8022       if (dump_enabled_p ())
8023 	dump_printf_loc (MSG_NOTE, vect_location,
8024 			 "vect_model_induction_cost: inside_cost = %d, "
8025 			 "prologue_cost = %d .\n", inside_cost,
8026 			 prologue_cost);
8027 
8028       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8029       DUMP_VECT_SCOPE ("vectorizable_induction");
8030       return true;
8031     }
8032 
8033   /* Transform.  */
8034 
8035   /* Compute a vector variable, initialized with the first VF values of
8036      the induction variable.  E.g., for an iv with IV_PHI='X' and
8037      evolution S, for a vector of 4 units, we want to compute:
8038      [X, X + S, X + 2*S, X + 3*S].  */
8039 
8040   if (dump_enabled_p ())
8041     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8042 
8043   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8044   gcc_assert (step_expr != NULL_TREE);
8045   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8046 
8047   pe = loop_preheader_edge (iv_loop);
8048   /* Find the first insertion point in the BB.  */
8049   basic_block bb = gimple_bb (phi);
8050   si = gsi_after_labels (bb);
8051 
8052   /* For SLP induction we have to generate several IVs as for example
8053      with group size 3 we need
8054        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8055        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8056   if (slp_node)
8057     {
8058       /* Enforced above.  */
8059       unsigned int const_nunits = nunits.to_constant ();
8060 
8061       /* The initial values are vectorized, but any lanes > group_size
8062 	 need adjustment.  */
8063       slp_tree init_node
8064 	= SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8065 
8066       /* Gather steps.  Since we do not vectorize inductions as
8067 	 cycles we have to reconstruct the step from SCEV data.  */
8068       unsigned group_size = SLP_TREE_LANES (slp_node);
8069       tree *steps = XALLOCAVEC (tree, group_size);
8070       tree *inits = XALLOCAVEC (tree, group_size);
8071       stmt_vec_info phi_info;
8072       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8073 	{
8074 	  steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8075 	  if (!init_node)
8076 	    inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8077 					   pe->dest_idx);
8078 	}
8079 
8080       /* Now generate the IVs.  */
8081       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8082       gcc_assert ((const_nunits * nvects) % group_size == 0);
8083       unsigned nivs;
8084       if (nested_in_vect_loop)
8085 	nivs = nvects;
8086       else
8087 	{
8088 	  /* Compute the number of distinct IVs we need.  First reduce
8089 	     group_size if it is a multiple of const_nunits so we get
8090 	     one IV for a group_size of 4 but const_nunits 2.  */
8091 	  unsigned group_sizep = group_size;
8092 	  if (group_sizep % const_nunits == 0)
8093 	    group_sizep = group_sizep / const_nunits;
8094 	  nivs = least_common_multiple (group_sizep,
8095 					const_nunits) / const_nunits;
8096 	}
8097       tree stept = TREE_TYPE (step_vectype);
8098       tree lupdate_mul = NULL_TREE;
8099       if (!nested_in_vect_loop)
8100 	{
8101 	  /* The number of iterations covered in one vector iteration.  */
8102 	  unsigned lup_mul = (nvects * const_nunits) / group_size;
8103 	  lupdate_mul
8104 	    = build_vector_from_val (step_vectype,
8105 				     SCALAR_FLOAT_TYPE_P (stept)
8106 				     ? build_real_from_wide (stept, lup_mul,
8107 							     UNSIGNED)
8108 				     : build_int_cstu (stept, lup_mul));
8109 	}
8110       tree peel_mul = NULL_TREE;
8111       gimple_seq init_stmts = NULL;
8112       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8113 	{
8114 	  if (SCALAR_FLOAT_TYPE_P (stept))
8115 	    peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8116 				     LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8117 	  else
8118 	    peel_mul = gimple_convert (&init_stmts, stept,
8119 				       LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8120 	  peel_mul = gimple_build_vector_from_val (&init_stmts,
8121 						   step_vectype, peel_mul);
8122 	}
8123       unsigned ivn;
8124       auto_vec<tree> vec_steps;
8125       for (ivn = 0; ivn < nivs; ++ivn)
8126 	{
8127 	  tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8128 	  tree_vector_builder init_elts (vectype, const_nunits, 1);
8129 	  tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8130 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8131 	    {
8132 	      /* The scalar steps of the IVs.  */
8133 	      tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8134 	      elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8135 	      step_elts.quick_push (elt);
8136 	      if (!init_node)
8137 		{
8138 		  /* The scalar inits of the IVs if not vectorized.  */
8139 		  elt = inits[(ivn*const_nunits + eltn) % group_size];
8140 		  if (!useless_type_conversion_p (TREE_TYPE (vectype),
8141 						  TREE_TYPE (elt)))
8142 		    elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8143 					TREE_TYPE (vectype), elt);
8144 		  init_elts.quick_push (elt);
8145 		}
8146 	      /* The number of steps to add to the initial values.  */
8147 	      unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8148 	      mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8149 				   ? build_real_from_wide (stept,
8150 							   mul_elt, UNSIGNED)
8151 				   : build_int_cstu (stept, mul_elt));
8152 	    }
8153 	  vec_step = gimple_build_vector (&init_stmts, &step_elts);
8154 	  vec_steps.safe_push (vec_step);
8155 	  tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8156 	  if (peel_mul)
8157 	    step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8158 				     step_mul, peel_mul);
8159 	  if (!init_node)
8160 	    vec_init = gimple_build_vector (&init_stmts, &init_elts);
8161 
8162 	  /* Create the induction-phi that defines the induction-operand.  */
8163 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8164 					    "vec_iv_");
8165 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
8166 	  induc_def = PHI_RESULT (induction_phi);
8167 
8168 	  /* Create the iv update inside the loop  */
8169 	  tree up = vec_step;
8170 	  if (lupdate_mul)
8171 	    up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8172 			       vec_step, lupdate_mul);
8173 	  gimple_seq stmts = NULL;
8174 	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8175 	  vec_def = gimple_build (&stmts,
8176 				  PLUS_EXPR, step_vectype, vec_def, up);
8177 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8178 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8179 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8180 		       UNKNOWN_LOCATION);
8181 
8182 	  if (init_node)
8183 	    vec_init = vect_get_slp_vect_def (init_node, ivn);
8184 	  if (!nested_in_vect_loop
8185 	      && !integer_zerop (step_mul))
8186 	    {
8187 	      vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8188 	      up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8189 				 vec_step, step_mul);
8190 	      vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8191 				      vec_def, up);
8192 	      vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8193 	    }
8194 
8195 	  /* Set the arguments of the phi node:  */
8196 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8197 
8198 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8199 	}
8200       if (!nested_in_vect_loop)
8201 	{
8202 	  /* Fill up to the number of vectors we need for the whole group.  */
8203 	  nivs = least_common_multiple (group_size,
8204 					const_nunits) / const_nunits;
8205 	  vec_steps.reserve (nivs-ivn);
8206 	  for (; ivn < nivs; ++ivn)
8207 	    {
8208 	      SLP_TREE_VEC_STMTS (slp_node)
8209 		.quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8210 	      vec_steps.quick_push (vec_steps[0]);
8211 	    }
8212 	}
8213 
8214       /* Re-use IVs when we can.  We are generating further vector
8215 	 stmts by adding VF' * stride to the IVs generated above.  */
8216       if (ivn < nvects)
8217 	{
8218 	  unsigned vfp
8219 	    = least_common_multiple (group_size, const_nunits) / group_size;
8220 	  tree lupdate_mul
8221 	    = build_vector_from_val (step_vectype,
8222 				     SCALAR_FLOAT_TYPE_P (stept)
8223 				     ? build_real_from_wide (stept,
8224 							     vfp, UNSIGNED)
8225 				     : build_int_cstu (stept, vfp));
8226 	  for (; ivn < nvects; ++ivn)
8227 	    {
8228 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8229 	      tree def = gimple_get_lhs (iv);
8230 	      if (ivn < 2*nivs)
8231 		vec_steps[ivn - nivs]
8232 		  = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8233 				  vec_steps[ivn - nivs], lupdate_mul);
8234 	      gimple_seq stmts = NULL;
8235 	      def = gimple_convert (&stmts, step_vectype, def);
8236 	      def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8237 				  def, vec_steps[ivn % nivs]);
8238 	      def = gimple_convert (&stmts, vectype, def);
8239 	      if (gimple_code (iv) == GIMPLE_PHI)
8240 		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8241 	      else
8242 		{
8243 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8244 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8245 		}
8246 	      SLP_TREE_VEC_STMTS (slp_node)
8247 		.quick_push (SSA_NAME_DEF_STMT (def));
8248 	    }
8249 	}
8250 
8251       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8252       gcc_assert (!new_bb);
8253 
8254       return true;
8255     }
8256 
8257   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8258 				     loop_preheader_edge (iv_loop));
8259 
8260   gimple_seq stmts = NULL;
8261   if (!nested_in_vect_loop)
8262     {
8263       /* Convert the initial value to the IV update type.  */
8264       tree new_type = TREE_TYPE (step_expr);
8265       init_expr = gimple_convert (&stmts, new_type, init_expr);
8266 
8267       /* If we are using the loop mask to "peel" for alignment then we need
8268 	 to adjust the start value here.  */
8269       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8270       if (skip_niters != NULL_TREE)
8271 	{
8272 	  if (FLOAT_TYPE_P (vectype))
8273 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8274 					skip_niters);
8275 	  else
8276 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8277 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8278 					 skip_niters, step_expr);
8279 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8280 				    init_expr, skip_step);
8281 	}
8282     }
8283 
8284   if (stmts)
8285     {
8286       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8287       gcc_assert (!new_bb);
8288     }
8289 
8290   /* Create the vector that holds the initial_value of the induction.  */
8291   if (nested_in_vect_loop)
8292     {
8293       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8294 	 been created during vectorization of previous stmts.  We obtain it
8295 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8296       auto_vec<tree> vec_inits;
8297       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8298 				     init_expr, &vec_inits);
8299       vec_init = vec_inits[0];
8300       /* If the initial value is not of proper type, convert it.  */
8301       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8302 	{
8303 	  new_stmt
8304 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
8305 							  vect_simple_var,
8306 							  "vec_iv_"),
8307 				   VIEW_CONVERT_EXPR,
8308 				   build1 (VIEW_CONVERT_EXPR, vectype,
8309 					   vec_init));
8310 	  vec_init = gimple_assign_lhs (new_stmt);
8311 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8312 						 new_stmt);
8313 	  gcc_assert (!new_bb);
8314 	}
8315     }
8316   else
8317     {
8318       /* iv_loop is the loop to be vectorized. Create:
8319 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8320       stmts = NULL;
8321       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8322 
8323       unsigned HOST_WIDE_INT const_nunits;
8324       if (nunits.is_constant (&const_nunits))
8325 	{
8326 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
8327 	  elts.quick_push (new_name);
8328 	  for (i = 1; i < const_nunits; i++)
8329 	    {
8330 	      /* Create: new_name_i = new_name + step_expr  */
8331 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8332 				       new_name, step_expr);
8333 	      elts.quick_push (new_name);
8334 	    }
8335 	  /* Create a vector from [new_name_0, new_name_1, ...,
8336 	     new_name_nunits-1]  */
8337 	  vec_init = gimple_build_vector (&stmts, &elts);
8338 	}
8339       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8340 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
8341 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8342 				 new_name, step_expr);
8343       else
8344 	{
8345 	  /* Build:
8346 	        [base, base, base, ...]
8347 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8348 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8349 	  gcc_assert (flag_associative_math);
8350 	  tree index = build_index_vector (step_vectype, 0, 1);
8351 	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8352 							new_name);
8353 	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8354 							step_expr);
8355 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8356 	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8357 				   vec_init, step_vec);
8358 	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8359 				   vec_init, base_vec);
8360 	}
8361       vec_init = gimple_convert (&stmts, vectype, vec_init);
8362 
8363       if (stmts)
8364 	{
8365 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8366 	  gcc_assert (!new_bb);
8367 	}
8368     }
8369 
8370 
8371   /* Create the vector that holds the step of the induction.  */
8372   if (nested_in_vect_loop)
8373     /* iv_loop is nested in the loop to be vectorized. Generate:
8374        vec_step = [S, S, S, S]  */
8375     new_name = step_expr;
8376   else
8377     {
8378       /* iv_loop is the loop to be vectorized. Generate:
8379 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8380       gimple_seq seq = NULL;
8381       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8382 	{
8383 	  expr = build_int_cst (integer_type_node, vf);
8384 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8385 	}
8386       else
8387 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
8388       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8389 			       expr, step_expr);
8390       if (seq)
8391 	{
8392 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8393 	  gcc_assert (!new_bb);
8394 	}
8395     }
8396 
8397   t = unshare_expr (new_name);
8398   gcc_assert (CONSTANT_CLASS_P (new_name)
8399 	      || TREE_CODE (new_name) == SSA_NAME);
8400   new_vec = build_vector_from_val (step_vectype, t);
8401   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8402 			       new_vec, step_vectype, NULL);
8403 
8404 
8405   /* Create the following def-use cycle:
8406      loop prolog:
8407          vec_init = ...
8408 	 vec_step = ...
8409      loop:
8410          vec_iv = PHI <vec_init, vec_loop>
8411          ...
8412          STMT
8413          ...
8414          vec_loop = vec_iv + vec_step;  */
8415 
8416   /* Create the induction-phi that defines the induction-operand.  */
8417   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8418   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8419   induc_def = PHI_RESULT (induction_phi);
8420 
8421   /* Create the iv update inside the loop  */
8422   stmts = NULL;
8423   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8424   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8425   vec_def = gimple_convert (&stmts, vectype, vec_def);
8426   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8427   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8428 
8429   /* Set the arguments of the phi node:  */
8430   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8431   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8432 	       UNKNOWN_LOCATION);
8433 
8434   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8435   *vec_stmt = induction_phi;
8436 
8437   /* In case that vectorization factor (VF) is bigger than the number
8438      of elements that we can fit in a vectype (nunits), we have to generate
8439      more than one vector stmt - i.e - we need to "unroll" the
8440      vector stmt by a factor VF/nunits.  For more details see documentation
8441      in vectorizable_operation.  */
8442 
8443   if (ncopies > 1)
8444     {
8445       gimple_seq seq = NULL;
8446       /* FORNOW. This restriction should be relaxed.  */
8447       gcc_assert (!nested_in_vect_loop);
8448 
8449       /* Create the vector that holds the step of the induction.  */
8450       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8451 	{
8452 	  expr = build_int_cst (integer_type_node, nunits);
8453 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8454 	}
8455       else
8456 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8457       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8458 			       expr, step_expr);
8459       if (seq)
8460 	{
8461 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8462 	  gcc_assert (!new_bb);
8463 	}
8464 
8465       t = unshare_expr (new_name);
8466       gcc_assert (CONSTANT_CLASS_P (new_name)
8467 		  || TREE_CODE (new_name) == SSA_NAME);
8468       new_vec = build_vector_from_val (step_vectype, t);
8469       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8470 				   new_vec, step_vectype, NULL);
8471 
8472       vec_def = induc_def;
8473       for (i = 1; i < ncopies; i++)
8474 	{
8475 	  /* vec_i = vec_prev + vec_step  */
8476 	  gimple_seq stmts = NULL;
8477 	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8478 	  vec_def = gimple_build (&stmts,
8479 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
8480 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8481 
8482 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8483 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
8484 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8485 	}
8486     }
8487 
8488   if (dump_enabled_p ())
8489     dump_printf_loc (MSG_NOTE, vect_location,
8490 		     "transform induction: created def-use cycle: %G%G",
8491 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
8492 
8493   return true;
8494 }
8495 
8496 /* Function vectorizable_live_operation.
8497 
8498    STMT_INFO computes a value that is used outside the loop.  Check if
8499    it can be supported.  */
8500 
8501 bool
vectorizable_live_operation(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost * cost_vec)8502 vectorizable_live_operation (vec_info *vinfo,
8503 			     stmt_vec_info stmt_info,
8504 			     gimple_stmt_iterator *gsi,
8505 			     slp_tree slp_node, slp_instance slp_node_instance,
8506 			     int slp_index, bool vec_stmt_p,
8507 			     stmt_vector_for_cost *cost_vec)
8508 {
8509   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8510   imm_use_iterator imm_iter;
8511   tree lhs, lhs_type, bitsize;
8512   tree vectype = (slp_node
8513 		  ? SLP_TREE_VECTYPE (slp_node)
8514 		  : STMT_VINFO_VECTYPE (stmt_info));
8515   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8516   int ncopies;
8517   gimple *use_stmt;
8518   auto_vec<tree> vec_oprnds;
8519   int vec_entry = 0;
8520   poly_uint64 vec_index = 0;
8521 
8522   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8523 
8524   /* If a stmt of a reduction is live, vectorize it via
8525      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8526      validity so just trigger the transform here.  */
8527   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8528     {
8529       if (!vec_stmt_p)
8530 	return true;
8531       if (slp_node)
8532 	{
8533 	  /* For reduction chains the meta-info is attached to
8534 	     the group leader.  */
8535 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8536 	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8537 	  /* For SLP reductions we vectorize the epilogue for
8538 	     all involved stmts together.  */
8539 	  else if (slp_index != 0)
8540 	    return true;
8541 	  else
8542 	    /* For SLP reductions the meta-info is attached to
8543 	       the representative.  */
8544 	    stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8545 	}
8546       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8547       gcc_assert (reduc_info->is_reduc_info);
8548       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8549 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8550 	return true;
8551       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8552 					slp_node_instance);
8553       return true;
8554     }
8555 
8556   /* If STMT is not relevant and it is a simple assignment and its inputs are
8557      invariant then it can remain in place, unvectorized.  The original last
8558      scalar value that it computes will be used.  */
8559   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8560     {
8561       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8562       if (dump_enabled_p ())
8563 	dump_printf_loc (MSG_NOTE, vect_location,
8564 			 "statement is simple and uses invariant.  Leaving in "
8565 			 "place.\n");
8566       return true;
8567     }
8568 
8569   if (slp_node)
8570     ncopies = 1;
8571   else
8572     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8573 
8574   if (slp_node)
8575     {
8576       gcc_assert (slp_index >= 0);
8577 
8578       /* Get the last occurrence of the scalar index from the concatenation of
8579 	 all the slp vectors. Calculate which slp vector it is and the index
8580 	 within.  */
8581       int num_scalar = SLP_TREE_LANES (slp_node);
8582       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8583       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8584 
8585       /* Calculate which vector contains the result, and which lane of
8586 	 that vector we need.  */
8587       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8588 	{
8589 	  if (dump_enabled_p ())
8590 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8591 			     "Cannot determine which vector holds the"
8592 			     " final result.\n");
8593 	  return false;
8594 	}
8595     }
8596 
8597   if (!vec_stmt_p)
8598     {
8599       /* No transformation required.  */
8600       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8601 	{
8602 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8603 					       OPTIMIZE_FOR_SPEED))
8604 	    {
8605 	      if (dump_enabled_p ())
8606 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8607 				 "can't operate on partial vectors "
8608 				 "because the target doesn't support extract "
8609 				 "last reduction.\n");
8610 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8611 	    }
8612 	  else if (slp_node)
8613 	    {
8614 	      if (dump_enabled_p ())
8615 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8616 				 "can't operate on partial vectors "
8617 				 "because an SLP statement is live after "
8618 				 "the loop.\n");
8619 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8620 	    }
8621 	  else if (ncopies > 1)
8622 	    {
8623 	      if (dump_enabled_p ())
8624 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8625 				 "can't operate on partial vectors "
8626 				 "because ncopies is greater than 1.\n");
8627 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8628 	    }
8629 	  else
8630 	    {
8631 	      gcc_assert (ncopies == 1 && !slp_node);
8632 	      vect_record_loop_mask (loop_vinfo,
8633 				     &LOOP_VINFO_MASKS (loop_vinfo),
8634 				     1, vectype, NULL);
8635 	    }
8636 	}
8637       /* ???  Enable for loop costing as well.  */
8638       if (!loop_vinfo)
8639 	record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8640 			  0, vect_epilogue);
8641       return true;
8642     }
8643 
8644   /* Use the lhs of the original scalar statement.  */
8645   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8646   if (dump_enabled_p ())
8647     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8648 		     "stmt %G", stmt);
8649 
8650   lhs = gimple_get_lhs (stmt);
8651   lhs_type = TREE_TYPE (lhs);
8652 
8653   bitsize = vector_element_bits_tree (vectype);
8654 
8655   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8656   tree vec_lhs, bitstart;
8657   gimple *vec_stmt;
8658   if (slp_node)
8659     {
8660       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8661 
8662       /* Get the correct slp vectorized stmt.  */
8663       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8664       vec_lhs = gimple_get_lhs (vec_stmt);
8665 
8666       /* Get entry to use.  */
8667       bitstart = bitsize_int (vec_index);
8668       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8669     }
8670   else
8671     {
8672       /* For multiple copies, get the last copy.  */
8673       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8674       vec_lhs = gimple_get_lhs (vec_stmt);
8675 
8676       /* Get the last lane in the vector.  */
8677       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8678     }
8679 
8680   if (loop_vinfo)
8681     {
8682       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8683 	 requirement, insert one phi node for it.  It looks like:
8684 	   loop;
8685 	 BB:
8686 	   # lhs' = PHI <lhs>
8687 	 ==>
8688 	   loop;
8689 	 BB:
8690 	   # vec_lhs' = PHI <vec_lhs>
8691 	   new_tree = lane_extract <vec_lhs', ...>;
8692 	   lhs' = new_tree;  */
8693 
8694       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8695       basic_block exit_bb = single_exit (loop)->dest;
8696       gcc_assert (single_pred_p (exit_bb));
8697 
8698       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8699       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8700       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8701 
8702       gimple_seq stmts = NULL;
8703       tree new_tree;
8704       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8705 	{
8706 	  /* Emit:
8707 
8708 	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8709 
8710 	     where VEC_LHS is the vectorized live-out result and MASK is
8711 	     the loop mask for the final iteration.  */
8712 	  gcc_assert (ncopies == 1 && !slp_node);
8713 	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8714 	  tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8715 					  1, vectype, 0);
8716 	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8717 					  mask, vec_lhs_phi);
8718 
8719 	  /* Convert the extracted vector element to the scalar type.  */
8720 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8721 	}
8722       else
8723 	{
8724 	  tree bftype = TREE_TYPE (vectype);
8725 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
8726 	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8727 	  new_tree = build3 (BIT_FIELD_REF, bftype,
8728 			     vec_lhs_phi, bitsize, bitstart);
8729 	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8730 					   &stmts, true, NULL_TREE);
8731 	}
8732 
8733       if (stmts)
8734 	{
8735 	  gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8736 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8737 
8738 	  /* Remove existing phi from lhs and create one copy from new_tree.  */
8739 	  tree lhs_phi = NULL_TREE;
8740 	  gimple_stmt_iterator gsi;
8741 	  for (gsi = gsi_start_phis (exit_bb);
8742 	       !gsi_end_p (gsi); gsi_next (&gsi))
8743 	    {
8744 	      gimple *phi = gsi_stmt (gsi);
8745 	      if ((gimple_phi_arg_def (phi, 0) == lhs))
8746 		{
8747 		  remove_phi_node (&gsi, false);
8748 		  lhs_phi = gimple_phi_result (phi);
8749 		  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8750 		  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8751 		  break;
8752 		}
8753 	    }
8754 	}
8755 
8756       /* Replace use of lhs with newly computed result.  If the use stmt is a
8757 	 single arg PHI, just replace all uses of PHI result.  It's necessary
8758 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
8759       use_operand_p use_p;
8760       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8761 	if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8762 	    && !is_gimple_debug (use_stmt))
8763 	  {
8764 	    if (gimple_code (use_stmt) == GIMPLE_PHI
8765 		&& gimple_phi_num_args (use_stmt) == 1)
8766 	      {
8767 		replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8768 	      }
8769 	    else
8770 	      {
8771 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8772 		    SET_USE (use_p, new_tree);
8773 	      }
8774 	    update_stmt (use_stmt);
8775 	  }
8776     }
8777   else
8778     {
8779       /* For basic-block vectorization simply insert the lane-extraction.  */
8780       tree bftype = TREE_TYPE (vectype);
8781       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8782 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8783       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8784 			      vec_lhs, bitsize, bitstart);
8785       gimple_seq stmts = NULL;
8786       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8787 				       &stmts, true, NULL_TREE);
8788       if (TREE_CODE (new_tree) == SSA_NAME
8789 	  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8790 	SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8791       if (is_a <gphi *> (vec_stmt))
8792 	{
8793 	  gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8794 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8795 	}
8796       else
8797 	{
8798 	  gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8799 	  gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8800 	}
8801 
8802       /* Replace use of lhs with newly computed result.  If the use stmt is a
8803 	 single arg PHI, just replace all uses of PHI result.  It's necessary
8804 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
8805       use_operand_p use_p;
8806       stmt_vec_info use_stmt_info;
8807       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8808 	if (!is_gimple_debug (use_stmt)
8809 	    && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8810 		|| !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8811 	  {
8812 	    /* ???  This can happen when the live lane ends up being
8813 	       used in a vector construction code-generated by an
8814 	       external SLP node (and code-generation for that already
8815 	       happened).  See gcc.dg/vect/bb-slp-47.c.
8816 	       Doing this is what would happen if that vector CTOR
8817 	       were not code-generated yet so it is not too bad.
8818 	       ???  In fact we'd likely want to avoid this situation
8819 	       in the first place.  */
8820 	    if (TREE_CODE (new_tree) == SSA_NAME
8821 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8822 		&& gimple_code (use_stmt) != GIMPLE_PHI
8823 		&& !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8824 						use_stmt))
8825 	      {
8826 		enum tree_code code = gimple_assign_rhs_code (use_stmt);
8827 		gcc_assert (code == CONSTRUCTOR
8828 			    || code == VIEW_CONVERT_EXPR
8829 			    || CONVERT_EXPR_CODE_P (code));
8830 		if (dump_enabled_p ())
8831 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8832 				   "Using original scalar computation for "
8833 				   "live lane because use preceeds vector "
8834 				   "def\n");
8835 		continue;
8836 	      }
8837 	    /* ???  It can also happen that we end up pulling a def into
8838 	       a loop where replacing out-of-loop uses would require
8839 	       a new LC SSA PHI node.  Retain the original scalar in
8840 	       those cases as well.  PR98064.  */
8841 	    if (TREE_CODE (new_tree) == SSA_NAME
8842 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8843 		&& (gimple_bb (use_stmt)->loop_father
8844 		    != gimple_bb (vec_stmt)->loop_father)
8845 		&& !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8846 					gimple_bb (use_stmt)->loop_father))
8847 	      {
8848 		if (dump_enabled_p ())
8849 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8850 				   "Using original scalar computation for "
8851 				   "live lane because there is an out-of-loop "
8852 				   "definition for it\n");
8853 		continue;
8854 	      }
8855 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8856 	      SET_USE (use_p, new_tree);
8857 	    update_stmt (use_stmt);
8858 	  }
8859     }
8860 
8861   return true;
8862 }
8863 
8864 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8865 
8866 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8867 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8868 {
8869   ssa_op_iter op_iter;
8870   imm_use_iterator imm_iter;
8871   def_operand_p def_p;
8872   gimple *ustmt;
8873 
8874   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8875     {
8876       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8877 	{
8878 	  basic_block bb;
8879 
8880 	  if (!is_gimple_debug (ustmt))
8881 	    continue;
8882 
8883 	  bb = gimple_bb (ustmt);
8884 
8885 	  if (!flow_bb_inside_loop_p (loop, bb))
8886 	    {
8887 	      if (gimple_debug_bind_p (ustmt))
8888 		{
8889 		  if (dump_enabled_p ())
8890 		    dump_printf_loc (MSG_NOTE, vect_location,
8891                                      "killing debug use\n");
8892 
8893 		  gimple_debug_bind_reset_value (ustmt);
8894 		  update_stmt (ustmt);
8895 		}
8896 	      else
8897 		gcc_unreachable ();
8898 	    }
8899 	}
8900     }
8901 }
8902 
8903 /* Given loop represented by LOOP_VINFO, return true if computation of
8904    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8905    otherwise.  */
8906 
8907 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8908 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8909 {
8910   /* Constant case.  */
8911   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8912     {
8913       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8914       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8915 
8916       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8917       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8918       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8919 	return true;
8920     }
8921 
8922   widest_int max;
8923   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8924   /* Check the upper bound of loop niters.  */
8925   if (get_max_loop_iterations (loop, &max))
8926     {
8927       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8928       signop sgn = TYPE_SIGN (type);
8929       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8930       if (max < type_max)
8931 	return true;
8932     }
8933   return false;
8934 }
8935 
8936 /* Return a mask type with half the number of elements as OLD_TYPE,
8937    given that it should have mode NEW_MODE.  */
8938 
8939 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8940 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8941 {
8942   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8943   return build_truth_vector_type_for_mode (nunits, new_mode);
8944 }
8945 
8946 /* Return a mask type with twice as many elements as OLD_TYPE,
8947    given that it should have mode NEW_MODE.  */
8948 
8949 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8950 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8951 {
8952   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8953   return build_truth_vector_type_for_mode (nunits, new_mode);
8954 }
8955 
8956 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8957    contain a sequence of NVECTORS masks that each control a vector of type
8958    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8959    these vector masks with the vector version of SCALAR_MASK.  */
8960 
8961 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8962 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8963 		       unsigned int nvectors, tree vectype, tree scalar_mask)
8964 {
8965   gcc_assert (nvectors != 0);
8966   if (masks->length () < nvectors)
8967     masks->safe_grow_cleared (nvectors, true);
8968   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8969   /* The number of scalars per iteration and the number of vectors are
8970      both compile-time constants.  */
8971   unsigned int nscalars_per_iter
8972     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8973 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8974 
8975   if (scalar_mask)
8976     {
8977       scalar_cond_masked_key cond (scalar_mask, nvectors);
8978       loop_vinfo->scalar_cond_masked_set.add (cond);
8979     }
8980 
8981   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8982     {
8983       rgm->max_nscalars_per_iter = nscalars_per_iter;
8984       rgm->type = truth_type_for (vectype);
8985       rgm->factor = 1;
8986     }
8987 }
8988 
8989 /* Given a complete set of masks MASKS, extract mask number INDEX
8990    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8991    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8992 
8993    See the comment above vec_loop_masks for more details about the mask
8994    arrangement.  */
8995 
8996 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8997 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8998 		    unsigned int nvectors, tree vectype, unsigned int index)
8999 {
9000   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9001   tree mask_type = rgm->type;
9002 
9003   /* Populate the rgroup's mask array, if this is the first time we've
9004      used it.  */
9005   if (rgm->controls.is_empty ())
9006     {
9007       rgm->controls.safe_grow_cleared (nvectors, true);
9008       for (unsigned int i = 0; i < nvectors; ++i)
9009 	{
9010 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9011 	  /* Provide a dummy definition until the real one is available.  */
9012 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9013 	  rgm->controls[i] = mask;
9014 	}
9015     }
9016 
9017   tree mask = rgm->controls[index];
9018   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9019 		TYPE_VECTOR_SUBPARTS (vectype)))
9020     {
9021       /* A loop mask for data type X can be reused for data type Y
9022 	 if X has N times more elements than Y and if Y's elements
9023 	 are N times bigger than X's.  In this case each sequence
9024 	 of N elements in the loop mask will be all-zero or all-one.
9025 	 We can then view-convert the mask so that each sequence of
9026 	 N elements is replaced by a single element.  */
9027       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9028 			      TYPE_VECTOR_SUBPARTS (vectype)));
9029       gimple_seq seq = NULL;
9030       mask_type = truth_type_for (vectype);
9031       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9032       if (seq)
9033 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9034     }
9035   return mask;
9036 }
9037 
9038 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9039    lengths for controlling an operation on VECTYPE.  The operation splits
9040    each element of VECTYPE into FACTOR separate subelements, measuring the
9041    length as a number of these subelements.  */
9042 
9043 void
vect_record_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,tree vectype,unsigned int factor)9044 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9045 		      unsigned int nvectors, tree vectype, unsigned int factor)
9046 {
9047   gcc_assert (nvectors != 0);
9048   if (lens->length () < nvectors)
9049     lens->safe_grow_cleared (nvectors, true);
9050   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9051 
9052   /* The number of scalars per iteration, scalar occupied bytes and
9053      the number of vectors are both compile-time constants.  */
9054   unsigned int nscalars_per_iter
9055     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9056 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9057 
9058   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9059     {
9060       /* For now, we only support cases in which all loads and stores fall back
9061 	 to VnQI or none do.  */
9062       gcc_assert (!rgl->max_nscalars_per_iter
9063 		  || (rgl->factor == 1 && factor == 1)
9064 		  || (rgl->max_nscalars_per_iter * rgl->factor
9065 		      == nscalars_per_iter * factor));
9066       rgl->max_nscalars_per_iter = nscalars_per_iter;
9067       rgl->type = vectype;
9068       rgl->factor = factor;
9069     }
9070 }
9071 
9072 /* Given a complete set of length LENS, extract length number INDEX for an
9073    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9074 
9075 tree
vect_get_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,unsigned int index)9076 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9077 		   unsigned int nvectors, unsigned int index)
9078 {
9079   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9080 
9081   /* Populate the rgroup's len array, if this is the first time we've
9082      used it.  */
9083   if (rgl->controls.is_empty ())
9084     {
9085       rgl->controls.safe_grow_cleared (nvectors, true);
9086       for (unsigned int i = 0; i < nvectors; ++i)
9087 	{
9088 	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9089 	  gcc_assert (len_type != NULL_TREE);
9090 	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9091 
9092 	  /* Provide a dummy definition until the real one is available.  */
9093 	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9094 	  rgl->controls[i] = len;
9095 	}
9096     }
9097 
9098   return rgl->controls[index];
9099 }
9100 
9101 /* Scale profiling counters by estimation for LOOP which is vectorized
9102    by factor VF.  */
9103 
9104 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)9105 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9106 {
9107   edge preheader = loop_preheader_edge (loop);
9108   /* Reduce loop iterations by the vectorization factor.  */
9109   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9110   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9111 
9112   if (freq_h.nonzero_p ())
9113     {
9114       profile_probability p;
9115 
9116       /* Avoid dropping loop body profile counter to 0 because of zero count
9117 	 in loop's preheader.  */
9118       if (!(freq_e == profile_count::zero ()))
9119         freq_e = freq_e.force_nonzero ();
9120       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9121       scale_loop_frequencies (loop, p);
9122     }
9123 
9124   edge exit_e = single_exit (loop);
9125   exit_e->probability = profile_probability::always ()
9126 				 .apply_scale (1, new_est_niter + 1);
9127 
9128   edge exit_l = single_pred_edge (loop->latch);
9129   profile_probability prob = exit_l->probability;
9130   exit_l->probability = exit_e->probability.invert ();
9131   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9132     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9133 }
9134 
9135 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9136    latch edge values originally defined by it.  */
9137 
9138 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)9139 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9140 				     stmt_vec_info def_stmt_info)
9141 {
9142   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9143   if (!def || TREE_CODE (def) != SSA_NAME)
9144     return;
9145   stmt_vec_info phi_info;
9146   imm_use_iterator iter;
9147   use_operand_p use_p;
9148   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9149     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9150       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9151 	  && (phi_info = loop_vinfo->lookup_stmt (phi))
9152 	  && STMT_VINFO_RELEVANT_P (phi_info)
9153 	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9154 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9155 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9156 	{
9157 	  loop_p loop = gimple_bb (phi)->loop_father;
9158 	  edge e = loop_latch_edge (loop);
9159 	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9160 	    {
9161 	      vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9162 	      vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9163 	      gcc_assert (phi_defs.length () == latch_defs.length ());
9164 	      for (unsigned i = 0; i < phi_defs.length (); ++i)
9165 		add_phi_arg (as_a <gphi *> (phi_defs[i]),
9166 			     gimple_get_lhs (latch_defs[i]), e,
9167 			     gimple_phi_arg_location (phi, e->dest_idx));
9168 	    }
9169 	}
9170 }
9171 
9172 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9173    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9174    stmt_vec_info.  */
9175 
9176 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)9177 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9178 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9179 {
9180   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9181   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9182 
9183   if (dump_enabled_p ())
9184     dump_printf_loc (MSG_NOTE, vect_location,
9185 		     "------>vectorizing statement: %G", stmt_info->stmt);
9186 
9187   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9188     vect_loop_kill_debug_uses (loop, stmt_info);
9189 
9190   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9191       && !STMT_VINFO_LIVE_P (stmt_info))
9192     return false;
9193 
9194   if (STMT_VINFO_VECTYPE (stmt_info))
9195     {
9196       poly_uint64 nunits
9197 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9198       if (!STMT_SLP_TYPE (stmt_info)
9199 	  && maybe_ne (nunits, vf)
9200 	  && dump_enabled_p ())
9201 	/* For SLP VF is set according to unrolling factor, and not
9202 	   to vector size, hence for SLP this print is not valid.  */
9203 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9204     }
9205 
9206   /* Pure SLP statements have already been vectorized.  We still need
9207      to apply loop vectorization to hybrid SLP statements.  */
9208   if (PURE_SLP_STMT (stmt_info))
9209     return false;
9210 
9211   if (dump_enabled_p ())
9212     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9213 
9214   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9215     *seen_store = stmt_info;
9216 
9217   return true;
9218 }
9219 
9220 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9221    in the hash_map with its corresponding values.  */
9222 
9223 static tree
find_in_mapping(tree t,void * context)9224 find_in_mapping (tree t, void *context)
9225 {
9226   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9227 
9228   tree *value = mapping->get (t);
9229   return value ? *value : t;
9230 }
9231 
9232 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9233    original loop that has now been vectorized.
9234 
9235    The inits of the data_references need to be advanced with the number of
9236    iterations of the main loop.  This has been computed in vect_do_peeling and
9237    is stored in parameter ADVANCE.  We first restore the data_references
9238    initial offset with the values recored in ORIG_DRS_INIT.
9239 
9240    Since the loop_vec_info of this EPILOGUE was constructed for the original
9241    loop, its stmt_vec_infos all point to the original statements.  These need
9242    to be updated to point to their corresponding copies as well as the SSA_NAMES
9243    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9244 
9245    The data_reference's connections also need to be updated.  Their
9246    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9247    stmt_vec_infos, their statements need to point to their corresponding copy,
9248    if they are gather loads or scatter stores then their reference needs to be
9249    updated to point to its corresponding copy and finally we set
9250    'base_misaligned' to false as we have already peeled for alignment in the
9251    prologue of the main loop.  */
9252 
9253 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)9254 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9255 {
9256   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9257   auto_vec<gimple *> stmt_worklist;
9258   hash_map<tree,tree> mapping;
9259   gimple *orig_stmt, *new_stmt;
9260   gimple_stmt_iterator epilogue_gsi;
9261   gphi_iterator epilogue_phi_gsi;
9262   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9263   basic_block *epilogue_bbs = get_loop_body (epilogue);
9264   unsigned i;
9265 
9266   free (LOOP_VINFO_BBS (epilogue_vinfo));
9267   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9268 
9269   /* Advance data_reference's with the number of iterations of the previous
9270      loop and its prologue.  */
9271   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9272 
9273 
9274   /* The EPILOGUE loop is a copy of the original loop so they share the same
9275      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9276      point to the copied statements.  We also create a mapping of all LHS' in
9277      the original loop and all the LHS' in the EPILOGUE and create worklists to
9278      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9279   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9280     {
9281       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9282 	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9283 	{
9284 	  new_stmt = epilogue_phi_gsi.phi ();
9285 
9286 	  gcc_assert (gimple_uid (new_stmt) > 0);
9287 	  stmt_vinfo
9288 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9289 
9290 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9291 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9292 
9293 	  mapping.put (gimple_phi_result (orig_stmt),
9294 		       gimple_phi_result (new_stmt));
9295 	  /* PHI nodes can not have patterns or related statements.  */
9296 	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9297 		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9298 	}
9299 
9300       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9301 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9302 	{
9303 	  new_stmt = gsi_stmt (epilogue_gsi);
9304 	  if (is_gimple_debug (new_stmt))
9305 	    continue;
9306 
9307 	  gcc_assert (gimple_uid (new_stmt) > 0);
9308 	  stmt_vinfo
9309 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9310 
9311 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9312 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9313 
9314 	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
9315 	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9316 
9317 	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9318 	    {
9319 	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9320 	      for (gimple_stmt_iterator gsi = gsi_start (seq);
9321 		   !gsi_end_p (gsi); gsi_next (&gsi))
9322 		stmt_worklist.safe_push (gsi_stmt (gsi));
9323 	    }
9324 
9325 	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9326 	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9327 	    {
9328 	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9329 	      stmt_worklist.safe_push (stmt);
9330 	      /* Set BB such that the assert in
9331 		'get_initial_def_for_reduction' is able to determine that
9332 		the BB of the related stmt is inside this loop.  */
9333 	      gimple_set_bb (stmt,
9334 			     gimple_bb (new_stmt));
9335 	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9336 	      gcc_assert (related_vinfo == NULL
9337 			  || related_vinfo == stmt_vinfo);
9338 	    }
9339 	}
9340     }
9341 
9342   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9343      using the original main loop and thus need to be updated to refer to the
9344      cloned variables used in the epilogue.  */
9345   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9346     {
9347       gimple *stmt = stmt_worklist[i];
9348       tree *new_op;
9349 
9350       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9351 	{
9352 	  tree op = gimple_op (stmt, j);
9353 	  if ((new_op = mapping.get(op)))
9354 	    gimple_set_op (stmt, j, *new_op);
9355 	  else
9356 	    {
9357 	      /* PR92429: The last argument of simplify_replace_tree disables
9358 		 folding when replacing arguments.  This is required as
9359 		 otherwise you might end up with different statements than the
9360 		 ones analyzed in vect_loop_analyze, leading to different
9361 		 vectorization.  */
9362 	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9363 					  &find_in_mapping, &mapping, false);
9364 	      gimple_set_op (stmt, j, op);
9365 	    }
9366 	}
9367     }
9368 
9369   struct data_reference *dr;
9370   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9371   FOR_EACH_VEC_ELT (datarefs, i, dr)
9372     {
9373       orig_stmt = DR_STMT (dr);
9374       gcc_assert (gimple_uid (orig_stmt) > 0);
9375       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9376       /* Data references for gather loads and scatter stores do not use the
9377 	 updated offset we set using ADVANCE.  Instead we have to make sure the
9378 	 reference in the data references point to the corresponding copy of
9379 	 the original in the epilogue.  */
9380       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9381 	  == VMAT_GATHER_SCATTER)
9382 	{
9383 	  DR_REF (dr)
9384 	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9385 				     &find_in_mapping, &mapping);
9386 	  DR_BASE_ADDRESS (dr)
9387 	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9388 				     &find_in_mapping, &mapping);
9389 	}
9390       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9391       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9392       /* The vector size of the epilogue is smaller than that of the main loop
9393 	 so the alignment is either the same or lower. This means the dr will
9394 	 thus by definition be aligned.  */
9395       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9396     }
9397 
9398   epilogue_vinfo->shared->datarefs_copy.release ();
9399   epilogue_vinfo->shared->save_datarefs ();
9400 }
9401 
9402 /* Function vect_transform_loop.
9403 
9404    The analysis phase has determined that the loop is vectorizable.
9405    Vectorize the loop - created vectorized stmts to replace the scalar
9406    stmts in the loop, and update the loop exit condition.
9407    Returns scalar epilogue loop if any.  */
9408 
9409 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)9410 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9411 {
9412   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9413   class loop *epilogue = NULL;
9414   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9415   int nbbs = loop->num_nodes;
9416   int i;
9417   tree niters_vector = NULL_TREE;
9418   tree step_vector = NULL_TREE;
9419   tree niters_vector_mult_vf = NULL_TREE;
9420   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9421   unsigned int lowest_vf = constant_lower_bound (vf);
9422   gimple *stmt;
9423   bool check_profitability = false;
9424   unsigned int th;
9425 
9426   DUMP_VECT_SCOPE ("vec_transform_loop");
9427 
9428   loop_vinfo->shared->check_datarefs ();
9429 
9430   /* Use the more conservative vectorization threshold.  If the number
9431      of iterations is constant assume the cost check has been performed
9432      by our caller.  If the threshold makes all loops profitable that
9433      run at least the (estimated) vectorization factor number of times
9434      checking is pointless, too.  */
9435   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9436   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9437     {
9438       if (dump_enabled_p ())
9439 	dump_printf_loc (MSG_NOTE, vect_location,
9440 			 "Profitability threshold is %d loop iterations.\n",
9441 			 th);
9442       check_profitability = true;
9443     }
9444 
9445   /* Make sure there exists a single-predecessor exit bb.  Do this before
9446      versioning.   */
9447   edge e = single_exit (loop);
9448   if (! single_pred_p (e->dest))
9449     {
9450       split_loop_exit_edge (e, true);
9451       if (dump_enabled_p ())
9452 	dump_printf (MSG_NOTE, "split exit edge\n");
9453     }
9454 
9455   /* Version the loop first, if required, so the profitability check
9456      comes first.  */
9457 
9458   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9459     {
9460       class loop *sloop
9461 	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9462       sloop->force_vectorize = false;
9463       check_profitability = false;
9464     }
9465 
9466   /* Make sure there exists a single-predecessor exit bb also on the
9467      scalar loop copy.  Do this after versioning but before peeling
9468      so CFG structure is fine for both scalar and if-converted loop
9469      to make slpeel_duplicate_current_defs_from_edges face matched
9470      loop closed PHI nodes on the exit.  */
9471   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9472     {
9473       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9474       if (! single_pred_p (e->dest))
9475 	{
9476 	  split_loop_exit_edge (e, true);
9477 	  if (dump_enabled_p ())
9478 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9479 	}
9480     }
9481 
9482   tree niters = vect_build_loop_niters (loop_vinfo);
9483   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9484   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9485   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9486   tree advance;
9487   drs_init_vec orig_drs_init;
9488 
9489   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9490 			      &step_vector, &niters_vector_mult_vf, th,
9491 			      check_profitability, niters_no_overflow,
9492 			      &advance);
9493 
9494   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9495       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9496     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9497 			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9498 
9499   if (niters_vector == NULL_TREE)
9500     {
9501       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9502 	  && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9503 	  && known_eq (lowest_vf, vf))
9504 	{
9505 	  niters_vector
9506 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9507 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9508 	  step_vector = build_one_cst (TREE_TYPE (niters));
9509 	}
9510       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9511 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9512 				     &step_vector, niters_no_overflow);
9513       else
9514 	/* vect_do_peeling subtracted the number of peeled prologue
9515 	   iterations from LOOP_VINFO_NITERS.  */
9516 	vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9517 				     &niters_vector, &step_vector,
9518 				     niters_no_overflow);
9519     }
9520 
9521   /* 1) Make sure the loop header has exactly two entries
9522      2) Make sure we have a preheader basic block.  */
9523 
9524   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9525 
9526   split_edge (loop_preheader_edge (loop));
9527 
9528   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9529     /* This will deal with any possible peeling.  */
9530     vect_prepare_for_masked_peels (loop_vinfo);
9531 
9532   /* Schedule the SLP instances first, then handle loop vectorization
9533      below.  */
9534   if (!loop_vinfo->slp_instances.is_empty ())
9535     {
9536       DUMP_VECT_SCOPE ("scheduling SLP instances");
9537       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9538     }
9539 
9540   /* FORNOW: the vectorizer supports only loops which body consist
9541      of one basic block (header + empty latch). When the vectorizer will
9542      support more involved loop forms, the order by which the BBs are
9543      traversed need to be reconsidered.  */
9544 
9545   for (i = 0; i < nbbs; i++)
9546     {
9547       basic_block bb = bbs[i];
9548       stmt_vec_info stmt_info;
9549 
9550       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9551 	   gsi_next (&si))
9552 	{
9553 	  gphi *phi = si.phi ();
9554 	  if (dump_enabled_p ())
9555 	    dump_printf_loc (MSG_NOTE, vect_location,
9556 			     "------>vectorizing phi: %G", phi);
9557 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9558 	  if (!stmt_info)
9559 	    continue;
9560 
9561 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9562 	    vect_loop_kill_debug_uses (loop, stmt_info);
9563 
9564 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9565 	      && !STMT_VINFO_LIVE_P (stmt_info))
9566 	    continue;
9567 
9568 	  if (STMT_VINFO_VECTYPE (stmt_info)
9569 	      && (maybe_ne
9570 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9571 	      && dump_enabled_p ())
9572 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9573 
9574 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9575 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9576 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9577 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9578 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9579 	      && ! PURE_SLP_STMT (stmt_info))
9580 	    {
9581 	      if (dump_enabled_p ())
9582 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9583 	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9584 	    }
9585 	}
9586 
9587       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9588 	   gsi_next (&si))
9589 	{
9590 	  gphi *phi = si.phi ();
9591 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9592 	  if (!stmt_info)
9593 	    continue;
9594 
9595 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9596 	      && !STMT_VINFO_LIVE_P (stmt_info))
9597 	    continue;
9598 
9599 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9600 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9601 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9602 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9603 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9604 	      && ! PURE_SLP_STMT (stmt_info))
9605 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9606 	}
9607 
9608       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9609 	   !gsi_end_p (si);)
9610 	{
9611 	  stmt = gsi_stmt (si);
9612 	  /* During vectorization remove existing clobber stmts.  */
9613 	  if (gimple_clobber_p (stmt))
9614 	    {
9615 	      unlink_stmt_vdef (stmt);
9616 	      gsi_remove (&si, true);
9617 	      release_defs (stmt);
9618 	    }
9619 	  else
9620 	    {
9621 	      /* Ignore vector stmts created in the outer loop.  */
9622 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
9623 
9624 	      /* vector stmts created in the outer-loop during vectorization of
9625 		 stmts in an inner-loop may not have a stmt_info, and do not
9626 		 need to be vectorized.  */
9627 	      stmt_vec_info seen_store = NULL;
9628 	      if (stmt_info)
9629 		{
9630 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9631 		    {
9632 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9633 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9634 			   !gsi_end_p (subsi); gsi_next (&subsi))
9635 			{
9636 			  stmt_vec_info pat_stmt_info
9637 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9638 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9639 						    &si, &seen_store);
9640 			}
9641 		      stmt_vec_info pat_stmt_info
9642 			= STMT_VINFO_RELATED_STMT (stmt_info);
9643 		      if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9644 						    &si, &seen_store))
9645 			maybe_set_vectorized_backedge_value (loop_vinfo,
9646 							     pat_stmt_info);
9647 		    }
9648 		  else
9649 		    {
9650 		      if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9651 						    &seen_store))
9652 			maybe_set_vectorized_backedge_value (loop_vinfo,
9653 							     stmt_info);
9654 		    }
9655 		}
9656 	      gsi_next (&si);
9657 	      if (seen_store)
9658 		{
9659 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9660 		    /* Interleaving.  If IS_STORE is TRUE, the
9661 		       vectorization of the interleaving chain was
9662 		       completed - free all the stores in the chain.  */
9663 		    vect_remove_stores (loop_vinfo,
9664 					DR_GROUP_FIRST_ELEMENT (seen_store));
9665 		  else
9666 		    /* Free the attached stmt_vec_info and remove the stmt.  */
9667 		    loop_vinfo->remove_stmt (stmt_info);
9668 		}
9669 	    }
9670 	}
9671 
9672       /* Stub out scalar statements that must not survive vectorization.
9673 	 Doing this here helps with grouped statements, or statements that
9674 	 are involved in patterns.  */
9675       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9676 	   !gsi_end_p (gsi); gsi_next (&gsi))
9677 	{
9678 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9679 	  if (!call || !gimple_call_internal_p (call))
9680 	    continue;
9681 	  internal_fn ifn = gimple_call_internal_fn (call);
9682 	  if (ifn == IFN_MASK_LOAD)
9683 	    {
9684 	      tree lhs = gimple_get_lhs (call);
9685 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9686 		{
9687 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
9688 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
9689 		  gsi_replace (&gsi, new_stmt, true);
9690 		}
9691 	    }
9692 	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9693 	    {
9694 	      tree lhs = gimple_get_lhs (call);
9695 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9696 		{
9697 		  tree else_arg
9698 		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9699 		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9700 		  gsi_replace (&gsi, new_stmt, true);
9701 		}
9702 	    }
9703 	}
9704     }				/* BBs in loop */
9705 
9706   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9707      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9708   if (integer_onep (step_vector))
9709     niters_no_overflow = true;
9710   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9711 			   niters_vector_mult_vf, !niters_no_overflow);
9712 
9713   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9714   scale_profile_for_vect_loop (loop, assumed_vf);
9715 
9716   /* True if the final iteration might not handle a full vector's
9717      worth of scalar iterations.  */
9718   bool final_iter_may_be_partial
9719     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9720   /* The minimum number of iterations performed by the epilogue.  This
9721      is 1 when peeling for gaps because we always need a final scalar
9722      iteration.  */
9723   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9724   /* +1 to convert latch counts to loop iteration counts,
9725      -min_epilogue_iters to remove iterations that cannot be performed
9726        by the vector code.  */
9727   int bias_for_lowest = 1 - min_epilogue_iters;
9728   int bias_for_assumed = bias_for_lowest;
9729   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9730   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9731     {
9732       /* When the amount of peeling is known at compile time, the first
9733 	 iteration will have exactly alignment_npeels active elements.
9734 	 In the worst case it will have at least one.  */
9735       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9736       bias_for_lowest += lowest_vf - min_first_active;
9737       bias_for_assumed += assumed_vf - min_first_active;
9738     }
9739   /* In these calculations the "- 1" converts loop iteration counts
9740      back to latch counts.  */
9741   if (loop->any_upper_bound)
9742     loop->nb_iterations_upper_bound
9743       = (final_iter_may_be_partial
9744 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9745 			  lowest_vf) - 1
9746 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9747 			   lowest_vf) - 1);
9748   if (loop->any_likely_upper_bound)
9749     loop->nb_iterations_likely_upper_bound
9750       = (final_iter_may_be_partial
9751 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9752 			  + bias_for_lowest, lowest_vf) - 1
9753 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9754 			   + bias_for_lowest, lowest_vf) - 1);
9755   if (loop->any_estimate)
9756     loop->nb_iterations_estimate
9757       = (final_iter_may_be_partial
9758 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9759 			  assumed_vf) - 1
9760 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9761 			   assumed_vf) - 1);
9762 
9763   if (dump_enabled_p ())
9764     {
9765       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9766 	{
9767 	  dump_printf_loc (MSG_NOTE, vect_location,
9768 			   "LOOP VECTORIZED\n");
9769 	  if (loop->inner)
9770 	    dump_printf_loc (MSG_NOTE, vect_location,
9771 			     "OUTER LOOP VECTORIZED\n");
9772 	  dump_printf (MSG_NOTE, "\n");
9773 	}
9774       else
9775 	dump_printf_loc (MSG_NOTE, vect_location,
9776 			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9777 			 GET_MODE_NAME (loop_vinfo->vector_mode));
9778     }
9779 
9780   /* Loops vectorized with a variable factor won't benefit from
9781      unrolling/peeling.  */
9782   if (!vf.is_constant ())
9783     {
9784       loop->unroll = 1;
9785       if (dump_enabled_p ())
9786 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9787 			 " variable-length vectorization factor\n");
9788     }
9789   /* Free SLP instances here because otherwise stmt reference counting
9790      won't work.  */
9791   slp_instance instance;
9792   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9793     vect_free_slp_instance (instance);
9794   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9795   /* Clear-up safelen field since its value is invalid after vectorization
9796      since vectorized loop can have loop-carried dependencies.  */
9797   loop->safelen = 0;
9798 
9799   if (epilogue)
9800     {
9801       update_epilogue_loop_vinfo (epilogue, advance);
9802 
9803       epilogue->simduid = loop->simduid;
9804       epilogue->force_vectorize = loop->force_vectorize;
9805       epilogue->dont_vectorize = false;
9806     }
9807 
9808   return epilogue;
9809 }
9810 
9811 /* The code below is trying to perform simple optimization - revert
9812    if-conversion for masked stores, i.e. if the mask of a store is zero
9813    do not perform it and all stored value producers also if possible.
9814    For example,
9815      for (i=0; i<n; i++)
9816        if (c[i])
9817 	{
9818 	  p1[i] += 1;
9819 	  p2[i] = p3[i] +2;
9820 	}
9821    this transformation will produce the following semi-hammock:
9822 
9823    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9824      {
9825        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9826        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9827        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9828        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9829        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9830        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9831      }
9832 */
9833 
9834 void
optimize_mask_stores(class loop * loop)9835 optimize_mask_stores (class loop *loop)
9836 {
9837   basic_block *bbs = get_loop_body (loop);
9838   unsigned nbbs = loop->num_nodes;
9839   unsigned i;
9840   basic_block bb;
9841   class loop *bb_loop;
9842   gimple_stmt_iterator gsi;
9843   gimple *stmt;
9844   auto_vec<gimple *> worklist;
9845   auto_purge_vect_location sentinel;
9846 
9847   vect_location = find_loop_location (loop);
9848   /* Pick up all masked stores in loop if any.  */
9849   for (i = 0; i < nbbs; i++)
9850     {
9851       bb = bbs[i];
9852       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9853 	   gsi_next (&gsi))
9854 	{
9855 	  stmt = gsi_stmt (gsi);
9856 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9857 	    worklist.safe_push (stmt);
9858 	}
9859     }
9860 
9861   free (bbs);
9862   if (worklist.is_empty ())
9863     return;
9864 
9865   /* Loop has masked stores.  */
9866   while (!worklist.is_empty ())
9867     {
9868       gimple *last, *last_store;
9869       edge e, efalse;
9870       tree mask;
9871       basic_block store_bb, join_bb;
9872       gimple_stmt_iterator gsi_to;
9873       tree vdef, new_vdef;
9874       gphi *phi;
9875       tree vectype;
9876       tree zero;
9877 
9878       last = worklist.pop ();
9879       mask = gimple_call_arg (last, 2);
9880       bb = gimple_bb (last);
9881       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9882 	 the same loop as if_bb.  It could be different to LOOP when two
9883 	 level loop-nest is vectorized and mask_store belongs to the inner
9884 	 one.  */
9885       e = split_block (bb, last);
9886       bb_loop = bb->loop_father;
9887       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9888       join_bb = e->dest;
9889       store_bb = create_empty_bb (bb);
9890       add_bb_to_loop (store_bb, bb_loop);
9891       e->flags = EDGE_TRUE_VALUE;
9892       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9893       /* Put STORE_BB to likely part.  */
9894       efalse->probability = profile_probability::unlikely ();
9895       store_bb->count = efalse->count ();
9896       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9897       if (dom_info_available_p (CDI_DOMINATORS))
9898 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9899       if (dump_enabled_p ())
9900 	dump_printf_loc (MSG_NOTE, vect_location,
9901 			 "Create new block %d to sink mask stores.",
9902 			 store_bb->index);
9903       /* Create vector comparison with boolean result.  */
9904       vectype = TREE_TYPE (mask);
9905       zero = build_zero_cst (vectype);
9906       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9907       gsi = gsi_last_bb (bb);
9908       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9909       /* Create new PHI node for vdef of the last masked store:
9910 	 .MEM_2 = VDEF <.MEM_1>
9911 	 will be converted to
9912 	 .MEM.3 = VDEF <.MEM_1>
9913 	 and new PHI node will be created in join bb
9914 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9915       */
9916       vdef = gimple_vdef (last);
9917       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9918       gimple_set_vdef (last, new_vdef);
9919       phi = create_phi_node (vdef, join_bb);
9920       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9921 
9922       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9923       while (true)
9924 	{
9925 	  gimple_stmt_iterator gsi_from;
9926 	  gimple *stmt1 = NULL;
9927 
9928 	  /* Move masked store to STORE_BB.  */
9929 	  last_store = last;
9930 	  gsi = gsi_for_stmt (last);
9931 	  gsi_from = gsi;
9932 	  /* Shift GSI to the previous stmt for further traversal.  */
9933 	  gsi_prev (&gsi);
9934 	  gsi_to = gsi_start_bb (store_bb);
9935 	  gsi_move_before (&gsi_from, &gsi_to);
9936 	  /* Setup GSI_TO to the non-empty block start.  */
9937 	  gsi_to = gsi_start_bb (store_bb);
9938 	  if (dump_enabled_p ())
9939 	    dump_printf_loc (MSG_NOTE, vect_location,
9940 			     "Move stmt to created bb\n%G", last);
9941 	  /* Move all stored value producers if possible.  */
9942 	  while (!gsi_end_p (gsi))
9943 	    {
9944 	      tree lhs;
9945 	      imm_use_iterator imm_iter;
9946 	      use_operand_p use_p;
9947 	      bool res;
9948 
9949 	      /* Skip debug statements.  */
9950 	      if (is_gimple_debug (gsi_stmt (gsi)))
9951 		{
9952 		  gsi_prev (&gsi);
9953 		  continue;
9954 		}
9955 	      stmt1 = gsi_stmt (gsi);
9956 	      /* Do not consider statements writing to memory or having
9957 		 volatile operand.  */
9958 	      if (gimple_vdef (stmt1)
9959 		  || gimple_has_volatile_ops (stmt1))
9960 		break;
9961 	      gsi_from = gsi;
9962 	      gsi_prev (&gsi);
9963 	      lhs = gimple_get_lhs (stmt1);
9964 	      if (!lhs)
9965 		break;
9966 
9967 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9968 	      if (TREE_CODE (lhs) != SSA_NAME)
9969 		break;
9970 
9971 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9972 		{
9973 		  /* Remove dead scalar statement.  */
9974 		  if (has_zero_uses (lhs))
9975 		    {
9976 		      gsi_remove (&gsi_from, true);
9977 		      continue;
9978 		    }
9979 		}
9980 
9981 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9982 	      res = true;
9983 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9984 		{
9985 		  gimple *use_stmt;
9986 		  use_stmt = USE_STMT (use_p);
9987 		  if (is_gimple_debug (use_stmt))
9988 		    continue;
9989 		  if (gimple_bb (use_stmt) != store_bb)
9990 		    {
9991 		      res = false;
9992 		      break;
9993 		    }
9994 		}
9995 	      if (!res)
9996 		break;
9997 
9998 	      if (gimple_vuse (stmt1)
9999 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
10000 		break;
10001 
10002 	      /* Can move STMT1 to STORE_BB.  */
10003 	      if (dump_enabled_p ())
10004 		dump_printf_loc (MSG_NOTE, vect_location,
10005 				 "Move stmt to created bb\n%G", stmt1);
10006 	      gsi_move_before (&gsi_from, &gsi_to);
10007 	      /* Shift GSI_TO for further insertion.  */
10008 	      gsi_prev (&gsi_to);
10009 	    }
10010 	  /* Put other masked stores with the same mask to STORE_BB.  */
10011 	  if (worklist.is_empty ()
10012 	      || gimple_call_arg (worklist.last (), 2) != mask
10013 	      || worklist.last () != stmt1)
10014 	    break;
10015 	  last = worklist.pop ();
10016 	}
10017       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10018     }
10019 }
10020 
10021 /* Decide whether it is possible to use a zero-based induction variable
10022    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10023    the value that the induction variable must be able to hold in order
10024    to ensure that the rgroups eventually have no active vector elements.
10025    Return -1 otherwise.  */
10026 
10027 widest_int
vect_iv_limit_for_partial_vectors(loop_vec_info loop_vinfo)10028 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10029 {
10030   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10031   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10032   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10033 
10034   /* Calculate the value that the induction variable must be able
10035      to hit in order to ensure that we end the loop with an all-false mask.
10036      This involves adding the maximum number of inactive trailing scalar
10037      iterations.  */
10038   widest_int iv_limit = -1;
10039   if (max_loop_iterations (loop, &iv_limit))
10040     {
10041       if (niters_skip)
10042 	{
10043 	  /* Add the maximum number of skipped iterations to the
10044 	     maximum iteration count.  */
10045 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
10046 	    iv_limit += wi::to_widest (niters_skip);
10047 	  else
10048 	    iv_limit += max_vf - 1;
10049 	}
10050       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10051 	/* Make a conservatively-correct assumption.  */
10052 	iv_limit += max_vf - 1;
10053 
10054       /* IV_LIMIT is the maximum number of latch iterations, which is also
10055 	 the maximum in-range IV value.  Round this value down to the previous
10056 	 vector alignment boundary and then add an extra full iteration.  */
10057       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10058       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10059     }
10060   return iv_limit;
10061 }
10062 
10063 /* For the given rgroup_controls RGC, check whether an induction variable
10064    would ever hit a value that produces a set of all-false masks or zero
10065    lengths before wrapping around.  Return true if it's possible to wrap
10066    around before hitting the desirable value, otherwise return false.  */
10067 
10068 bool
vect_rgroup_iv_might_wrap_p(loop_vec_info loop_vinfo,rgroup_controls * rgc)10069 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10070 {
10071   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10072 
10073   if (iv_limit == -1)
10074     return true;
10075 
10076   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10077   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10078   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10079 
10080   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10081     return true;
10082 
10083   return false;
10084 }
10085