1 /* Loop Vectorization
2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56 
57 /* Loop Vectorization Pass.
58 
59    This pass tries to vectorize loops.
60 
61    For example, the vectorizer transforms the following simple loop:
62 
63         short a[N]; short b[N]; short c[N]; int i;
64 
65         for (i=0; i<N; i++){
66           a[i] = b[i] + c[i];
67         }
68 
69    as if it was manually vectorized by rewriting the source code into:
70 
71         typedef int __attribute__((mode(V8HI))) v8hi;
72         short a[N];  short b[N]; short c[N];   int i;
73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74         v8hi va, vb, vc;
75 
76         for (i=0; i<N/8; i++){
77           vb = pb[i];
78           vc = pc[i];
79           va = vb + vc;
80           pa[i] = va;
81         }
82 
83         The main entry to this pass is vectorize_loops(), in which
84    the vectorizer applies a set of analyses on a given set of loops,
85    followed by the actual vectorization transformation for the loops that
86    had successfully passed the analysis phase.
87         Throughout this pass we make a distinction between two types of
88    data: scalars (which are represented by SSA_NAMES), and memory references
89    ("data-refs").  These two types of data require different handling both
90    during analysis and transformation. The types of data-refs that the
91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93    accesses are required to have a simple (consecutive) access pattern.
94 
95    Analysis phase:
96    ===============
97         The driver for the analysis phase is vect_analyze_loop().
98    It applies a set of analyses, some of which rely on the scalar evolution
99    analyzer (scev) developed by Sebastian Pop.
100 
101         During the analysis phase the vectorizer records some information
102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103    loop, as well as general information about the loop as a whole, which is
104    recorded in a "loop_vec_info" struct attached to each loop.
105 
106    Transformation phase:
107    =====================
108         The loop transformation phase scans all the stmts in the loop, and
109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110    the loop that needs to be vectorized.  It inserts the vector code sequence
111    just before the scalar stmt S, and records a pointer to the vector code
112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113    attached to S).  This pointer will be used for the vectorization of following
114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115    otherwise, we rely on dead code elimination for removing it.
116 
117         For example, say stmt S1 was vectorized into stmt VS1:
118 
119    VS1: vb = px[i];
120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121    S2:  a = b;
122 
123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
126    resulting sequence would be:
127 
128    VS1: vb = px[i];
129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130    VS2: va = vb;
131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132 
133         Operands that are not SSA_NAMEs, are data-refs that appear in
134    load/store operations (like 'x[i]' in S1), and are handled differently.
135 
136    Target modeling:
137    =================
138         Currently the only target specific information that is used is the
139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140    Targets that can support different sizes of vectors, for now will need
141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
142    flexibility will be added in the future.
143 
144         Since we only vectorize operations which vector form can be
145    expressed using existing tree codes, to verify that an operation is
146    supported, the vectorizer checks the relevant optab at the relevant
147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
148    the value found is CODE_FOR_nothing, then there's no target support, and
149    we can't vectorize the stmt.
150 
151    For additional information on this project see:
152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154 
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 					       bool *, bool *);
158 
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161    may already be set for general statements (not just data refs).  */
162 
163 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
165 			      bool vectype_maybe_set_p,
166 			      poly_uint64 *vf)
167 {
168   gimple *stmt = stmt_info->stmt;
169 
170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171        && !STMT_VINFO_LIVE_P (stmt_info))
172       || gimple_clobber_p (stmt))
173     {
174       if (dump_enabled_p ())
175 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176       return opt_result::success ();
177     }
178 
179   tree stmt_vectype, nunits_vectype;
180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 						   &nunits_vectype);
182   if (!res)
183     return res;
184 
185   if (stmt_vectype)
186     {
187       if (STMT_VINFO_VECTYPE (stmt_info))
188 	/* The only case when a vectype had been already set is for stmts
189 	   that contain a data ref, or for "pattern-stmts" (stmts generated
190 	   by the vectorizer to represent/replace a certain idiom).  */
191 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 		     || vectype_maybe_set_p)
193 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194       else
195 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
196     }
197 
198   if (nunits_vectype)
199     vect_update_max_nunits (vf, nunits_vectype);
200 
201   return opt_result::success ();
202 }
203 
204 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
205    types of STMT_INFO and all attached pattern statements and update
206    the vectorization factor VF accordingly.  Return true on success
207    or false if something prevented vectorization.  */
208 
209 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf)210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
211 {
212   vec_info *vinfo = stmt_info->vinfo;
213   if (dump_enabled_p ())
214     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
215 		     stmt_info->stmt);
216   opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
217   if (!res)
218     return res;
219 
220   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
221       && STMT_VINFO_RELATED_STMT (stmt_info))
222     {
223       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
224       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
225 
226       /* If a pattern statement has def stmts, analyze them too.  */
227       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
228 	   !gsi_end_p (si); gsi_next (&si))
229 	{
230 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
231 	  if (dump_enabled_p ())
232 	    dump_printf_loc (MSG_NOTE, vect_location,
233 			     "==> examining pattern def stmt: %G",
234 			     def_stmt_info->stmt);
235 	  res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
236 	  if (!res)
237 	    return res;
238 	}
239 
240       if (dump_enabled_p ())
241 	dump_printf_loc (MSG_NOTE, vect_location,
242 			 "==> examining pattern statement: %G",
243 			 stmt_info->stmt);
244       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
245       if (!res)
246 	return res;
247     }
248 
249   return opt_result::success ();
250 }
251 
252 /* Function vect_determine_vectorization_factor
253 
254    Determine the vectorization factor (VF).  VF is the number of data elements
255    that are operated upon in parallel in a single iteration of the vectorized
256    loop.  For example, when vectorizing a loop that operates on 4byte elements,
257    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
258    elements can fit in a single vector register.
259 
260    We currently support vectorization of loops in which all types operated upon
261    are of the same size.  Therefore this function currently sets VF according to
262    the size of the types operated upon, and fails if there are multiple sizes
263    in the loop.
264 
265    VF is also the factor by which the loop iterations are strip-mined, e.g.:
266    original loop:
267         for (i=0; i<N; i++){
268           a[i] = b[i] + c[i];
269         }
270 
271    vectorized loop:
272         for (i=0; i<N; i+=VF){
273           a[i:VF] = b[i:VF] + c[i:VF];
274         }
275 */
276 
277 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
279 {
280   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
281   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
282   unsigned nbbs = loop->num_nodes;
283   poly_uint64 vectorization_factor = 1;
284   tree scalar_type = NULL_TREE;
285   gphi *phi;
286   tree vectype;
287   stmt_vec_info stmt_info;
288   unsigned i;
289 
290   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
291 
292   for (i = 0; i < nbbs; i++)
293     {
294       basic_block bb = bbs[i];
295 
296       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
297 	   gsi_next (&si))
298 	{
299 	  phi = si.phi ();
300 	  stmt_info = loop_vinfo->lookup_stmt (phi);
301 	  if (dump_enabled_p ())
302 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
303 			     phi);
304 
305 	  gcc_assert (stmt_info);
306 
307 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
308 	      || STMT_VINFO_LIVE_P (stmt_info))
309             {
310 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
311               scalar_type = TREE_TYPE (PHI_RESULT (phi));
312 
313 	      if (dump_enabled_p ())
314 		dump_printf_loc (MSG_NOTE, vect_location,
315 				 "get vectype for scalar type:  %T\n",
316 				 scalar_type);
317 
318 	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
319 	      if (!vectype)
320 		return opt_result::failure_at (phi,
321 					       "not vectorized: unsupported "
322 					       "data-type %T\n",
323 					       scalar_type);
324 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
325 
326 	      if (dump_enabled_p ())
327 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
328 				 vectype);
329 
330 	      if (dump_enabled_p ())
331 		{
332 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
333 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
334 		  dump_printf (MSG_NOTE, "\n");
335 		}
336 
337 	      vect_update_max_nunits (&vectorization_factor, vectype);
338 	    }
339 	}
340 
341       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
342 	   gsi_next (&si))
343 	{
344 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
345 	  opt_result res
346 	    = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
347 	  if (!res)
348 	    return res;
349         }
350     }
351 
352   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
353   if (dump_enabled_p ())
354     {
355       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
356       dump_dec (MSG_NOTE, vectorization_factor);
357       dump_printf (MSG_NOTE, "\n");
358     }
359 
360   if (known_le (vectorization_factor, 1U))
361     return opt_result::failure_at (vect_location,
362 				   "not vectorized: unsupported data-type\n");
363   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
364   return opt_result::success ();
365 }
366 
367 
368 /* Function vect_is_simple_iv_evolution.
369 
370    FORNOW: A simple evolution of an induction variables in the loop is
371    considered a polynomial evolution.  */
372 
373 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)374 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
375                              tree * step)
376 {
377   tree init_expr;
378   tree step_expr;
379   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
380   basic_block bb;
381 
382   /* When there is no evolution in this loop, the evolution function
383      is not "simple".  */
384   if (evolution_part == NULL_TREE)
385     return false;
386 
387   /* When the evolution is a polynomial of degree >= 2
388      the evolution function is not "simple".  */
389   if (tree_is_chrec (evolution_part))
390     return false;
391 
392   step_expr = evolution_part;
393   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
394 
395   if (dump_enabled_p ())
396     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
397 		     step_expr, init_expr);
398 
399   *init = init_expr;
400   *step = step_expr;
401 
402   if (TREE_CODE (step_expr) != INTEGER_CST
403       && (TREE_CODE (step_expr) != SSA_NAME
404 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
405 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
406 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
407 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
408 		  || !flag_associative_math)))
409       && (TREE_CODE (step_expr) != REAL_CST
410 	  || !flag_associative_math))
411     {
412       if (dump_enabled_p ())
413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414                          "step unknown.\n");
415       return false;
416     }
417 
418   return true;
419 }
420 
421 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
422    what we are assuming is a double reduction.  For example, given
423    a structure like this:
424 
425       outer1:
426 	x_1 = PHI <x_4(outer2), ...>;
427 	...
428 
429       inner:
430 	x_2 = PHI <x_1(outer1), ...>;
431 	...
432 	x_3 = ...;
433 	...
434 
435       outer2:
436 	x_4 = PHI <x_3(inner)>;
437 	...
438 
439    outer loop analysis would treat x_1 as a double reduction phi and
440    this function would then return true for x_2.  */
441 
442 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)443 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
444 {
445   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
446   use_operand_p use_p;
447   ssa_op_iter op_iter;
448   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
449     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
450       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
451 	return true;
452   return false;
453 }
454 
455 /* Function vect_analyze_scalar_cycles_1.
456 
457    Examine the cross iteration def-use cycles of scalar variables
458    in LOOP.  LOOP_VINFO represents the loop that is now being
459    considered for vectorization (can be LOOP, or an outer-loop
460    enclosing LOOP).  */
461 
462 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
464 {
465   basic_block bb = loop->header;
466   tree init, step;
467   auto_vec<stmt_vec_info, 64> worklist;
468   gphi_iterator gsi;
469   bool double_reduc, reduc_chain;
470 
471   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
472 
473   /* First - identify all inductions.  Reduction detection assumes that all the
474      inductions have been identified, therefore, this order must not be
475      changed.  */
476   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
477     {
478       gphi *phi = gsi.phi ();
479       tree access_fn = NULL;
480       tree def = PHI_RESULT (phi);
481       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
482 
483       if (dump_enabled_p ())
484 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
485 
486       /* Skip virtual phi's.  The data dependences that are associated with
487          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
488       if (virtual_operand_p (def))
489 	continue;
490 
491       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
492 
493       /* Analyze the evolution function.  */
494       access_fn = analyze_scalar_evolution (loop, def);
495       if (access_fn)
496 	{
497 	  STRIP_NOPS (access_fn);
498 	  if (dump_enabled_p ())
499 	    dump_printf_loc (MSG_NOTE, vect_location,
500 			     "Access function of PHI: %T\n", access_fn);
501 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
502 	    = initial_condition_in_loop_num (access_fn, loop->num);
503 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
504 	    = evolution_part_in_loop_num (access_fn, loop->num);
505 	}
506 
507       if (!access_fn
508 	  || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
509 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
510 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
511 	      && TREE_CODE (step) != INTEGER_CST))
512 	{
513 	  worklist.safe_push (stmt_vinfo);
514 	  continue;
515 	}
516 
517       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
518 		  != NULL_TREE);
519       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
520 
521       if (dump_enabled_p ())
522 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
523       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
524     }
525 
526 
527   /* Second - identify all reductions and nested cycles.  */
528   while (worklist.length () > 0)
529     {
530       stmt_vec_info stmt_vinfo = worklist.pop ();
531       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
532       tree def = PHI_RESULT (phi);
533 
534       if (dump_enabled_p ())
535 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
536 
537       gcc_assert (!virtual_operand_p (def)
538 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
539 
540       stmt_vec_info reduc_stmt_info
541 	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
542 				    &reduc_chain);
543       if (reduc_stmt_info)
544         {
545 	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
546 	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
547 	  if (double_reduc)
548 	    {
549 	      if (dump_enabled_p ())
550 		dump_printf_loc (MSG_NOTE, vect_location,
551 				 "Detected double reduction.\n");
552 
553               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
554 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
555             }
556           else
557             {
558               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
559                 {
560                   if (dump_enabled_p ())
561                     dump_printf_loc (MSG_NOTE, vect_location,
562 				     "Detected vectorizable nested cycle.\n");
563 
564                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
565                 }
566               else
567                 {
568                   if (dump_enabled_p ())
569                     dump_printf_loc (MSG_NOTE, vect_location,
570 				     "Detected reduction.\n");
571 
572                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
573 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
574                   /* Store the reduction cycles for possible vectorization in
575                      loop-aware SLP if it was not detected as reduction
576 		     chain.  */
577 		  if (! reduc_chain)
578 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
579 		      (reduc_stmt_info);
580                 }
581             }
582         }
583       else
584         if (dump_enabled_p ())
585           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
586 			   "Unknown def-use cycle pattern.\n");
587     }
588 }
589 
590 
591 /* Function vect_analyze_scalar_cycles.
592 
593    Examine the cross iteration def-use cycles of scalar variables, by
594    analyzing the loop-header PHIs of scalar variables.  Classify each
595    cycle as one of the following: invariant, induction, reduction, unknown.
596    We do that for the loop represented by LOOP_VINFO, and also to its
597    inner-loop, if exists.
598    Examples for scalar cycles:
599 
600    Example1: reduction:
601 
602               loop1:
603               for (i=0; i<N; i++)
604                  sum += a[i];
605 
606    Example2: induction:
607 
608               loop2:
609               for (i=0; i<N; i++)
610                  a[i] = i;  */
611 
612 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
614 {
615   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
616 
617   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
618 
619   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
620      Reductions in such inner-loop therefore have different properties than
621      the reductions in the nest that gets vectorized:
622      1. When vectorized, they are executed in the same order as in the original
623         scalar loop, so we can't change the order of computation when
624         vectorizing them.
625      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
626         current checks are too strict.  */
627 
628   if (loop->inner)
629     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
630 }
631 
632 /* Transfer group and reduction information from STMT_INFO to its
633    pattern stmt.  */
634 
635 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)636 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
637 {
638   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
639   stmt_vec_info stmtp;
640   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
641 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
642   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
643   do
644     {
645       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
646       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
647 			   == STMT_VINFO_DEF_TYPE (stmt_info));
648       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
649       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
650       if (stmt_info)
651 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
652 	  = STMT_VINFO_RELATED_STMT (stmt_info);
653     }
654   while (stmt_info);
655 }
656 
657 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
658 
659 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)660 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
661 {
662   stmt_vec_info first;
663   unsigned i;
664 
665   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
666     if (STMT_VINFO_IN_PATTERN_P (first))
667       {
668 	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
669 	while (next)
670 	  {
671 	    if (! STMT_VINFO_IN_PATTERN_P (next)
672 		|| STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
673 	      break;
674 	    next = REDUC_GROUP_NEXT_ELEMENT (next);
675 	  }
676 	/* If not all stmt in the chain are patterns or if we failed
677 	   to update STMT_VINFO_REDUC_IDX try to handle the chain
678 	   without patterns.  */
679 	if (! next
680 	    && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
681 	  {
682 	    vect_fixup_reduc_chain (first);
683 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
684 	      = STMT_VINFO_RELATED_STMT (first);
685 	  }
686       }
687 }
688 
689 /* Function vect_get_loop_niters.
690 
691    Determine how many iterations the loop is executed and place it
692    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
693    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
694    niter information holds in ASSUMPTIONS.
695 
696    Return the loop exit condition.  */
697 
698 
699 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)700 vect_get_loop_niters (class loop *loop, tree *assumptions,
701 		      tree *number_of_iterations, tree *number_of_iterationsm1)
702 {
703   edge exit = single_exit (loop);
704   class tree_niter_desc niter_desc;
705   tree niter_assumptions, niter, may_be_zero;
706   gcond *cond = get_loop_exit_condition (loop);
707 
708   *assumptions = boolean_true_node;
709   *number_of_iterationsm1 = chrec_dont_know;
710   *number_of_iterations = chrec_dont_know;
711   DUMP_VECT_SCOPE ("get_loop_niters");
712 
713   if (!exit)
714     return cond;
715 
716   may_be_zero = NULL_TREE;
717   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
718       || chrec_contains_undetermined (niter_desc.niter))
719     return cond;
720 
721   niter_assumptions = niter_desc.assumptions;
722   may_be_zero = niter_desc.may_be_zero;
723   niter = niter_desc.niter;
724 
725   if (may_be_zero && integer_zerop (may_be_zero))
726     may_be_zero = NULL_TREE;
727 
728   if (may_be_zero)
729     {
730       if (COMPARISON_CLASS_P (may_be_zero))
731 	{
732 	  /* Try to combine may_be_zero with assumptions, this can simplify
733 	     computation of niter expression.  */
734 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
735 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
736 					     niter_assumptions,
737 					     fold_build1 (TRUTH_NOT_EXPR,
738 							  boolean_type_node,
739 							  may_be_zero));
740 	  else
741 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
742 				 build_int_cst (TREE_TYPE (niter), 0),
743 				 rewrite_to_non_trapping_overflow (niter));
744 
745 	  may_be_zero = NULL_TREE;
746 	}
747       else if (integer_nonzerop (may_be_zero))
748 	{
749 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
750 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
751 	  return cond;
752 	}
753       else
754 	return cond;
755     }
756 
757   *assumptions = niter_assumptions;
758   *number_of_iterationsm1 = niter;
759 
760   /* We want the number of loop header executions which is the number
761      of latch executions plus one.
762      ???  For UINT_MAX latch executions this number overflows to zero
763      for loops like do { n++; } while (n != 0);  */
764   if (niter && !chrec_contains_undetermined (niter))
765     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
766 			  build_int_cst (TREE_TYPE (niter), 1));
767   *number_of_iterations = niter;
768 
769   return cond;
770 }
771 
772 /* Function bb_in_loop_p
773 
774    Used as predicate for dfs order traversal of the loop bbs.  */
775 
776 static bool
bb_in_loop_p(const_basic_block bb,const void * data)777 bb_in_loop_p (const_basic_block bb, const void *data)
778 {
779   const class loop *const loop = (const class loop *)data;
780   if (flow_bb_inside_loop_p (loop, bb))
781     return true;
782   return false;
783 }
784 
785 
786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
787    stmt_vec_info structs for all the stmts in LOOP_IN.  */
788 
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
790   : vec_info (vec_info::loop, init_cost (loop_in), shared),
791     loop (loop_in),
792     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
793     num_itersm1 (NULL_TREE),
794     num_iters (NULL_TREE),
795     num_iters_unchanged (NULL_TREE),
796     num_iters_assumptions (NULL_TREE),
797     th (0),
798     versioning_threshold (0),
799     vectorization_factor (0),
800     max_vectorization_factor (0),
801     mask_skip_niters (NULL_TREE),
802     mask_compare_type (NULL_TREE),
803     simd_if_cond (NULL_TREE),
804     unaligned_dr (NULL),
805     peeling_for_alignment (0),
806     ptr_mask (0),
807     ivexpr_map (NULL),
808     scan_map (NULL),
809     slp_unrolling_factor (1),
810     single_scalar_iteration_cost (0),
811     vec_outside_cost (0),
812     vec_inside_cost (0),
813     vectorizable (false),
814     can_fully_mask_p (true),
815     fully_masked_p (false),
816     peeling_for_gaps (false),
817     peeling_for_niter (false),
818     no_data_dependencies (false),
819     has_mask_store (false),
820     scalar_loop_scaling (profile_probability::uninitialized ()),
821     scalar_loop (NULL),
822     orig_loop_info (NULL)
823 {
824   /* CHECKME: We want to visit all BBs before their successors (except for
825      latch blocks, for which this assertion wouldn't hold).  In the simple
826      case of the loop forms we allow, a dfs order of the BBs would the same
827      as reversed postorder traversal, so we are safe.  */
828 
829   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
830 					  bbs, loop->num_nodes, loop);
831   gcc_assert (nbbs == loop->num_nodes);
832 
833   for (unsigned int i = 0; i < nbbs; i++)
834     {
835       basic_block bb = bbs[i];
836       gimple_stmt_iterator si;
837 
838       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
839 	{
840 	  gimple *phi = gsi_stmt (si);
841 	  gimple_set_uid (phi, 0);
842 	  add_stmt (phi);
843 	}
844 
845       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
846 	{
847 	  gimple *stmt = gsi_stmt (si);
848 	  gimple_set_uid (stmt, 0);
849 	  add_stmt (stmt);
850 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
851 	     third argument is the #pragma omp simd if (x) condition, when 0,
852 	     loop shouldn't be vectorized, when non-zero constant, it should
853 	     be vectorized normally, otherwise versioned with vectorized loop
854 	     done if the condition is non-zero at runtime.  */
855 	  if (loop_in->simduid
856 	      && is_gimple_call (stmt)
857 	      && gimple_call_internal_p (stmt)
858 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
859 	      && gimple_call_num_args (stmt) >= 3
860 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
861 	      && (loop_in->simduid
862 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
863 	    {
864 	      tree arg = gimple_call_arg (stmt, 2);
865 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
866 		simd_if_cond = arg;
867 	      else
868 		gcc_assert (integer_nonzerop (arg));
869 	    }
870 	}
871     }
872 
873   epilogue_vinfos.create (6);
874 }
875 
876 /* Free all levels of MASKS.  */
877 
878 void
release_vec_loop_masks(vec_loop_masks * masks)879 release_vec_loop_masks (vec_loop_masks *masks)
880 {
881   rgroup_masks *rgm;
882   unsigned int i;
883   FOR_EACH_VEC_ELT (*masks, i, rgm)
884     rgm->masks.release ();
885   masks->release ();
886 }
887 
888 /* Free all memory used by the _loop_vec_info, as well as all the
889    stmt_vec_info structs of all the stmts in the loop.  */
890 
~_loop_vec_info()891 _loop_vec_info::~_loop_vec_info ()
892 {
893   free (bbs);
894 
895   release_vec_loop_masks (&masks);
896   delete ivexpr_map;
897   delete scan_map;
898   epilogue_vinfos.release ();
899 
900   loop->aux = NULL;
901 }
902 
903 /* Return an invariant or register for EXPR and emit necessary
904    computations in the LOOP_VINFO loop preheader.  */
905 
906 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)907 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
908 {
909   if (is_gimple_reg (expr)
910       || is_gimple_min_invariant (expr))
911     return expr;
912 
913   if (! loop_vinfo->ivexpr_map)
914     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
915   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
916   if (! cached)
917     {
918       gimple_seq stmts = NULL;
919       cached = force_gimple_operand (unshare_expr (expr),
920 				     &stmts, true, NULL_TREE);
921       if (stmts)
922 	{
923 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
924 	  gsi_insert_seq_on_edge_immediate (e, stmts);
925 	}
926     }
927   return cached;
928 }
929 
930 /* Return true if we can use CMP_TYPE as the comparison type to produce
931    all masks required to mask LOOP_VINFO.  */
932 
933 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)934 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
935 {
936   rgroup_masks *rgm;
937   unsigned int i;
938   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
939     if (rgm->mask_type != NULL_TREE
940 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
941 					    cmp_type, rgm->mask_type,
942 					    OPTIMIZE_FOR_SPEED))
943       return false;
944   return true;
945 }
946 
947 /* Calculate the maximum number of scalars per iteration for every
948    rgroup in LOOP_VINFO.  */
949 
950 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)951 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
952 {
953   unsigned int res = 1;
954   unsigned int i;
955   rgroup_masks *rgm;
956   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
957     res = MAX (res, rgm->max_nscalars_per_iter);
958   return res;
959 }
960 
961 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
962    whether we can actually generate the masks required.  Return true if so,
963    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
964 
965 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)966 vect_verify_full_masking (loop_vec_info loop_vinfo)
967 {
968   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
969   unsigned int min_ni_width;
970   unsigned int max_nscalars_per_iter
971     = vect_get_max_nscalars_per_iter (loop_vinfo);
972 
973   /* Use a normal loop if there are no statements that need masking.
974      This only happens in rare degenerate cases: it means that the loop
975      has no loads, no stores, and no live-out values.  */
976   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
977     return false;
978 
979   /* Get the maximum number of iterations that is representable
980      in the counter type.  */
981   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
982   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
983 
984   /* Get a more refined estimate for the number of iterations.  */
985   widest_int max_back_edges;
986   if (max_loop_iterations (loop, &max_back_edges))
987     max_ni = wi::smin (max_ni, max_back_edges + 1);
988 
989   /* Account for rgroup masks, in which each bit is replicated N times.  */
990   max_ni *= max_nscalars_per_iter;
991 
992   /* Work out how many bits we need to represent the limit.  */
993   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
994 
995   /* Find a scalar mode for which WHILE_ULT is supported.  */
996   opt_scalar_int_mode cmp_mode_iter;
997   tree cmp_type = NULL_TREE;
998   tree iv_type = NULL_TREE;
999   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000   unsigned int iv_precision = UINT_MAX;
1001 
1002   if (iv_limit != -1)
1003     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004 				      UNSIGNED);
1005 
1006   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1007     {
1008       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1009       if (cmp_bits >= min_ni_width
1010 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1011 	{
1012 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1013 	  if (this_type
1014 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1015 	    {
1016 	      /* Although we could stop as soon as we find a valid mode,
1017 		 there are at least two reasons why that's not always the
1018 		 best choice:
1019 
1020 		 - An IV that's Pmode or wider is more likely to be reusable
1021 		   in address calculations than an IV that's narrower than
1022 		   Pmode.
1023 
1024 		 - Doing the comparison in IV_PRECISION or wider allows
1025 		   a natural 0-based IV, whereas using a narrower comparison
1026 		   type requires mitigations against wrap-around.
1027 
1028 		 Conversely, if the IV limit is variable, doing the comparison
1029 		 in a wider type than the original type can introduce
1030 		 unnecessary extensions, so picking the widest valid mode
1031 		 is not always a good choice either.
1032 
1033 		 Here we prefer the first IV type that's Pmode or wider,
1034 		 and the first comparison type that's IV_PRECISION or wider.
1035 		 (The comparison type must be no wider than the IV type,
1036 		 to avoid extensions in the vector loop.)
1037 
1038 		 ??? We might want to try continuing beyond Pmode for ILP32
1039 		 targets if CMP_BITS < IV_PRECISION.  */
1040 	      iv_type = this_type;
1041 	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042 		cmp_type = this_type;
1043 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1044 		break;
1045 	    }
1046 	}
1047     }
1048 
1049   if (!cmp_type)
1050     return false;
1051 
1052   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1054   return true;
1055 }
1056 
1057 /* Calculate the cost of one scalar iteration of the loop.  */
1058 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1060 {
1061   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1062   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1063   int nbbs = loop->num_nodes, factor;
1064   int innerloop_iters, i;
1065 
1066   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1067 
1068   /* Gather costs for statements in the scalar loop.  */
1069 
1070   /* FORNOW.  */
1071   innerloop_iters = 1;
1072   if (loop->inner)
1073     innerloop_iters = 50; /* FIXME */
1074 
1075   for (i = 0; i < nbbs; i++)
1076     {
1077       gimple_stmt_iterator si;
1078       basic_block bb = bbs[i];
1079 
1080       if (bb->loop_father == loop->inner)
1081         factor = innerloop_iters;
1082       else
1083         factor = 1;
1084 
1085       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1086         {
1087 	  gimple *stmt = gsi_stmt (si);
1088 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1089 
1090           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1091             continue;
1092 
1093           /* Skip stmts that are not vectorized inside the loop.  */
1094 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1095           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1096               && (!STMT_VINFO_LIVE_P (vstmt_info)
1097                   || !VECTORIZABLE_CYCLE_DEF
1098 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1099             continue;
1100 
1101 	  vect_cost_for_stmt kind;
1102           if (STMT_VINFO_DATA_REF (stmt_info))
1103             {
1104               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1105                kind = scalar_load;
1106              else
1107                kind = scalar_store;
1108             }
1109 	  else if (vect_nop_conversion_p (stmt_info))
1110 	    continue;
1111 	  else
1112             kind = scalar_stmt;
1113 
1114 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1115 			    factor, kind, stmt_info, 0, vect_prologue);
1116         }
1117     }
1118 
1119   /* Now accumulate cost.  */
1120   void *target_cost_data = init_cost (loop);
1121   stmt_info_for_cost *si;
1122   int j;
1123   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1124 		    j, si)
1125     (void) add_stmt_cost (target_cost_data, si->count,
1126 			  si->kind, si->stmt_info, si->misalign,
1127 			  vect_body);
1128   unsigned dummy, body_cost = 0;
1129   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1130   destroy_cost_data (target_cost_data);
1131   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1132 }
1133 
1134 
1135 /* Function vect_analyze_loop_form_1.
1136 
1137    Verify that certain CFG restrictions hold, including:
1138    - the loop has a pre-header
1139    - the loop has a single entry and exit
1140    - the loop exit condition is simple enough
1141    - the number of iterations can be analyzed, i.e, a countable loop.  The
1142      niter could be analyzed under some assumptions.  */
1143 
1144 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1146 			  tree *assumptions, tree *number_of_iterationsm1,
1147 			  tree *number_of_iterations, gcond **inner_loop_cond)
1148 {
1149   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1150 
1151   /* Different restrictions apply when we are considering an inner-most loop,
1152      vs. an outer (nested) loop.
1153      (FORNOW. May want to relax some of these restrictions in the future).  */
1154 
1155   if (!loop->inner)
1156     {
1157       /* Inner-most loop.  We currently require that the number of BBs is
1158 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1159 	 look like this:
1160 
1161                         (pre-header)
1162                            |
1163                           header <--------+
1164                            | |            |
1165                            | +--> latch --+
1166                            |
1167                         (exit-bb)  */
1168 
1169       if (loop->num_nodes != 2)
1170 	return opt_result::failure_at (vect_location,
1171 				       "not vectorized:"
1172 				       " control flow in loop.\n");
1173 
1174       if (empty_block_p (loop->header))
1175 	return opt_result::failure_at (vect_location,
1176 				       "not vectorized: empty loop.\n");
1177     }
1178   else
1179     {
1180       class loop *innerloop = loop->inner;
1181       edge entryedge;
1182 
1183       /* Nested loop. We currently require that the loop is doubly-nested,
1184 	 contains a single inner loop, and the number of BBs is exactly 5.
1185 	 Vectorizable outer-loops look like this:
1186 
1187 			(pre-header)
1188 			   |
1189 			  header <---+
1190 			   |         |
1191 		          inner-loop |
1192 			   |         |
1193 			  tail ------+
1194 			   |
1195 		        (exit-bb)
1196 
1197 	 The inner-loop has the properties expected of inner-most loops
1198 	 as described above.  */
1199 
1200       if ((loop->inner)->inner || (loop->inner)->next)
1201 	return opt_result::failure_at (vect_location,
1202 				       "not vectorized:"
1203 				       " multiple nested loops.\n");
1204 
1205       if (loop->num_nodes != 5)
1206 	return opt_result::failure_at (vect_location,
1207 				       "not vectorized:"
1208 				       " control flow in loop.\n");
1209 
1210       entryedge = loop_preheader_edge (innerloop);
1211       if (entryedge->src != loop->header
1212 	  || !single_exit (innerloop)
1213 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1214 	return opt_result::failure_at (vect_location,
1215 				       "not vectorized:"
1216 				       " unsupported outerloop form.\n");
1217 
1218       /* Analyze the inner-loop.  */
1219       tree inner_niterm1, inner_niter, inner_assumptions;
1220       opt_result res
1221 	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1222 				    &inner_assumptions, &inner_niterm1,
1223 				    &inner_niter, NULL);
1224       if (!res)
1225 	{
1226 	  if (dump_enabled_p ())
1227 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 			     "not vectorized: Bad inner loop.\n");
1229 	  return res;
1230 	}
1231 
1232       /* Don't support analyzing niter under assumptions for inner
1233 	 loop.  */
1234       if (!integer_onep (inner_assumptions))
1235 	return opt_result::failure_at (vect_location,
1236 				       "not vectorized: Bad inner loop.\n");
1237 
1238       if (!expr_invariant_in_loop_p (loop, inner_niter))
1239 	return opt_result::failure_at (vect_location,
1240 				       "not vectorized: inner-loop count not"
1241 				       " invariant.\n");
1242 
1243       if (dump_enabled_p ())
1244         dump_printf_loc (MSG_NOTE, vect_location,
1245 			 "Considering outer-loop vectorization.\n");
1246     }
1247 
1248   if (!single_exit (loop))
1249     return opt_result::failure_at (vect_location,
1250 				   "not vectorized: multiple exits.\n");
1251   if (EDGE_COUNT (loop->header->preds) != 2)
1252     return opt_result::failure_at (vect_location,
1253 				   "not vectorized:"
1254 				   " too many incoming edges.\n");
1255 
1256   /* We assume that the loop exit condition is at the end of the loop. i.e,
1257      that the loop is represented as a do-while (with a proper if-guard
1258      before the loop if needed), where the loop header contains all the
1259      executable statements, and the latch is empty.  */
1260   if (!empty_block_p (loop->latch)
1261       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1262     return opt_result::failure_at (vect_location,
1263 				   "not vectorized: latch block not empty.\n");
1264 
1265   /* Make sure the exit is not abnormal.  */
1266   edge e = single_exit (loop);
1267   if (e->flags & EDGE_ABNORMAL)
1268     return opt_result::failure_at (vect_location,
1269 				   "not vectorized:"
1270 				   " abnormal loop exit edge.\n");
1271 
1272   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1273 				     number_of_iterationsm1);
1274   if (!*loop_cond)
1275     return opt_result::failure_at
1276       (vect_location,
1277        "not vectorized: complicated exit condition.\n");
1278 
1279   if (integer_zerop (*assumptions)
1280       || !*number_of_iterations
1281       || chrec_contains_undetermined (*number_of_iterations))
1282     return opt_result::failure_at
1283       (*loop_cond,
1284        "not vectorized: number of iterations cannot be computed.\n");
1285 
1286   if (integer_zerop (*number_of_iterations))
1287     return opt_result::failure_at
1288       (*loop_cond,
1289        "not vectorized: number of iterations = 0.\n");
1290 
1291   return opt_result::success ();
1292 }
1293 
1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1295 
1296 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1298 {
1299   tree assumptions, number_of_iterations, number_of_iterationsm1;
1300   gcond *loop_cond, *inner_loop_cond = NULL;
1301 
1302   opt_result res
1303     = vect_analyze_loop_form_1 (loop, &loop_cond,
1304 				&assumptions, &number_of_iterationsm1,
1305 				&number_of_iterations, &inner_loop_cond);
1306   if (!res)
1307     return opt_loop_vec_info::propagate_failure (res);
1308 
1309   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1310   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313   if (!integer_onep (assumptions))
1314     {
1315       /* We consider to vectorize this loop by versioning it under
1316 	 some assumptions.  In order to do this, we need to clear
1317 	 existing information computed by scev and niter analyzer.  */
1318       scev_reset_htab ();
1319       free_numbers_of_iterations_estimates (loop);
1320       /* Also set flag for this loop so that following scev and niter
1321 	 analysis are done under the assumptions.  */
1322       loop_constraint_set (loop, LOOP_C_FINITE);
1323       /* Also record the assumptions for versioning.  */
1324       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1325     }
1326 
1327   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1328     {
1329       if (dump_enabled_p ())
1330         {
1331           dump_printf_loc (MSG_NOTE, vect_location,
1332 			   "Symbolic number of iterations is ");
1333 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1334           dump_printf (MSG_NOTE, "\n");
1335         }
1336     }
1337 
1338   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1339   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1340   if (inner_loop_cond)
1341     {
1342       stmt_vec_info inner_loop_cond_info
1343 	= loop_vinfo->lookup_stmt (inner_loop_cond);
1344       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345     }
1346 
1347   gcc_assert (!loop->aux);
1348   loop->aux = loop_vinfo;
1349   return opt_loop_vec_info::success (loop_vinfo);
1350 }
1351 
1352 
1353 
1354 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1355    statements update the vectorization factor.  */
1356 
1357 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1359 {
1360   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362   int nbbs = loop->num_nodes;
1363   poly_uint64 vectorization_factor;
1364   int i;
1365 
1366   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1367 
1368   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1369   gcc_assert (known_ne (vectorization_factor, 0U));
1370 
1371   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1372      vectorization factor of the loop is the unrolling factor required by
1373      the SLP instances.  If that unrolling factor is 1, we say, that we
1374      perform pure SLP on loop - cross iteration parallelism is not
1375      exploited.  */
1376   bool only_slp_in_loop = true;
1377   for (i = 0; i < nbbs; i++)
1378     {
1379       basic_block bb = bbs[i];
1380       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381 	   gsi_next (&si))
1382 	{
1383 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384 	  if (!stmt_info)
1385 	    continue;
1386 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388 	      && !PURE_SLP_STMT (stmt_info))
1389 	    /* STMT needs both SLP and loop-based vectorization.  */
1390 	    only_slp_in_loop = false;
1391 	}
1392       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393 	   gsi_next (&si))
1394 	{
1395 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1397 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399 	      && !PURE_SLP_STMT (stmt_info))
1400 	    /* STMT needs both SLP and loop-based vectorization.  */
1401 	    only_slp_in_loop = false;
1402 	}
1403     }
1404 
1405   if (only_slp_in_loop)
1406     {
1407       if (dump_enabled_p ())
1408 	dump_printf_loc (MSG_NOTE, vect_location,
1409 			 "Loop contains only SLP stmts\n");
1410       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1411     }
1412   else
1413     {
1414       if (dump_enabled_p ())
1415 	dump_printf_loc (MSG_NOTE, vect_location,
1416 			 "Loop contains SLP and non-SLP stmts\n");
1417       /* Both the vectorization factor and unroll factor have the form
1418 	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1419 	 so they must have a common multiple.  */
1420       vectorization_factor
1421 	= force_common_multiple (vectorization_factor,
1422 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1423     }
1424 
1425   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426   if (dump_enabled_p ())
1427     {
1428       dump_printf_loc (MSG_NOTE, vect_location,
1429 		       "Updating vectorization factor to ");
1430       dump_dec (MSG_NOTE, vectorization_factor);
1431       dump_printf (MSG_NOTE, ".\n");
1432     }
1433 }
1434 
1435 /* Return true if STMT_INFO describes a double reduction phi and if
1436    the other phi in the reduction is also relevant for vectorization.
1437    This rejects cases such as:
1438 
1439       outer1:
1440 	x_1 = PHI <x_3(outer2), ...>;
1441 	...
1442 
1443       inner:
1444 	x_2 = ...;
1445 	...
1446 
1447       outer2:
1448 	x_3 = PHI <x_2(inner)>;
1449 
1450    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1451 
1452 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1453 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1454 {
1455   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456     return false;
1457 
1458   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1459 }
1460 
1461 /* Function vect_analyze_loop_operations.
1462 
1463    Scan the loop stmts and make sure they are all vectorizable.  */
1464 
1465 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1467 {
1468   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470   int nbbs = loop->num_nodes;
1471   int i;
1472   stmt_vec_info stmt_info;
1473   bool need_to_vectorize = false;
1474   bool ok;
1475 
1476   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477 
1478   auto_vec<stmt_info_for_cost> cost_vec;
1479 
1480   for (i = 0; i < nbbs; i++)
1481     {
1482       basic_block bb = bbs[i];
1483 
1484       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485 	   gsi_next (&si))
1486         {
1487           gphi *phi = si.phi ();
1488           ok = true;
1489 
1490 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1491           if (dump_enabled_p ())
1492 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493 	  if (virtual_operand_p (gimple_phi_result (phi)))
1494 	    continue;
1495 
1496           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497              (i.e., a phi in the tail of the outer-loop).  */
1498           if (! is_loop_header_bb_p (bb))
1499             {
1500               /* FORNOW: we currently don't support the case that these phis
1501                  are not used in the outerloop (unless it is double reduction,
1502                  i.e., this phi is vect_reduction_def), cause this case
1503                  requires to actually do something here.  */
1504               if (STMT_VINFO_LIVE_P (stmt_info)
1505 		  && !vect_active_double_reduction_p (stmt_info))
1506 		return opt_result::failure_at (phi,
1507 					       "Unsupported loop-closed phi"
1508 					       " in outer-loop.\n");
1509 
1510               /* If PHI is used in the outer loop, we check that its operand
1511                  is defined in the inner loop.  */
1512               if (STMT_VINFO_RELEVANT_P (stmt_info))
1513                 {
1514                   tree phi_op;
1515 
1516                   if (gimple_phi_num_args (phi) != 1)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518 
1519                   phi_op = PHI_ARG_DEF (phi, 0);
1520 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521 		  if (!op_def_info)
1522 		    return opt_result::failure_at (phi, "unsupported phi\n");
1523 
1524 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525 		      && (STMT_VINFO_RELEVANT (op_def_info)
1526 			  != vect_used_in_outer_by_reduction))
1527 		    return opt_result::failure_at (phi, "unsupported phi\n");
1528 
1529 		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530 		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1531 			   == vect_double_reduction_def))
1532 		      && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533 		    return opt_result::failure_at (phi, "unsupported phi\n");
1534                 }
1535 
1536               continue;
1537             }
1538 
1539           gcc_assert (stmt_info);
1540 
1541           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542                || STMT_VINFO_LIVE_P (stmt_info))
1543               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544 	    /* A scalar-dependence cycle that we don't support.  */
1545 	    return opt_result::failure_at (phi,
1546 					   "not vectorized:"
1547 					   " scalar dependence cycle.\n");
1548 
1549           if (STMT_VINFO_RELEVANT_P (stmt_info))
1550             {
1551               need_to_vectorize = true;
1552               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553 		  && ! PURE_SLP_STMT (stmt_info))
1554 		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555 					     &cost_vec);
1556 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1558 			    == vect_double_reduction_def)
1559 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560 		       && ! PURE_SLP_STMT (stmt_info))
1561 		ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1562             }
1563 
1564 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1565 	  if (ok
1566 	      && STMT_VINFO_LIVE_P (stmt_info)
1567 	      && !PURE_SLP_STMT (stmt_info))
1568 	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1569 					      -1, false, &cost_vec);
1570 
1571           if (!ok)
1572 	    return opt_result::failure_at (phi,
1573 					   "not vectorized: relevant phi not "
1574 					   "supported: %G",
1575 					   static_cast <gimple *> (phi));
1576         }
1577 
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 	   gsi_next (&si))
1580         {
1581 	  gimple *stmt = gsi_stmt (si);
1582 	  if (!gimple_clobber_p (stmt))
1583 	    {
1584 	      opt_result res
1585 		= vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1586 				     &need_to_vectorize,
1587 				     NULL, NULL, &cost_vec);
1588 	      if (!res)
1589 		return res;
1590 	    }
1591         }
1592     } /* bbs */
1593 
1594   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1595 
1596   /* All operations in the loop are either irrelevant (deal with loop
1597      control, or dead), or only used outside the loop and can be moved
1598      out of the loop (e.g. invariants, inductions).  The loop can be
1599      optimized away by scalar optimizations.  We're better off not
1600      touching this loop.  */
1601   if (!need_to_vectorize)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_NOTE, vect_location,
1605 			 "All the computation can be taken out of the loop.\n");
1606       return opt_result::failure_at
1607 	(vect_location,
1608 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1609     }
1610 
1611   return opt_result::success ();
1612 }
1613 
1614 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1615    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1616    definitely no, or -1 if it's worth retrying.  */
1617 
1618 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1620 {
1621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1622   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1623 
1624   /* Only fully-masked loops can have iteration counts less than the
1625      vectorization factor.  */
1626   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1627     {
1628       HOST_WIDE_INT max_niter;
1629 
1630       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1631 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1632       else
1633 	max_niter = max_stmt_executions_int (loop);
1634 
1635       if (max_niter != -1
1636 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1637 	{
1638 	  if (dump_enabled_p ())
1639 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640 			     "not vectorized: iteration count smaller than "
1641 			     "vectorization factor.\n");
1642 	  return 0;
1643 	}
1644     }
1645 
1646   int min_profitable_iters, min_profitable_estimate;
1647   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648 				      &min_profitable_estimate);
1649 
1650   if (min_profitable_iters < 0)
1651     {
1652       if (dump_enabled_p ())
1653 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 			 "not vectorized: vectorization not profitable.\n");
1655       if (dump_enabled_p ())
1656 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 			 "not vectorized: vector version will never be "
1658 			 "profitable.\n");
1659       return -1;
1660     }
1661 
1662   int min_scalar_loop_bound = (param_min_vect_loop_bound
1663 			       * assumed_vf);
1664 
1665   /* Use the cost model only if it is more conservative than user specified
1666      threshold.  */
1667   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1668 				    min_profitable_iters);
1669 
1670   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1671 
1672   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1673       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1674     {
1675       if (dump_enabled_p ())
1676 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 			 "not vectorized: vectorization not profitable.\n");
1678       if (dump_enabled_p ())
1679 	dump_printf_loc (MSG_NOTE, vect_location,
1680 			 "not vectorized: iteration count smaller than user "
1681 			 "specified loop bound parameter or minimum profitable "
1682 			 "iterations (whichever is more conservative).\n");
1683       return 0;
1684     }
1685 
1686   /* The static profitablity threshold min_profitable_estimate includes
1687      the cost of having to check at runtime whether the scalar loop
1688      should be used instead.  If it turns out that we don't need or want
1689      such a check, the threshold we should use for the static estimate
1690      is simply the point at which the vector loop becomes more profitable
1691      than the scalar loop.  */
1692   if (min_profitable_estimate > min_profitable_iters
1693       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1697     {
1698       if (dump_enabled_p ())
1699 	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700 			 " choice between the scalar and vector loops\n");
1701       min_profitable_estimate = min_profitable_iters;
1702     }
1703 
1704   HOST_WIDE_INT estimated_niter;
1705 
1706   /* If we are vectorizing an epilogue then we know the maximum number of
1707      scalar iterations it will cover is at least one lower than the
1708      vectorization factor of the main loop.  */
1709   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710     estimated_niter
1711       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712   else
1713     {
1714       estimated_niter = estimated_stmt_executions_int (loop);
1715       if (estimated_niter == -1)
1716 	estimated_niter = likely_max_stmt_executions_int (loop);
1717     }
1718   if (estimated_niter != -1
1719       && ((unsigned HOST_WIDE_INT) estimated_niter
1720 	  < MAX (th, (unsigned) min_profitable_estimate)))
1721     {
1722       if (dump_enabled_p ())
1723 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 			 "not vectorized: estimated iteration count too "
1725 			 "small.\n");
1726       if (dump_enabled_p ())
1727 	dump_printf_loc (MSG_NOTE, vect_location,
1728 			 "not vectorized: estimated iteration count smaller "
1729 			 "than specified loop bound parameter or minimum "
1730 			 "profitable iterations (whichever is more "
1731 			 "conservative).\n");
1732       return -1;
1733     }
1734 
1735   return 1;
1736 }
1737 
1738 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1739 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1740 			   vec<data_reference_p> *datarefs,
1741 			   unsigned int *n_stmts)
1742 {
1743   *n_stmts = 0;
1744   for (unsigned i = 0; i < loop->num_nodes; i++)
1745     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1746 	 !gsi_end_p (gsi); gsi_next (&gsi))
1747       {
1748 	gimple *stmt = gsi_stmt (gsi);
1749 	if (is_gimple_debug (stmt))
1750 	  continue;
1751 	++(*n_stmts);
1752 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1753 	if (!res)
1754 	  {
1755 	    if (is_gimple_call (stmt) && loop->safelen)
1756 	      {
1757 		tree fndecl = gimple_call_fndecl (stmt), op;
1758 		if (fndecl != NULL_TREE)
1759 		  {
1760 		    cgraph_node *node = cgraph_node::get (fndecl);
1761 		    if (node != NULL && node->simd_clones != NULL)
1762 		      {
1763 			unsigned int j, n = gimple_call_num_args (stmt);
1764 			for (j = 0; j < n; j++)
1765 			  {
1766 			    op = gimple_call_arg (stmt, j);
1767 			    if (DECL_P (op)
1768 				|| (REFERENCE_CLASS_P (op)
1769 				    && get_base_address (op)))
1770 			      break;
1771 			  }
1772 			op = gimple_call_lhs (stmt);
1773 			/* Ignore #pragma omp declare simd functions
1774 			   if they don't have data references in the
1775 			   call stmt itself.  */
1776 			if (j == n
1777 			    && !(op
1778 				 && (DECL_P (op)
1779 				     || (REFERENCE_CLASS_P (op)
1780 					 && get_base_address (op)))))
1781 			  continue;
1782 		      }
1783 		  }
1784 	      }
1785 	    return res;
1786 	  }
1787 	/* If dependence analysis will give up due to the limit on the
1788 	   number of datarefs stop here and fail fatally.  */
1789 	if (datarefs->length ()
1790 	    > (unsigned)param_loop_max_datarefs_for_datadeps)
1791 	  return opt_result::failure_at (stmt, "exceeded param "
1792 					 "loop-max-datarefs-for-datadeps\n");
1793       }
1794   return opt_result::success ();
1795 }
1796 
1797 /* Look for SLP-only access groups and turn each individual access into its own
1798    group.  */
1799 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int i;
1803   struct data_reference *dr;
1804 
1805   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1806 
1807   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1808   FOR_EACH_VEC_ELT (datarefs, i, dr)
1809     {
1810       gcc_assert (DR_REF (dr));
1811       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1812 
1813       /* Check if the load is a part of an interleaving chain.  */
1814       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1815 	{
1816 	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1817 	  unsigned int group_size = DR_GROUP_SIZE (first_element);
1818 
1819 	  /* Check if SLP-only groups.  */
1820 	  if (!STMT_SLP_TYPE (stmt_info)
1821 	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
1822 	    {
1823 	      /* Dissolve the group.  */
1824 	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1825 
1826 	      stmt_vec_info vinfo = first_element;
1827 	      while (vinfo)
1828 		{
1829 		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1830 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1831 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1832 		  DR_GROUP_SIZE (vinfo) = 1;
1833 		  if (STMT_VINFO_STRIDED_P (first_element))
1834 		    DR_GROUP_GAP (vinfo) = 0;
1835 		  else
1836 		    DR_GROUP_GAP (vinfo) = group_size - 1;
1837 		  vinfo = next;
1838 		}
1839 	    }
1840 	}
1841     }
1842 }
1843 
1844 
1845 /* Decides whether we need to create an epilogue loop to handle
1846    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1847 
1848 void
determine_peel_for_niter(loop_vec_info loop_vinfo)1849 determine_peel_for_niter (loop_vec_info loop_vinfo)
1850 {
1851   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1852 
1853   unsigned HOST_WIDE_INT const_vf;
1854   HOST_WIDE_INT max_niter
1855     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1856 
1857   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1858   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1859     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1860 					  (loop_vinfo));
1861 
1862   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1863     /* The main loop handles all iterations.  */
1864     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1867     {
1868       /* Work out the (constant) number of iterations that need to be
1869 	 peeled for reasons other than niters.  */
1870       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1871       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1872 	peel_niter += 1;
1873       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1874 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1875 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1876     }
1877   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1878 	   /* ??? When peeling for gaps but not alignment, we could
1879 	      try to check whether the (variable) niters is known to be
1880 	      VF * N + 1.  That's something of a niche case though.  */
1881 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1882 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1883 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1884 		< (unsigned) exact_log2 (const_vf))
1885 	       /* In case of versioning, check if the maximum number of
1886 		  iterations is greater than th.  If they are identical,
1887 		  the epilogue is unnecessary.  */
1888 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1889 		   || ((unsigned HOST_WIDE_INT) max_niter
1890 		       > (th / const_vf) * const_vf))))
1891     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1892 }
1893 
1894 
1895 /* Function vect_analyze_loop_2.
1896 
1897    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898    for it.  The different analyses will record information in the
1899    loop_vec_info struct.  */
1900 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1902 {
1903   opt_result ok = opt_result::success ();
1904   int res;
1905   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906   poly_uint64 min_vf = 2;
1907   loop_vec_info orig_loop_vinfo = NULL;
1908 
1909   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910      loop_vec_info of the first vectorized loop.  */
1911   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913   else
1914     orig_loop_vinfo = loop_vinfo;
1915   gcc_assert (orig_loop_vinfo);
1916 
1917   /* The first group of checks is independent of the vector size.  */
1918   fatal = true;
1919 
1920   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922     return opt_result::failure_at (vect_location,
1923 				   "not vectorized: simd if(0)\n");
1924 
1925   /* Find all data references in the loop (which correspond to vdefs/vuses)
1926      and analyze their evolution in the loop.  */
1927 
1928   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1929 
1930   /* Gather the data references and count stmts in the loop.  */
1931   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1932     {
1933       opt_result res
1934 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
1936 				     n_stmts);
1937       if (!res)
1938 	{
1939 	  if (dump_enabled_p ())
1940 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 			     "not vectorized: loop contains function "
1942 			     "calls or data references that cannot "
1943 			     "be analyzed\n");
1944 	  return res;
1945 	}
1946       loop_vinfo->shared->save_datarefs ();
1947     }
1948   else
1949     loop_vinfo->shared->check_datarefs ();
1950 
1951   /* Analyze the data references and also adjust the minimal
1952      vectorization factor according to the loads and stores.  */
1953 
1954   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955   if (!ok)
1956     {
1957       if (dump_enabled_p ())
1958 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 			 "bad data references.\n");
1960       return ok;
1961     }
1962 
1963   /* Classify all cross-iteration scalar data-flow cycles.
1964      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1965   vect_analyze_scalar_cycles (loop_vinfo);
1966 
1967   vect_pattern_recog (loop_vinfo);
1968 
1969   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1970 
1971   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1973 
1974   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975   if (!ok)
1976     {
1977       if (dump_enabled_p ())
1978 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979 			 "bad data access.\n");
1980       return ok;
1981     }
1982 
1983   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1984 
1985   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986   if (!ok)
1987     {
1988       if (dump_enabled_p ())
1989 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990 			 "unexpected pattern.\n");
1991       return ok;
1992     }
1993 
1994   /* While the rest of the analysis below depends on it in some way.  */
1995   fatal = false;
1996 
1997   /* Analyze data dependences between the data-refs in the loop
1998      and adjust the maximum vectorization factor according to
1999      the dependences.
2000      FORNOW: fail at the first data dependence that we encounter.  */
2001 
2002   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003   if (!ok)
2004     {
2005       if (dump_enabled_p ())
2006 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007 			 "bad data dependence.\n");
2008       return ok;
2009     }
2010   if (max_vf != MAX_VECTORIZATION_FACTOR
2011       && maybe_lt (max_vf, min_vf))
2012     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2014 
2015   ok = vect_determine_vectorization_factor (loop_vinfo);
2016   if (!ok)
2017     {
2018       if (dump_enabled_p ())
2019 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 			 "can't determine vectorization factor.\n");
2021       return ok;
2022     }
2023   if (max_vf != MAX_VECTORIZATION_FACTOR
2024       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026 
2027   /* Compute the scalar iteration cost.  */
2028   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2029 
2030   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2031 
2032   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2033   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034   if (!ok)
2035     return ok;
2036 
2037   /* If there are any SLP instances mark them as pure_slp.  */
2038   bool slp = vect_make_slp_decision (loop_vinfo);
2039   if (slp)
2040     {
2041       /* Find stmts that need to be both vectorized and SLPed.  */
2042       vect_detect_hybrid_slp (loop_vinfo);
2043 
2044       /* Update the vectorization factor based on the SLP decision.  */
2045       vect_update_vf_for_slp (loop_vinfo);
2046     }
2047 
2048   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2049 
2050   /* We don't expect to have to roll back to anything other than an empty
2051      set of rgroups.  */
2052   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2053 
2054   /* This is the point where we can re-start analysis with SLP forced off.  */
2055 start_over:
2056 
2057   /* Now the vectorization factor is final.  */
2058   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059   gcc_assert (known_ne (vectorization_factor, 0U));
2060 
2061   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2062     {
2063       dump_printf_loc (MSG_NOTE, vect_location,
2064 		       "vectorization_factor = ");
2065       dump_dec (MSG_NOTE, vectorization_factor);
2066       dump_printf (MSG_NOTE, ", niters = %wd\n",
2067 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2068     }
2069 
2070   /* Analyze the alignment of the data-refs in the loop.
2071      Fail if a data reference is found that cannot be vectorized.  */
2072 
2073   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074   if (!ok)
2075     {
2076       if (dump_enabled_p ())
2077 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 			 "bad data alignment.\n");
2079       return ok;
2080     }
2081 
2082   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083      It is important to call pruning after vect_analyze_data_ref_accesses,
2084      since we use grouping information gathered by interleaving analysis.  */
2085   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086   if (!ok)
2087     return ok;
2088 
2089   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090      vectorization, since we do not want to add extra peeling or
2091      add versioning for alignment.  */
2092   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093     /* This pass will decide on using loop versioning and/or loop peeling in
2094        order to enhance the alignment of data references in the loop.  */
2095     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096   else
2097     ok = vect_verify_datarefs_alignment (loop_vinfo);
2098   if (!ok)
2099     return ok;
2100 
2101   if (slp)
2102     {
2103       /* Analyze operations in the SLP instances.  Note this may
2104 	 remove unsupported SLP instances which makes the above
2105 	 SLP kind detection invalid.  */
2106       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107       vect_slp_analyze_operations (loop_vinfo);
2108       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2109 	{
2110 	  ok = opt_result::failure_at (vect_location,
2111 				       "unsupported SLP instances\n");
2112 	  goto again;
2113 	}
2114     }
2115 
2116   /* Dissolve SLP-only groups.  */
2117   vect_dissolve_slp_only_groups (loop_vinfo);
2118 
2119   /* Scan all the remaining operations in the loop that are not subject
2120      to SLP and make sure they are vectorizable.  */
2121   ok = vect_analyze_loop_operations (loop_vinfo);
2122   if (!ok)
2123     {
2124       if (dump_enabled_p ())
2125 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 			 "bad operation or unsupported loop bound.\n");
2127       return ok;
2128     }
2129 
2130   /* Decide whether to use a fully-masked loop for this vectorization
2131      factor.  */
2132   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134        && vect_verify_full_masking (loop_vinfo));
2135   if (dump_enabled_p ())
2136     {
2137       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138 	dump_printf_loc (MSG_NOTE, vect_location,
2139 			 "using a fully-masked loop.\n");
2140       else
2141 	dump_printf_loc (MSG_NOTE, vect_location,
2142 			 "not using a fully-masked loop.\n");
2143     }
2144 
2145   /* If epilog loop is required because of data accesses with gaps,
2146      one additional iteration needs to be peeled.  Check if there is
2147      enough iterations for vectorization.  */
2148   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2151     {
2152       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2154 
2155       if (known_lt (wi::to_widest (scalar_niters), vf))
2156 	return opt_result::failure_at (vect_location,
2157 				       "loop has no enough iterations to"
2158 				       " support peeling for gaps.\n");
2159     }
2160 
2161   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162      loop or a loop that has a lower VF than the main loop.  */
2163   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167     return opt_result::failure_at (vect_location,
2168 				   "Vectorization factor too high for"
2169 				   " epilogue loop.\n");
2170 
2171   /* Check the costings of the loop make vectorizing worthwhile.  */
2172   res = vect_analyze_loop_costing (loop_vinfo);
2173   if (res < 0)
2174     {
2175       ok = opt_result::failure_at (vect_location,
2176 				   "Loop costings may not be worthwhile.\n");
2177       goto again;
2178     }
2179   if (!res)
2180     return opt_result::failure_at (vect_location,
2181 				   "Loop costings not worthwhile.\n");
2182 
2183   determine_peel_for_niter (loop_vinfo);
2184   /* If an epilogue loop is required make sure we can create one.  */
2185   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2186       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2187     {
2188       if (dump_enabled_p ())
2189         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2190       if (!vect_can_advance_ivs_p (loop_vinfo)
2191 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2192 					   single_exit (LOOP_VINFO_LOOP
2193 							 (loop_vinfo))))
2194         {
2195 	  ok = opt_result::failure_at (vect_location,
2196 				       "not vectorized: can't create required "
2197 				       "epilog loop\n");
2198           goto again;
2199         }
2200     }
2201 
2202   /* During peeling, we need to check if number of loop iterations is
2203      enough for both peeled prolog loop and vector loop.  This check
2204      can be merged along with threshold check of loop versioning, so
2205      increase threshold for this case if necessary.
2206 
2207      If we are analyzing an epilogue we still want to check what its
2208      versioning threshold would be.  If we decide to vectorize the epilogues we
2209      will want to use the lowest versioning threshold of all epilogues and main
2210      loop.  This will enable us to enter a vectorized epilogue even when
2211      versioning the loop.  We can't simply check whether the epilogue requires
2212      versioning though since we may have skipped some versioning checks when
2213      analyzing the epilogue.  For instance, checks for alias versioning will be
2214      skipped when dealing with epilogues as we assume we already checked them
2215      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2216   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2217     {
2218       poly_uint64 niters_th = 0;
2219       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2220 
2221       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2222 	{
2223 	  /* Niters for peeled prolog loop.  */
2224 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2225 	    {
2226 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2227 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2228 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2229 	    }
2230 	  else
2231 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2232 	}
2233 
2234       /* Niters for at least one iteration of vectorized loop.  */
2235       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2236 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2237       /* One additional iteration because of peeling for gap.  */
2238       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2239 	niters_th += 1;
2240 
2241       /*  Use the same condition as vect_transform_loop to decide when to use
2242 	  the cost to determine a versioning threshold.  */
2243       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244 	  && ordered_p (th, niters_th))
2245 	niters_th = ordered_max (poly_uint64 (th), niters_th);
2246 
2247       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2248     }
2249 
2250   gcc_assert (known_eq (vectorization_factor,
2251 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2252 
2253   /* Ok to vectorize!  */
2254   return opt_result::success ();
2255 
2256 again:
2257   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2258   gcc_assert (!ok);
2259 
2260   /* Try again with SLP forced off but if we didn't do any SLP there is
2261      no point in re-trying.  */
2262   if (!slp)
2263     return ok;
2264 
2265   /* If there are reduction chains re-trying will fail anyway.  */
2266   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2267     return ok;
2268 
2269   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2270      via interleaving or lane instructions.  */
2271   slp_instance instance;
2272   slp_tree node;
2273   unsigned i, j;
2274   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2275     {
2276       stmt_vec_info vinfo;
2277       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2278       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2279 	continue;
2280       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2281       unsigned int size = DR_GROUP_SIZE (vinfo);
2282       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2283       if (! vect_store_lanes_supported (vectype, size, false)
2284 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2285 	 && ! vect_grouped_store_supported (vectype, size))
2286 	return opt_result::failure_at (vinfo->stmt,
2287 				       "unsupported grouped store\n");
2288       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2289 	{
2290 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2291 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2292 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2293 	  size = DR_GROUP_SIZE (vinfo);
2294 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2295 	  if (! vect_load_lanes_supported (vectype, size, false)
2296 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2297 						size))
2298 	    return opt_result::failure_at (vinfo->stmt,
2299 					   "unsupported grouped load\n");
2300 	}
2301     }
2302 
2303   if (dump_enabled_p ())
2304     dump_printf_loc (MSG_NOTE, vect_location,
2305 		     "re-trying with SLP disabled\n");
2306 
2307   /* Roll back state appropriately.  No SLP this time.  */
2308   slp = false;
2309   /* Restore vectorization factor as it were without SLP.  */
2310   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311   /* Free the SLP instances.  */
2312   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313     vect_free_slp_instance (instance, false);
2314   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315   /* Reset SLP type to loop_vect on all stmts.  */
2316   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2317     {
2318       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320 	   !gsi_end_p (si); gsi_next (&si))
2321 	{
2322 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2323 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2324 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325 	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2326 	    {
2327 	      /* vectorizable_reduction adjusts reduction stmt def-types,
2328 		 restore them to that of the PHI.  */
2329 	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330 		= STMT_VINFO_DEF_TYPE (stmt_info);
2331 	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332 					(STMT_VINFO_REDUC_DEF (stmt_info)))
2333 		= STMT_VINFO_DEF_TYPE (stmt_info);
2334 	    }
2335 	}
2336       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2337 	   !gsi_end_p (si); gsi_next (&si))
2338 	{
2339 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2340 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2341 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2342 	    {
2343 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2344 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2345 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2346 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2347 		   !gsi_end_p (pi); gsi_next (&pi))
2348 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2349 		  = loop_vect;
2350 	    }
2351 	}
2352     }
2353   /* Free optimized alias test DDRS.  */
2354   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2355   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2356   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2357   /* Reset target cost data.  */
2358   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2359   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2360     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2361   /* Reset accumulated rgroup information.  */
2362   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2363   /* Reset assorted flags.  */
2364   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2365   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2366   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2367   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2368   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2369 
2370   goto start_over;
2371 }
2372 
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2375    OLD_LOOP_VINFO is better unless something specifically indicates
2376    otherwise.
2377 
2378    Note that this deliberately isn't a partial order.  */
2379 
2380 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382 			  loop_vec_info old_loop_vinfo)
2383 {
2384   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2386 
2387   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2389 
2390   /* Always prefer a VF of loop->simdlen over any other VF.  */
2391   if (loop->simdlen)
2392     {
2393       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395       if (new_simdlen_p != old_simdlen_p)
2396 	return new_simdlen_p;
2397     }
2398 
2399   /* Limit the VFs to what is likely to be the maximum number of iterations,
2400      to handle cases in which at least one loop_vinfo is fully-masked.  */
2401   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402   if (estimated_max_niter != -1)
2403     {
2404       if (known_le (estimated_max_niter, new_vf))
2405 	new_vf = estimated_max_niter;
2406       if (known_le (estimated_max_niter, old_vf))
2407 	old_vf = estimated_max_niter;
2408     }
2409 
2410   /* Check whether the (fractional) cost per scalar iteration is lower
2411      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2412   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413 			     * poly_widest_int (old_vf));
2414   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415 			     * poly_widest_int (new_vf));
2416   if (maybe_lt (rel_old, rel_new))
2417     {
2418       /* When old_loop_vinfo uses a variable vectorization factor,
2419 	 we know that it has a lower cost for at least one runtime VF.
2420 	 However, we don't know how likely that VF is.
2421 
2422 	 One option would be to compare the costs for the estimated VFs.
2423 	 The problem is that that can put too much pressure on the cost
2424 	 model.  E.g. if the estimated VF is also the lowest possible VF,
2425 	 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2426 	 for the estimated VF, we'd then choose new_loop_vinfo even
2427 	 though (a) new_loop_vinfo might not actually be better than
2428 	 old_loop_vinfo for that VF and (b) it would be significantly
2429 	 worse at larger VFs.
2430 
2431 	 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2432 	 no more expensive than old_loop_vinfo even after doubling the
2433 	 estimated old_loop_vinfo VF.  For all but trivial loops, this
2434 	 ensures that we only pick new_loop_vinfo if it is significantly
2435 	 better than old_loop_vinfo at the estimated VF.  */
2436       if (rel_new.is_constant ())
2437 	return false;
2438 
2439       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2440       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2441       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2442 				      * widest_int (old_estimated_vf));
2443       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2444 				      * widest_int (new_estimated_vf));
2445       return estimated_rel_new * 2 <= estimated_rel_old;
2446     }
2447   if (known_lt (rel_new, rel_old))
2448     return true;
2449 
2450   /* If there's nothing to choose between the loop bodies, see whether
2451      there's a difference in the prologue and epilogue costs.  */
2452   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2453     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2454 
2455   return false;
2456 }
2457 
2458 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2459    true if we should.  */
2460 
2461 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2462 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2463 			loop_vec_info old_loop_vinfo)
2464 {
2465   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2466     return false;
2467 
2468   if (dump_enabled_p ())
2469     dump_printf_loc (MSG_NOTE, vect_location,
2470 		     "***** Preferring vector mode %s to vector mode %s\n",
2471 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2472 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2473   return true;
2474 }
2475 
2476 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2477    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2478    and null on failure.  */
2479 
2480 static loop_vec_info
vect_reanalyze_as_main_loop(loop_vec_info loop_vinfo,unsigned int * n_stmts)2481 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2482 {
2483   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2484     return loop_vinfo;
2485 
2486   if (dump_enabled_p ())
2487     dump_printf_loc (MSG_NOTE, vect_location,
2488 		     "***** Reanalyzing as a main loop with vector mode %s\n",
2489 		     GET_MODE_NAME (loop_vinfo->vector_mode));
2490 
2491   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2492   vec_info_shared *shared = loop_vinfo->shared;
2493   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2494   gcc_assert (main_loop_vinfo);
2495 
2496   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2497 
2498   bool fatal = false;
2499   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2500   loop->aux = NULL;
2501   if (!res)
2502     {
2503       if (dump_enabled_p ())
2504 	dump_printf_loc (MSG_NOTE, vect_location,
2505 			 "***** Failed to analyze main loop with vector"
2506 			 " mode %s\n",
2507 			 GET_MODE_NAME (loop_vinfo->vector_mode));
2508       delete main_loop_vinfo;
2509       return NULL;
2510     }
2511   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2512   return main_loop_vinfo;
2513 }
2514 
2515 /* Function vect_analyze_loop.
2516 
2517    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2518    for it.  The different analyses will record information in the
2519    loop_vec_info struct.  */
2520 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2521 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2522 {
2523   auto_vector_modes vector_modes;
2524 
2525   /* Autodetect first vector size we try.  */
2526   unsigned int autovec_flags
2527     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2528 						    loop->simdlen != 0);
2529   unsigned int mode_i = 0;
2530 
2531   DUMP_VECT_SCOPE ("analyze_loop_nest");
2532 
2533   if (loop_outer (loop)
2534       && loop_vec_info_for_loop (loop_outer (loop))
2535       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2536     return opt_loop_vec_info::failure_at (vect_location,
2537 					  "outer-loop already vectorized.\n");
2538 
2539   if (!find_loop_nest (loop, &shared->loop_nest))
2540     return opt_loop_vec_info::failure_at
2541       (vect_location,
2542        "not vectorized: loop nest containing two or more consecutive inner"
2543        " loops cannot be vectorized\n");
2544 
2545   unsigned n_stmts = 0;
2546   machine_mode autodetected_vector_mode = VOIDmode;
2547   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2548   machine_mode next_vector_mode = VOIDmode;
2549   poly_uint64 lowest_th = 0;
2550   unsigned vectorized_loops = 0;
2551   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2552 			     && !unlimited_cost_model (loop));
2553 
2554   bool vect_epilogues = false;
2555   opt_result res = opt_result::success ();
2556   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2557   while (1)
2558     {
2559       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2560       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2561       if (!loop_vinfo)
2562 	{
2563 	  if (dump_enabled_p ())
2564 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565 			     "bad loop form.\n");
2566 	  gcc_checking_assert (first_loop_vinfo == NULL);
2567 	  return loop_vinfo;
2568 	}
2569       loop_vinfo->vector_mode = next_vector_mode;
2570 
2571       bool fatal = false;
2572 
2573       /* When pick_lowest_cost_p is true, we should in principle iterate
2574 	 over all the loop_vec_infos that LOOP_VINFO could replace and
2575 	 try to vectorize LOOP_VINFO under the same conditions.
2576 	 E.g. when trying to replace an epilogue loop, we should vectorize
2577 	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2578 	 to replace the main loop, we should vectorize LOOP_VINFO as a main
2579 	 loop too.
2580 
2581 	 However, autovectorize_vector_modes is usually sorted as follows:
2582 
2583 	 - Modes that naturally produce lower VFs usually follow modes that
2584 	   naturally produce higher VFs.
2585 
2586 	 - When modes naturally produce the same VF, maskable modes
2587 	   usually follow unmaskable ones, so that the maskable mode
2588 	   can be used to vectorize the epilogue of the unmaskable mode.
2589 
2590 	 This order is preferred because it leads to the maximum
2591 	 epilogue vectorization opportunities.  Targets should only use
2592 	 a different order if they want to make wide modes available while
2593 	 disparaging them relative to earlier, smaller modes.  The assumption
2594 	 in that case is that the wider modes are more expensive in some
2595 	 way that isn't reflected directly in the costs.
2596 
2597 	 There should therefore be few interesting cases in which
2598 	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2599 	 treated as a standalone loop, and ends up being genuinely cheaper
2600 	 than FIRST_LOOP_VINFO.  */
2601       if (vect_epilogues)
2602 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2603 
2604       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2605       if (mode_i == 0)
2606 	autodetected_vector_mode = loop_vinfo->vector_mode;
2607       if (dump_enabled_p ())
2608 	{
2609 	  if (res)
2610 	    dump_printf_loc (MSG_NOTE, vect_location,
2611 			     "***** Analysis succeeded with vector mode %s\n",
2612 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2613 	  else
2614 	    dump_printf_loc (MSG_NOTE, vect_location,
2615 			     "***** Analysis failed with vector mode %s\n",
2616 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2617 	}
2618 
2619       loop->aux = NULL;
2620 
2621       if (!fatal)
2622 	while (mode_i < vector_modes.length ()
2623 	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2624 	  {
2625 	    if (dump_enabled_p ())
2626 	      dump_printf_loc (MSG_NOTE, vect_location,
2627 			       "***** The result for vector mode %s would"
2628 			       " be the same\n",
2629 			       GET_MODE_NAME (vector_modes[mode_i]));
2630 	    mode_i += 1;
2631 	  }
2632 
2633       if (res)
2634 	{
2635 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2636 	  vectorized_loops++;
2637 
2638 	  /* Once we hit the desired simdlen for the first time,
2639 	     discard any previous attempts.  */
2640 	  if (simdlen
2641 	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2642 	    {
2643 	      delete first_loop_vinfo;
2644 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2645 	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2646 	      simdlen = 0;
2647 	    }
2648 	  else if (pick_lowest_cost_p && first_loop_vinfo)
2649 	    {
2650 	      /* Keep trying to roll back vectorization attempts while the
2651 		 loop_vec_infos they produced were worse than this one.  */
2652 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2653 	      while (!vinfos.is_empty ()
2654 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2655 		{
2656 		  gcc_assert (vect_epilogues);
2657 		  delete vinfos.pop ();
2658 		}
2659 	      if (vinfos.is_empty ()
2660 		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2661 		{
2662 		  loop_vec_info main_loop_vinfo
2663 		    = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
2664 		  if (main_loop_vinfo == loop_vinfo)
2665 		    {
2666 		      delete first_loop_vinfo;
2667 		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2668 		    }
2669 		  else if (main_loop_vinfo
2670 			   && vect_joust_loop_vinfos (main_loop_vinfo,
2671 						      first_loop_vinfo))
2672 		    {
2673 		      delete first_loop_vinfo;
2674 		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2675 		      delete loop_vinfo;
2676 		      loop_vinfo
2677 			= opt_loop_vec_info::success (main_loop_vinfo);
2678 		    }
2679 		  else
2680 		    delete main_loop_vinfo;
2681 		}
2682 	    }
2683 
2684 	  if (first_loop_vinfo == NULL)
2685 	    {
2686 	      first_loop_vinfo = loop_vinfo;
2687 	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2688 	    }
2689 	  else if (vect_epilogues
2690 		   /* For now only allow one epilogue loop.  */
2691 		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
2692 	    {
2693 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2694 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2695 	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2696 			  || maybe_ne (lowest_th, 0U));
2697 	      /* Keep track of the known smallest versioning
2698 		 threshold.  */
2699 	      if (ordered_p (lowest_th, th))
2700 		lowest_th = ordered_min (lowest_th, th);
2701 	    }
2702 	  else
2703 	    delete loop_vinfo;
2704 
2705 	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2706 	     enabled, SIMDUID is not set, it is the innermost loop and we have
2707 	     either already found the loop's SIMDLEN or there was no SIMDLEN to
2708 	     begin with.
2709 	     TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2710 	  vect_epilogues = (!simdlen
2711 			    && loop->inner == NULL
2712 			    && param_vect_epilogues_nomask
2713 			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2714 			    && !loop->simduid
2715 			    /* For now only allow one epilogue loop, but allow
2716 			       pick_lowest_cost_p to replace it.  */
2717 			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2718 				|| pick_lowest_cost_p));
2719 
2720 	  /* Commit to first_loop_vinfo if we have no reason to try
2721 	     alternatives.  */
2722 	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2723 	    break;
2724 	}
2725       else
2726 	{
2727 	  delete loop_vinfo;
2728 	  if (fatal)
2729 	    {
2730 	      gcc_checking_assert (first_loop_vinfo == NULL);
2731 	      break;
2732 	    }
2733 	}
2734 
2735       if (mode_i < vector_modes.length ()
2736 	  && VECTOR_MODE_P (autodetected_vector_mode)
2737 	  && (related_vector_mode (vector_modes[mode_i],
2738 				   GET_MODE_INNER (autodetected_vector_mode))
2739 	      == autodetected_vector_mode)
2740 	  && (related_vector_mode (autodetected_vector_mode,
2741 				   GET_MODE_INNER (vector_modes[mode_i]))
2742 	      == vector_modes[mode_i]))
2743 	{
2744 	  if (dump_enabled_p ())
2745 	    dump_printf_loc (MSG_NOTE, vect_location,
2746 			     "***** Skipping vector mode %s, which would"
2747 			     " repeat the analysis for %s\n",
2748 			     GET_MODE_NAME (vector_modes[mode_i]),
2749 			     GET_MODE_NAME (autodetected_vector_mode));
2750 	  mode_i += 1;
2751 	}
2752 
2753       if (mode_i == vector_modes.length ()
2754 	  || autodetected_vector_mode == VOIDmode)
2755 	break;
2756 
2757       /* Try the next biggest vector size.  */
2758       next_vector_mode = vector_modes[mode_i++];
2759       if (dump_enabled_p ())
2760 	dump_printf_loc (MSG_NOTE, vect_location,
2761 			 "***** Re-trying analysis with vector mode %s\n",
2762 			 GET_MODE_NAME (next_vector_mode));
2763     }
2764 
2765   if (first_loop_vinfo)
2766     {
2767       loop->aux = (loop_vec_info) first_loop_vinfo;
2768       if (dump_enabled_p ())
2769 	dump_printf_loc (MSG_NOTE, vect_location,
2770 			 "***** Choosing vector mode %s\n",
2771 			 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2772       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2773       return first_loop_vinfo;
2774     }
2775 
2776   return opt_loop_vec_info::propagate_failure (res);
2777 }
2778 
2779 /* Return true if there is an in-order reduction function for CODE, storing
2780    it in *REDUC_FN if so.  */
2781 
2782 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2783 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2784 {
2785   switch (code)
2786     {
2787     case PLUS_EXPR:
2788       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2789       return true;
2790 
2791     default:
2792       return false;
2793     }
2794 }
2795 
2796 /* Function reduction_fn_for_scalar_code
2797 
2798    Input:
2799    CODE - tree_code of a reduction operations.
2800 
2801    Output:
2802    REDUC_FN - the corresponding internal function to be used to reduce the
2803       vector of partial results into a single scalar result, or IFN_LAST
2804       if the operation is a supported reduction operation, but does not have
2805       such an internal function.
2806 
2807    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2808 
2809 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2810 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2811 {
2812   switch (code)
2813     {
2814       case MAX_EXPR:
2815         *reduc_fn = IFN_REDUC_MAX;
2816         return true;
2817 
2818       case MIN_EXPR:
2819         *reduc_fn = IFN_REDUC_MIN;
2820         return true;
2821 
2822       case PLUS_EXPR:
2823         *reduc_fn = IFN_REDUC_PLUS;
2824         return true;
2825 
2826       case BIT_AND_EXPR:
2827 	*reduc_fn = IFN_REDUC_AND;
2828 	return true;
2829 
2830       case BIT_IOR_EXPR:
2831 	*reduc_fn = IFN_REDUC_IOR;
2832 	return true;
2833 
2834       case BIT_XOR_EXPR:
2835 	*reduc_fn = IFN_REDUC_XOR;
2836 	return true;
2837 
2838       case MULT_EXPR:
2839       case MINUS_EXPR:
2840         *reduc_fn = IFN_LAST;
2841         return true;
2842 
2843       default:
2844        return false;
2845     }
2846 }
2847 
2848 /* If there is a neutral value X such that SLP reduction NODE would not
2849    be affected by the introduction of additional X elements, return that X,
2850    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2851    is the vector type that would hold element X.  REDUC_CHAIN is true if
2852    the SLP statements perform a single reduction, false if each statement
2853    performs an independent reduction.  */
2854 
2855 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)2856 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2857 			      tree_code code, bool reduc_chain)
2858 {
2859   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2860   stmt_vec_info stmt_vinfo = stmts[0];
2861   tree scalar_type = TREE_TYPE (vector_type);
2862   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2863   gcc_assert (loop);
2864 
2865   switch (code)
2866     {
2867     case WIDEN_SUM_EXPR:
2868     case DOT_PROD_EXPR:
2869     case SAD_EXPR:
2870     case PLUS_EXPR:
2871     case MINUS_EXPR:
2872     case BIT_IOR_EXPR:
2873     case BIT_XOR_EXPR:
2874       return build_zero_cst (scalar_type);
2875 
2876     case MULT_EXPR:
2877       return build_one_cst (scalar_type);
2878 
2879     case BIT_AND_EXPR:
2880       return build_all_ones_cst (scalar_type);
2881 
2882     case MAX_EXPR:
2883     case MIN_EXPR:
2884       /* For MIN/MAX the initial values are neutral.  A reduction chain
2885 	 has only a single initial value, so that value is neutral for
2886 	 all statements.  */
2887       if (reduc_chain)
2888 	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2889 				      loop_preheader_edge (loop));
2890       return NULL_TREE;
2891 
2892     default:
2893       return NULL_TREE;
2894     }
2895 }
2896 
2897 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2898    STMT is printed with a message MSG. */
2899 
2900 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2901 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2902 {
2903   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2904 }
2905 
2906 /* Return true if we need an in-order reduction for operation CODE
2907    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2908    overflow must wrap.  */
2909 
2910 bool
needs_fold_left_reduction_p(tree type,tree_code code)2911 needs_fold_left_reduction_p (tree type, tree_code code)
2912 {
2913   /* CHECKME: check for !flag_finite_math_only too?  */
2914   if (SCALAR_FLOAT_TYPE_P (type))
2915     switch (code)
2916       {
2917       case MIN_EXPR:
2918       case MAX_EXPR:
2919 	return false;
2920 
2921       default:
2922 	return !flag_associative_math;
2923       }
2924 
2925   if (INTEGRAL_TYPE_P (type))
2926     {
2927       if (!operation_no_trapping_overflow (type, code))
2928 	return true;
2929       return false;
2930     }
2931 
2932   if (SAT_FIXED_POINT_TYPE_P (type))
2933     return true;
2934 
2935   return false;
2936 }
2937 
2938 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2939    has a handled computation expression.  Store the main reduction
2940    operation in *CODE.  */
2941 
2942 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)2943 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2944 		      tree loop_arg, enum tree_code *code,
2945 		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2946 {
2947   auto_bitmap visited;
2948   tree lookfor = PHI_RESULT (phi);
2949   ssa_op_iter curri;
2950   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2951   while (USE_FROM_PTR (curr) != loop_arg)
2952     curr = op_iter_next_use (&curri);
2953   curri.i = curri.numops;
2954   do
2955     {
2956       path.safe_push (std::make_pair (curri, curr));
2957       tree use = USE_FROM_PTR (curr);
2958       if (use == lookfor)
2959 	break;
2960       gimple *def = SSA_NAME_DEF_STMT (use);
2961       if (gimple_nop_p (def)
2962 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2963 	{
2964 pop:
2965 	  do
2966 	    {
2967 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2968 	      curri = x.first;
2969 	      curr = x.second;
2970 	      do
2971 		curr = op_iter_next_use (&curri);
2972 	      /* Skip already visited or non-SSA operands (from iterating
2973 	         over PHI args).  */
2974 	      while (curr != NULL_USE_OPERAND_P
2975 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2976 			 || ! bitmap_set_bit (visited,
2977 					      SSA_NAME_VERSION
2978 					        (USE_FROM_PTR (curr)))));
2979 	    }
2980 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2981 	  if (curr == NULL_USE_OPERAND_P)
2982 	    break;
2983 	}
2984       else
2985 	{
2986 	  if (gimple_code (def) == GIMPLE_PHI)
2987 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2988 	  else
2989 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2990 	  while (curr != NULL_USE_OPERAND_P
2991 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2992 		     || ! bitmap_set_bit (visited,
2993 					  SSA_NAME_VERSION
2994 					    (USE_FROM_PTR (curr)))))
2995 	    curr = op_iter_next_use (&curri);
2996 	  if (curr == NULL_USE_OPERAND_P)
2997 	    goto pop;
2998 	}
2999     }
3000   while (1);
3001   if (dump_file && (dump_flags & TDF_DETAILS))
3002     {
3003       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3004       unsigned i;
3005       std::pair<ssa_op_iter, use_operand_p> *x;
3006       FOR_EACH_VEC_ELT (path, i, x)
3007 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3008       dump_printf (MSG_NOTE, "\n");
3009     }
3010 
3011   /* Check whether the reduction path detected is valid.  */
3012   bool fail = path.length () == 0;
3013   bool neg = false;
3014   int sign = -1;
3015   *code = ERROR_MARK;
3016   for (unsigned i = 1; i < path.length (); ++i)
3017     {
3018       gimple *use_stmt = USE_STMT (path[i].second);
3019       tree op = USE_FROM_PTR (path[i].second);
3020       if (! is_gimple_assign (use_stmt)
3021 	  /* The following make sure we can compute the operand index
3022 	     easily plus it mostly disallows chaining via COND_EXPR condition
3023 	     operands.  */
3024 	  || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3025 	      && (gimple_num_ops (use_stmt) <= 2
3026 		  || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3027 	      && (gimple_num_ops (use_stmt) <= 3
3028 		  || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3029 	{
3030 	  fail = true;
3031 	  break;
3032 	}
3033       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3034       if (use_code == MINUS_EXPR)
3035 	{
3036 	  use_code = PLUS_EXPR;
3037 	  /* Track whether we negate the reduction value each iteration.  */
3038 	  if (gimple_assign_rhs2 (use_stmt) == op)
3039 	    neg = ! neg;
3040 	}
3041       if (CONVERT_EXPR_CODE_P (use_code)
3042 	  && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3043 				    TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3044 	;
3045       else if (*code == ERROR_MARK)
3046 	{
3047 	  *code = use_code;
3048 	  sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3049 	}
3050       else if (use_code != *code)
3051 	{
3052 	  fail = true;
3053 	  break;
3054 	}
3055       else if ((use_code == MIN_EXPR
3056 		|| use_code == MAX_EXPR)
3057 	       && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3058 	{
3059 	  fail = true;
3060 	  break;
3061 	}
3062       /* Check there's only a single stmt the op is used on.  For the
3063 	 not value-changing tail and the last stmt allow out-of-loop uses.
3064 	 ???  We could relax this and handle arbitrary live stmts by
3065 	 forcing a scalar epilogue for example.  */
3066       imm_use_iterator imm_iter;
3067       gimple *op_use_stmt;
3068       unsigned cnt = 0;
3069       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3070 	if (!is_gimple_debug (op_use_stmt)
3071 	    && (*code != ERROR_MARK
3072 		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3073 	  {
3074 	    /* We want to allow x + x but not x < 1 ? x : 2.  */
3075 	    if (is_gimple_assign (op_use_stmt)
3076 		&& gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3077 	      {
3078 		use_operand_p use_p;
3079 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3080 		  cnt++;
3081 	      }
3082 	    else
3083 	      cnt++;
3084 	  }
3085       if (cnt != 1)
3086 	{
3087 	  fail = true;
3088 	  break;
3089 	}
3090     }
3091   return ! fail && ! neg && *code != ERROR_MARK;
3092 }
3093 
3094 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3095 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3096 		      tree loop_arg, enum tree_code code)
3097 {
3098   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3099   enum tree_code code_;
3100   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3101 	  && code_ == code);
3102 }
3103 
3104 
3105 
3106 /* Function vect_is_simple_reduction
3107 
3108    (1) Detect a cross-iteration def-use cycle that represents a simple
3109    reduction computation.  We look for the following pattern:
3110 
3111    loop_header:
3112      a1 = phi < a0, a2 >
3113      a3 = ...
3114      a2 = operation (a3, a1)
3115 
3116    or
3117 
3118    a3 = ...
3119    loop_header:
3120      a1 = phi < a0, a2 >
3121      a2 = operation (a3, a1)
3122 
3123    such that:
3124    1. operation is commutative and associative and it is safe to
3125       change the order of the computation
3126    2. no uses for a2 in the loop (a2 is used out of the loop)
3127    3. no uses of a1 in the loop besides the reduction operation
3128    4. no uses of a1 outside the loop.
3129 
3130    Conditions 1,4 are tested here.
3131    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3132 
3133    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3134    nested cycles.
3135 
3136    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3137    reductions:
3138 
3139      a1 = phi < a0, a2 >
3140      inner loop (def of a3)
3141      a2 = phi < a3 >
3142 
3143    (4) Detect condition expressions, ie:
3144      for (int i = 0; i < N; i++)
3145        if (a[i] < val)
3146 	ret_val = a[i];
3147 
3148 */
3149 
3150 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3151 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3152 			  bool *double_reduc, bool *reduc_chain_p)
3153 {
3154   gphi *phi = as_a <gphi *> (phi_info->stmt);
3155   gimple *phi_use_stmt = NULL;
3156   imm_use_iterator imm_iter;
3157   use_operand_p use_p;
3158 
3159   *double_reduc = false;
3160   *reduc_chain_p = false;
3161   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3162 
3163   tree phi_name = PHI_RESULT (phi);
3164   /* ???  If there are no uses of the PHI result the inner loop reduction
3165      won't be detected as possibly double-reduction by vectorizable_reduction
3166      because that tries to walk the PHI arg from the preheader edge which
3167      can be constant.  See PR60382.  */
3168   if (has_zero_uses (phi_name))
3169     return NULL;
3170   class loop *loop = (gimple_bb (phi))->loop_father;
3171   unsigned nphi_def_loop_uses = 0;
3172   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3173     {
3174       gimple *use_stmt = USE_STMT (use_p);
3175       if (is_gimple_debug (use_stmt))
3176 	continue;
3177 
3178       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3179         {
3180           if (dump_enabled_p ())
3181 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3182 			     "intermediate value used outside loop.\n");
3183 
3184           return NULL;
3185         }
3186 
3187       nphi_def_loop_uses++;
3188       phi_use_stmt = use_stmt;
3189     }
3190 
3191   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3192   if (TREE_CODE (latch_def) != SSA_NAME)
3193     {
3194       if (dump_enabled_p ())
3195 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196 			 "reduction: not ssa_name: %T\n", latch_def);
3197       return NULL;
3198     }
3199 
3200   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3201   if (!def_stmt_info
3202       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3203     return NULL;
3204 
3205   bool nested_in_vect_loop
3206     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3207   unsigned nlatch_def_loop_uses = 0;
3208   auto_vec<gphi *, 3> lcphis;
3209   bool inner_loop_of_double_reduc = false;
3210   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3211     {
3212       gimple *use_stmt = USE_STMT (use_p);
3213       if (is_gimple_debug (use_stmt))
3214 	continue;
3215       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3216 	nlatch_def_loop_uses++;
3217       else
3218 	{
3219 	  /* We can have more than one loop-closed PHI.  */
3220 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3221 	  if (nested_in_vect_loop
3222 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3223 		  == vect_double_reduction_def))
3224 	    inner_loop_of_double_reduc = true;
3225 	}
3226     }
3227 
3228   /* If we are vectorizing an inner reduction we are executing that
3229      in the original order only in case we are not dealing with a
3230      double reduction.  */
3231   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3232     {
3233       if (dump_enabled_p ())
3234 	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3235 			"detected nested cycle: ");
3236       return def_stmt_info;
3237     }
3238 
3239   /* When the inner loop of a double reduction ends up with more than
3240      one loop-closed PHI we have failed to classify alternate such
3241      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3242   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3243     {
3244       if (dump_enabled_p ())
3245 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3246 			 "unhandle double reduction\n");
3247       return NULL;
3248     }
3249 
3250   /* If this isn't a nested cycle or if the nested cycle reduction value
3251      is used ouside of the inner loop we cannot handle uses of the reduction
3252      value.  */
3253   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3254     {
3255       if (dump_enabled_p ())
3256 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3257 			 "reduction used in loop.\n");
3258       return NULL;
3259     }
3260 
3261   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3262      defined in the inner loop.  */
3263   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3264     {
3265       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3266       if (gimple_phi_num_args (def_stmt) != 1
3267           || TREE_CODE (op1) != SSA_NAME)
3268         {
3269           if (dump_enabled_p ())
3270 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271 			     "unsupported phi node definition.\n");
3272 
3273           return NULL;
3274         }
3275 
3276       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3277       if (gimple_bb (def1)
3278 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3279           && loop->inner
3280           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3281           && is_gimple_assign (def1)
3282 	  && is_a <gphi *> (phi_use_stmt)
3283 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3284         {
3285           if (dump_enabled_p ())
3286             report_vect_op (MSG_NOTE, def_stmt,
3287 			    "detected double reduction: ");
3288 
3289           *double_reduc = true;
3290 	  return def_stmt_info;
3291         }
3292 
3293       return NULL;
3294     }
3295 
3296   /* Look for the expression computing latch_def from then loop PHI result.  */
3297   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3298   enum tree_code code;
3299   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3300 			    path))
3301     {
3302       STMT_VINFO_REDUC_CODE (phi_info) = code;
3303       if (code == COND_EXPR && !nested_in_vect_loop)
3304 	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3305 
3306       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3307 	 reduction chain for which the additional restriction is that
3308 	 all operations in the chain are the same.  */
3309       auto_vec<stmt_vec_info, 8> reduc_chain;
3310       unsigned i;
3311       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3312       for (i = path.length () - 1; i >= 1; --i)
3313 	{
3314 	  gimple *stmt = USE_STMT (path[i].second);
3315 	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3316 	  STMT_VINFO_REDUC_IDX (stmt_info)
3317 	    = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3318 	  enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3319 	  bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3320 				     && (i == 1 || i == path.length () - 1));
3321 	  if ((stmt_code != code && !leading_conversion)
3322 	      /* We can only handle the final value in epilogue
3323 		 generation for reduction chains.  */
3324 	      || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3325 	    is_slp_reduc = false;
3326 	  /* For reduction chains we support a trailing/leading
3327 	     conversions.  We do not store those in the actual chain.  */
3328 	  if (leading_conversion)
3329 	    continue;
3330 	  reduc_chain.safe_push (stmt_info);
3331 	}
3332       if (is_slp_reduc && reduc_chain.length () > 1)
3333 	{
3334 	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3335 	    {
3336 	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3337 	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3338 	    }
3339 	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3340 	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3341 
3342 	  /* Save the chain for further analysis in SLP detection.  */
3343 	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3344 	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3345 
3346 	  *reduc_chain_p = true;
3347 	  if (dump_enabled_p ())
3348 	    dump_printf_loc (MSG_NOTE, vect_location,
3349 			    "reduction: detected reduction chain\n");
3350 	}
3351       else if (dump_enabled_p ())
3352 	dump_printf_loc (MSG_NOTE, vect_location,
3353 			 "reduction: detected reduction\n");
3354 
3355       return def_stmt_info;
3356     }
3357 
3358   if (dump_enabled_p ())
3359     dump_printf_loc (MSG_NOTE, vect_location,
3360 		     "reduction: unknown pattern\n");
3361 
3362   return NULL;
3363 }
3364 
3365 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3366 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3367 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3368                              int *peel_iters_epilogue,
3369                              stmt_vector_for_cost *scalar_cost_vec,
3370 			     stmt_vector_for_cost *prologue_cost_vec,
3371 			     stmt_vector_for_cost *epilogue_cost_vec)
3372 {
3373   int retval = 0;
3374   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3375 
3376   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3377     {
3378       *peel_iters_epilogue = assumed_vf / 2;
3379       if (dump_enabled_p ())
3380         dump_printf_loc (MSG_NOTE, vect_location,
3381 			 "cost model: epilogue peel iters set to vf/2 "
3382 			 "because loop iterations are unknown .\n");
3383 
3384       /* If peeled iterations are known but number of scalar loop
3385          iterations are unknown, count a taken branch per peeled loop.  */
3386       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3387 				 NULL, 0, vect_prologue);
3388       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3389 				  NULL, 0, vect_epilogue);
3390     }
3391   else
3392     {
3393       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3394       peel_iters_prologue = niters < peel_iters_prologue ?
3395                             niters : peel_iters_prologue;
3396       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3397       /* If we need to peel for gaps, but no peeling is required, we have to
3398 	 peel VF iterations.  */
3399       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3400 	*peel_iters_epilogue = assumed_vf;
3401     }
3402 
3403   stmt_info_for_cost *si;
3404   int j;
3405   if (peel_iters_prologue)
3406     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3407       retval += record_stmt_cost (prologue_cost_vec,
3408 				  si->count * peel_iters_prologue,
3409 				  si->kind, si->stmt_info, si->misalign,
3410 				  vect_prologue);
3411   if (*peel_iters_epilogue)
3412     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3413       retval += record_stmt_cost (epilogue_cost_vec,
3414 				  si->count * *peel_iters_epilogue,
3415 				  si->kind, si->stmt_info, si->misalign,
3416 				  vect_epilogue);
3417 
3418   return retval;
3419 }
3420 
3421 /* Function vect_estimate_min_profitable_iters
3422 
3423    Return the number of iterations required for the vector version of the
3424    loop to be profitable relative to the cost of the scalar version of the
3425    loop.
3426 
3427    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3428    of iterations for vectorization.  -1 value means loop vectorization
3429    is not profitable.  This returned value may be used for dynamic
3430    profitability check.
3431 
3432    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3433    for static check against estimated number of iterations.  */
3434 
3435 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3436 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3437 				    int *ret_min_profitable_niters,
3438 				    int *ret_min_profitable_estimate)
3439 {
3440   int min_profitable_iters;
3441   int min_profitable_estimate;
3442   int peel_iters_prologue;
3443   int peel_iters_epilogue;
3444   unsigned vec_inside_cost = 0;
3445   int vec_outside_cost = 0;
3446   unsigned vec_prologue_cost = 0;
3447   unsigned vec_epilogue_cost = 0;
3448   int scalar_single_iter_cost = 0;
3449   int scalar_outside_cost = 0;
3450   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3451   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3452   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3453 
3454   /* Cost model disabled.  */
3455   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3456     {
3457       if (dump_enabled_p ())
3458 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3459       *ret_min_profitable_niters = 0;
3460       *ret_min_profitable_estimate = 0;
3461       return;
3462     }
3463 
3464   /* Requires loop versioning tests to handle misalignment.  */
3465   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3466     {
3467       /*  FIXME: Make cost depend on complexity of individual check.  */
3468       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3469       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3470 			    vect_prologue);
3471       if (dump_enabled_p ())
3472 	dump_printf (MSG_NOTE,
3473 		     "cost model: Adding cost of checks for loop "
3474 		     "versioning to treat misalignment.\n");
3475     }
3476 
3477   /* Requires loop versioning with alias checks.  */
3478   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3479     {
3480       /*  FIXME: Make cost depend on complexity of individual check.  */
3481       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3482       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3483 			    vect_prologue);
3484       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3485       if (len)
3486 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3487 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3488 			      NULL, 0, vect_prologue);
3489       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3490       if (len)
3491 	{
3492 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3493 	  unsigned int nstmts = len * 2 - 1;
3494 	  /* +1 for each bias that needs adding.  */
3495 	  for (unsigned int i = 0; i < len; ++i)
3496 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3497 	      nstmts += 1;
3498 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3499 				NULL, 0, vect_prologue);
3500 	}
3501       if (dump_enabled_p ())
3502 	dump_printf (MSG_NOTE,
3503 		     "cost model: Adding cost of checks for loop "
3504 		     "versioning aliasing.\n");
3505     }
3506 
3507   /* Requires loop versioning with niter checks.  */
3508   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3509     {
3510       /*  FIXME: Make cost depend on complexity of individual check.  */
3511       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3512 			    vect_prologue);
3513       if (dump_enabled_p ())
3514 	dump_printf (MSG_NOTE,
3515 		     "cost model: Adding cost of checks for loop "
3516 		     "versioning niters.\n");
3517     }
3518 
3519   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3520     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3521 			  vect_prologue);
3522 
3523   /* Count statements in scalar loop.  Using this as scalar cost for a single
3524      iteration for now.
3525 
3526      TODO: Add outer loop support.
3527 
3528      TODO: Consider assigning different costs to different scalar
3529      statements.  */
3530 
3531   scalar_single_iter_cost
3532     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3533 
3534   /* Add additional cost for the peeled instructions in prologue and epilogue
3535      loop.  (For fully-masked loops there will be no peeling.)
3536 
3537      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3538      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3539 
3540      TODO: Build an expression that represents peel_iters for prologue and
3541      epilogue to be used in a run-time test.  */
3542 
3543   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3544     {
3545       peel_iters_prologue = 0;
3546       peel_iters_epilogue = 0;
3547 
3548       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3549 	{
3550 	  /* We need to peel exactly one iteration.  */
3551 	  peel_iters_epilogue += 1;
3552 	  stmt_info_for_cost *si;
3553 	  int j;
3554 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3555 			    j, si)
3556 	    (void) add_stmt_cost (target_cost_data, si->count,
3557 				  si->kind, si->stmt_info, si->misalign,
3558 				  vect_epilogue);
3559 	}
3560 
3561       /* Calculate how many masks we need to generate.  */
3562       unsigned int num_masks = 0;
3563       rgroup_masks *rgm;
3564       unsigned int num_vectors_m1;
3565       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3566 	if (rgm->mask_type)
3567 	  num_masks += num_vectors_m1 + 1;
3568       gcc_assert (num_masks > 0);
3569 
3570       /* In the worst case, we need to generate each mask in the prologue
3571 	 and in the loop body.  One of the loop body mask instructions
3572 	 replaces the comparison in the scalar loop, and since we don't
3573 	 count the scalar comparison against the scalar body, we shouldn't
3574 	 count that vector instruction against the vector body either.
3575 
3576 	 Sometimes we can use unpacks instead of generating prologue
3577 	 masks and sometimes the prologue mask will fold to a constant,
3578 	 so the actual prologue cost might be smaller.  However, it's
3579 	 simpler and safer to use the worst-case cost; if this ends up
3580 	 being the tie-breaker between vectorizing or not, then it's
3581 	 probably better not to vectorize.  */
3582       (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3583 			    NULL, 0, vect_prologue);
3584       (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3585 			    NULL, 0, vect_body);
3586     }
3587   else if (npeel < 0)
3588     {
3589       peel_iters_prologue = assumed_vf / 2;
3590       if (dump_enabled_p ())
3591 	dump_printf (MSG_NOTE, "cost model: "
3592 		     "prologue peel iters set to vf/2.\n");
3593 
3594       /* If peeling for alignment is unknown, loop bound of main loop becomes
3595          unknown.  */
3596       peel_iters_epilogue = assumed_vf / 2;
3597       if (dump_enabled_p ())
3598 	dump_printf (MSG_NOTE, "cost model: "
3599 		     "epilogue peel iters set to vf/2 because "
3600 		     "peeling for alignment is unknown.\n");
3601 
3602       /* If peeled iterations are unknown, count a taken branch and a not taken
3603          branch per peeled loop. Even if scalar loop iterations are known,
3604          vector iterations are not known since peeled prologue iterations are
3605          not known. Hence guards remain the same.  */
3606       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3607 			    NULL, 0, vect_prologue);
3608       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3609 			    NULL, 0, vect_prologue);
3610       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3611 			    NULL, 0, vect_epilogue);
3612       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3613 			    NULL, 0, vect_epilogue);
3614       stmt_info_for_cost *si;
3615       int j;
3616       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3617 	{
3618 	  (void) add_stmt_cost (target_cost_data,
3619 				si->count * peel_iters_prologue,
3620 				si->kind, si->stmt_info, si->misalign,
3621 				vect_prologue);
3622 	  (void) add_stmt_cost (target_cost_data,
3623 				si->count * peel_iters_epilogue,
3624 				si->kind, si->stmt_info, si->misalign,
3625 				vect_epilogue);
3626 	}
3627     }
3628   else
3629     {
3630       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3631       stmt_info_for_cost *si;
3632       int j;
3633       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3634 
3635       prologue_cost_vec.create (2);
3636       epilogue_cost_vec.create (2);
3637       peel_iters_prologue = npeel;
3638 
3639       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3640 					  &peel_iters_epilogue,
3641 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3642 					    (loop_vinfo),
3643 					  &prologue_cost_vec,
3644 					  &epilogue_cost_vec);
3645 
3646       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3647 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3648 			      si->misalign, vect_prologue);
3649 
3650       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3651 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3652 			      si->misalign, vect_epilogue);
3653 
3654       prologue_cost_vec.release ();
3655       epilogue_cost_vec.release ();
3656     }
3657 
3658   /* FORNOW: The scalar outside cost is incremented in one of the
3659      following ways:
3660 
3661      1. The vectorizer checks for alignment and aliasing and generates
3662      a condition that allows dynamic vectorization.  A cost model
3663      check is ANDED with the versioning condition.  Hence scalar code
3664      path now has the added cost of the versioning check.
3665 
3666        if (cost > th & versioning_check)
3667          jmp to vector code
3668 
3669      Hence run-time scalar is incremented by not-taken branch cost.
3670 
3671      2. The vectorizer then checks if a prologue is required.  If the
3672      cost model check was not done before during versioning, it has to
3673      be done before the prologue check.
3674 
3675        if (cost <= th)
3676          prologue = scalar_iters
3677        if (prologue == 0)
3678          jmp to vector code
3679        else
3680          execute prologue
3681        if (prologue == num_iters)
3682 	 go to exit
3683 
3684      Hence the run-time scalar cost is incremented by a taken branch,
3685      plus a not-taken branch, plus a taken branch cost.
3686 
3687      3. The vectorizer then checks if an epilogue is required.  If the
3688      cost model check was not done before during prologue check, it
3689      has to be done with the epilogue check.
3690 
3691        if (prologue == 0)
3692          jmp to vector code
3693        else
3694          execute prologue
3695        if (prologue == num_iters)
3696 	 go to exit
3697        vector code:
3698          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3699            jmp to epilogue
3700 
3701      Hence the run-time scalar cost should be incremented by 2 taken
3702      branches.
3703 
3704      TODO: The back end may reorder the BBS's differently and reverse
3705      conditions/branch directions.  Change the estimates below to
3706      something more reasonable.  */
3707 
3708   /* If the number of iterations is known and we do not do versioning, we can
3709      decide whether to vectorize at compile time.  Hence the scalar version
3710      do not carry cost model guard costs.  */
3711   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3712       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3713     {
3714       /* Cost model check occurs at versioning.  */
3715       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3716 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3717       else
3718 	{
3719 	  /* Cost model check occurs at prologue generation.  */
3720 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3721 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3722 	      + vect_get_stmt_cost (cond_branch_not_taken);
3723 	  /* Cost model check occurs at epilogue generation.  */
3724 	  else
3725 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3726 	}
3727     }
3728 
3729   /* Complete the target-specific cost calculations.  */
3730   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3731 	       &vec_inside_cost, &vec_epilogue_cost);
3732 
3733   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3734 
3735   /* Stash the costs so that we can compare two loop_vec_infos.  */
3736   loop_vinfo->vec_inside_cost = vec_inside_cost;
3737   loop_vinfo->vec_outside_cost = vec_outside_cost;
3738 
3739   if (dump_enabled_p ())
3740     {
3741       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3742       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3743                    vec_inside_cost);
3744       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3745                    vec_prologue_cost);
3746       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3747                    vec_epilogue_cost);
3748       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3749                    scalar_single_iter_cost);
3750       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3751                    scalar_outside_cost);
3752       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3753                    vec_outside_cost);
3754       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3755                    peel_iters_prologue);
3756       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3757                    peel_iters_epilogue);
3758     }
3759 
3760   /* Calculate number of iterations required to make the vector version
3761      profitable, relative to the loop bodies only.  The following condition
3762      must hold true:
3763      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3764      where
3765      SIC = scalar iteration cost, VIC = vector iteration cost,
3766      VOC = vector outside cost, VF = vectorization factor,
3767      NPEEL = prologue iterations + epilogue iterations,
3768      SOC = scalar outside cost for run time cost model check.  */
3769 
3770   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3771 			  - vec_inside_cost);
3772   if (saving_per_viter <= 0)
3773     {
3774       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3775 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3776 		    "vectorization did not happen for a simd loop");
3777 
3778       if (dump_enabled_p ())
3779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780 			 "cost model: the vector iteration cost = %d "
3781 			 "divided by the scalar iteration cost = %d "
3782 			 "is greater or equal to the vectorization factor = %d"
3783                          ".\n",
3784 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3785       *ret_min_profitable_niters = -1;
3786       *ret_min_profitable_estimate = -1;
3787       return;
3788     }
3789 
3790   /* ??? The "if" arm is written to handle all cases; see below for what
3791      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3792   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3793     {
3794       /* Rewriting the condition above in terms of the number of
3795 	 vector iterations (vniters) rather than the number of
3796 	 scalar iterations (niters) gives:
3797 
3798 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3799 
3800 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3801 
3802 	 For integer N, X and Y when X > 0:
3803 
3804 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3805       int outside_overhead = (vec_outside_cost
3806 			      - scalar_single_iter_cost * peel_iters_prologue
3807 			      - scalar_single_iter_cost * peel_iters_epilogue
3808 			      - scalar_outside_cost);
3809       /* We're only interested in cases that require at least one
3810 	 vector iteration.  */
3811       int min_vec_niters = 1;
3812       if (outside_overhead > 0)
3813 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3814 
3815       if (dump_enabled_p ())
3816 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3817 		     min_vec_niters);
3818 
3819       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3820 	{
3821 	  /* Now that we know the minimum number of vector iterations,
3822 	     find the minimum niters for which the scalar cost is larger:
3823 
3824 	     SIC * niters > VIC * vniters + VOC - SOC
3825 
3826 	     We know that the minimum niters is no more than
3827 	     vniters * VF + NPEEL, but it might be (and often is) less
3828 	     than that if a partial vector iteration is cheaper than the
3829 	     equivalent scalar code.  */
3830 	  int threshold = (vec_inside_cost * min_vec_niters
3831 			   + vec_outside_cost
3832 			   - scalar_outside_cost);
3833 	  if (threshold <= 0)
3834 	    min_profitable_iters = 1;
3835 	  else
3836 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3837 	}
3838       else
3839 	/* Convert the number of vector iterations into a number of
3840 	   scalar iterations.  */
3841 	min_profitable_iters = (min_vec_niters * assumed_vf
3842 				+ peel_iters_prologue
3843 				+ peel_iters_epilogue);
3844     }
3845   else
3846     {
3847       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3848 			      * assumed_vf
3849 			      - vec_inside_cost * peel_iters_prologue
3850 			      - vec_inside_cost * peel_iters_epilogue);
3851       if (min_profitable_iters <= 0)
3852         min_profitable_iters = 0;
3853       else
3854 	{
3855 	  min_profitable_iters /= saving_per_viter;
3856 
3857 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3858 	      <= (((int) vec_inside_cost * min_profitable_iters)
3859 		  + (((int) vec_outside_cost - scalar_outside_cost)
3860 		     * assumed_vf)))
3861 	    min_profitable_iters++;
3862 	}
3863     }
3864 
3865   if (dump_enabled_p ())
3866     dump_printf (MSG_NOTE,
3867 		 "  Calculated minimum iters for profitability: %d\n",
3868 		 min_profitable_iters);
3869 
3870   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3871       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3872     /* We want the vectorized loop to execute at least once.  */
3873     min_profitable_iters = assumed_vf + peel_iters_prologue;
3874 
3875   if (dump_enabled_p ())
3876     dump_printf_loc (MSG_NOTE, vect_location,
3877                      "  Runtime profitability threshold = %d\n",
3878                      min_profitable_iters);
3879 
3880   *ret_min_profitable_niters = min_profitable_iters;
3881 
3882   /* Calculate number of iterations required to make the vector version
3883      profitable, relative to the loop bodies only.
3884 
3885      Non-vectorized variant is SIC * niters and it must win over vector
3886      variant on the expected loop trip count.  The following condition must hold true:
3887      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3888 
3889   if (vec_outside_cost <= 0)
3890     min_profitable_estimate = 0;
3891   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3892     {
3893       /* This is a repeat of the code above, but with + SOC rather
3894 	 than - SOC.  */
3895       int outside_overhead = (vec_outside_cost
3896 			      - scalar_single_iter_cost * peel_iters_prologue
3897 			      - scalar_single_iter_cost * peel_iters_epilogue
3898 			      + scalar_outside_cost);
3899       int min_vec_niters = 1;
3900       if (outside_overhead > 0)
3901 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3902 
3903       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3904 	{
3905 	  int threshold = (vec_inside_cost * min_vec_niters
3906 			   + vec_outside_cost
3907 			   + scalar_outside_cost);
3908 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3909 	}
3910       else
3911 	min_profitable_estimate = (min_vec_niters * assumed_vf
3912 				   + peel_iters_prologue
3913 				   + peel_iters_epilogue);
3914     }
3915   else
3916     {
3917       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3918 				 * assumed_vf
3919 				 - vec_inside_cost * peel_iters_prologue
3920 				 - vec_inside_cost * peel_iters_epilogue)
3921 				 / ((scalar_single_iter_cost * assumed_vf)
3922 				   - vec_inside_cost);
3923     }
3924   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3925   if (dump_enabled_p ())
3926     dump_printf_loc (MSG_NOTE, vect_location,
3927 		     "  Static estimate profitability threshold = %d\n",
3928 		     min_profitable_estimate);
3929 
3930   *ret_min_profitable_estimate = min_profitable_estimate;
3931 }
3932 
3933 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3934    vector elements (not bits) for a vector with NELT elements.  */
3935 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3936 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3937 			      vec_perm_builder *sel)
3938 {
3939   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3940      by vec_perm_indices.  */
3941   sel->new_vector (nelt, 1, 3);
3942   for (unsigned int i = 0; i < 3; i++)
3943     sel->quick_push (i + offset);
3944 }
3945 
3946 /* Checks whether the target supports whole-vector shifts for vectors of mode
3947    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3948    it supports vec_perm_const with masks for all necessary shift amounts.  */
3949 static bool
have_whole_vector_shift(machine_mode mode)3950 have_whole_vector_shift (machine_mode mode)
3951 {
3952   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3953     return true;
3954 
3955   /* Variable-length vectors should be handled via the optab.  */
3956   unsigned int nelt;
3957   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3958     return false;
3959 
3960   vec_perm_builder sel;
3961   vec_perm_indices indices;
3962   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3963     {
3964       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3965       indices.new_vector (sel, 2, nelt);
3966       if (!can_vec_perm_const_p (mode, indices, false))
3967 	return false;
3968     }
3969   return true;
3970 }
3971 
3972 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3973    functions. Design better to avoid maintenance issues.  */
3974 
3975 /* Function vect_model_reduction_cost.
3976 
3977    Models cost for a reduction operation, including the vector ops
3978    generated within the strip-mine loop in some cases, the initial
3979    definition before the loop, and the epilogue code that must be generated.  */
3980 
3981 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)3982 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3983 			   vect_reduction_type reduction_type,
3984 			   int ncopies, stmt_vector_for_cost *cost_vec)
3985 {
3986   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3987   enum tree_code code;
3988   optab optab;
3989   tree vectype;
3990   machine_mode mode;
3991   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3992   class loop *loop = NULL;
3993 
3994   if (loop_vinfo)
3995     loop = LOOP_VINFO_LOOP (loop_vinfo);
3996 
3997   /* Condition reductions generate two reductions in the loop.  */
3998   if (reduction_type == COND_REDUCTION)
3999     ncopies *= 2;
4000 
4001   vectype = STMT_VINFO_VECTYPE (stmt_info);
4002   mode = TYPE_MODE (vectype);
4003   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4004 
4005   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4006 
4007   if (reduction_type == EXTRACT_LAST_REDUCTION)
4008     /* No extra instructions are needed in the prologue.  The loop body
4009        operations are costed in vectorizable_condition.  */
4010     inside_cost = 0;
4011   else if (reduction_type == FOLD_LEFT_REDUCTION)
4012     {
4013       /* No extra instructions needed in the prologue.  */
4014       prologue_cost = 0;
4015 
4016       if (reduc_fn != IFN_LAST)
4017 	/* Count one reduction-like operation per vector.  */
4018 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4019 					stmt_info, 0, vect_body);
4020       else
4021 	{
4022 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4023 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4024 	  inside_cost = record_stmt_cost (cost_vec, nelements,
4025 					  vec_to_scalar, stmt_info, 0,
4026 					  vect_body);
4027 	  inside_cost += record_stmt_cost (cost_vec, nelements,
4028 					   scalar_stmt, stmt_info, 0,
4029 					   vect_body);
4030 	}
4031     }
4032   else
4033     {
4034       /* Add in cost for initial definition.
4035 	 For cond reduction we have four vectors: initial index, step,
4036 	 initial result of the data reduction, initial value of the index
4037 	 reduction.  */
4038       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4039       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4040 					 scalar_to_vec, stmt_info, 0,
4041 					 vect_prologue);
4042     }
4043 
4044   /* Determine cost of epilogue code.
4045 
4046      We have a reduction operator that will reduce the vector in one statement.
4047      Also requires scalar extract.  */
4048 
4049   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4050     {
4051       if (reduc_fn != IFN_LAST)
4052 	{
4053 	  if (reduction_type == COND_REDUCTION)
4054 	    {
4055 	      /* An EQ stmt and an COND_EXPR stmt.  */
4056 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4057 						 vector_stmt, stmt_info, 0,
4058 						 vect_epilogue);
4059 	      /* Reduction of the max index and a reduction of the found
4060 		 values.  */
4061 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4062 						 vec_to_scalar, stmt_info, 0,
4063 						 vect_epilogue);
4064 	      /* A broadcast of the max value.  */
4065 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4066 						 scalar_to_vec, stmt_info, 0,
4067 						 vect_epilogue);
4068 	    }
4069 	  else
4070 	    {
4071 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4072 						 stmt_info, 0, vect_epilogue);
4073 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4074 						 vec_to_scalar, stmt_info, 0,
4075 						 vect_epilogue);
4076 	    }
4077 	}
4078       else if (reduction_type == COND_REDUCTION)
4079 	{
4080 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4081 	  /* Extraction of scalar elements.  */
4082 	  epilogue_cost += record_stmt_cost (cost_vec,
4083 					     2 * estimated_nunits,
4084 					     vec_to_scalar, stmt_info, 0,
4085 					     vect_epilogue);
4086 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4087 	  epilogue_cost += record_stmt_cost (cost_vec,
4088 					     2 * estimated_nunits - 3,
4089 					     scalar_stmt, stmt_info, 0,
4090 					     vect_epilogue);
4091 	}
4092       else if (reduction_type == EXTRACT_LAST_REDUCTION
4093 	       || reduction_type == FOLD_LEFT_REDUCTION)
4094 	/* No extra instructions need in the epilogue.  */
4095 	;
4096       else
4097 	{
4098 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4099 	  tree bitsize =
4100 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4101 	  int element_bitsize = tree_to_uhwi (bitsize);
4102 	  int nelements = vec_size_in_bits / element_bitsize;
4103 
4104 	  if (code == COND_EXPR)
4105 	    code = MAX_EXPR;
4106 
4107 	  optab = optab_for_tree_code (code, vectype, optab_default);
4108 
4109 	  /* We have a whole vector shift available.  */
4110 	  if (optab != unknown_optab
4111 	      && VECTOR_MODE_P (mode)
4112 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4113 	      && have_whole_vector_shift (mode))
4114 	    {
4115 	      /* Final reduction via vector shifts and the reduction operator.
4116 		 Also requires scalar extract.  */
4117 	      epilogue_cost += record_stmt_cost (cost_vec,
4118 						 exact_log2 (nelements) * 2,
4119 						 vector_stmt, stmt_info, 0,
4120 						 vect_epilogue);
4121 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4122 						 vec_to_scalar, stmt_info, 0,
4123 						 vect_epilogue);
4124 	    }
4125 	  else
4126 	    /* Use extracts and reduction op for final reduction.  For N
4127 	       elements, we have N extracts and N-1 reduction ops.  */
4128 	    epilogue_cost += record_stmt_cost (cost_vec,
4129 					       nelements + nelements - 1,
4130 					       vector_stmt, stmt_info, 0,
4131 					       vect_epilogue);
4132 	}
4133     }
4134 
4135   if (dump_enabled_p ())
4136     dump_printf (MSG_NOTE,
4137                  "vect_model_reduction_cost: inside_cost = %d, "
4138                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4139                  prologue_cost, epilogue_cost);
4140 }
4141 
4142 
4143 /* Function vect_model_induction_cost.
4144 
4145    Models cost for induction operations.  */
4146 
4147 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4148 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4149 			   stmt_vector_for_cost *cost_vec)
4150 {
4151   unsigned inside_cost, prologue_cost;
4152 
4153   if (PURE_SLP_STMT (stmt_info))
4154     return;
4155 
4156   /* loop cost for vec_loop.  */
4157   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4158 				  stmt_info, 0, vect_body);
4159 
4160   /* prologue cost for vec_init and vec_step.  */
4161   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4162 				    stmt_info, 0, vect_prologue);
4163 
4164   if (dump_enabled_p ())
4165     dump_printf_loc (MSG_NOTE, vect_location,
4166                      "vect_model_induction_cost: inside_cost = %d, "
4167                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4168 }
4169 
4170 
4171 
4172 /* Function get_initial_def_for_reduction
4173 
4174    Input:
4175    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4176    INIT_VAL - the initial value of the reduction variable
4177 
4178    Output:
4179    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4180         of the reduction (used for adjusting the epilog - see below).
4181    Return a vector variable, initialized according to the operation that
4182 	STMT_VINFO performs. This vector will be used as the initial value
4183 	of the vector of partial results.
4184 
4185    Option1 (adjust in epilog): Initialize the vector as follows:
4186      add/bit or/xor:    [0,0,...,0,0]
4187      mult/bit and:      [1,1,...,1,1]
4188      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4189    and when necessary (e.g. add/mult case) let the caller know
4190    that it needs to adjust the result by init_val.
4191 
4192    Option2: Initialize the vector as follows:
4193      add/bit or/xor:    [init_val,0,0,...,0]
4194      mult/bit and:      [init_val,1,1,...,1]
4195      min/max/cond_expr: [init_val,init_val,...,init_val]
4196    and no adjustments are needed.
4197 
4198    For example, for the following code:
4199 
4200    s = init_val;
4201    for (i=0;i<n;i++)
4202      s = s + a[i];
4203 
4204    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4205    For a vector of 4 units, we want to return either [0,0,0,init_val],
4206    or [0,0,0,0] and let the caller know that it needs to adjust
4207    the result at the end by 'init_val'.
4208 
4209    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4210    initialization vector is simpler (same element in all entries), if
4211    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4212 
4213    A cost model should help decide between these two schemes.  */
4214 
4215 static tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4216 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4217 			       enum tree_code code, tree init_val,
4218                                tree *adjustment_def)
4219 {
4220   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4221   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4222   tree scalar_type = TREE_TYPE (init_val);
4223   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4224   tree def_for_init;
4225   tree init_def;
4226   REAL_VALUE_TYPE real_init_val = dconst0;
4227   int int_init_val = 0;
4228   gimple_seq stmts = NULL;
4229 
4230   gcc_assert (vectype);
4231 
4232   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4233 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4234 
4235   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4236 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4237 
4238   /* ADJUSTMENT_DEF is NULL when called from
4239      vect_create_epilog_for_reduction to vectorize double reduction.  */
4240   if (adjustment_def)
4241     *adjustment_def = NULL;
4242 
4243   switch (code)
4244     {
4245     case WIDEN_SUM_EXPR:
4246     case DOT_PROD_EXPR:
4247     case SAD_EXPR:
4248     case PLUS_EXPR:
4249     case MINUS_EXPR:
4250     case BIT_IOR_EXPR:
4251     case BIT_XOR_EXPR:
4252     case MULT_EXPR:
4253     case BIT_AND_EXPR:
4254       {
4255         if (code == MULT_EXPR)
4256           {
4257             real_init_val = dconst1;
4258             int_init_val = 1;
4259           }
4260 
4261         if (code == BIT_AND_EXPR)
4262           int_init_val = -1;
4263 
4264         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4265           def_for_init = build_real (scalar_type, real_init_val);
4266         else
4267           def_for_init = build_int_cst (scalar_type, int_init_val);
4268 
4269 	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4270 	  {
4271 	    /* Option1: the first element is '0' or '1' as well.  */
4272 	    if (!operand_equal_p (def_for_init, init_val, 0))
4273 	      *adjustment_def = init_val;
4274 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4275 						     def_for_init);
4276 	  }
4277 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4278 	  {
4279 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4280 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4281 						     def_for_init);
4282 	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4283 				     vectype, init_def, init_val);
4284 	  }
4285 	else
4286 	  {
4287 	    /* Option2: the first element is INIT_VAL.  */
4288 	    tree_vector_builder elts (vectype, 1, 2);
4289 	    elts.quick_push (init_val);
4290 	    elts.quick_push (def_for_init);
4291 	    init_def = gimple_build_vector (&stmts, &elts);
4292 	  }
4293       }
4294       break;
4295 
4296     case MIN_EXPR:
4297     case MAX_EXPR:
4298     case COND_EXPR:
4299       {
4300 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4301 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4302       }
4303       break;
4304 
4305     default:
4306       gcc_unreachable ();
4307     }
4308 
4309   if (stmts)
4310     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4311   return init_def;
4312 }
4313 
4314 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4315    NUMBER_OF_VECTORS is the number of vector defs to create.
4316    If NEUTRAL_OP is nonnull, introducing extra elements of that
4317    value will not change the result.  */
4318 
4319 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4320 get_initial_defs_for_reduction (slp_tree slp_node,
4321 				vec<tree> *vec_oprnds,
4322 				unsigned int number_of_vectors,
4323 				bool reduc_chain, tree neutral_op)
4324 {
4325   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326   stmt_vec_info stmt_vinfo = stmts[0];
4327   vec_info *vinfo = stmt_vinfo->vinfo;
4328   unsigned HOST_WIDE_INT nunits;
4329   unsigned j, number_of_places_left_in_vector;
4330   tree vector_type;
4331   unsigned int group_size = stmts.length ();
4332   unsigned int i;
4333   class loop *loop;
4334 
4335   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4336 
4337   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4338 
4339   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4340   gcc_assert (loop);
4341   edge pe = loop_preheader_edge (loop);
4342 
4343   gcc_assert (!reduc_chain || neutral_op);
4344 
4345   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4346      created vectors. It is greater than 1 if unrolling is performed.
4347 
4348      For example, we have two scalar operands, s1 and s2 (e.g., group of
4349      strided accesses of size two), while NUNITS is four (i.e., four scalars
4350      of this type can be packed in a vector).  The output vector will contain
4351      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4352      will be 2).
4353 
4354      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4355      vectors containing the operands.
4356 
4357      For example, NUNITS is four as before, and the group size is 8
4358      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4359      {s5, s6, s7, s8}.  */
4360 
4361   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4362     nunits = group_size;
4363 
4364   number_of_places_left_in_vector = nunits;
4365   bool constant_p = true;
4366   tree_vector_builder elts (vector_type, nunits, 1);
4367   elts.quick_grow (nunits);
4368   gimple_seq ctor_seq = NULL;
4369   for (j = 0; j < nunits * number_of_vectors; ++j)
4370     {
4371       tree op;
4372       i = j % group_size;
4373       stmt_vinfo = stmts[i];
4374 
4375       /* Get the def before the loop.  In reduction chain we have only
4376 	 one initial value.  Else we have as many as PHIs in the group.  */
4377       if (reduc_chain)
4378 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4379       else if (((vec_oprnds->length () + 1) * nunits
4380 		- number_of_places_left_in_vector >= group_size)
4381 	       && neutral_op)
4382 	op = neutral_op;
4383       else
4384 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4385 
4386       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4387       number_of_places_left_in_vector--;
4388       elts[nunits - number_of_places_left_in_vector - 1] = op;
4389       if (!CONSTANT_CLASS_P (op))
4390 	constant_p = false;
4391 
4392       if (number_of_places_left_in_vector == 0)
4393 	{
4394 	  tree init;
4395 	  if (constant_p && !neutral_op
4396 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4397 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4398 	    /* Build the vector directly from ELTS.  */
4399 	    init = gimple_build_vector (&ctor_seq, &elts);
4400 	  else if (neutral_op)
4401 	    {
4402 	      /* Build a vector of the neutral value and shift the
4403 		 other elements into place.  */
4404 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4405 						   neutral_op);
4406 	      int k = nunits;
4407 	      while (k > 0 && elts[k - 1] == neutral_op)
4408 		k -= 1;
4409 	      while (k > 0)
4410 		{
4411 		  k -= 1;
4412 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4413 				       vector_type, init, elts[k]);
4414 		}
4415 	    }
4416 	  else
4417 	    {
4418 	      /* First time round, duplicate ELTS to fill the
4419 		 required number of vectors.  */
4420 	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4421 					number_of_vectors, *vec_oprnds);
4422 	      break;
4423 	    }
4424 	  vec_oprnds->quick_push (init);
4425 
4426 	  number_of_places_left_in_vector = nunits;
4427 	  elts.new_vector (vector_type, nunits, 1);
4428 	  elts.quick_grow (nunits);
4429 	  constant_p = true;
4430 	}
4431     }
4432   if (ctor_seq != NULL)
4433     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4434 }
4435 
4436 /* For a statement STMT_INFO taking part in a reduction operation return
4437    the stmt_vec_info the meta information is stored on.  */
4438 
4439 stmt_vec_info
info_for_reduction(stmt_vec_info stmt_info)4440 info_for_reduction (stmt_vec_info stmt_info)
4441 {
4442   stmt_info = vect_orig_stmt (stmt_info);
4443   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4444   if (!is_a <gphi *> (stmt_info->stmt)
4445       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4446     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4447   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4448   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4449     {
4450       if (gimple_phi_num_args (phi) == 1)
4451 	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4452     }
4453   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4454     {
4455       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4456       stmt_vec_info info
4457 	  = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4458       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4459 	stmt_info = info;
4460     }
4461   return stmt_info;
4462 }
4463 
4464 /* Function vect_create_epilog_for_reduction
4465 
4466    Create code at the loop-epilog to finalize the result of a reduction
4467    computation.
4468 
4469    STMT_INFO is the scalar reduction stmt that is being vectorized.
4470    SLP_NODE is an SLP node containing a group of reduction statements. The
4471      first one in this group is STMT_INFO.
4472    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4473    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4474      (counting from 0)
4475 
4476    This function:
4477    1. Completes the reduction def-use cycles.
4478    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4479       by calling the function specified by REDUC_FN if available, or by
4480       other means (whole-vector shifts or a scalar loop).
4481       The function also creates a new phi node at the loop exit to preserve
4482       loop-closed form, as illustrated below.
4483 
4484      The flow at the entry to this function:
4485 
4486         loop:
4487           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4488           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4489           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4490         loop_exit:
4491           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4492           use <s_out0>
4493           use <s_out0>
4494 
4495      The above is transformed by this function into:
4496 
4497         loop:
4498           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4499           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4500           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4501         loop_exit:
4502           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4503           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4504           v_out2 = reduce <v_out1>
4505           s_out3 = extract_field <v_out2, 0>
4506           s_out4 = adjust_result <s_out3>
4507           use <s_out4>
4508           use <s_out4>
4509 */
4510 
4511 static void
vect_create_epilog_for_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4512 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4513 				  slp_tree slp_node,
4514 				  slp_instance slp_node_instance)
4515 {
4516   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4517   gcc_assert (reduc_info->is_reduc_info);
4518   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4519   /* For double reductions we need to get at the inner loop reduction
4520      stmt which has the meta info attached.  Our stmt_info is that of the
4521      loop-closed PHI of the inner loop which we remember as
4522      def for the reduction PHI generation.  */
4523   bool double_reduc = false;
4524   stmt_vec_info rdef_info = stmt_info;
4525   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4526     {
4527       gcc_assert (!slp_node);
4528       double_reduc = true;
4529       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4530 					    (stmt_info->stmt, 0));
4531       stmt_info = vect_stmt_to_vectorize (stmt_info);
4532     }
4533   gphi *reduc_def_stmt
4534     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4535   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4536   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4537   stmt_vec_info prev_phi_info;
4538   tree vectype;
4539   machine_mode mode;
4540   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4541   basic_block exit_bb;
4542   tree scalar_dest;
4543   tree scalar_type;
4544   gimple *new_phi = NULL, *phi;
4545   stmt_vec_info phi_info;
4546   gimple_stmt_iterator exit_gsi;
4547   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4548   gimple *epilog_stmt = NULL;
4549   gimple *exit_phi;
4550   tree bitsize;
4551   tree def;
4552   tree orig_name, scalar_result;
4553   imm_use_iterator imm_iter, phi_imm_iter;
4554   use_operand_p use_p, phi_use_p;
4555   gimple *use_stmt;
4556   bool nested_in_vect_loop = false;
4557   auto_vec<gimple *> new_phis;
4558   int j, i;
4559   auto_vec<tree> scalar_results;
4560   unsigned int group_size = 1, k;
4561   auto_vec<gimple *> phis;
4562   bool slp_reduc = false;
4563   bool direct_slp_reduc;
4564   tree new_phi_result;
4565   tree induction_index = NULL_TREE;
4566 
4567   if (slp_node)
4568     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4569 
4570   if (nested_in_vect_loop_p (loop, stmt_info))
4571     {
4572       outer_loop = loop;
4573       loop = loop->inner;
4574       nested_in_vect_loop = true;
4575       gcc_assert (!slp_node);
4576     }
4577   gcc_assert (!nested_in_vect_loop || double_reduc);
4578 
4579   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4580   gcc_assert (vectype);
4581   mode = TYPE_MODE (vectype);
4582 
4583   tree initial_def = NULL;
4584   tree induc_val = NULL_TREE;
4585   tree adjustment_def = NULL;
4586   if (slp_node)
4587     ;
4588   else
4589     {
4590       /* Get at the scalar def before the loop, that defines the initial value
4591 	 of the reduction variable.  */
4592       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4593 					   loop_preheader_edge (loop));
4594       /* Optimize: for induction condition reduction, if we can't use zero
4595          for induc_val, use initial_def.  */
4596       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4597 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4598       else if (double_reduc)
4599 	;
4600       else if (nested_in_vect_loop)
4601 	;
4602       else
4603 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4604     }
4605 
4606   unsigned vec_num;
4607   int ncopies;
4608   if (slp_node)
4609     {
4610       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4611       ncopies = 1;
4612     }
4613   else
4614     {
4615       vec_num = 1;
4616       ncopies = 0;
4617       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4618       do
4619 	{
4620 	  ncopies++;
4621 	  phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4622 	}
4623       while (phi_info);
4624     }
4625 
4626   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4627      which is updated with the current index of the loop for every match of
4628      the original loop's cond_expr (VEC_STMT).  This results in a vector
4629      containing the last time the condition passed for that vector lane.
4630      The first match will be a 1 to allow 0 to be used for non-matching
4631      indexes.  If there are no matches at all then the vector will be all
4632      zeroes.
4633 
4634      PR92772: This algorithm is broken for architectures that support
4635      masked vectors, but do not provide fold_extract_last.  */
4636   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4637     {
4638       auto_vec<std::pair<tree, bool>, 2> ccompares;
4639       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4640       cond_info = vect_stmt_to_vectorize (cond_info);
4641       while (cond_info != reduc_info)
4642 	{
4643 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4644 	    {
4645 	      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4646 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4647 	      ccompares.safe_push
4648 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4649 				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4650 	    }
4651 	  cond_info
4652 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4653 						 1 + STMT_VINFO_REDUC_IDX
4654 							(cond_info)));
4655 	  cond_info = vect_stmt_to_vectorize (cond_info);
4656 	}
4657       gcc_assert (ccompares.length () != 0);
4658 
4659       tree indx_before_incr, indx_after_incr;
4660       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4661       int scalar_precision
4662 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4663       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4664       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4665 	(TYPE_MODE (vectype), cr_index_scalar_type,
4666 	 TYPE_VECTOR_SUBPARTS (vectype));
4667 
4668       /* First we create a simple vector induction variable which starts
4669 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4670 	 vector size (STEP).  */
4671 
4672       /* Create a {1,2,3,...} vector.  */
4673       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4674 
4675       /* Create a vector of the step value.  */
4676       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4677       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4678 
4679       /* Create an induction variable.  */
4680       gimple_stmt_iterator incr_gsi;
4681       bool insert_after;
4682       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4683       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4684 		 insert_after, &indx_before_incr, &indx_after_incr);
4685 
4686       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4687 	 filled with zeros (VEC_ZERO).  */
4688 
4689       /* Create a vector of 0s.  */
4690       tree zero = build_zero_cst (cr_index_scalar_type);
4691       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4692 
4693       /* Create a vector phi node.  */
4694       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4695       new_phi = create_phi_node (new_phi_tree, loop->header);
4696       loop_vinfo->add_stmt (new_phi);
4697       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4698 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4699 
4700       /* Now take the condition from the loops original cond_exprs
4701 	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4702 	 every match uses values from the induction variable
4703 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4704 	 (NEW_PHI_TREE).
4705 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4706 	 the new cond_expr (INDEX_COND_EXPR).  */
4707       gimple_seq stmts = NULL;
4708       for (int i = ccompares.length () - 1; i != -1; --i)
4709 	{
4710 	  tree ccompare = ccompares[i].first;
4711 	  if (ccompares[i].second)
4712 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4713 					 cr_index_vector_type,
4714 					 ccompare,
4715 					 indx_before_incr, new_phi_tree);
4716 	  else
4717 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4718 					 cr_index_vector_type,
4719 					 ccompare,
4720 					 new_phi_tree, indx_before_incr);
4721 	}
4722       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4723       stmt_vec_info index_vec_info
4724 	= loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4725       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4726 
4727       /* Update the phi with the vec cond.  */
4728       induction_index = new_phi_tree;
4729       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4730 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4731     }
4732 
4733   /* 2. Create epilog code.
4734         The reduction epilog code operates across the elements of the vector
4735         of partial results computed by the vectorized loop.
4736         The reduction epilog code consists of:
4737 
4738         step 1: compute the scalar result in a vector (v_out2)
4739         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4740         step 3: adjust the scalar result (s_out3) if needed.
4741 
4742         Step 1 can be accomplished using one the following three schemes:
4743           (scheme 1) using reduc_fn, if available.
4744           (scheme 2) using whole-vector shifts, if available.
4745           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4746                      combined.
4747 
4748           The overall epilog code looks like this:
4749 
4750           s_out0 = phi <s_loop>         # original EXIT_PHI
4751           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4752           v_out2 = reduce <v_out1>              # step 1
4753           s_out3 = extract_field <v_out2, 0>    # step 2
4754           s_out4 = adjust_result <s_out3>       # step 3
4755 
4756           (step 3 is optional, and steps 1 and 2 may be combined).
4757           Lastly, the uses of s_out0 are replaced by s_out4.  */
4758 
4759 
4760   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4761          v_out1 = phi <VECT_DEF>
4762          Store them in NEW_PHIS.  */
4763   if (double_reduc)
4764     loop = outer_loop;
4765   exit_bb = single_exit (loop)->dest;
4766   prev_phi_info = NULL;
4767   new_phis.create (slp_node ? vec_num : ncopies);
4768   for (unsigned i = 0; i < vec_num; i++)
4769     {
4770       if (slp_node)
4771 	def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4772       else
4773 	def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4774       for (j = 0; j < ncopies; j++)
4775         {
4776 	  tree new_def = copy_ssa_name (def);
4777           phi = create_phi_node (new_def, exit_bb);
4778 	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4779           if (j == 0)
4780             new_phis.quick_push (phi);
4781           else
4782 	    {
4783 	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4784 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4785 	    }
4786 
4787           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4788 	  prev_phi_info = phi_info;
4789         }
4790     }
4791 
4792   exit_gsi = gsi_after_labels (exit_bb);
4793 
4794   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4795          (i.e. when reduc_fn is not available) and in the final adjustment
4796 	 code (if needed).  Also get the original scalar reduction variable as
4797          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4798          represents a reduction pattern), the tree-code and scalar-def are
4799          taken from the original stmt that the pattern-stmt (STMT) replaces.
4800          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4801          are taken from STMT.  */
4802 
4803   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4804   if (orig_stmt_info != stmt_info)
4805     {
4806       /* Reduction pattern  */
4807       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4808       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4809     }
4810 
4811   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4812   scalar_type = TREE_TYPE (scalar_dest);
4813   scalar_results.create (group_size);
4814   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4815   bitsize = TYPE_SIZE (scalar_type);
4816 
4817   /* SLP reduction without reduction chain, e.g.,
4818      # a1 = phi <a2, a0>
4819      # b1 = phi <b2, b0>
4820      a2 = operation (a1)
4821      b2 = operation (b1)  */
4822   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4823 
4824   /* True if we should implement SLP_REDUC using native reduction operations
4825      instead of scalar operations.  */
4826   direct_slp_reduc = (reduc_fn != IFN_LAST
4827 		      && slp_reduc
4828 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4829 
4830   /* In case of reduction chain, e.g.,
4831      # a1 = phi <a3, a0>
4832      a2 = operation (a1)
4833      a3 = operation (a2),
4834 
4835      we may end up with more than one vector result.  Here we reduce them to
4836      one vector.  */
4837   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4838     {
4839       gimple_seq stmts = NULL;
4840       tree first_vect = PHI_RESULT (new_phis[0]);
4841       first_vect = gimple_convert (&stmts, vectype, first_vect);
4842       for (k = 1; k < new_phis.length (); k++)
4843         {
4844 	  gimple *next_phi = new_phis[k];
4845           tree second_vect = PHI_RESULT (next_phi);
4846 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4847           first_vect = gimple_build (&stmts, code, vectype,
4848 				     first_vect, second_vect);
4849         }
4850       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4851 
4852       new_phi_result = first_vect;
4853       new_phis.truncate (0);
4854       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4855     }
4856   /* Likewise if we couldn't use a single defuse cycle.  */
4857   else if (ncopies > 1)
4858     {
4859       gcc_assert (new_phis.length () == 1);
4860       gimple_seq stmts = NULL;
4861       tree first_vect = PHI_RESULT (new_phis[0]);
4862       first_vect = gimple_convert (&stmts, vectype, first_vect);
4863       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4864       for (int k = 1; k < ncopies; ++k)
4865 	{
4866 	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4867 	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
4868 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4869 	  first_vect = gimple_build (&stmts, code, vectype,
4870 				     first_vect, second_vect);
4871 	}
4872       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4873       new_phi_result = first_vect;
4874       new_phis.truncate (0);
4875       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4876     }
4877   else
4878     new_phi_result = PHI_RESULT (new_phis[0]);
4879 
4880   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4881       && reduc_fn != IFN_LAST)
4882     {
4883       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4884 	 various data values where the condition matched and another vector
4885 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4886 	 need to extract the last matching index (which will be the index with
4887 	 highest value) and use this to index into the data vector.
4888 	 For the case where there were no matches, the data vector will contain
4889 	 all default values and the index vector will be all zeros.  */
4890 
4891       /* Get various versions of the type of the vector of indexes.  */
4892       tree index_vec_type = TREE_TYPE (induction_index);
4893       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4894       tree index_scalar_type = TREE_TYPE (index_vec_type);
4895       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4896 
4897       /* Get an unsigned integer version of the type of the data vector.  */
4898       int scalar_precision
4899 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4900       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4901       tree vectype_unsigned = build_vector_type
4902 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4903 
4904       /* First we need to create a vector (ZERO_VEC) of zeros and another
4905 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4906 	 can create using a MAX reduction and then expanding.
4907 	 In the case where the loop never made any matches, the max index will
4908 	 be zero.  */
4909 
4910       /* Vector of {0, 0, 0,...}.  */
4911       tree zero_vec = build_zero_cst (vectype);
4912 
4913       gimple_seq stmts = NULL;
4914       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4915       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916 
4917       /* Find maximum value from the vector of found indexes.  */
4918       tree max_index = make_ssa_name (index_scalar_type);
4919       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4920 							  1, induction_index);
4921       gimple_call_set_lhs (max_index_stmt, max_index);
4922       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4923 
4924       /* Vector of {max_index, max_index, max_index,...}.  */
4925       tree max_index_vec = make_ssa_name (index_vec_type);
4926       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4927 						      max_index);
4928       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4929 							max_index_vec_rhs);
4930       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4931 
4932       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4933 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4934 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4935 	 otherwise.  Only one value should match, resulting in a vector
4936 	 (VEC_COND) with one data value and the rest zeros.
4937 	 In the case where the loop never made any matches, every index will
4938 	 match, resulting in a vector with all data values (which will all be
4939 	 the default value).  */
4940 
4941       /* Compare the max index vector to the vector of found indexes to find
4942 	 the position of the max value.  */
4943       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4944       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4945 						      induction_index,
4946 						      max_index_vec);
4947       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4948 
4949       /* Use the compare to choose either values from the data vector or
4950 	 zero.  */
4951       tree vec_cond = make_ssa_name (vectype);
4952       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4953 						   vec_compare, new_phi_result,
4954 						   zero_vec);
4955       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4956 
4957       /* Finally we need to extract the data value from the vector (VEC_COND)
4958 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4959 	 reduction, but because this doesn't exist, we can use a MAX reduction
4960 	 instead.  The data value might be signed or a float so we need to cast
4961 	 it first.
4962 	 In the case where the loop never made any matches, the data values are
4963 	 all identical, and so will reduce down correctly.  */
4964 
4965       /* Make the matched data values unsigned.  */
4966       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4967       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4968 				       vec_cond);
4969       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4970 							VIEW_CONVERT_EXPR,
4971 							vec_cond_cast_rhs);
4972       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4973 
4974       /* Reduce down to a scalar value.  */
4975       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4976       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4977 							   1, vec_cond_cast);
4978       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4979       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4980 
4981       /* Convert the reduced value back to the result type and set as the
4982 	 result.  */
4983       stmts = NULL;
4984       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4985 			       data_reduc);
4986       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4987       scalar_results.safe_push (new_temp);
4988     }
4989   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4990 	   && reduc_fn == IFN_LAST)
4991     {
4992       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4993 	 idx = 0;
4994          idx_val = induction_index[0];
4995 	 val = data_reduc[0];
4996          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4997 	   if (induction_index[i] > idx_val)
4998 	     val = data_reduc[i], idx_val = induction_index[i];
4999 	 return val;  */
5000 
5001       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5002       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5003       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5004       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5005       /* Enforced by vectorizable_reduction, which ensures we have target
5006 	 support before allowing a conditional reduction on variable-length
5007 	 vectors.  */
5008       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5009       tree idx_val = NULL_TREE, val = NULL_TREE;
5010       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5011 	{
5012 	  tree old_idx_val = idx_val;
5013 	  tree old_val = val;
5014 	  idx_val = make_ssa_name (idx_eltype);
5015 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5016 					     build3 (BIT_FIELD_REF, idx_eltype,
5017 						     induction_index,
5018 						     bitsize_int (el_size),
5019 						     bitsize_int (off)));
5020 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5021 	  val = make_ssa_name (data_eltype);
5022 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5023 					     build3 (BIT_FIELD_REF,
5024 						     data_eltype,
5025 						     new_phi_result,
5026 						     bitsize_int (el_size),
5027 						     bitsize_int (off)));
5028 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5029 	  if (off != 0)
5030 	    {
5031 	      tree new_idx_val = idx_val;
5032 	      if (off != v_size - el_size)
5033 		{
5034 		  new_idx_val = make_ssa_name (idx_eltype);
5035 		  epilog_stmt = gimple_build_assign (new_idx_val,
5036 						     MAX_EXPR, idx_val,
5037 						     old_idx_val);
5038 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039 		}
5040 	      tree new_val = make_ssa_name (data_eltype);
5041 	      epilog_stmt = gimple_build_assign (new_val,
5042 						 COND_EXPR,
5043 						 build2 (GT_EXPR,
5044 							 boolean_type_node,
5045 							 idx_val,
5046 							 old_idx_val),
5047 						 val, old_val);
5048 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5049 	      idx_val = new_idx_val;
5050 	      val = new_val;
5051 	    }
5052 	}
5053       /* Convert the reduced value back to the result type and set as the
5054 	 result.  */
5055       gimple_seq stmts = NULL;
5056       val = gimple_convert (&stmts, scalar_type, val);
5057       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5058       scalar_results.safe_push (val);
5059     }
5060 
5061   /* 2.3 Create the reduction code, using one of the three schemes described
5062          above. In SLP we simply need to extract all the elements from the
5063          vector (without reducing them), so we use scalar shifts.  */
5064   else if (reduc_fn != IFN_LAST && !slp_reduc)
5065     {
5066       tree tmp;
5067       tree vec_elem_type;
5068 
5069       /* Case 1:  Create:
5070          v_out2 = reduc_expr <v_out1>  */
5071 
5072       if (dump_enabled_p ())
5073         dump_printf_loc (MSG_NOTE, vect_location,
5074 			 "Reduce using direct vector reduction.\n");
5075 
5076       gimple_seq stmts = NULL;
5077       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5078       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5079       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5080 			       vec_elem_type, new_phi_result);
5081       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5082       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5083 
5084       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5085 	  && induc_val)
5086 	{
5087 	  /* Earlier we set the initial value to be a vector if induc_val
5088 	     values.  Check the result and if it is induc_val then replace
5089 	     with the original initial value, unless induc_val is
5090 	     the same as initial_def already.  */
5091 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5092 				  induc_val);
5093 
5094 	  tmp = make_ssa_name (new_scalar_dest);
5095 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5096 					     initial_def, new_temp);
5097 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098 	  new_temp = tmp;
5099 	}
5100 
5101       scalar_results.safe_push (new_temp);
5102     }
5103   else if (direct_slp_reduc)
5104     {
5105       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5106 	 with the elements for other SLP statements replaced with the
5107 	 neutral value.  We can then do a normal reduction on each vector.  */
5108 
5109       /* Enforced by vectorizable_reduction.  */
5110       gcc_assert (new_phis.length () == 1);
5111       gcc_assert (pow2p_hwi (group_size));
5112 
5113       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5114       vec<stmt_vec_info> orig_phis
5115 	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5116       gimple_seq seq = NULL;
5117 
5118       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5119 	 and the same element size as VECTYPE.  */
5120       tree index = build_index_vector (vectype, 0, 1);
5121       tree index_type = TREE_TYPE (index);
5122       tree index_elt_type = TREE_TYPE (index_type);
5123       tree mask_type = truth_type_for (index_type);
5124 
5125       /* Create a vector that, for each element, identifies which of
5126 	 the REDUC_GROUP_SIZE results should use it.  */
5127       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5128       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5129 			    build_vector_from_val (index_type, index_mask));
5130 
5131       /* Get a neutral vector value.  This is simply a splat of the neutral
5132 	 scalar value if we have one, otherwise the initial scalar value
5133 	 is itself a neutral value.  */
5134       tree vector_identity = NULL_TREE;
5135       tree neutral_op = NULL_TREE;
5136       if (slp_node)
5137 	{
5138 	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5139 	  neutral_op
5140 	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5141 					    vectype, code, first != NULL);
5142 	}
5143       if (neutral_op)
5144 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5145 							neutral_op);
5146       for (unsigned int i = 0; i < group_size; ++i)
5147 	{
5148 	  /* If there's no univeral neutral value, we can use the
5149 	     initial scalar value from the original PHI.  This is used
5150 	     for MIN and MAX reduction, for example.  */
5151 	  if (!neutral_op)
5152 	    {
5153 	      tree scalar_value
5154 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5155 					 loop_preheader_edge (loop));
5156 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5157 					     scalar_value);
5158 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5159 							      scalar_value);
5160 	    }
5161 
5162 	  /* Calculate the equivalent of:
5163 
5164 	     sel[j] = (index[j] == i);
5165 
5166 	     which selects the elements of NEW_PHI_RESULT that should
5167 	     be included in the result.  */
5168 	  tree compare_val = build_int_cst (index_elt_type, i);
5169 	  compare_val = build_vector_from_val (index_type, compare_val);
5170 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5171 				   index, compare_val);
5172 
5173 	  /* Calculate the equivalent of:
5174 
5175 	     vec = seq ? new_phi_result : vector_identity;
5176 
5177 	     VEC is now suitable for a full vector reduction.  */
5178 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5179 				   sel, new_phi_result, vector_identity);
5180 
5181 	  /* Do the reduction and convert it to the appropriate type.  */
5182 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5183 				      TREE_TYPE (vectype), vec);
5184 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5185 	  scalar_results.safe_push (scalar);
5186 	}
5187       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5188     }
5189   else
5190     {
5191       bool reduce_with_shift;
5192       tree vec_temp;
5193 
5194       gcc_assert (slp_reduc || new_phis.length () == 1);
5195 
5196       /* See if the target wants to do the final (shift) reduction
5197 	 in a vector mode of smaller size and first reduce upper/lower
5198 	 halves against each other.  */
5199       enum machine_mode mode1 = mode;
5200       tree stype = TREE_TYPE (vectype);
5201       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5202       unsigned nunits1 = nunits;
5203       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5204 	  && new_phis.length () == 1)
5205 	{
5206 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5207 	  /* For SLP reductions we have to make sure lanes match up, but
5208 	     since we're doing individual element final reduction reducing
5209 	     vector width here is even more important.
5210 	     ???  We can also separate lanes with permutes, for the common
5211 	     case of power-of-two group-size odd/even extracts would work.  */
5212 	  if (slp_reduc && nunits != nunits1)
5213 	    {
5214 	      nunits1 = least_common_multiple (nunits1, group_size);
5215 	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5216 	    }
5217 	}
5218       if (!slp_reduc
5219 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5220 	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5221 
5222       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5223 							   stype, nunits1);
5224       reduce_with_shift = have_whole_vector_shift (mode1);
5225       if (!VECTOR_MODE_P (mode1))
5226 	reduce_with_shift = false;
5227       else
5228 	{
5229 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5230 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5231 	    reduce_with_shift = false;
5232 	}
5233 
5234       /* First reduce the vector to the desired vector size we should
5235 	 do shift reduction on by combining upper and lower halves.  */
5236       new_temp = new_phi_result;
5237       while (nunits > nunits1)
5238 	{
5239 	  nunits /= 2;
5240 	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5241 							  stype, nunits);
5242 	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5243 
5244 	  /* The target has to make sure we support lowpart/highpart
5245 	     extraction, either via direct vector extract or through
5246 	     an integer mode punning.  */
5247 	  tree dst1, dst2;
5248 	  if (convert_optab_handler (vec_extract_optab,
5249 				     TYPE_MODE (TREE_TYPE (new_temp)),
5250 				     TYPE_MODE (vectype1))
5251 	      != CODE_FOR_nothing)
5252 	    {
5253 	      /* Extract sub-vectors directly once vec_extract becomes
5254 		 a conversion optab.  */
5255 	      dst1 = make_ssa_name (vectype1);
5256 	      epilog_stmt
5257 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5258 					 build3 (BIT_FIELD_REF, vectype1,
5259 						 new_temp, TYPE_SIZE (vectype1),
5260 						 bitsize_int (0)));
5261 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5262 	      dst2 =  make_ssa_name (vectype1);
5263 	      epilog_stmt
5264 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5265 					 build3 (BIT_FIELD_REF, vectype1,
5266 						 new_temp, TYPE_SIZE (vectype1),
5267 						 bitsize_int (bitsize)));
5268 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5269 	    }
5270 	  else
5271 	    {
5272 	      /* Extract via punning to appropriately sized integer mode
5273 		 vector.  */
5274 	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
5275 	      tree etype = build_vector_type (eltype, 2);
5276 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5277 						 TYPE_MODE (etype),
5278 						 TYPE_MODE (eltype))
5279 			  != CODE_FOR_nothing);
5280 	      tree tem = make_ssa_name (etype);
5281 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5282 						 build1 (VIEW_CONVERT_EXPR,
5283 							 etype, new_temp));
5284 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285 	      new_temp = tem;
5286 	      tem = make_ssa_name (eltype);
5287 	      epilog_stmt
5288 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5289 					 build3 (BIT_FIELD_REF, eltype,
5290 						 new_temp, TYPE_SIZE (eltype),
5291 						 bitsize_int (0)));
5292 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293 	      dst1 = make_ssa_name (vectype1);
5294 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5295 						 build1 (VIEW_CONVERT_EXPR,
5296 							 vectype1, tem));
5297 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298 	      tem = make_ssa_name (eltype);
5299 	      epilog_stmt
5300 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5301 					 build3 (BIT_FIELD_REF, eltype,
5302 						 new_temp, TYPE_SIZE (eltype),
5303 						 bitsize_int (bitsize)));
5304 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305 	      dst2 =  make_ssa_name (vectype1);
5306 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5307 						 build1 (VIEW_CONVERT_EXPR,
5308 							 vectype1, tem));
5309 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310 	    }
5311 
5312 	  new_temp = make_ssa_name (vectype1);
5313 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5314 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315 	  new_phis[0] = epilog_stmt;
5316 	}
5317 
5318       if (reduce_with_shift && !slp_reduc)
5319 	{
5320 	  int element_bitsize = tree_to_uhwi (bitsize);
5321 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5322 	     for variable-length vectors and also requires direct target support
5323 	     for loop reductions.  */
5324 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5325 	  int nelements = vec_size_in_bits / element_bitsize;
5326 	  vec_perm_builder sel;
5327 	  vec_perm_indices indices;
5328 
5329           int elt_offset;
5330 
5331           tree zero_vec = build_zero_cst (vectype1);
5332           /* Case 2: Create:
5333              for (offset = nelements/2; offset >= 1; offset/=2)
5334                 {
5335                   Create:  va' = vec_shift <va, offset>
5336                   Create:  va = vop <va, va'>
5337                 }  */
5338 
5339           tree rhs;
5340 
5341           if (dump_enabled_p ())
5342             dump_printf_loc (MSG_NOTE, vect_location,
5343 			     "Reduce using vector shifts\n");
5344 
5345 	  gimple_seq stmts = NULL;
5346 	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5347           for (elt_offset = nelements / 2;
5348                elt_offset >= 1;
5349                elt_offset /= 2)
5350             {
5351 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5352 	      indices.new_vector (sel, 2, nelements);
5353 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5354 	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5355 				       new_temp, zero_vec, mask);
5356 	      new_temp = gimple_build (&stmts, code,
5357 				       vectype1, new_name, new_temp);
5358             }
5359 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5360 
5361 	  /* 2.4  Extract the final scalar result.  Create:
5362 	     s_out3 = extract_field <v_out2, bitpos>  */
5363 
5364 	  if (dump_enabled_p ())
5365 	    dump_printf_loc (MSG_NOTE, vect_location,
5366 			     "extract scalar result\n");
5367 
5368 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5369 			bitsize, bitsize_zero_node);
5370 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5371 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5372 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5373 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5374 	  scalar_results.safe_push (new_temp);
5375         }
5376       else
5377         {
5378           /* Case 3: Create:
5379              s = extract_field <v_out2, 0>
5380              for (offset = element_size;
5381                   offset < vector_size;
5382                   offset += element_size;)
5383                {
5384                  Create:  s' = extract_field <v_out2, offset>
5385                  Create:  s = op <s, s'>  // For non SLP cases
5386                }  */
5387 
5388           if (dump_enabled_p ())
5389             dump_printf_loc (MSG_NOTE, vect_location,
5390 			     "Reduce using scalar code.\n");
5391 
5392 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5393 	  int element_bitsize = tree_to_uhwi (bitsize);
5394 	  tree compute_type = TREE_TYPE (vectype);
5395 	  gimple_seq stmts = NULL;
5396           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5397             {
5398               int bit_offset;
5399               if (gimple_code (new_phi) == GIMPLE_PHI)
5400                 vec_temp = PHI_RESULT (new_phi);
5401               else
5402                 vec_temp = gimple_assign_lhs (new_phi);
5403 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5404 				       vec_temp, bitsize, bitsize_zero_node);
5405 
5406               /* In SLP we don't need to apply reduction operation, so we just
5407                  collect s' values in SCALAR_RESULTS.  */
5408               if (slp_reduc)
5409                 scalar_results.safe_push (new_temp);
5410 
5411               for (bit_offset = element_bitsize;
5412                    bit_offset < vec_size_in_bits;
5413                    bit_offset += element_bitsize)
5414                 {
5415                   tree bitpos = bitsize_int (bit_offset);
5416 		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5417 					   compute_type, vec_temp,
5418 					   bitsize, bitpos);
5419                   if (slp_reduc)
5420                     {
5421                       /* In SLP we don't need to apply reduction operation, so
5422                          we just collect s' values in SCALAR_RESULTS.  */
5423                       new_temp = new_name;
5424                       scalar_results.safe_push (new_name);
5425                     }
5426                   else
5427 		    new_temp = gimple_build (&stmts, code, compute_type,
5428 					     new_name, new_temp);
5429                 }
5430             }
5431 
5432           /* The only case where we need to reduce scalar results in SLP, is
5433              unrolling.  If the size of SCALAR_RESULTS is greater than
5434              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5435              REDUC_GROUP_SIZE.  */
5436           if (slp_reduc)
5437             {
5438               tree res, first_res, new_res;
5439 
5440               /* Reduce multiple scalar results in case of SLP unrolling.  */
5441               for (j = group_size; scalar_results.iterate (j, &res);
5442                    j++)
5443                 {
5444                   first_res = scalar_results[j % group_size];
5445 		  new_res = gimple_build (&stmts, code, compute_type,
5446 					  first_res, res);
5447                   scalar_results[j % group_size] = new_res;
5448                 }
5449 	      for (k = 0; k < group_size; k++)
5450 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
5451 						    scalar_results[k]);
5452             }
5453           else
5454 	    {
5455 	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5456 	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5457 	      scalar_results.safe_push (new_temp);
5458 	    }
5459 
5460 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5461         }
5462 
5463       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5464 	  && induc_val)
5465 	{
5466 	  /* Earlier we set the initial value to be a vector if induc_val
5467 	     values.  Check the result and if it is induc_val then replace
5468 	     with the original initial value, unless induc_val is
5469 	     the same as initial_def already.  */
5470 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5471 				  induc_val);
5472 
5473 	  tree tmp = make_ssa_name (new_scalar_dest);
5474 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5475 					     initial_def, new_temp);
5476 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5477 	  scalar_results[0] = tmp;
5478 	}
5479     }
5480 
5481   /* 2.5 Adjust the final result by the initial value of the reduction
5482 	 variable. (When such adjustment is not needed, then
5483 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5484 	 new_temp = loop_exit_def + adjustment_def  */
5485 
5486   if (adjustment_def)
5487     {
5488       gcc_assert (!slp_reduc);
5489       gimple_seq stmts = NULL;
5490       if (nested_in_vect_loop)
5491 	{
5492           new_phi = new_phis[0];
5493 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5494 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5495 	  new_temp = gimple_build (&stmts, code, vectype,
5496 				   PHI_RESULT (new_phi), adjustment_def);
5497 	}
5498       else
5499 	{
5500           new_temp = scalar_results[0];
5501 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5502 	  adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5503 	  new_temp = gimple_build (&stmts, code, scalar_type,
5504 				   new_temp, adjustment_def);
5505 	}
5506 
5507       epilog_stmt = gimple_seq_last_stmt (stmts);
5508       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5509       if (nested_in_vect_loop)
5510         {
5511 	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5512 	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5513 	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5514 
5515           if (!double_reduc)
5516             scalar_results.quick_push (new_temp);
5517           else
5518             scalar_results[0] = new_temp;
5519         }
5520       else
5521         scalar_results[0] = new_temp;
5522 
5523       new_phis[0] = epilog_stmt;
5524     }
5525 
5526   if (double_reduc)
5527     loop = loop->inner;
5528 
5529   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5530           phis with new adjusted scalar results, i.e., replace use <s_out0>
5531           with use <s_out4>.
5532 
5533      Transform:
5534         loop_exit:
5535           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5536           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5537           v_out2 = reduce <v_out1>
5538           s_out3 = extract_field <v_out2, 0>
5539           s_out4 = adjust_result <s_out3>
5540           use <s_out0>
5541           use <s_out0>
5542 
5543      into:
5544 
5545         loop_exit:
5546           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5547           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5548           v_out2 = reduce <v_out1>
5549           s_out3 = extract_field <v_out2, 0>
5550           s_out4 = adjust_result <s_out3>
5551           use <s_out4>
5552           use <s_out4> */
5553 
5554 
5555   /* In SLP reduction chain we reduce vector results into one vector if
5556      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5557      LHS of the last stmt in the reduction chain, since we are looking for
5558      the loop exit phi node.  */
5559   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5560     {
5561       stmt_vec_info dest_stmt_info
5562 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5563       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5564       group_size = 1;
5565     }
5566 
5567   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5568      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5569      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5570      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5571      correspond to the first vector stmt, etc.
5572      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5573   if (group_size > new_phis.length ())
5574     gcc_assert (!(group_size % new_phis.length ()));
5575 
5576   for (k = 0; k < group_size; k++)
5577     {
5578       if (slp_reduc)
5579         {
5580 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5581 
5582 	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5583 	  /* SLP statements can't participate in patterns.  */
5584 	  gcc_assert (!orig_stmt_info);
5585 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5586         }
5587 
5588       if (nested_in_vect_loop)
5589         {
5590           if (double_reduc)
5591             loop = outer_loop;
5592           else
5593 	    gcc_unreachable ();
5594         }
5595 
5596       phis.create (3);
5597       /* Find the loop-closed-use at the loop exit of the original scalar
5598          result.  (The reduction result is expected to have two immediate uses,
5599          one at the latch block, and one at the loop exit).  For double
5600          reductions we are looking for exit phis of the outer loop.  */
5601       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5602         {
5603           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5604 	    {
5605 	      if (!is_gimple_debug (USE_STMT (use_p)))
5606 		phis.safe_push (USE_STMT (use_p));
5607 	    }
5608           else
5609             {
5610               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5611                 {
5612                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5613 
5614                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5615                     {
5616                       if (!flow_bb_inside_loop_p (loop,
5617                                              gimple_bb (USE_STMT (phi_use_p)))
5618 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5619                         phis.safe_push (USE_STMT (phi_use_p));
5620                     }
5621                 }
5622             }
5623         }
5624 
5625       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5626         {
5627           /* Replace the uses:  */
5628           orig_name = PHI_RESULT (exit_phi);
5629           scalar_result = scalar_results[k];
5630           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5631 	    {
5632 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5633 		SET_USE (use_p, scalar_result);
5634 	      update_stmt (use_stmt);
5635 	    }
5636         }
5637 
5638       phis.release ();
5639     }
5640 }
5641 
5642 /* Return a vector of type VECTYPE that is equal to the vector select
5643    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5644    before GSI.  */
5645 
5646 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5647 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5648 		     tree vec, tree identity)
5649 {
5650   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5651   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5652 					  mask, vec, identity);
5653   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5654   return cond;
5655 }
5656 
5657 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5658    order, starting with LHS.  Insert the extraction statements before GSI and
5659    associate the new scalar SSA names with variable SCALAR_DEST.
5660    Return the SSA name for the result.  */
5661 
5662 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5663 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5664 		       tree_code code, tree lhs, tree vector_rhs)
5665 {
5666   tree vectype = TREE_TYPE (vector_rhs);
5667   tree scalar_type = TREE_TYPE (vectype);
5668   tree bitsize = TYPE_SIZE (scalar_type);
5669   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5670   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5671 
5672   for (unsigned HOST_WIDE_INT bit_offset = 0;
5673        bit_offset < vec_size_in_bits;
5674        bit_offset += element_bitsize)
5675     {
5676       tree bitpos = bitsize_int (bit_offset);
5677       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5678 			 bitsize, bitpos);
5679 
5680       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5681       rhs = make_ssa_name (scalar_dest, stmt);
5682       gimple_assign_set_lhs (stmt, rhs);
5683       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5684 
5685       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5686       tree new_name = make_ssa_name (scalar_dest, stmt);
5687       gimple_assign_set_lhs (stmt, new_name);
5688       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5689       lhs = new_name;
5690     }
5691   return lhs;
5692 }
5693 
5694 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5695    type of the vector input.  */
5696 
5697 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)5698 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5699 {
5700   internal_fn mask_reduc_fn;
5701 
5702   switch (reduc_fn)
5703     {
5704     case IFN_FOLD_LEFT_PLUS:
5705       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5706       break;
5707 
5708     default:
5709       return IFN_LAST;
5710     }
5711 
5712   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5713 				      OPTIMIZE_FOR_SPEED))
5714     return mask_reduc_fn;
5715   return IFN_LAST;
5716 }
5717 
5718 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5719    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5720    statement.  CODE is the operation performed by STMT_INFO and OPS are
5721    its scalar operands.  REDUC_INDEX is the index of the operand in
5722    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5723    implements in-order reduction, or IFN_LAST if we should open-code it.
5724    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5725    that should be used to control the operation in a fully-masked loop.  */
5726 
5727 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5728 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5729 			       gimple_stmt_iterator *gsi,
5730 			       stmt_vec_info *vec_stmt, slp_tree slp_node,
5731 			       gimple *reduc_def_stmt,
5732 			       tree_code code, internal_fn reduc_fn,
5733 			       tree ops[3], tree vectype_in,
5734 			       int reduc_index, vec_loop_masks *masks)
5735 {
5736   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5738   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5739   stmt_vec_info new_stmt_info = NULL;
5740   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5741 
5742   int ncopies;
5743   if (slp_node)
5744     ncopies = 1;
5745   else
5746     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5747 
5748   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5749   gcc_assert (ncopies == 1);
5750   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5751 
5752   if (slp_node)
5753     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5754 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
5755 
5756   tree op0 = ops[1 - reduc_index];
5757 
5758   int group_size = 1;
5759   stmt_vec_info scalar_dest_def_info;
5760   auto_vec<tree> vec_oprnds0;
5761   if (slp_node)
5762     {
5763       auto_vec<vec<tree> > vec_defs (2);
5764       vect_get_slp_defs (slp_node, &vec_defs);
5765       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5766       vec_defs[0].release ();
5767       vec_defs[1].release ();
5768       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5769       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5770     }
5771   else
5772     {
5773       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5774       vec_oprnds0.create (1);
5775       vec_oprnds0.quick_push (loop_vec_def0);
5776       scalar_dest_def_info = stmt_info;
5777     }
5778 
5779   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5780   tree scalar_type = TREE_TYPE (scalar_dest);
5781   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5782 
5783   int vec_num = vec_oprnds0.length ();
5784   gcc_assert (vec_num == 1 || slp_node);
5785   tree vec_elem_type = TREE_TYPE (vectype_out);
5786   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5787 
5788   tree vector_identity = NULL_TREE;
5789   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5790     vector_identity = build_zero_cst (vectype_out);
5791 
5792   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5793   int i;
5794   tree def0;
5795   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5796     {
5797       gimple *new_stmt;
5798       tree mask = NULL_TREE;
5799       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5800 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5801 
5802       /* Handle MINUS by adding the negative.  */
5803       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5804 	{
5805 	  tree negated = make_ssa_name (vectype_out);
5806 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5807 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5808 	  def0 = negated;
5809 	}
5810 
5811       if (mask && mask_reduc_fn == IFN_LAST)
5812 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5813 				    vector_identity);
5814 
5815       /* On the first iteration the input is simply the scalar phi
5816 	 result, and for subsequent iterations it is the output of
5817 	 the preceding operation.  */
5818       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5819 	{
5820 	  if (mask && mask_reduc_fn != IFN_LAST)
5821 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5822 						   def0, mask);
5823 	  else
5824 	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5825 						   def0);
5826 	  /* For chained SLP reductions the output of the previous reduction
5827 	     operation serves as the input of the next. For the final statement
5828 	     the output cannot be a temporary - we reuse the original
5829 	     scalar destination of the last statement.  */
5830 	  if (i != vec_num - 1)
5831 	    {
5832 	      gimple_set_lhs (new_stmt, scalar_dest_var);
5833 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5834 	      gimple_set_lhs (new_stmt, reduc_var);
5835 	    }
5836 	}
5837       else
5838 	{
5839 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5840 					     reduc_var, def0);
5841 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5842 	  /* Remove the statement, so that we can use the same code paths
5843 	     as for statements that we've just created.  */
5844 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5845 	  gsi_remove (&tmp_gsi, true);
5846 	}
5847 
5848       if (i == vec_num - 1)
5849 	{
5850 	  gimple_set_lhs (new_stmt, scalar_dest);
5851 	  new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5852 						    new_stmt);
5853 	}
5854       else
5855 	new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5856 						     new_stmt, gsi);
5857 
5858       if (slp_node)
5859 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5860     }
5861 
5862   if (!slp_node)
5863     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5864 
5865   return true;
5866 }
5867 
5868 /* Function is_nonwrapping_integer_induction.
5869 
5870    Check if STMT_VINO (which is part of loop LOOP) both increments and
5871    does not cause overflow.  */
5872 
5873 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)5874 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5875 {
5876   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5877   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5878   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5879   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5880   widest_int ni, max_loop_value, lhs_max;
5881   wi::overflow_type overflow = wi::OVF_NONE;
5882 
5883   /* Make sure the loop is integer based.  */
5884   if (TREE_CODE (base) != INTEGER_CST
5885       || TREE_CODE (step) != INTEGER_CST)
5886     return false;
5887 
5888   /* Check that the max size of the loop will not wrap.  */
5889 
5890   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5891     return true;
5892 
5893   if (! max_stmt_executions (loop, &ni))
5894     return false;
5895 
5896   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5897 			    &overflow);
5898   if (overflow)
5899     return false;
5900 
5901   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5902 			    TYPE_SIGN (lhs_type), &overflow);
5903   if (overflow)
5904     return false;
5905 
5906   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5907 	  <= TYPE_PRECISION (lhs_type));
5908 }
5909 
5910 /* Check if masking can be supported by inserting a conditional expression.
5911    CODE is the code for the operation.  COND_FN is the conditional internal
5912    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5913 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)5914 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5915 			 tree vectype_in)
5916 {
5917   if (cond_fn != IFN_LAST
5918       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5919 					 OPTIMIZE_FOR_SPEED))
5920     return false;
5921 
5922   switch (code)
5923     {
5924     case DOT_PROD_EXPR:
5925     case SAD_EXPR:
5926       return true;
5927 
5928     default:
5929       return false;
5930     }
5931 }
5932 
5933 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5934    code for the operation.  VOP is the array of operands.  MASK is the loop
5935    mask.  GSI is a statement iterator used to place the new conditional
5936    expression.  */
5937 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)5938 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5939 		      gimple_stmt_iterator *gsi)
5940 {
5941   switch (code)
5942     {
5943     case DOT_PROD_EXPR:
5944       {
5945 	tree vectype = TREE_TYPE (vop[1]);
5946 	tree zero = build_zero_cst (vectype);
5947 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5948 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5949 					       mask, vop[1], zero);
5950 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5951 	vop[1] = masked_op1;
5952 	break;
5953       }
5954 
5955     case SAD_EXPR:
5956       {
5957 	tree vectype = TREE_TYPE (vop[1]);
5958 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5959 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5960 					       mask, vop[1], vop[0]);
5961 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5962 	vop[1] = masked_op1;
5963 	break;
5964       }
5965 
5966     default:
5967       gcc_unreachable ();
5968     }
5969 }
5970 
5971 /* Function vectorizable_reduction.
5972 
5973    Check if STMT_INFO performs a reduction operation that can be vectorized.
5974    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5975    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5976    Return true if STMT_INFO is vectorizable in this way.
5977 
5978    This function also handles reduction idioms (patterns) that have been
5979    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5980    may be of this form:
5981      X = pattern_expr (arg0, arg1, ..., X)
5982    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5983    sequence that had been detected and replaced by the pattern-stmt
5984    (STMT_INFO).
5985 
5986    This function also handles reduction of condition expressions, for example:
5987      for (int i = 0; i < N; i++)
5988        if (a[i] < value)
5989 	 last = a[i];
5990    This is handled by vectorising the loop and creating an additional vector
5991    containing the loop indexes for which "a[i] < value" was true.  In the
5992    function epilogue this is reduced to a single max value and then used to
5993    index into the vector of results.
5994 
5995    In some cases of reduction patterns, the type of the reduction variable X is
5996    different than the type of the other arguments of STMT_INFO.
5997    In such cases, the vectype that is used when transforming STMT_INFO into
5998    a vector stmt is different than the vectype that is used to determine the
5999    vectorization factor, because it consists of a different number of elements
6000    than the actual number of elements that are being operated upon in parallel.
6001 
6002    For example, consider an accumulation of shorts into an int accumulator.
6003    On some targets it's possible to vectorize this pattern operating on 8
6004    shorts at a time (hence, the vectype for purposes of determining the
6005    vectorization factor should be V8HI); on the other hand, the vectype that
6006    is used to create the vector form is actually V4SI (the type of the result).
6007 
6008    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6009    indicates what is the actual level of parallelism (V8HI in the example), so
6010    that the right vectorization factor would be derived.  This vectype
6011    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6012    be used to create the vectorized stmt.  The right vectype for the vectorized
6013    stmt is obtained from the type of the result X:
6014       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6015 
6016    This means that, contrary to "regular" reductions (or "regular" stmts in
6017    general), the following equation:
6018       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6019    does *NOT* necessarily hold for reduction patterns.  */
6020 
6021 bool
vectorizable_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6022 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
6023 			slp_instance slp_node_instance,
6024 			stmt_vector_for_cost *cost_vec)
6025 {
6026   tree scalar_dest;
6027   tree vectype_in = NULL_TREE;
6028   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6029   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6030   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6031   stmt_vec_info cond_stmt_vinfo = NULL;
6032   tree scalar_type;
6033   int i;
6034   int ncopies;
6035   bool single_defuse_cycle = false;
6036   bool nested_cycle = false;
6037   bool double_reduc = false;
6038   int vec_num;
6039   tree tem;
6040   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6041   tree cond_reduc_val = NULL_TREE;
6042 
6043   /* Make sure it was already recognized as a reduction computation.  */
6044   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6045       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6046       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6047     return false;
6048 
6049   /* The stmt we store reduction analysis meta on.  */
6050   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6051   reduc_info->is_reduc_info = true;
6052 
6053   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6054     {
6055       if (is_a <gphi *> (stmt_info->stmt))
6056 	/* Analysis for double-reduction is done on the outer
6057 	   loop PHI, nested cycles have no further restrictions.  */
6058 	STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6059       else
6060 	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6061       return true;
6062     }
6063 
6064   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6065   stmt_vec_info phi_info = stmt_info;
6066   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6067       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6068     {
6069       if (!is_a <gphi *> (stmt_info->stmt))
6070 	{
6071 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6072 	  return true;
6073 	}
6074       if (slp_node)
6075 	{
6076 	  slp_node_instance->reduc_phis = slp_node;
6077 	  /* ???  We're leaving slp_node to point to the PHIs, we only
6078 	     need it to get at the number of vector stmts which wasn't
6079 	     yet initialized for the instance root.  */
6080 	}
6081       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6082 	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6083       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6084 	{
6085 	  use_operand_p use_p;
6086 	  gimple *use_stmt;
6087 	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6088 				     &use_p, &use_stmt);
6089 	  gcc_assert (res);
6090 	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
6091 	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6092 	}
6093     }
6094 
6095   /* PHIs should not participate in patterns.  */
6096   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6097   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6098 
6099   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6100      and compute the reduction chain length.  */
6101   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6102 					  loop_latch_edge (loop));
6103   unsigned reduc_chain_length = 0;
6104   bool only_slp_reduc_chain = true;
6105   stmt_info = NULL;
6106   while (reduc_def != PHI_RESULT (reduc_def_phi))
6107     {
6108       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6109       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6110       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6111 	{
6112 	  if (dump_enabled_p ())
6113 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6114 			     "reduction chain broken by patterns.\n");
6115 	  return false;
6116 	}
6117       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6118 	only_slp_reduc_chain = false;
6119       /* ???  For epilogue generation live members of the chain need
6120          to point back to the PHI via their original stmt for
6121 	 info_for_reduction to work.  */
6122       if (STMT_VINFO_LIVE_P (vdef))
6123 	STMT_VINFO_REDUC_DEF (def) = phi_info;
6124       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6125       if (!assign)
6126 	{
6127 	  if (dump_enabled_p ())
6128 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6129 			     "reduction chain includes calls.\n");
6130 	  return false;
6131 	}
6132       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6133 	{
6134 	  if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6135 				      TREE_TYPE (gimple_assign_rhs1 (assign))))
6136 	    {
6137 	      if (dump_enabled_p ())
6138 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139 				 "conversion in the reduction chain.\n");
6140 	      return false;
6141 	    }
6142 	}
6143       else if (!stmt_info)
6144 	/* First non-conversion stmt.  */
6145 	stmt_info = vdef;
6146       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6147       reduc_chain_length++;
6148     }
6149   /* PHIs should not participate in patterns.  */
6150   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6151 
6152   if (nested_in_vect_loop_p (loop, stmt_info))
6153     {
6154       loop = loop->inner;
6155       nested_cycle = true;
6156     }
6157 
6158   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6159      element.  */
6160   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6161     {
6162       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6163       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6164     }
6165   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6166     gcc_assert (slp_node
6167 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6168 
6169   /* 1. Is vectorizable reduction?  */
6170   /* Not supportable if the reduction variable is used in the loop, unless
6171      it's a reduction chain.  */
6172   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6173       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6174     return false;
6175 
6176   /* Reductions that are not used even in an enclosing outer-loop,
6177      are expected to be "live" (used out of the loop).  */
6178   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6179       && !STMT_VINFO_LIVE_P (stmt_info))
6180     return false;
6181 
6182   /* 2. Has this been recognized as a reduction pattern?
6183 
6184      Check if STMT represents a pattern that has been recognized
6185      in earlier analysis stages.  For stmts that represent a pattern,
6186      the STMT_VINFO_RELATED_STMT field records the last stmt in
6187      the original sequence that constitutes the pattern.  */
6188 
6189   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6190   if (orig_stmt_info)
6191     {
6192       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6193       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6194     }
6195 
6196   /* 3. Check the operands of the operation.  The first operands are defined
6197         inside the loop body. The last operand is the reduction variable,
6198         which is defined by the loop-header-phi.  */
6199 
6200   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6201   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6202   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6203   enum tree_code code = gimple_assign_rhs_code (stmt);
6204   bool lane_reduc_code_p
6205     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6206   int op_type = TREE_CODE_LENGTH (code);
6207 
6208   scalar_dest = gimple_assign_lhs (stmt);
6209   scalar_type = TREE_TYPE (scalar_dest);
6210   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6211       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6212     return false;
6213 
6214   /* Do not try to vectorize bit-precision reductions.  */
6215   if (!type_has_mode_precision_p (scalar_type))
6216     return false;
6217 
6218   /* For lane-reducing ops we're reducing the number of reduction PHIs
6219      which means the only use of that may be in the lane-reducing operation.  */
6220   if (lane_reduc_code_p
6221       && reduc_chain_length != 1
6222       && !only_slp_reduc_chain)
6223     {
6224       if (dump_enabled_p ())
6225 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226 			 "lane-reducing reduction with extra stmts.\n");
6227       return false;
6228     }
6229 
6230   /* All uses but the last are expected to be defined in the loop.
6231      The last use is the reduction variable.  In case of nested cycle this
6232      assumption is not true: we use reduc_index to record the index of the
6233      reduction variable.  */
6234   reduc_def = PHI_RESULT (reduc_def_phi);
6235   for (i = 0; i < op_type; i++)
6236     {
6237       tree op = gimple_op (stmt, i + 1);
6238       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6239       if (i == 0 && code == COND_EXPR)
6240         continue;
6241 
6242       stmt_vec_info def_stmt_info;
6243       enum vect_def_type dt;
6244       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6245 			       &def_stmt_info))
6246 	{
6247 	  if (dump_enabled_p ())
6248 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249 			     "use not simple.\n");
6250 	  return false;
6251 	}
6252       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6253 	continue;
6254 
6255       /* There should be only one cycle def in the stmt, the one
6256          leading to reduc_def.  */
6257       if (VECTORIZABLE_CYCLE_DEF (dt))
6258 	return false;
6259 
6260       /* To properly compute ncopies we are interested in the widest
6261 	 non-reduction input type in case we're looking at a widening
6262 	 accumulation that we later handle in vect_transform_reduction.  */
6263       if (lane_reduc_code_p
6264 	  && tem
6265 	  && (!vectype_in
6266 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6267 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6268 	vectype_in = tem;
6269 
6270       if (code == COND_EXPR)
6271 	{
6272 	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6273 	  if (dt == vect_constant_def)
6274 	    {
6275 	      cond_reduc_dt = dt;
6276 	      cond_reduc_val = op;
6277 	    }
6278 	  if (dt == vect_induction_def
6279 	      && def_stmt_info
6280 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6281 	    {
6282 	      cond_reduc_dt = dt;
6283 	      cond_stmt_vinfo = def_stmt_info;
6284 	    }
6285 	}
6286     }
6287   if (!vectype_in)
6288     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6289   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6290 
6291   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6292   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6293   /* If we have a condition reduction, see if we can simplify it further.  */
6294   if (v_reduc_type == COND_REDUCTION)
6295     {
6296       if (slp_node)
6297 	return false;
6298 
6299       /* When the condition uses the reduction value in the condition, fail.  */
6300       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6301 	{
6302 	  if (dump_enabled_p ())
6303 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304 			     "condition depends on previous iteration\n");
6305 	  return false;
6306 	}
6307 
6308       if (reduc_chain_length == 1
6309 	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6310 					     vectype_in, OPTIMIZE_FOR_SPEED))
6311 	{
6312 	  if (dump_enabled_p ())
6313 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314 			     "optimizing condition reduction with"
6315 			     " FOLD_EXTRACT_LAST.\n");
6316 	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6317 	}
6318       else if (cond_reduc_dt == vect_induction_def)
6319 	{
6320 	  tree base
6321 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6322 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6323 
6324 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6325 		      && TREE_CODE (step) == INTEGER_CST);
6326 	  cond_reduc_val = NULL_TREE;
6327 	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6328 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6329 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6330 	    ;
6331 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6332 	     above base; punt if base is the minimum value of the type for
6333 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6334 	  else if (tree_int_cst_sgn (step) == -1)
6335 	    {
6336 	      cond_reduc_op_code = MIN_EXPR;
6337 	      if (tree_int_cst_sgn (base) == -1)
6338 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6339 	      else if (tree_int_cst_lt (base,
6340 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6341 		cond_reduc_val
6342 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6343 	    }
6344 	  else
6345 	    {
6346 	      cond_reduc_op_code = MAX_EXPR;
6347 	      if (tree_int_cst_sgn (base) == 1)
6348 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6349 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6350 					base))
6351 		cond_reduc_val
6352 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6353 	    }
6354 	  if (cond_reduc_val)
6355 	    {
6356 	      if (dump_enabled_p ())
6357 		dump_printf_loc (MSG_NOTE, vect_location,
6358 				 "condition expression based on "
6359 				 "integer induction.\n");
6360 	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6361 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6362 		= cond_reduc_val;
6363 	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6364 	    }
6365 	}
6366       else if (cond_reduc_dt == vect_constant_def)
6367 	{
6368 	  enum vect_def_type cond_initial_dt;
6369 	  tree cond_initial_val
6370 	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6371 
6372 	  gcc_assert (cond_reduc_val != NULL_TREE);
6373 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6374 	  if (cond_initial_dt == vect_constant_def
6375 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6376 				     TREE_TYPE (cond_reduc_val)))
6377 	    {
6378 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6379 				    cond_initial_val, cond_reduc_val);
6380 	      if (e && (integer_onep (e) || integer_zerop (e)))
6381 		{
6382 		  if (dump_enabled_p ())
6383 		    dump_printf_loc (MSG_NOTE, vect_location,
6384 				     "condition expression based on "
6385 				     "compile time constant.\n");
6386 		  /* Record reduction code at analysis stage.  */
6387 		  STMT_VINFO_REDUC_CODE (reduc_info)
6388 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6389 		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6390 		}
6391 	    }
6392 	}
6393     }
6394 
6395   if (STMT_VINFO_LIVE_P (phi_info))
6396     return false;
6397 
6398   if (slp_node)
6399     ncopies = 1;
6400   else
6401     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6402 
6403   gcc_assert (ncopies >= 1);
6404 
6405   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6406 
6407   if (nested_cycle)
6408     {
6409       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6410 		  == vect_double_reduction_def);
6411       double_reduc = true;
6412     }
6413 
6414   /* 4.2. Check support for the epilog operation.
6415 
6416           If STMT represents a reduction pattern, then the type of the
6417           reduction variable may be different than the type of the rest
6418           of the arguments.  For example, consider the case of accumulation
6419           of shorts into an int accumulator; The original code:
6420                         S1: int_a = (int) short_a;
6421           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6422 
6423           was replaced with:
6424                         STMT: int_acc = widen_sum <short_a, int_acc>
6425 
6426           This means that:
6427           1. The tree-code that is used to create the vector operation in the
6428              epilog code (that reduces the partial results) is not the
6429              tree-code of STMT, but is rather the tree-code of the original
6430              stmt from the pattern that STMT is replacing.  I.e, in the example
6431              above we want to use 'widen_sum' in the loop, but 'plus' in the
6432              epilog.
6433           2. The type (mode) we use to check available target support
6434              for the vector operation to be created in the *epilog*, is
6435              determined by the type of the reduction variable (in the example
6436              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6437              However the type (mode) we use to check available target support
6438              for the vector operation to be created *inside the loop*, is
6439              determined by the type of the other arguments to STMT (in the
6440              example we'd check this: optab_handler (widen_sum_optab,
6441 	     vect_short_mode)).
6442 
6443           This is contrary to "regular" reductions, in which the types of all
6444           the arguments are the same as the type of the reduction variable.
6445           For "regular" reductions we can therefore use the same vector type
6446           (and also the same tree-code) when generating the epilog code and
6447           when generating the code inside the loop.  */
6448 
6449   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6450   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6451 
6452   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6453   if (reduction_type == TREE_CODE_REDUCTION)
6454     {
6455       /* Check whether it's ok to change the order of the computation.
6456 	 Generally, when vectorizing a reduction we change the order of the
6457 	 computation.  This may change the behavior of the program in some
6458 	 cases, so we need to check that this is ok.  One exception is when
6459 	 vectorizing an outer-loop: the inner-loop is executed sequentially,
6460 	 and therefore vectorizing reductions in the inner-loop during
6461 	 outer-loop vectorization is safe.  */
6462       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6463 	{
6464 	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
6465 	     is not directy used in stmt.  */
6466 	  if (!only_slp_reduc_chain
6467 	      && reduc_chain_length != 1)
6468 	    {
6469 	      if (dump_enabled_p ())
6470 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471 				 "in-order reduction chain without SLP.\n");
6472 	      return false;
6473 	    }
6474 	  STMT_VINFO_REDUC_TYPE (reduc_info)
6475 	    = reduction_type = FOLD_LEFT_REDUCTION;
6476 	}
6477       else if (!commutative_tree_code (orig_code)
6478 	       || !associative_tree_code (orig_code))
6479 	{
6480 	  if (dump_enabled_p ())
6481 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6482 			    "reduction: not commutative/associative");
6483 	  return false;
6484 	}
6485     }
6486 
6487   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6488       && ncopies > 1)
6489     {
6490       if (dump_enabled_p ())
6491 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 			 "multiple types in double reduction or condition "
6493 			 "reduction or fold-left reduction.\n");
6494       return false;
6495     }
6496 
6497   internal_fn reduc_fn = IFN_LAST;
6498   if (reduction_type == TREE_CODE_REDUCTION
6499       || reduction_type == FOLD_LEFT_REDUCTION
6500       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6501       || reduction_type == CONST_COND_REDUCTION)
6502     {
6503       if (reduction_type == FOLD_LEFT_REDUCTION
6504 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6505 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6506 	{
6507 	  if (reduc_fn != IFN_LAST
6508 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6509 						  OPTIMIZE_FOR_SPEED))
6510 	    {
6511 	      if (dump_enabled_p ())
6512 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 				 "reduc op not supported by target.\n");
6514 
6515 	      reduc_fn = IFN_LAST;
6516 	    }
6517 	}
6518       else
6519 	{
6520 	  if (!nested_cycle || double_reduc)
6521 	    {
6522 	      if (dump_enabled_p ())
6523 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6524 				 "no reduc code for scalar code.\n");
6525 
6526 	      return false;
6527 	    }
6528 	}
6529     }
6530   else if (reduction_type == COND_REDUCTION)
6531     {
6532       int scalar_precision
6533 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6534       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6535       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6536 						nunits_out);
6537 
6538       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6539 					  OPTIMIZE_FOR_SPEED))
6540 	reduc_fn = IFN_REDUC_MAX;
6541     }
6542   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6543 
6544   if (reduction_type != EXTRACT_LAST_REDUCTION
6545       && (!nested_cycle || double_reduc)
6546       && reduc_fn == IFN_LAST
6547       && !nunits_out.is_constant ())
6548     {
6549       if (dump_enabled_p ())
6550 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551 			 "missing target support for reduction on"
6552 			 " variable-length vectors.\n");
6553       return false;
6554     }
6555 
6556   /* For SLP reductions, see if there is a neutral value we can use.  */
6557   tree neutral_op = NULL_TREE;
6558   if (slp_node)
6559     neutral_op = neutral_op_for_slp_reduction
6560       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6561        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6562 
6563   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6564     {
6565       /* We can't support in-order reductions of code such as this:
6566 
6567 	   for (int i = 0; i < n1; ++i)
6568 	     for (int j = 0; j < n2; ++j)
6569 	       l += a[j];
6570 
6571 	 since GCC effectively transforms the loop when vectorizing:
6572 
6573 	   for (int i = 0; i < n1 / VF; ++i)
6574 	     for (int j = 0; j < n2; ++j)
6575 	       for (int k = 0; k < VF; ++k)
6576 		 l += a[j];
6577 
6578 	 which is a reassociation of the original operation.  */
6579       if (dump_enabled_p ())
6580 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 			 "in-order double reduction not supported.\n");
6582 
6583       return false;
6584     }
6585 
6586   if (reduction_type == FOLD_LEFT_REDUCTION
6587       && slp_node
6588       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6589     {
6590       /* We cannot use in-order reductions in this case because there is
6591 	 an implicit reassociation of the operations involved.  */
6592       if (dump_enabled_p ())
6593 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6594 			 "in-order unchained SLP reductions not supported.\n");
6595       return false;
6596     }
6597 
6598   /* For double reductions, and for SLP reductions with a neutral value,
6599      we construct a variable-length initial vector by loading a vector
6600      full of the neutral value and then shift-and-inserting the start
6601      values into the low-numbered elements.  */
6602   if ((double_reduc || neutral_op)
6603       && !nunits_out.is_constant ()
6604       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6605 					  vectype_out, OPTIMIZE_FOR_SPEED))
6606     {
6607       if (dump_enabled_p ())
6608 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 			 "reduction on variable-length vectors requires"
6610 			 " target support for a vector-shift-and-insert"
6611 			 " operation.\n");
6612       return false;
6613     }
6614 
6615   /* Check extra constraints for variable-length unchained SLP reductions.  */
6616   if (STMT_SLP_TYPE (stmt_info)
6617       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6618       && !nunits_out.is_constant ())
6619     {
6620       /* We checked above that we could build the initial vector when
6621 	 there's a neutral element value.  Check here for the case in
6622 	 which each SLP statement has its own initial value and in which
6623 	 that value needs to be repeated for every instance of the
6624 	 statement within the initial vector.  */
6625       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6626       if (!neutral_op
6627 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6628 					      TREE_TYPE (vectype_out)))
6629 	{
6630 	  if (dump_enabled_p ())
6631 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632 			     "unsupported form of SLP reduction for"
6633 			     " variable-length vectors: cannot build"
6634 			     " initial vector.\n");
6635 	  return false;
6636 	}
6637       /* The epilogue code relies on the number of elements being a multiple
6638 	 of the group size.  The duplicate-and-interleave approach to setting
6639 	 up the initial vector does too.  */
6640       if (!multiple_p (nunits_out, group_size))
6641 	{
6642 	  if (dump_enabled_p ())
6643 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6644 			     "unsupported form of SLP reduction for"
6645 			     " variable-length vectors: the vector size"
6646 			     " is not a multiple of the number of results.\n");
6647 	  return false;
6648 	}
6649     }
6650 
6651   if (reduction_type == COND_REDUCTION)
6652     {
6653       widest_int ni;
6654 
6655       if (! max_loop_iterations (loop, &ni))
6656 	{
6657 	  if (dump_enabled_p ())
6658 	    dump_printf_loc (MSG_NOTE, vect_location,
6659 			     "loop count not known, cannot create cond "
6660 			     "reduction.\n");
6661 	  return false;
6662 	}
6663       /* Convert backedges to iterations.  */
6664       ni += 1;
6665 
6666       /* The additional index will be the same type as the condition.  Check
6667 	 that the loop can fit into this less one (because we'll use up the
6668 	 zero slot for when there are no matches).  */
6669       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6670       if (wi::geu_p (ni, wi::to_widest (max_index)))
6671 	{
6672 	  if (dump_enabled_p ())
6673 	    dump_printf_loc (MSG_NOTE, vect_location,
6674 			     "loop size is greater than data size.\n");
6675 	  return false;
6676 	}
6677     }
6678 
6679   /* In case the vectorization factor (VF) is bigger than the number
6680      of elements that we can fit in a vectype (nunits), we have to generate
6681      more than one vector stmt - i.e - we need to "unroll" the
6682      vector stmt by a factor VF/nunits.  For more details see documentation
6683      in vectorizable_operation.  */
6684 
6685   /* If the reduction is used in an outer loop we need to generate
6686      VF intermediate results, like so (e.g. for ncopies=2):
6687 	r0 = phi (init, r0)
6688 	r1 = phi (init, r1)
6689 	r0 = x0 + r0;
6690         r1 = x1 + r1;
6691     (i.e. we generate VF results in 2 registers).
6692     In this case we have a separate def-use cycle for each copy, and therefore
6693     for each copy we get the vector def for the reduction variable from the
6694     respective phi node created for this copy.
6695 
6696     Otherwise (the reduction is unused in the loop nest), we can combine
6697     together intermediate results, like so (e.g. for ncopies=2):
6698 	r = phi (init, r)
6699 	r = x0 + r;
6700 	r = x1 + r;
6701    (i.e. we generate VF/2 results in a single register).
6702    In this case for each copy we get the vector def for the reduction variable
6703    from the vectorized reduction operation generated in the previous iteration.
6704 
6705    This only works when we see both the reduction PHI and its only consumer
6706    in vectorizable_reduction and there are no intermediate stmts
6707    participating.  */
6708   if (ncopies > 1
6709       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6710       && reduc_chain_length == 1)
6711     single_defuse_cycle = true;
6712 
6713   if (single_defuse_cycle || lane_reduc_code_p)
6714     {
6715       gcc_assert (code != COND_EXPR);
6716 
6717       /* 4. Supportable by target?  */
6718       bool ok = true;
6719 
6720       /* 4.1. check support for the operation in the loop  */
6721       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6722       if (!optab)
6723 	{
6724 	  if (dump_enabled_p ())
6725 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6726 			     "no optab.\n");
6727 	  ok = false;
6728         }
6729 
6730       machine_mode vec_mode = TYPE_MODE (vectype_in);
6731       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6732         {
6733           if (dump_enabled_p ())
6734             dump_printf (MSG_NOTE, "op not supported by target.\n");
6735 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6736 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6737 	    ok = false;
6738 	  else
6739 	    if (dump_enabled_p ())
6740 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6741         }
6742 
6743       /* Worthwhile without SIMD support?  */
6744       if (ok
6745 	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6746 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6747         {
6748           if (dump_enabled_p ())
6749 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6750 			     "not worthwhile without SIMD support.\n");
6751 	  ok = false;
6752         }
6753 
6754       /* lane-reducing operations have to go through vect_transform_reduction.
6755          For the other cases try without the single cycle optimization.  */
6756       if (!ok)
6757 	{
6758 	  if (lane_reduc_code_p)
6759 	    return false;
6760 	  else
6761 	    single_defuse_cycle = false;
6762 	}
6763     }
6764   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6765 
6766   /* If the reduction stmt is one of the patterns that have lane
6767      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6768   if ((ncopies > 1 && ! single_defuse_cycle)
6769       && lane_reduc_code_p)
6770     {
6771       if (dump_enabled_p ())
6772 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6773 			 "multi def-use cycle not possible for lane-reducing "
6774 			 "reduction operation\n");
6775       return false;
6776     }
6777 
6778   if (slp_node)
6779     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6780   else
6781     vec_num = 1;
6782 
6783   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6784 			     cost_vec);
6785   /* Cost the reduction op inside the loop if transformed via
6786      vect_transform_reduction.  Otherwise this is costed by the
6787      separate vectorizable_* routines.  */
6788   if (single_defuse_cycle
6789       || code == DOT_PROD_EXPR
6790       || code == WIDEN_SUM_EXPR
6791       || code == SAD_EXPR)
6792     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
6793 
6794   if (dump_enabled_p ()
6795       && reduction_type == FOLD_LEFT_REDUCTION)
6796     dump_printf_loc (MSG_NOTE, vect_location,
6797 		     "using an in-order (fold-left) reduction.\n");
6798   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6799   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6800      reductions go through their own vectorizable_* routines.  */
6801   if (!single_defuse_cycle
6802       && code != DOT_PROD_EXPR
6803       && code != WIDEN_SUM_EXPR
6804       && code != SAD_EXPR
6805       && reduction_type != FOLD_LEFT_REDUCTION)
6806     {
6807       stmt_vec_info tem
6808 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6809       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6810 	{
6811 	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6812 	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6813 	}
6814       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6815       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6816     }
6817   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6818     {
6819       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6820       internal_fn cond_fn = get_conditional_internal_fn (code);
6821 
6822       if (reduction_type != FOLD_LEFT_REDUCTION
6823 	  && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6824 	  && (cond_fn == IFN_LAST
6825 	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6826 						  OPTIMIZE_FOR_SPEED)))
6827 	{
6828 	  if (dump_enabled_p ())
6829 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830 			     "can't use a fully-masked loop because no"
6831 			     " conditional operation is available.\n");
6832 	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6833 	}
6834       else if (reduction_type == FOLD_LEFT_REDUCTION
6835 	       && reduc_fn == IFN_LAST
6836 	       && !expand_vec_cond_expr_p (vectype_in,
6837 					   truth_type_for (vectype_in),
6838 					   SSA_NAME))
6839 	{
6840 	  if (dump_enabled_p ())
6841 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842 			     "can't use a fully-masked loop because no"
6843 			     " conditional operation is available.\n");
6844 	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6845 	}
6846       else
6847 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6848 			       vectype_in, NULL);
6849     }
6850   return true;
6851 }
6852 
6853 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6854    value.  */
6855 
6856 bool
vect_transform_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node)6857 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6858 			  stmt_vec_info *vec_stmt, slp_tree slp_node)
6859 {
6860   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6861   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6862   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6863   int i;
6864   int ncopies;
6865   int j;
6866   int vec_num;
6867 
6868   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6869   gcc_assert (reduc_info->is_reduc_info);
6870 
6871   if (nested_in_vect_loop_p (loop, stmt_info))
6872     {
6873       loop = loop->inner;
6874       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6875     }
6876 
6877   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6878   enum tree_code code = gimple_assign_rhs_code (stmt);
6879   int op_type = TREE_CODE_LENGTH (code);
6880 
6881   /* Flatten RHS.  */
6882   tree ops[3];
6883   switch (get_gimple_rhs_class (code))
6884     {
6885     case GIMPLE_TERNARY_RHS:
6886       ops[2] = gimple_assign_rhs3 (stmt);
6887       /* Fall thru.  */
6888     case GIMPLE_BINARY_RHS:
6889       ops[0] = gimple_assign_rhs1 (stmt);
6890       ops[1] = gimple_assign_rhs2 (stmt);
6891       break;
6892     default:
6893       gcc_unreachable ();
6894     }
6895 
6896   /* All uses but the last are expected to be defined in the loop.
6897      The last use is the reduction variable.  In case of nested cycle this
6898      assumption is not true: we use reduc_index to record the index of the
6899      reduction variable.  */
6900   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6901   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6902   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6903   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6904 
6905   if (slp_node)
6906     {
6907       ncopies = 1;
6908       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6909     }
6910   else
6911     {
6912       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6913       vec_num = 1;
6914     }
6915 
6916   internal_fn cond_fn = get_conditional_internal_fn (code);
6917   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6918   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6919 
6920   /* Transform.  */
6921   stmt_vec_info new_stmt_info = NULL;
6922   stmt_vec_info prev_stmt_info;
6923   tree new_temp = NULL_TREE;
6924   auto_vec<tree> vec_oprnds0;
6925   auto_vec<tree> vec_oprnds1;
6926   auto_vec<tree> vec_oprnds2;
6927   tree def0;
6928 
6929   if (dump_enabled_p ())
6930     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6931 
6932   /* FORNOW: Multiple types are not supported for condition.  */
6933   if (code == COND_EXPR)
6934     gcc_assert (ncopies == 1);
6935 
6936   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6937 
6938   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6939   if (reduction_type == FOLD_LEFT_REDUCTION)
6940     {
6941       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6942       return vectorize_fold_left_reduction
6943 	  (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6944 	   reduc_fn, ops, vectype_in, reduc_index, masks);
6945     }
6946 
6947   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6948   gcc_assert (single_defuse_cycle
6949 	      || code == DOT_PROD_EXPR
6950 	      || code == WIDEN_SUM_EXPR
6951 	      || code == SAD_EXPR);
6952 
6953   /* Create the destination vector  */
6954   tree scalar_dest = gimple_assign_lhs (stmt);
6955   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6956 
6957   prev_stmt_info = NULL;
6958   if (!slp_node)
6959     {
6960       vec_oprnds0.create (1);
6961       vec_oprnds1.create (1);
6962       if (op_type == ternary_op)
6963         vec_oprnds2.create (1);
6964     }
6965 
6966   for (j = 0; j < ncopies; j++)
6967     {
6968       /* Handle uses.  */
6969       if (j == 0)
6970         {
6971 	  if (slp_node)
6972 	    {
6973 	      /* Get vec defs for all the operands except the reduction index,
6974 		 ensuring the ordering of the ops in the vector is kept.  */
6975 	      auto_vec<vec<tree>, 3> vec_defs;
6976 	      vect_get_slp_defs (slp_node, &vec_defs);
6977 	      vec_oprnds0.safe_splice (vec_defs[0]);
6978 	      vec_defs[0].release ();
6979 	      vec_oprnds1.safe_splice (vec_defs[1]);
6980 	      vec_defs[1].release ();
6981 	      if (op_type == ternary_op)
6982 		{
6983 		  vec_oprnds2.safe_splice (vec_defs[2]);
6984 		  vec_defs[2].release ();
6985 		}
6986 	    }
6987           else
6988 	    {
6989               vec_oprnds0.quick_push
6990 		(vect_get_vec_def_for_operand (ops[0], stmt_info));
6991               vec_oprnds1.quick_push
6992 		(vect_get_vec_def_for_operand (ops[1], stmt_info));
6993               if (op_type == ternary_op)
6994 		vec_oprnds2.quick_push
6995 		  (vect_get_vec_def_for_operand (ops[2], stmt_info));
6996 	    }
6997         }
6998       else
6999         {
7000           if (!slp_node)
7001             {
7002 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7003 
7004 	      if (single_defuse_cycle && reduc_index == 0)
7005 		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7006 	      else
7007 		vec_oprnds0[0]
7008 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7009 						    vec_oprnds0[0]);
7010 	      if (single_defuse_cycle && reduc_index == 1)
7011 		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7012 	      else
7013 		vec_oprnds1[0]
7014 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7015 						    vec_oprnds1[0]);
7016 	      if (op_type == ternary_op)
7017 		{
7018 		  if (single_defuse_cycle && reduc_index == 2)
7019 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7020 		  else
7021 		    vec_oprnds2[0]
7022 		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7023 							vec_oprnds2[0]);
7024 		}
7025             }
7026         }
7027 
7028       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7029         {
7030 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7031 	  if (masked_loop_p && !mask_by_cond_expr)
7032 	    {
7033 	      /* Make sure that the reduction accumulator is vop[0].  */
7034 	      if (reduc_index == 1)
7035 		{
7036 		  gcc_assert (commutative_tree_code (code));
7037 		  std::swap (vop[0], vop[1]);
7038 		}
7039 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7040 					      vectype_in, i * ncopies + j);
7041 	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7042 							vop[0], vop[1],
7043 							vop[0]);
7044 	      new_temp = make_ssa_name (vec_dest, call);
7045 	      gimple_call_set_lhs (call, new_temp);
7046 	      gimple_call_set_nothrow (call, true);
7047 	      new_stmt_info
7048 		= vect_finish_stmt_generation (stmt_info, call, gsi);
7049 	    }
7050 	  else
7051 	    {
7052 	      if (op_type == ternary_op)
7053 		vop[2] = vec_oprnds2[i];
7054 
7055 	      if (masked_loop_p && mask_by_cond_expr)
7056 		{
7057 		  tree mask = vect_get_loop_mask (gsi, masks,
7058 						  vec_num * ncopies,
7059 						  vectype_in, i * ncopies + j);
7060 		  build_vect_cond_expr (code, vop, mask, gsi);
7061 		}
7062 
7063 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
7064 						       vop[0], vop[1], vop[2]);
7065 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7066 	      gimple_assign_set_lhs (new_stmt, new_temp);
7067 	      new_stmt_info
7068 		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7069 	    }
7070 
7071           if (slp_node)
7072 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7073         }
7074 
7075       if (slp_node || single_defuse_cycle)
7076         continue;
7077 
7078       if (j == 0)
7079 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7080       else
7081 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7082 
7083       prev_stmt_info = new_stmt_info;
7084     }
7085 
7086   if (single_defuse_cycle && !slp_node)
7087     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7088 
7089   return true;
7090 }
7091 
7092 /* Transform phase of a cycle PHI.  */
7093 
7094 bool
vect_transform_cycle_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7095 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7096 			  slp_tree slp_node, slp_instance slp_node_instance)
7097 {
7098   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7099   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7100   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7101   int i;
7102   int ncopies;
7103   stmt_vec_info prev_phi_info;
7104   int j;
7105   bool nested_cycle = false;
7106   int vec_num;
7107 
7108   if (nested_in_vect_loop_p (loop, stmt_info))
7109     {
7110       loop = loop->inner;
7111       nested_cycle = true;
7112     }
7113 
7114   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7115   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7116   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7117   gcc_assert (reduc_info->is_reduc_info);
7118 
7119   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7120       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7121     /* Leave the scalar phi in place.  */
7122     return true;
7123 
7124   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7125   /* For a nested cycle we do not fill the above.  */
7126   if (!vectype_in)
7127     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7128   gcc_assert (vectype_in);
7129 
7130   if (slp_node)
7131     {
7132       /* The size vect_schedule_slp_instance computes is off for us.  */
7133       vec_num = vect_get_num_vectors
7134 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7135 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7136       ncopies = 1;
7137     }
7138   else
7139     {
7140       vec_num = 1;
7141       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7142     }
7143 
7144   /* Check whether we should use a single PHI node and accumulate
7145      vectors to one before the backedge.  */
7146   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7147     ncopies = 1;
7148 
7149   /* Create the destination vector  */
7150   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7151   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7152 					       vectype_out);
7153 
7154   /* Get the loop-entry arguments.  */
7155   tree vec_initial_def;
7156   auto_vec<tree> vec_initial_defs;
7157   if (slp_node)
7158     {
7159       vec_initial_defs.reserve (vec_num);
7160       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7161       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7162       tree neutral_op
7163 	= neutral_op_for_slp_reduction (slp_node, vectype_out,
7164 					STMT_VINFO_REDUC_CODE (reduc_info),
7165 					first != NULL);
7166       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7167 				      &vec_initial_defs, vec_num,
7168 				      first != NULL, neutral_op);
7169     }
7170   else
7171     {
7172       /* Get at the scalar def before the loop, that defines the initial
7173 	 value of the reduction variable.  */
7174       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7175 						loop_preheader_edge (loop));
7176       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7177 	 and we can't use zero for induc_val, use initial_def.  Similarly
7178 	 for REDUC_MIN and initial_def larger than the base.  */
7179       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7180 	{
7181 	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7182 	  if (TREE_CODE (initial_def) == INTEGER_CST
7183 	      && !integer_zerop (induc_val)
7184 	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7185 		   && tree_int_cst_lt (initial_def, induc_val))
7186 		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7187 		      && tree_int_cst_lt (induc_val, initial_def))))
7188 	    {
7189 	      induc_val = initial_def;
7190 	      /* Communicate we used the initial_def to epilouge
7191 		 generation.  */
7192 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7193 	    }
7194 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7195 	}
7196       else if (nested_cycle)
7197 	{
7198 	  /* Do not use an adjustment def as that case is not supported
7199 	     correctly if ncopies is not one.  */
7200 	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7201 							  reduc_stmt_info);
7202 	}
7203       else
7204 	{
7205 	  tree adjustment_def = NULL_TREE;
7206 	  tree *adjustment_defp = &adjustment_def;
7207 	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7208 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7209 	    adjustment_defp = NULL;
7210 	  vec_initial_def
7211 	    = get_initial_def_for_reduction (reduc_stmt_info, code,
7212 					     initial_def, adjustment_defp);
7213 	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7214 	}
7215       vec_initial_defs.create (1);
7216       vec_initial_defs.quick_push (vec_initial_def);
7217     }
7218 
7219   /* Generate the reduction PHIs upfront.  */
7220   prev_phi_info = NULL;
7221   for (i = 0; i < vec_num; i++)
7222     {
7223       tree vec_init_def = vec_initial_defs[i];
7224       for (j = 0; j < ncopies; j++)
7225 	{
7226 	  /* Create the reduction-phi that defines the reduction
7227 	     operand.  */
7228 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7229 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7230 
7231 	  /* Set the loop-entry arg of the reduction-phi.  */
7232 	  if (j != 0 && nested_cycle)
7233 	    vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7234 							   vec_init_def);
7235 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7236 		       UNKNOWN_LOCATION);
7237 
7238 	  /* The loop-latch arg is set in epilogue processing.  */
7239 
7240 	  if (slp_node)
7241 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7242 	  else
7243 	    {
7244 	      if (j == 0)
7245 		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7246 	      else
7247 		STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7248 	      prev_phi_info = new_phi_info;
7249 	    }
7250 	}
7251     }
7252 
7253   return true;
7254 }
7255 
7256 /* Vectorizes LC PHIs.  */
7257 
7258 bool
vectorizable_lc_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node)7259 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7260 		     slp_tree slp_node)
7261 {
7262   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7263   if (!loop_vinfo
7264       || !is_a <gphi *> (stmt_info->stmt)
7265       || gimple_phi_num_args (stmt_info->stmt) != 1)
7266     return false;
7267 
7268   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7269       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7270     return false;
7271 
7272   if (!vec_stmt) /* transformation not required.  */
7273     {
7274       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7275       return true;
7276     }
7277 
7278   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7279   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7280   basic_block bb = gimple_bb (stmt_info->stmt);
7281   edge e = single_pred_edge (bb);
7282   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7283   vec<tree> vec_oprnds = vNULL;
7284   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7285 		     stmt_info, &vec_oprnds, NULL, slp_node);
7286   if (slp_node)
7287     {
7288       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7289       gcc_assert (vec_oprnds.length () == vec_num);
7290       for (unsigned i = 0; i < vec_num; i++)
7291 	{
7292 	  /* Create the vectorized LC PHI node.  */
7293 	  gphi *new_phi = create_phi_node (vec_dest, bb);
7294 	  add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7295 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7296 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7297 	}
7298     }
7299   else
7300     {
7301       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7302       stmt_vec_info prev_phi_info = NULL;
7303       for (unsigned i = 0; i < ncopies; i++)
7304 	{
7305 	  if (i != 0)
7306 	    vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7307 	  /* Create the vectorized LC PHI node.  */
7308 	  gphi *new_phi = create_phi_node (vec_dest, bb);
7309 	  add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7310 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7311 	  if (i == 0)
7312 	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7313 	  else
7314 	    STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7315 	  prev_phi_info = new_phi_info;
7316 	}
7317     }
7318   vec_oprnds.release ();
7319 
7320   return true;
7321 }
7322 
7323 
7324 /* Function vect_min_worthwhile_factor.
7325 
7326    For a loop where we could vectorize the operation indicated by CODE,
7327    return the minimum vectorization factor that makes it worthwhile
7328    to use generic vectors.  */
7329 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7330 vect_min_worthwhile_factor (enum tree_code code)
7331 {
7332   switch (code)
7333     {
7334     case PLUS_EXPR:
7335     case MINUS_EXPR:
7336     case NEGATE_EXPR:
7337       return 4;
7338 
7339     case BIT_AND_EXPR:
7340     case BIT_IOR_EXPR:
7341     case BIT_XOR_EXPR:
7342     case BIT_NOT_EXPR:
7343       return 2;
7344 
7345     default:
7346       return INT_MAX;
7347     }
7348 }
7349 
7350 /* Return true if VINFO indicates we are doing loop vectorization and if
7351    it is worth decomposing CODE operations into scalar operations for
7352    that loop's vectorization factor.  */
7353 
7354 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7355 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7356 {
7357   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7358   unsigned HOST_WIDE_INT value;
7359   return (loop_vinfo
7360 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7361 	  && value >= vect_min_worthwhile_factor (code));
7362 }
7363 
7364 /* Function vectorizable_induction
7365 
7366    Check if STMT_INFO performs an induction computation that can be vectorized.
7367    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7368    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7369    Return true if STMT_INFO is vectorizable in this way.  */
7370 
7371 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7372 vectorizable_induction (stmt_vec_info stmt_info,
7373 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7374 			stmt_vec_info *vec_stmt, slp_tree slp_node,
7375 			stmt_vector_for_cost *cost_vec)
7376 {
7377   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7378   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7379   unsigned ncopies;
7380   bool nested_in_vect_loop = false;
7381   class loop *iv_loop;
7382   tree vec_def;
7383   edge pe = loop_preheader_edge (loop);
7384   basic_block new_bb;
7385   tree new_vec, vec_init, vec_step, t;
7386   tree new_name;
7387   gimple *new_stmt;
7388   gphi *induction_phi;
7389   tree induc_def, vec_dest;
7390   tree init_expr, step_expr;
7391   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7392   unsigned i;
7393   tree expr;
7394   gimple_seq stmts;
7395   imm_use_iterator imm_iter;
7396   use_operand_p use_p;
7397   gimple *exit_phi;
7398   edge latch_e;
7399   tree loop_arg;
7400   gimple_stmt_iterator si;
7401 
7402   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7403   if (!phi)
7404     return false;
7405 
7406   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7407     return false;
7408 
7409   /* Make sure it was recognized as induction computation.  */
7410   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7411     return false;
7412 
7413   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7414   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7415 
7416   if (slp_node)
7417     ncopies = 1;
7418   else
7419     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7420   gcc_assert (ncopies >= 1);
7421 
7422   /* FORNOW. These restrictions should be relaxed.  */
7423   if (nested_in_vect_loop_p (loop, stmt_info))
7424     {
7425       imm_use_iterator imm_iter;
7426       use_operand_p use_p;
7427       gimple *exit_phi;
7428       edge latch_e;
7429       tree loop_arg;
7430 
7431       if (ncopies > 1)
7432 	{
7433 	  if (dump_enabled_p ())
7434 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7435 			     "multiple types in nested loop.\n");
7436 	  return false;
7437 	}
7438 
7439       /* FORNOW: outer loop induction with SLP not supported.  */
7440       if (STMT_SLP_TYPE (stmt_info))
7441 	return false;
7442 
7443       exit_phi = NULL;
7444       latch_e = loop_latch_edge (loop->inner);
7445       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7446       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7447 	{
7448 	  gimple *use_stmt = USE_STMT (use_p);
7449 	  if (is_gimple_debug (use_stmt))
7450 	    continue;
7451 
7452 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7453 	    {
7454 	      exit_phi = use_stmt;
7455 	      break;
7456 	    }
7457 	}
7458       if (exit_phi)
7459 	{
7460 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7461 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7462 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7463 	    {
7464 	      if (dump_enabled_p ())
7465 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7466 				 "inner-loop induction only used outside "
7467 				 "of the outer vectorized loop.\n");
7468 	      return false;
7469 	    }
7470 	}
7471 
7472       nested_in_vect_loop = true;
7473       iv_loop = loop->inner;
7474     }
7475   else
7476     iv_loop = loop;
7477   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7478 
7479   if (slp_node && !nunits.is_constant ())
7480     {
7481       /* The current SLP code creates the initial value element-by-element.  */
7482       if (dump_enabled_p ())
7483 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7484 			 "SLP induction not supported for variable-length"
7485 			 " vectors.\n");
7486       return false;
7487     }
7488 
7489   if (!vec_stmt) /* transformation not required.  */
7490     {
7491       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7492       DUMP_VECT_SCOPE ("vectorizable_induction");
7493       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7494       return true;
7495     }
7496 
7497   /* Transform.  */
7498 
7499   /* Compute a vector variable, initialized with the first VF values of
7500      the induction variable.  E.g., for an iv with IV_PHI='X' and
7501      evolution S, for a vector of 4 units, we want to compute:
7502      [X, X + S, X + 2*S, X + 3*S].  */
7503 
7504   if (dump_enabled_p ())
7505     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7506 
7507   latch_e = loop_latch_edge (iv_loop);
7508   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7509 
7510   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7511   gcc_assert (step_expr != NULL_TREE);
7512   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7513 
7514   pe = loop_preheader_edge (iv_loop);
7515   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7516 				     loop_preheader_edge (iv_loop));
7517 
7518   stmts = NULL;
7519   if (!nested_in_vect_loop)
7520     {
7521       /* Convert the initial value to the IV update type.  */
7522       tree new_type = TREE_TYPE (step_expr);
7523       init_expr = gimple_convert (&stmts, new_type, init_expr);
7524 
7525       /* If we are using the loop mask to "peel" for alignment then we need
7526 	 to adjust the start value here.  */
7527       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7528       if (skip_niters != NULL_TREE)
7529 	{
7530 	  if (FLOAT_TYPE_P (vectype))
7531 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7532 					skip_niters);
7533 	  else
7534 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7535 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7536 					 skip_niters, step_expr);
7537 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7538 				    init_expr, skip_step);
7539 	}
7540     }
7541 
7542   if (stmts)
7543     {
7544       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7545       gcc_assert (!new_bb);
7546     }
7547 
7548   /* Find the first insertion point in the BB.  */
7549   basic_block bb = gimple_bb (phi);
7550   si = gsi_after_labels (bb);
7551 
7552   /* For SLP induction we have to generate several IVs as for example
7553      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7554      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7555      [VF*S, VF*S, VF*S, VF*S] for all.  */
7556   if (slp_node)
7557     {
7558       /* Enforced above.  */
7559       unsigned int const_nunits = nunits.to_constant ();
7560 
7561       /* Generate [VF*S, VF*S, ... ].  */
7562       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7563 	{
7564 	  expr = build_int_cst (integer_type_node, vf);
7565 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7566 	}
7567       else
7568 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7569       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7570 			      expr, step_expr);
7571       if (! CONSTANT_CLASS_P (new_name))
7572 	new_name = vect_init_vector (stmt_info, new_name,
7573 				     TREE_TYPE (step_expr), NULL);
7574       new_vec = build_vector_from_val (step_vectype, new_name);
7575       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7576 
7577       /* Now generate the IVs.  */
7578       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7579       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7580       unsigned elts = const_nunits * nvects;
7581       unsigned nivs = least_common_multiple (group_size,
7582 					     const_nunits) / const_nunits;
7583       gcc_assert (elts % group_size == 0);
7584       tree elt = init_expr;
7585       unsigned ivn;
7586       for (ivn = 0; ivn < nivs; ++ivn)
7587 	{
7588 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7589 	  stmts = NULL;
7590 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7591 	    {
7592 	      if (ivn*const_nunits + eltn >= group_size
7593 		  && (ivn * const_nunits + eltn) % group_size == 0)
7594 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7595 				    elt, step_expr);
7596 	      elts.quick_push (elt);
7597 	    }
7598 	  vec_init = gimple_build_vector (&stmts, &elts);
7599 	  vec_init = gimple_convert (&stmts, vectype, vec_init);
7600 	  if (stmts)
7601 	    {
7602 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7603 	      gcc_assert (!new_bb);
7604 	    }
7605 
7606 	  /* Create the induction-phi that defines the induction-operand.  */
7607 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7608 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7609 	  stmt_vec_info induction_phi_info
7610 	    = loop_vinfo->add_stmt (induction_phi);
7611 	  induc_def = PHI_RESULT (induction_phi);
7612 
7613 	  /* Create the iv update inside the loop  */
7614 	  gimple_seq stmts = NULL;
7615 	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7616 	  vec_def = gimple_build (&stmts,
7617 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7618 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7619 	  loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7620 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7621 
7622 	  /* Set the arguments of the phi node:  */
7623 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7624 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7625 		       UNKNOWN_LOCATION);
7626 
7627 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7628 	}
7629 
7630       /* Re-use IVs when we can.  */
7631       if (ivn < nvects)
7632 	{
7633 	  unsigned vfp
7634 	    = least_common_multiple (group_size, const_nunits) / group_size;
7635 	  /* Generate [VF'*S, VF'*S, ... ].  */
7636 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7637 	    {
7638 	      expr = build_int_cst (integer_type_node, vfp);
7639 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7640 	    }
7641 	  else
7642 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7643 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7644 				  expr, step_expr);
7645 	  if (! CONSTANT_CLASS_P (new_name))
7646 	    new_name = vect_init_vector (stmt_info, new_name,
7647 					 TREE_TYPE (step_expr), NULL);
7648 	  new_vec = build_vector_from_val (step_vectype, new_name);
7649 	  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7650 	  for (; ivn < nvects; ++ivn)
7651 	    {
7652 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7653 	      tree def;
7654 	      if (gimple_code (iv) == GIMPLE_PHI)
7655 		def = gimple_phi_result (iv);
7656 	      else
7657 		def = gimple_assign_lhs (iv);
7658 	      gimple_seq stmts = NULL;
7659 	      def = gimple_convert (&stmts, step_vectype, def);
7660 	      def = gimple_build (&stmts,
7661 				  PLUS_EXPR, step_vectype, def, vec_step);
7662 	      def = gimple_convert (&stmts, vectype, def);
7663 	      if (gimple_code (iv) == GIMPLE_PHI)
7664 		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7665 	      else
7666 		{
7667 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7668 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7669 		}
7670 	      SLP_TREE_VEC_STMTS (slp_node).quick_push
7671 		(loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7672 	    }
7673 	}
7674 
7675       return true;
7676     }
7677 
7678   /* Create the vector that holds the initial_value of the induction.  */
7679   if (nested_in_vect_loop)
7680     {
7681       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7682 	 been created during vectorization of previous stmts.  We obtain it
7683 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7684       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7685       /* If the initial value is not of proper type, convert it.  */
7686       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7687 	{
7688 	  new_stmt
7689 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7690 							  vect_simple_var,
7691 							  "vec_iv_"),
7692 				   VIEW_CONVERT_EXPR,
7693 				   build1 (VIEW_CONVERT_EXPR, vectype,
7694 					   vec_init));
7695 	  vec_init = gimple_assign_lhs (new_stmt);
7696 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7697 						 new_stmt);
7698 	  gcc_assert (!new_bb);
7699 	  loop_vinfo->add_stmt (new_stmt);
7700 	}
7701     }
7702   else
7703     {
7704       /* iv_loop is the loop to be vectorized. Create:
7705 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7706       stmts = NULL;
7707       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7708 
7709       unsigned HOST_WIDE_INT const_nunits;
7710       if (nunits.is_constant (&const_nunits))
7711 	{
7712 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7713 	  elts.quick_push (new_name);
7714 	  for (i = 1; i < const_nunits; i++)
7715 	    {
7716 	      /* Create: new_name_i = new_name + step_expr  */
7717 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7718 				       new_name, step_expr);
7719 	      elts.quick_push (new_name);
7720 	    }
7721 	  /* Create a vector from [new_name_0, new_name_1, ...,
7722 	     new_name_nunits-1]  */
7723 	  vec_init = gimple_build_vector (&stmts, &elts);
7724 	}
7725       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7726 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7727 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7728 				 new_name, step_expr);
7729       else
7730 	{
7731 	  /* Build:
7732 	        [base, base, base, ...]
7733 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7734 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7735 	  gcc_assert (flag_associative_math);
7736 	  tree index = build_index_vector (step_vectype, 0, 1);
7737 	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7738 							new_name);
7739 	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7740 							step_expr);
7741 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7742 	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7743 				   vec_init, step_vec);
7744 	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7745 				   vec_init, base_vec);
7746 	}
7747       vec_init = gimple_convert (&stmts, vectype, vec_init);
7748 
7749       if (stmts)
7750 	{
7751 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7752 	  gcc_assert (!new_bb);
7753 	}
7754     }
7755 
7756 
7757   /* Create the vector that holds the step of the induction.  */
7758   if (nested_in_vect_loop)
7759     /* iv_loop is nested in the loop to be vectorized. Generate:
7760        vec_step = [S, S, S, S]  */
7761     new_name = step_expr;
7762   else
7763     {
7764       /* iv_loop is the loop to be vectorized. Generate:
7765 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7766       gimple_seq seq = NULL;
7767       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7768 	{
7769 	  expr = build_int_cst (integer_type_node, vf);
7770 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7771 	}
7772       else
7773 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7774       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7775 			       expr, step_expr);
7776       if (seq)
7777 	{
7778 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7779 	  gcc_assert (!new_bb);
7780 	}
7781     }
7782 
7783   t = unshare_expr (new_name);
7784   gcc_assert (CONSTANT_CLASS_P (new_name)
7785 	      || TREE_CODE (new_name) == SSA_NAME);
7786   new_vec = build_vector_from_val (step_vectype, t);
7787   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7788 
7789 
7790   /* Create the following def-use cycle:
7791      loop prolog:
7792          vec_init = ...
7793 	 vec_step = ...
7794      loop:
7795          vec_iv = PHI <vec_init, vec_loop>
7796          ...
7797          STMT
7798          ...
7799          vec_loop = vec_iv + vec_step;  */
7800 
7801   /* Create the induction-phi that defines the induction-operand.  */
7802   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7803   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7804   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7805   induc_def = PHI_RESULT (induction_phi);
7806 
7807   /* Create the iv update inside the loop  */
7808   stmts = NULL;
7809   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7810   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7811   vec_def = gimple_convert (&stmts, vectype, vec_def);
7812   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7813   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7814   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7815 
7816   /* Set the arguments of the phi node:  */
7817   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7818   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7819 	       UNKNOWN_LOCATION);
7820 
7821   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7822 
7823   /* In case that vectorization factor (VF) is bigger than the number
7824      of elements that we can fit in a vectype (nunits), we have to generate
7825      more than one vector stmt - i.e - we need to "unroll" the
7826      vector stmt by a factor VF/nunits.  For more details see documentation
7827      in vectorizable_operation.  */
7828 
7829   if (ncopies > 1)
7830     {
7831       gimple_seq seq = NULL;
7832       stmt_vec_info prev_stmt_vinfo;
7833       /* FORNOW. This restriction should be relaxed.  */
7834       gcc_assert (!nested_in_vect_loop);
7835 
7836       /* Create the vector that holds the step of the induction.  */
7837       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7838 	{
7839 	  expr = build_int_cst (integer_type_node, nunits);
7840 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7841 	}
7842       else
7843 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7844       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7845 			       expr, step_expr);
7846       if (seq)
7847 	{
7848 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7849 	  gcc_assert (!new_bb);
7850 	}
7851 
7852       t = unshare_expr (new_name);
7853       gcc_assert (CONSTANT_CLASS_P (new_name)
7854 		  || TREE_CODE (new_name) == SSA_NAME);
7855       new_vec = build_vector_from_val (step_vectype, t);
7856       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7857 
7858       vec_def = induc_def;
7859       prev_stmt_vinfo = induction_phi_info;
7860       for (i = 1; i < ncopies; i++)
7861 	{
7862 	  /* vec_i = vec_prev + vec_step  */
7863 	  gimple_seq stmts = NULL;
7864 	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7865 	  vec_def = gimple_build (&stmts,
7866 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7867 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7868 
7869 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7870 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
7871 	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7872 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7873 	  prev_stmt_vinfo = new_stmt_info;
7874 	}
7875     }
7876 
7877   if (nested_in_vect_loop)
7878     {
7879       /* Find the loop-closed exit-phi of the induction, and record
7880          the final vector of induction results:  */
7881       exit_phi = NULL;
7882       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7883         {
7884 	  gimple *use_stmt = USE_STMT (use_p);
7885 	  if (is_gimple_debug (use_stmt))
7886 	    continue;
7887 
7888 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7889 	    {
7890 	      exit_phi = use_stmt;
7891 	      break;
7892 	    }
7893         }
7894       if (exit_phi)
7895 	{
7896 	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7897 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
7898 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
7899 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7900 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
7901 
7902 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7903 	  if (dump_enabled_p ())
7904 	    dump_printf_loc (MSG_NOTE, vect_location,
7905 			     "vector of inductions after inner-loop:%G",
7906 			     new_stmt);
7907 	}
7908     }
7909 
7910 
7911   if (dump_enabled_p ())
7912     dump_printf_loc (MSG_NOTE, vect_location,
7913 		     "transform induction: created def-use cycle: %G%G",
7914 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
7915 
7916   return true;
7917 }
7918 
7919 /* Function vectorizable_live_operation.
7920 
7921    STMT_INFO computes a value that is used outside the loop.  Check if
7922    it can be supported.  */
7923 
7924 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost *)7925 vectorizable_live_operation (stmt_vec_info stmt_info,
7926 			     gimple_stmt_iterator *gsi,
7927 			     slp_tree slp_node, slp_instance slp_node_instance,
7928 			     int slp_index, bool vec_stmt_p,
7929 			     stmt_vector_for_cost *)
7930 {
7931   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7932   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7933   imm_use_iterator imm_iter;
7934   tree lhs, lhs_type, bitsize, vec_bitsize;
7935   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7936   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7937   int ncopies;
7938   gimple *use_stmt;
7939   auto_vec<tree> vec_oprnds;
7940   int vec_entry = 0;
7941   poly_uint64 vec_index = 0;
7942 
7943   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7944 
7945   /* Due to how we generate code for SLP_TREE_TWO_OPERATORS we cannot
7946      vectorize live operations out of it.  */
7947   if (slp_node && SLP_TREE_TWO_OPERATORS (slp_node))
7948     return false;
7949 
7950   /* If a stmt of a reduction is live, vectorize it via
7951      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7952      validity so just trigger the transform here.  */
7953   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7954     {
7955       if (!vec_stmt_p)
7956 	return true;
7957       if (slp_node)
7958 	{
7959 	  /* For reduction chains the meta-info is attached to
7960 	     the group leader.  */
7961 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7962 	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7963 	  /* For SLP reductions we vectorize the epilogue for
7964 	     all involved stmts together.  */
7965 	  else if (slp_index != 0)
7966 	    return true;
7967 	}
7968       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7969       gcc_assert (reduc_info->is_reduc_info);
7970       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7971 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7972 	return true;
7973       vect_create_epilog_for_reduction (stmt_info, slp_node,
7974 					slp_node_instance);
7975       return true;
7976     }
7977 
7978   /* FORNOW.  CHECKME.  */
7979   if (nested_in_vect_loop_p (loop, stmt_info))
7980     return false;
7981 
7982   /* If STMT is not relevant and it is a simple assignment and its inputs are
7983      invariant then it can remain in place, unvectorized.  The original last
7984      scalar value that it computes will be used.  */
7985   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7986     {
7987       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7988       if (dump_enabled_p ())
7989 	dump_printf_loc (MSG_NOTE, vect_location,
7990 			 "statement is simple and uses invariant.  Leaving in "
7991 			 "place.\n");
7992       return true;
7993     }
7994 
7995   if (slp_node)
7996     ncopies = 1;
7997   else
7998     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7999 
8000   if (slp_node)
8001     {
8002       gcc_assert (slp_index >= 0);
8003 
8004       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8005       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8006 
8007       /* Get the last occurrence of the scalar index from the concatenation of
8008 	 all the slp vectors. Calculate which slp vector it is and the index
8009 	 within.  */
8010       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8011 
8012       /* Calculate which vector contains the result, and which lane of
8013 	 that vector we need.  */
8014       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8015 	{
8016 	  if (dump_enabled_p ())
8017 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8018 			     "Cannot determine which vector holds the"
8019 			     " final result.\n");
8020 	  return false;
8021 	}
8022     }
8023 
8024   if (!vec_stmt_p)
8025     {
8026       /* No transformation required.  */
8027       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8028 	{
8029 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8030 					       OPTIMIZE_FOR_SPEED))
8031 	    {
8032 	      if (dump_enabled_p ())
8033 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8034 				 "can't use a fully-masked loop because "
8035 				 "the target doesn't support extract last "
8036 				 "reduction.\n");
8037 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8038 	    }
8039 	  else if (slp_node)
8040 	    {
8041 	      if (dump_enabled_p ())
8042 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8043 				 "can't use a fully-masked loop because an "
8044 				 "SLP statement is live after the loop.\n");
8045 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8046 	    }
8047 	  else if (ncopies > 1)
8048 	    {
8049 	      if (dump_enabled_p ())
8050 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051 				 "can't use a fully-masked loop because"
8052 				 " ncopies is greater than 1.\n");
8053 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8054 	    }
8055 	  else
8056 	    {
8057 	      gcc_assert (ncopies == 1 && !slp_node);
8058 	      vect_record_loop_mask (loop_vinfo,
8059 				     &LOOP_VINFO_MASKS (loop_vinfo),
8060 				     1, vectype, NULL);
8061 	    }
8062 	}
8063       return true;
8064     }
8065 
8066   /* Use the lhs of the original scalar statement.  */
8067   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8068 
8069   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8070 	: gimple_get_lhs (stmt);
8071   lhs_type = TREE_TYPE (lhs);
8072 
8073   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8074 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8075 	     : TYPE_SIZE (TREE_TYPE (vectype)));
8076   vec_bitsize = TYPE_SIZE (vectype);
8077 
8078   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8079   tree vec_lhs, bitstart;
8080   if (slp_node)
8081     {
8082       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8083 
8084       /* Get the correct slp vectorized stmt.  */
8085       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8086       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8087 	vec_lhs = gimple_phi_result (phi);
8088       else
8089 	vec_lhs = gimple_get_lhs (vec_stmt);
8090 
8091       /* Get entry to use.  */
8092       bitstart = bitsize_int (vec_index);
8093       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8094     }
8095   else
8096     {
8097       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8098       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8099       gcc_checking_assert (ncopies == 1
8100 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8101 
8102       /* For multiple copies, get the last copy.  */
8103       for (int i = 1; i < ncopies; ++i)
8104 	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8105 
8106       /* Get the last lane in the vector.  */
8107       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8108     }
8109 
8110   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8111      requirement, insert one phi node for it.  It looks like:
8112 	 loop;
8113        BB:
8114 	 # lhs' = PHI <lhs>
8115      ==>
8116 	 loop;
8117        BB:
8118 	 # vec_lhs' = PHI <vec_lhs>
8119 	 new_tree = lane_extract <vec_lhs', ...>;
8120 	 lhs' = new_tree;  */
8121 
8122   basic_block exit_bb = single_exit (loop)->dest;
8123   gcc_assert (single_pred_p (exit_bb));
8124 
8125   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8126   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8127   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8128 
8129   gimple_seq stmts = NULL;
8130   tree new_tree;
8131   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8132     {
8133       /* Emit:
8134 
8135 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8136 
8137 	 where VEC_LHS is the vectorized live-out result and MASK is
8138 	 the loop mask for the final iteration.  */
8139       gcc_assert (ncopies == 1 && !slp_node);
8140       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8141       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8142 				      vectype, 0);
8143       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8144 				      mask, vec_lhs_phi);
8145 
8146       /* Convert the extracted vector element to the required scalar type.  */
8147       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8148     }
8149   else
8150     {
8151       tree bftype = TREE_TYPE (vectype);
8152       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8153 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8154       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8155       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8156 				       &stmts, true, NULL_TREE);
8157     }
8158 
8159   if (stmts)
8160     {
8161       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8162       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8163 
8164       /* Remove existing phi from lhs and create one copy from new_tree.  */
8165       tree lhs_phi = NULL_TREE;
8166       gimple_stmt_iterator gsi;
8167       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8168 	{
8169 	  gimple *phi = gsi_stmt (gsi);
8170 	  if ((gimple_phi_arg_def (phi, 0) == lhs))
8171 	    {
8172 	      remove_phi_node (&gsi, false);
8173 	      lhs_phi = gimple_phi_result (phi);
8174 	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8175 	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8176 	      break;
8177 	    }
8178 	}
8179     }
8180 
8181   /* Replace use of lhs with newly computed result.  If the use stmt is a
8182      single arg PHI, just replace all uses of PHI result.  It's necessary
8183      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8184   use_operand_p use_p;
8185   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8186     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8187 	&& !is_gimple_debug (use_stmt))
8188     {
8189       if (gimple_code (use_stmt) == GIMPLE_PHI
8190 	  && gimple_phi_num_args (use_stmt) == 1)
8191 	{
8192 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8193 	}
8194       else
8195 	{
8196 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8197 	    SET_USE (use_p, new_tree);
8198 	}
8199       update_stmt (use_stmt);
8200     }
8201 
8202   return true;
8203 }
8204 
8205 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8206 
8207 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8208 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8209 {
8210   ssa_op_iter op_iter;
8211   imm_use_iterator imm_iter;
8212   def_operand_p def_p;
8213   gimple *ustmt;
8214 
8215   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8216     {
8217       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8218 	{
8219 	  basic_block bb;
8220 
8221 	  if (!is_gimple_debug (ustmt))
8222 	    continue;
8223 
8224 	  bb = gimple_bb (ustmt);
8225 
8226 	  if (!flow_bb_inside_loop_p (loop, bb))
8227 	    {
8228 	      if (gimple_debug_bind_p (ustmt))
8229 		{
8230 		  if (dump_enabled_p ())
8231 		    dump_printf_loc (MSG_NOTE, vect_location,
8232                                      "killing debug use\n");
8233 
8234 		  gimple_debug_bind_reset_value (ustmt);
8235 		  update_stmt (ustmt);
8236 		}
8237 	      else
8238 		gcc_unreachable ();
8239 	    }
8240 	}
8241     }
8242 }
8243 
8244 /* Given loop represented by LOOP_VINFO, return true if computation of
8245    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8246    otherwise.  */
8247 
8248 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8249 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8250 {
8251   /* Constant case.  */
8252   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8253     {
8254       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8255       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8256 
8257       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8258       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8259       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8260 	return true;
8261     }
8262 
8263   widest_int max;
8264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8265   /* Check the upper bound of loop niters.  */
8266   if (get_max_loop_iterations (loop, &max))
8267     {
8268       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8269       signop sgn = TYPE_SIGN (type);
8270       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8271       if (max < type_max)
8272 	return true;
8273     }
8274   return false;
8275 }
8276 
8277 /* Return a mask type with half the number of elements as OLD_TYPE,
8278    given that it should have mode NEW_MODE.  */
8279 
8280 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8281 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8282 {
8283   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8284   return build_truth_vector_type_for_mode (nunits, new_mode);
8285 }
8286 
8287 /* Return a mask type with twice as many elements as OLD_TYPE,
8288    given that it should have mode NEW_MODE.  */
8289 
8290 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8291 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8292 {
8293   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8294   return build_truth_vector_type_for_mode (nunits, new_mode);
8295 }
8296 
8297 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8298    contain a sequence of NVECTORS masks that each control a vector of type
8299    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8300    these vector masks with the vector version of SCALAR_MASK.  */
8301 
8302 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8303 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8304 		       unsigned int nvectors, tree vectype, tree scalar_mask)
8305 {
8306   gcc_assert (nvectors != 0);
8307   if (masks->length () < nvectors)
8308     masks->safe_grow_cleared (nvectors);
8309   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8310   /* The number of scalars per iteration and the number of vectors are
8311      both compile-time constants.  */
8312   unsigned int nscalars_per_iter
8313     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8314 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8315 
8316   if (scalar_mask)
8317     {
8318       scalar_cond_masked_key cond (scalar_mask, nvectors);
8319       loop_vinfo->scalar_cond_masked_set.add (cond);
8320     }
8321 
8322   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8323     {
8324       rgm->max_nscalars_per_iter = nscalars_per_iter;
8325       rgm->mask_type = truth_type_for (vectype);
8326     }
8327 }
8328 
8329 /* Given a complete set of masks MASKS, extract mask number INDEX
8330    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8331    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8332 
8333    See the comment above vec_loop_masks for more details about the mask
8334    arrangement.  */
8335 
8336 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8337 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8338 		    unsigned int nvectors, tree vectype, unsigned int index)
8339 {
8340   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8341   tree mask_type = rgm->mask_type;
8342 
8343   /* Populate the rgroup's mask array, if this is the first time we've
8344      used it.  */
8345   if (rgm->masks.is_empty ())
8346     {
8347       rgm->masks.safe_grow_cleared (nvectors);
8348       for (unsigned int i = 0; i < nvectors; ++i)
8349 	{
8350 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8351 	  /* Provide a dummy definition until the real one is available.  */
8352 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8353 	  rgm->masks[i] = mask;
8354 	}
8355     }
8356 
8357   tree mask = rgm->masks[index];
8358   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8359 		TYPE_VECTOR_SUBPARTS (vectype)))
8360     {
8361       /* A loop mask for data type X can be reused for data type Y
8362 	 if X has N times more elements than Y and if Y's elements
8363 	 are N times bigger than X's.  In this case each sequence
8364 	 of N elements in the loop mask will be all-zero or all-one.
8365 	 We can then view-convert the mask so that each sequence of
8366 	 N elements is replaced by a single element.  */
8367       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8368 			      TYPE_VECTOR_SUBPARTS (vectype)));
8369       gimple_seq seq = NULL;
8370       mask_type = truth_type_for (vectype);
8371       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8372       if (seq)
8373 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8374     }
8375   return mask;
8376 }
8377 
8378 /* Scale profiling counters by estimation for LOOP which is vectorized
8379    by factor VF.  */
8380 
8381 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)8382 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8383 {
8384   edge preheader = loop_preheader_edge (loop);
8385   /* Reduce loop iterations by the vectorization factor.  */
8386   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8387   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8388 
8389   if (freq_h.nonzero_p ())
8390     {
8391       profile_probability p;
8392 
8393       /* Avoid dropping loop body profile counter to 0 because of zero count
8394 	 in loop's preheader.  */
8395       if (!(freq_e == profile_count::zero ()))
8396         freq_e = freq_e.force_nonzero ();
8397       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8398       scale_loop_frequencies (loop, p);
8399     }
8400 
8401   edge exit_e = single_exit (loop);
8402   exit_e->probability = profile_probability::always ()
8403 				 .apply_scale (1, new_est_niter + 1);
8404 
8405   edge exit_l = single_pred_edge (loop->latch);
8406   profile_probability prob = exit_l->probability;
8407   exit_l->probability = exit_e->probability.invert ();
8408   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8409     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8410 }
8411 
8412 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8413    latch edge values originally defined by it.  */
8414 
8415 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)8416 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8417 				     stmt_vec_info def_stmt_info)
8418 {
8419   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8420   if (!def || TREE_CODE (def) != SSA_NAME)
8421     return;
8422   stmt_vec_info phi_info;
8423   imm_use_iterator iter;
8424   use_operand_p use_p;
8425   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8426     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8427       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8428 	  && (phi_info = loop_vinfo->lookup_stmt (phi))
8429 	  && STMT_VINFO_RELEVANT_P (phi_info)
8430 	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8431 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8432 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8433 	{
8434 	  loop_p loop = gimple_bb (phi)->loop_father;
8435 	  edge e = loop_latch_edge (loop);
8436 	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8437 	    {
8438 	      stmt_vec_info phi_vec_info = STMT_VINFO_VEC_STMT (phi_info);
8439 	      stmt_vec_info def_vec_info = STMT_VINFO_VEC_STMT (def_stmt_info);
8440 	      do
8441 		{
8442 		  add_phi_arg (as_a <gphi *> (phi_vec_info->stmt),
8443 			       gimple_get_lhs (def_vec_info->stmt), e,
8444 			       gimple_phi_arg_location (phi, e->dest_idx));
8445 		  phi_vec_info = STMT_VINFO_RELATED_STMT (phi_vec_info);
8446 		  def_vec_info = STMT_VINFO_RELATED_STMT (def_vec_info);
8447 		}
8448 	      while (phi_vec_info);
8449 	      gcc_assert (!def_vec_info);
8450 	    }
8451 	}
8452 }
8453 
8454 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8455    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8456    stmt_vec_info.  */
8457 
8458 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8459 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8460 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8461 {
8462   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8463   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8464 
8465   if (dump_enabled_p ())
8466     dump_printf_loc (MSG_NOTE, vect_location,
8467 		     "------>vectorizing statement: %G", stmt_info->stmt);
8468 
8469   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8470     vect_loop_kill_debug_uses (loop, stmt_info);
8471 
8472   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8473       && !STMT_VINFO_LIVE_P (stmt_info))
8474     return false;
8475 
8476   if (STMT_VINFO_VECTYPE (stmt_info))
8477     {
8478       poly_uint64 nunits
8479 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8480       if (!STMT_SLP_TYPE (stmt_info)
8481 	  && maybe_ne (nunits, vf)
8482 	  && dump_enabled_p ())
8483 	/* For SLP VF is set according to unrolling factor, and not
8484 	   to vector size, hence for SLP this print is not valid.  */
8485 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8486     }
8487 
8488   /* Pure SLP statements have already been vectorized.  We still need
8489      to apply loop vectorization to hybrid SLP statements.  */
8490   if (PURE_SLP_STMT (stmt_info))
8491     return false;
8492 
8493   if (dump_enabled_p ())
8494     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8495 
8496   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8497     *seen_store = stmt_info;
8498 
8499   return true;
8500 }
8501 
8502 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8503    in the hash_map with its corresponding values.  */
8504 
8505 static tree
find_in_mapping(tree t,void * context)8506 find_in_mapping (tree t, void *context)
8507 {
8508   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8509 
8510   tree *value = mapping->get (t);
8511   return value ? *value : t;
8512 }
8513 
8514 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8515    original loop that has now been vectorized.
8516 
8517    The inits of the data_references need to be advanced with the number of
8518    iterations of the main loop.  This has been computed in vect_do_peeling and
8519    is stored in parameter ADVANCE.  We first restore the data_references
8520    initial offset with the values recored in ORIG_DRS_INIT.
8521 
8522    Since the loop_vec_info of this EPILOGUE was constructed for the original
8523    loop, its stmt_vec_infos all point to the original statements.  These need
8524    to be updated to point to their corresponding copies as well as the SSA_NAMES
8525    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8526 
8527    The data_reference's connections also need to be updated.  Their
8528    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8529    stmt_vec_infos, their statements need to point to their corresponding copy,
8530    if they are gather loads or scatter stores then their reference needs to be
8531    updated to point to its corresponding copy and finally we set
8532    'base_misaligned' to false as we have already peeled for alignment in the
8533    prologue of the main loop.  */
8534 
8535 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)8536 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8537 {
8538   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8539   auto_vec<gimple *> stmt_worklist;
8540   hash_map<tree,tree> mapping;
8541   gimple *orig_stmt, *new_stmt;
8542   gimple_stmt_iterator epilogue_gsi;
8543   gphi_iterator epilogue_phi_gsi;
8544   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8545   basic_block *epilogue_bbs = get_loop_body (epilogue);
8546   unsigned i;
8547 
8548   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8549 
8550   /* Advance data_reference's with the number of iterations of the previous
8551      loop and its prologue.  */
8552   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8553 
8554 
8555   /* The EPILOGUE loop is a copy of the original loop so they share the same
8556      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8557      point to the copied statements.  We also create a mapping of all LHS' in
8558      the original loop and all the LHS' in the EPILOGUE and create worklists to
8559      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8560   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8561     {
8562       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8563 	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8564 	{
8565 	  new_stmt = epilogue_phi_gsi.phi ();
8566 
8567 	  gcc_assert (gimple_uid (new_stmt) > 0);
8568 	  stmt_vinfo
8569 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8570 
8571 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8572 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8573 
8574 	  mapping.put (gimple_phi_result (orig_stmt),
8575 		       gimple_phi_result (new_stmt));
8576 	  /* PHI nodes can not have patterns or related statements.  */
8577 	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8578 		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8579 	}
8580 
8581       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8582 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8583 	{
8584 	  new_stmt = gsi_stmt (epilogue_gsi);
8585 
8586 	  gcc_assert (gimple_uid (new_stmt) > 0);
8587 	  stmt_vinfo
8588 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8589 
8590 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8591 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8592 
8593 	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
8594 	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8595 
8596 	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8597 	    {
8598 	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8599 	      for (gimple_stmt_iterator gsi = gsi_start (seq);
8600 		   !gsi_end_p (gsi); gsi_next (&gsi))
8601 		stmt_worklist.safe_push (gsi_stmt (gsi));
8602 	    }
8603 
8604 	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8605 	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8606 	    {
8607 	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8608 	      stmt_worklist.safe_push (stmt);
8609 	      /* Set BB such that the assert in
8610 		'get_initial_def_for_reduction' is able to determine that
8611 		the BB of the related stmt is inside this loop.  */
8612 	      gimple_set_bb (stmt,
8613 			     gimple_bb (new_stmt));
8614 	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8615 	      gcc_assert (related_vinfo == NULL
8616 			  || related_vinfo == stmt_vinfo);
8617 	    }
8618 	}
8619     }
8620 
8621   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8622      using the original main loop and thus need to be updated to refer to the
8623      cloned variables used in the epilogue.  */
8624   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8625     {
8626       gimple *stmt = stmt_worklist[i];
8627       tree *new_op;
8628 
8629       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8630 	{
8631 	  tree op = gimple_op (stmt, j);
8632 	  if ((new_op = mapping.get(op)))
8633 	    gimple_set_op (stmt, j, *new_op);
8634 	  else
8635 	    {
8636 	      /* PR92429: The last argument of simplify_replace_tree disables
8637 		 folding when replacing arguments.  This is required as
8638 		 otherwise you might end up with different statements than the
8639 		 ones analyzed in vect_loop_analyze, leading to different
8640 		 vectorization.  */
8641 	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8642 					  &find_in_mapping, &mapping, false);
8643 	      gimple_set_op (stmt, j, op);
8644 	    }
8645 	}
8646     }
8647 
8648   struct data_reference *dr;
8649   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8650   FOR_EACH_VEC_ELT (datarefs, i, dr)
8651     {
8652       orig_stmt = DR_STMT (dr);
8653       gcc_assert (gimple_uid (orig_stmt) > 0);
8654       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8655       /* Data references for gather loads and scatter stores do not use the
8656 	 updated offset we set using ADVANCE.  Instead we have to make sure the
8657 	 reference in the data references point to the corresponding copy of
8658 	 the original in the epilogue.  */
8659       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8660 	  == VMAT_GATHER_SCATTER)
8661 	{
8662 	  DR_REF (dr)
8663 	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8664 				     &find_in_mapping, &mapping);
8665 	  DR_BASE_ADDRESS (dr)
8666 	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8667 				     &find_in_mapping, &mapping);
8668 	}
8669       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8670       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8671       /* The vector size of the epilogue is smaller than that of the main loop
8672 	 so the alignment is either the same or lower. This means the dr will
8673 	 thus by definition be aligned.  */
8674       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8675     }
8676 
8677   epilogue_vinfo->shared->datarefs_copy.release ();
8678   epilogue_vinfo->shared->save_datarefs ();
8679 }
8680 
8681 /* Function vect_transform_loop.
8682 
8683    The analysis phase has determined that the loop is vectorizable.
8684    Vectorize the loop - created vectorized stmts to replace the scalar
8685    stmts in the loop, and update the loop exit condition.
8686    Returns scalar epilogue loop if any.  */
8687 
8688 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)8689 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8690 {
8691   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8692   class loop *epilogue = NULL;
8693   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8694   int nbbs = loop->num_nodes;
8695   int i;
8696   tree niters_vector = NULL_TREE;
8697   tree step_vector = NULL_TREE;
8698   tree niters_vector_mult_vf = NULL_TREE;
8699   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8700   unsigned int lowest_vf = constant_lower_bound (vf);
8701   gimple *stmt;
8702   bool check_profitability = false;
8703   unsigned int th;
8704 
8705   DUMP_VECT_SCOPE ("vec_transform_loop");
8706 
8707   loop_vinfo->shared->check_datarefs ();
8708 
8709   /* Use the more conservative vectorization threshold.  If the number
8710      of iterations is constant assume the cost check has been performed
8711      by our caller.  If the threshold makes all loops profitable that
8712      run at least the (estimated) vectorization factor number of times
8713      checking is pointless, too.  */
8714   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8715   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8716     {
8717       if (dump_enabled_p ())
8718 	dump_printf_loc (MSG_NOTE, vect_location,
8719 			 "Profitability threshold is %d loop iterations.\n",
8720 			 th);
8721       check_profitability = true;
8722     }
8723 
8724   /* Make sure there exists a single-predecessor exit bb.  Do this before
8725      versioning.   */
8726   edge e = single_exit (loop);
8727   if (! single_pred_p (e->dest))
8728     {
8729       split_loop_exit_edge (e, true);
8730       if (dump_enabled_p ())
8731 	dump_printf (MSG_NOTE, "split exit edge\n");
8732     }
8733 
8734   /* Version the loop first, if required, so the profitability check
8735      comes first.  */
8736 
8737   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8738     {
8739       class loop *sloop
8740 	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8741       sloop->force_vectorize = false;
8742       check_profitability = false;
8743     }
8744 
8745   /* Make sure there exists a single-predecessor exit bb also on the
8746      scalar loop copy.  Do this after versioning but before peeling
8747      so CFG structure is fine for both scalar and if-converted loop
8748      to make slpeel_duplicate_current_defs_from_edges face matched
8749      loop closed PHI nodes on the exit.  */
8750   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8751     {
8752       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8753       if (! single_pred_p (e->dest))
8754 	{
8755 	  split_loop_exit_edge (e, true);
8756 	  if (dump_enabled_p ())
8757 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8758 	}
8759     }
8760 
8761   tree niters = vect_build_loop_niters (loop_vinfo);
8762   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8763   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8764   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8765   tree advance;
8766   drs_init_vec orig_drs_init;
8767 
8768   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8769 			      &step_vector, &niters_vector_mult_vf, th,
8770 			      check_profitability, niters_no_overflow,
8771 			      &advance);
8772 
8773   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8774       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8775     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8776 			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8777 
8778   if (niters_vector == NULL_TREE)
8779     {
8780       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8781 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8782 	  && known_eq (lowest_vf, vf))
8783 	{
8784 	  niters_vector
8785 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8786 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8787 	  step_vector = build_one_cst (TREE_TYPE (niters));
8788 	}
8789       else
8790 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8791 				     &step_vector, niters_no_overflow);
8792     }
8793 
8794   /* 1) Make sure the loop header has exactly two entries
8795      2) Make sure we have a preheader basic block.  */
8796 
8797   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8798 
8799   split_edge (loop_preheader_edge (loop));
8800 
8801   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8802       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8803     /* This will deal with any possible peeling.  */
8804     vect_prepare_for_masked_peels (loop_vinfo);
8805 
8806   /* Schedule the SLP instances first, then handle loop vectorization
8807      below.  */
8808   if (!loop_vinfo->slp_instances.is_empty ())
8809     {
8810       DUMP_VECT_SCOPE ("scheduling SLP instances");
8811       vect_schedule_slp (loop_vinfo);
8812     }
8813 
8814   /* FORNOW: the vectorizer supports only loops which body consist
8815      of one basic block (header + empty latch). When the vectorizer will
8816      support more involved loop forms, the order by which the BBs are
8817      traversed need to be reconsidered.  */
8818 
8819   for (i = 0; i < nbbs; i++)
8820     {
8821       basic_block bb = bbs[i];
8822       stmt_vec_info stmt_info;
8823 
8824       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8825 	   gsi_next (&si))
8826 	{
8827 	  gphi *phi = si.phi ();
8828 	  if (dump_enabled_p ())
8829 	    dump_printf_loc (MSG_NOTE, vect_location,
8830 			     "------>vectorizing phi: %G", phi);
8831 	  stmt_info = loop_vinfo->lookup_stmt (phi);
8832 	  if (!stmt_info)
8833 	    continue;
8834 
8835 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8836 	    vect_loop_kill_debug_uses (loop, stmt_info);
8837 
8838 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8839 	      && !STMT_VINFO_LIVE_P (stmt_info))
8840 	    continue;
8841 
8842 	  if (STMT_VINFO_VECTYPE (stmt_info)
8843 	      && (maybe_ne
8844 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8845 	      && dump_enabled_p ())
8846 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8847 
8848 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8849 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8850 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8851 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8852 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8853 	      && ! PURE_SLP_STMT (stmt_info))
8854 	    {
8855 	      if (dump_enabled_p ())
8856 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8857 	      vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8858 	    }
8859 	}
8860 
8861       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8862 	   gsi_next (&si))
8863 	{
8864 	  gphi *phi = si.phi ();
8865 	  stmt_info = loop_vinfo->lookup_stmt (phi);
8866 	  if (!stmt_info)
8867 	    continue;
8868 
8869 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8870 	      && !STMT_VINFO_LIVE_P (stmt_info))
8871 	    continue;
8872 
8873 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8874 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8875 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8876 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8877 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8878 	      && ! PURE_SLP_STMT (stmt_info))
8879 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
8880 	}
8881 
8882       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8883 	   !gsi_end_p (si);)
8884 	{
8885 	  stmt = gsi_stmt (si);
8886 	  /* During vectorization remove existing clobber stmts.  */
8887 	  if (gimple_clobber_p (stmt))
8888 	    {
8889 	      unlink_stmt_vdef (stmt);
8890 	      gsi_remove (&si, true);
8891 	      release_defs (stmt);
8892 	    }
8893 	  else
8894 	    {
8895 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
8896 
8897 	      /* vector stmts created in the outer-loop during vectorization of
8898 		 stmts in an inner-loop may not have a stmt_info, and do not
8899 		 need to be vectorized.  */
8900 	      stmt_vec_info seen_store = NULL;
8901 	      if (stmt_info)
8902 		{
8903 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8904 		    {
8905 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8906 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8907 			   !gsi_end_p (subsi); gsi_next (&subsi))
8908 			{
8909 			  stmt_vec_info pat_stmt_info
8910 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8911 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8912 						    &si, &seen_store);
8913 			}
8914 		      stmt_vec_info pat_stmt_info
8915 			= STMT_VINFO_RELATED_STMT (stmt_info);
8916 		      if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8917 						    &si, &seen_store))
8918 			maybe_set_vectorized_backedge_value (loop_vinfo,
8919 							     pat_stmt_info);
8920 		    }
8921 		  else
8922 		    {
8923 		      if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8924 						    &seen_store))
8925 			maybe_set_vectorized_backedge_value (loop_vinfo,
8926 							     stmt_info);
8927 		    }
8928 		}
8929 	      gsi_next (&si);
8930 	      if (seen_store)
8931 		{
8932 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8933 		    /* Interleaving.  If IS_STORE is TRUE, the
8934 		       vectorization of the interleaving chain was
8935 		       completed - free all the stores in the chain.  */
8936 		    vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8937 		  else
8938 		    /* Free the attached stmt_vec_info and remove the stmt.  */
8939 		    loop_vinfo->remove_stmt (stmt_info);
8940 		}
8941 	    }
8942 	}
8943 
8944       /* Stub out scalar statements that must not survive vectorization.
8945 	 Doing this here helps with grouped statements, or statements that
8946 	 are involved in patterns.  */
8947       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8948 	   !gsi_end_p (gsi); gsi_next (&gsi))
8949 	{
8950 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8951 	  if (!call || !gimple_call_internal_p (call))
8952 	    continue;
8953 	  internal_fn ifn = gimple_call_internal_fn (call);
8954 	  if (ifn == IFN_MASK_LOAD)
8955 	    {
8956 	      tree lhs = gimple_get_lhs (call);
8957 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8958 		{
8959 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8960 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8961 		  gsi_replace (&gsi, new_stmt, true);
8962 		}
8963 	    }
8964 	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
8965 	    {
8966 	      tree lhs = gimple_get_lhs (call);
8967 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8968 		{
8969 		  tree else_arg
8970 		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
8971 		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
8972 		  gsi_replace (&gsi, new_stmt, true);
8973 		}
8974 	    }
8975 	}
8976     }				/* BBs in loop */
8977 
8978   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8979      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8980   if (integer_onep (step_vector))
8981     niters_no_overflow = true;
8982   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8983 			   niters_vector_mult_vf, !niters_no_overflow);
8984 
8985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8986   scale_profile_for_vect_loop (loop, assumed_vf);
8987 
8988   /* True if the final iteration might not handle a full vector's
8989      worth of scalar iterations.  */
8990   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8991   /* The minimum number of iterations performed by the epilogue.  This
8992      is 1 when peeling for gaps because we always need a final scalar
8993      iteration.  */
8994   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8995   /* +1 to convert latch counts to loop iteration counts,
8996      -min_epilogue_iters to remove iterations that cannot be performed
8997        by the vector code.  */
8998   int bias_for_lowest = 1 - min_epilogue_iters;
8999   int bias_for_assumed = bias_for_lowest;
9000   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9001   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9002     {
9003       /* When the amount of peeling is known at compile time, the first
9004 	 iteration will have exactly alignment_npeels active elements.
9005 	 In the worst case it will have at least one.  */
9006       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9007       bias_for_lowest += lowest_vf - min_first_active;
9008       bias_for_assumed += assumed_vf - min_first_active;
9009     }
9010   /* In these calculations the "- 1" converts loop iteration counts
9011      back to latch counts.  */
9012   if (loop->any_upper_bound)
9013     loop->nb_iterations_upper_bound
9014       = (final_iter_may_be_partial
9015 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9016 			  lowest_vf) - 1
9017 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9018 			   lowest_vf) - 1);
9019   if (loop->any_likely_upper_bound)
9020     loop->nb_iterations_likely_upper_bound
9021       = (final_iter_may_be_partial
9022 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9023 			  + bias_for_lowest, lowest_vf) - 1
9024 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9025 			   + bias_for_lowest, lowest_vf) - 1);
9026   if (loop->any_estimate)
9027     loop->nb_iterations_estimate
9028       = (final_iter_may_be_partial
9029 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9030 			  assumed_vf) - 1
9031 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9032 			   assumed_vf) - 1);
9033 
9034   if (dump_enabled_p ())
9035     {
9036       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9037 	{
9038 	  dump_printf_loc (MSG_NOTE, vect_location,
9039 			   "LOOP VECTORIZED\n");
9040 	  if (loop->inner)
9041 	    dump_printf_loc (MSG_NOTE, vect_location,
9042 			     "OUTER LOOP VECTORIZED\n");
9043 	  dump_printf (MSG_NOTE, "\n");
9044 	}
9045       else
9046 	dump_printf_loc (MSG_NOTE, vect_location,
9047 			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9048 			 GET_MODE_NAME (loop_vinfo->vector_mode));
9049     }
9050 
9051   /* Loops vectorized with a variable factor won't benefit from
9052      unrolling/peeling.  */
9053   if (!vf.is_constant ())
9054     {
9055       loop->unroll = 1;
9056       if (dump_enabled_p ())
9057 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9058 			 " variable-length vectorization factor\n");
9059     }
9060   /* Free SLP instances here because otherwise stmt reference counting
9061      won't work.  */
9062   slp_instance instance;
9063   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9064     vect_free_slp_instance (instance, true);
9065   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9066   /* Clear-up safelen field since its value is invalid after vectorization
9067      since vectorized loop can have loop-carried dependencies.  */
9068   loop->safelen = 0;
9069 
9070   if (epilogue)
9071     {
9072       update_epilogue_loop_vinfo (epilogue, advance);
9073 
9074       epilogue->simduid = loop->simduid;
9075       epilogue->force_vectorize = loop->force_vectorize;
9076       epilogue->dont_vectorize = false;
9077     }
9078 
9079   return epilogue;
9080 }
9081 
9082 /* The code below is trying to perform simple optimization - revert
9083    if-conversion for masked stores, i.e. if the mask of a store is zero
9084    do not perform it and all stored value producers also if possible.
9085    For example,
9086      for (i=0; i<n; i++)
9087        if (c[i])
9088 	{
9089 	  p1[i] += 1;
9090 	  p2[i] = p3[i] +2;
9091 	}
9092    this transformation will produce the following semi-hammock:
9093 
9094    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9095      {
9096        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9097        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9098        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9099        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9100        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9101        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9102      }
9103 */
9104 
9105 void
optimize_mask_stores(class loop * loop)9106 optimize_mask_stores (class loop *loop)
9107 {
9108   basic_block *bbs = get_loop_body (loop);
9109   unsigned nbbs = loop->num_nodes;
9110   unsigned i;
9111   basic_block bb;
9112   class loop *bb_loop;
9113   gimple_stmt_iterator gsi;
9114   gimple *stmt;
9115   auto_vec<gimple *> worklist;
9116   auto_purge_vect_location sentinel;
9117 
9118   vect_location = find_loop_location (loop);
9119   /* Pick up all masked stores in loop if any.  */
9120   for (i = 0; i < nbbs; i++)
9121     {
9122       bb = bbs[i];
9123       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9124 	   gsi_next (&gsi))
9125 	{
9126 	  stmt = gsi_stmt (gsi);
9127 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9128 	    worklist.safe_push (stmt);
9129 	}
9130     }
9131 
9132   free (bbs);
9133   if (worklist.is_empty ())
9134     return;
9135 
9136   /* Loop has masked stores.  */
9137   while (!worklist.is_empty ())
9138     {
9139       gimple *last, *last_store;
9140       edge e, efalse;
9141       tree mask;
9142       basic_block store_bb, join_bb;
9143       gimple_stmt_iterator gsi_to;
9144       tree vdef, new_vdef;
9145       gphi *phi;
9146       tree vectype;
9147       tree zero;
9148 
9149       last = worklist.pop ();
9150       mask = gimple_call_arg (last, 2);
9151       bb = gimple_bb (last);
9152       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9153 	 the same loop as if_bb.  It could be different to LOOP when two
9154 	 level loop-nest is vectorized and mask_store belongs to the inner
9155 	 one.  */
9156       e = split_block (bb, last);
9157       bb_loop = bb->loop_father;
9158       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9159       join_bb = e->dest;
9160       store_bb = create_empty_bb (bb);
9161       add_bb_to_loop (store_bb, bb_loop);
9162       e->flags = EDGE_TRUE_VALUE;
9163       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9164       /* Put STORE_BB to likely part.  */
9165       efalse->probability = profile_probability::unlikely ();
9166       store_bb->count = efalse->count ();
9167       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9168       if (dom_info_available_p (CDI_DOMINATORS))
9169 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9170       if (dump_enabled_p ())
9171 	dump_printf_loc (MSG_NOTE, vect_location,
9172 			 "Create new block %d to sink mask stores.",
9173 			 store_bb->index);
9174       /* Create vector comparison with boolean result.  */
9175       vectype = TREE_TYPE (mask);
9176       zero = build_zero_cst (vectype);
9177       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9178       gsi = gsi_last_bb (bb);
9179       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9180       /* Create new PHI node for vdef of the last masked store:
9181 	 .MEM_2 = VDEF <.MEM_1>
9182 	 will be converted to
9183 	 .MEM.3 = VDEF <.MEM_1>
9184 	 and new PHI node will be created in join bb
9185 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9186       */
9187       vdef = gimple_vdef (last);
9188       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9189       gimple_set_vdef (last, new_vdef);
9190       phi = create_phi_node (vdef, join_bb);
9191       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9192 
9193       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9194       while (true)
9195 	{
9196 	  gimple_stmt_iterator gsi_from;
9197 	  gimple *stmt1 = NULL;
9198 
9199 	  /* Move masked store to STORE_BB.  */
9200 	  last_store = last;
9201 	  gsi = gsi_for_stmt (last);
9202 	  gsi_from = gsi;
9203 	  /* Shift GSI to the previous stmt for further traversal.  */
9204 	  gsi_prev (&gsi);
9205 	  gsi_to = gsi_start_bb (store_bb);
9206 	  gsi_move_before (&gsi_from, &gsi_to);
9207 	  /* Setup GSI_TO to the non-empty block start.  */
9208 	  gsi_to = gsi_start_bb (store_bb);
9209 	  if (dump_enabled_p ())
9210 	    dump_printf_loc (MSG_NOTE, vect_location,
9211 			     "Move stmt to created bb\n%G", last);
9212 	  /* Move all stored value producers if possible.  */
9213 	  while (!gsi_end_p (gsi))
9214 	    {
9215 	      tree lhs;
9216 	      imm_use_iterator imm_iter;
9217 	      use_operand_p use_p;
9218 	      bool res;
9219 
9220 	      /* Skip debug statements.  */
9221 	      if (is_gimple_debug (gsi_stmt (gsi)))
9222 		{
9223 		  gsi_prev (&gsi);
9224 		  continue;
9225 		}
9226 	      stmt1 = gsi_stmt (gsi);
9227 	      /* Do not consider statements writing to memory or having
9228 		 volatile operand.  */
9229 	      if (gimple_vdef (stmt1)
9230 		  || gimple_has_volatile_ops (stmt1))
9231 		break;
9232 	      gsi_from = gsi;
9233 	      gsi_prev (&gsi);
9234 	      lhs = gimple_get_lhs (stmt1);
9235 	      if (!lhs)
9236 		break;
9237 
9238 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9239 	      if (TREE_CODE (lhs) != SSA_NAME)
9240 		break;
9241 
9242 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9243 		{
9244 		  /* Remove dead scalar statement.  */
9245 		  if (has_zero_uses (lhs))
9246 		    {
9247 		      gsi_remove (&gsi_from, true);
9248 		      continue;
9249 		    }
9250 		}
9251 
9252 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9253 	      res = true;
9254 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9255 		{
9256 		  gimple *use_stmt;
9257 		  use_stmt = USE_STMT (use_p);
9258 		  if (is_gimple_debug (use_stmt))
9259 		    continue;
9260 		  if (gimple_bb (use_stmt) != store_bb)
9261 		    {
9262 		      res = false;
9263 		      break;
9264 		    }
9265 		}
9266 	      if (!res)
9267 		break;
9268 
9269 	      if (gimple_vuse (stmt1)
9270 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9271 		break;
9272 
9273 	      /* Can move STMT1 to STORE_BB.  */
9274 	      if (dump_enabled_p ())
9275 		dump_printf_loc (MSG_NOTE, vect_location,
9276 				 "Move stmt to created bb\n%G", stmt1);
9277 	      gsi_move_before (&gsi_from, &gsi_to);
9278 	      /* Shift GSI_TO for further insertion.  */
9279 	      gsi_prev (&gsi_to);
9280 	    }
9281 	  /* Put other masked stores with the same mask to STORE_BB.  */
9282 	  if (worklist.is_empty ()
9283 	      || gimple_call_arg (worklist.last (), 2) != mask
9284 	      || worklist.last () != stmt1)
9285 	    break;
9286 	  last = worklist.pop ();
9287 	}
9288       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9289     }
9290 }
9291 
9292 /* Decide whether it is possible to use a zero-based induction variable
9293    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9294    return the value that the induction variable must be able to hold
9295    in order to ensure that the loop ends with an all-false mask.
9296    Return -1 otherwise.  */
9297 widest_int
vect_iv_limit_for_full_masking(loop_vec_info loop_vinfo)9298 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9299 {
9300   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9301   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9302   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9303 
9304   /* Calculate the value that the induction variable must be able
9305      to hit in order to ensure that we end the loop with an all-false mask.
9306      This involves adding the maximum number of inactive trailing scalar
9307      iterations.  */
9308   widest_int iv_limit = -1;
9309   if (max_loop_iterations (loop, &iv_limit))
9310     {
9311       if (niters_skip)
9312 	{
9313 	  /* Add the maximum number of skipped iterations to the
9314 	     maximum iteration count.  */
9315 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
9316 	    iv_limit += wi::to_widest (niters_skip);
9317 	  else
9318 	    iv_limit += max_vf - 1;
9319 	}
9320       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9321 	/* Make a conservatively-correct assumption.  */
9322 	iv_limit += max_vf - 1;
9323 
9324       /* IV_LIMIT is the maximum number of latch iterations, which is also
9325 	 the maximum in-range IV value.  Round this value down to the previous
9326 	 vector alignment boundary and then add an extra full iteration.  */
9327       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9328       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9329     }
9330   return iv_limit;
9331 }
9332 
9333