1 /* Loop Vectorization
2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56 
57 #define vec_step vec_step_
58 
59 /* Loop Vectorization Pass.
60 
61    This pass tries to vectorize loops.
62 
63    For example, the vectorizer transforms the following simple loop:
64 
65         short a[N]; short b[N]; short c[N]; int i;
66 
67         for (i=0; i<N; i++){
68           a[i] = b[i] + c[i];
69         }
70 
71    as if it was manually vectorized by rewriting the source code into:
72 
73         typedef int __attribute__((mode(V8HI))) v8hi;
74         short a[N];  short b[N]; short c[N];   int i;
75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76         v8hi va, vb, vc;
77 
78         for (i=0; i<N/8; i++){
79           vb = pb[i];
80           vc = pc[i];
81           va = vb + vc;
82           pa[i] = va;
83         }
84 
85         The main entry to this pass is vectorize_loops(), in which
86    the vectorizer applies a set of analyses on a given set of loops,
87    followed by the actual vectorization transformation for the loops that
88    had successfully passed the analysis phase.
89         Throughout this pass we make a distinction between two types of
90    data: scalars (which are represented by SSA_NAMES), and memory references
91    ("data-refs").  These two types of data require different handling both
92    during analysis and transformation. The types of data-refs that the
93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95    accesses are required to have a simple (consecutive) access pattern.
96 
97    Analysis phase:
98    ===============
99         The driver for the analysis phase is vect_analyze_loop().
100    It applies a set of analyses, some of which rely on the scalar evolution
101    analyzer (scev) developed by Sebastian Pop.
102 
103         During the analysis phase the vectorizer records some information
104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105    loop, as well as general information about the loop as a whole, which is
106    recorded in a "loop_vec_info" struct attached to each loop.
107 
108    Transformation phase:
109    =====================
110         The loop transformation phase scans all the stmts in the loop, and
111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112    the loop that needs to be vectorized.  It inserts the vector code sequence
113    just before the scalar stmt S, and records a pointer to the vector code
114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115    attached to S).  This pointer will be used for the vectorization of following
116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117    otherwise, we rely on dead code elimination for removing it.
118 
119         For example, say stmt S1 was vectorized into stmt VS1:
120 
121    VS1: vb = px[i];
122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123    S2:  a = b;
124 
125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
128    resulting sequence would be:
129 
130    VS1: vb = px[i];
131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132    VS2: va = vb;
133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 
135         Operands that are not SSA_NAMEs, are data-refs that appear in
136    load/store operations (like 'x[i]' in S1), and are handled differently.
137 
138    Target modeling:
139    =================
140         Currently the only target specific information that is used is the
141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142    Targets that can support different sizes of vectors, for now will need
143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
144    flexibility will be added in the future.
145 
146         Since we only vectorize operations which vector form can be
147    expressed using existing tree codes, to verify that an operation is
148    supported, the vectorizer checks the relevant optab at the relevant
149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
150    the value found is CODE_FOR_nothing, then there's no target support, and
151    we can't vectorize the stmt.
152 
153    For additional information on this project see:
154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156 
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
159 					       bool *, bool *);
160 
161 /* Subroutine of vect_determine_vf_for_stmt that handles only one
162    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
163    may already be set for general statements (not just data refs).  */
164 
165 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)166 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
167 			      bool vectype_maybe_set_p,
168 			      poly_uint64 *vf)
169 {
170   gimple *stmt = stmt_info->stmt;
171 
172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173        && !STMT_VINFO_LIVE_P (stmt_info))
174       || gimple_clobber_p (stmt))
175     {
176       if (dump_enabled_p ())
177 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178       return opt_result::success ();
179     }
180 
181   tree stmt_vectype, nunits_vectype;
182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
183 						   &nunits_vectype);
184   if (!res)
185     return res;
186 
187   if (stmt_vectype)
188     {
189       if (STMT_VINFO_VECTYPE (stmt_info))
190 	/* The only case when a vectype had been already set is for stmts
191 	   that contain a data ref, or for "pattern-stmts" (stmts generated
192 	   by the vectorizer to represent/replace a certain idiom).  */
193 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 		     || vectype_maybe_set_p)
195 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196       else
197 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198     }
199 
200   if (nunits_vectype)
201     vect_update_max_nunits (vf, nunits_vectype);
202 
203   return opt_result::success ();
204 }
205 
206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
207    types of STMT_INFO and all attached pattern statements and update
208    the vectorization factor VF accordingly.  Return true on success
209    or false if something prevented vectorization.  */
210 
211 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf)212 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214   vec_info *vinfo = stmt_info->vinfo;
215   if (dump_enabled_p ())
216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 		     stmt_info->stmt);
218   opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
219   if (!res)
220     return res;
221 
222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223       && STMT_VINFO_RELATED_STMT (stmt_info))
224     {
225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 
228       /* If a pattern statement has def stmts, analyze them too.  */
229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 	   !gsi_end_p (si); gsi_next (&si))
231 	{
232 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 	  if (dump_enabled_p ())
234 	    dump_printf_loc (MSG_NOTE, vect_location,
235 			     "==> examining pattern def stmt: %G",
236 			     def_stmt_info->stmt);
237 	  res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
238 	  if (!res)
239 	    return res;
240 	}
241 
242       if (dump_enabled_p ())
243 	dump_printf_loc (MSG_NOTE, vect_location,
244 			 "==> examining pattern statement: %G",
245 			 stmt_info->stmt);
246       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
247       if (!res)
248 	return res;
249     }
250 
251   return opt_result::success ();
252 }
253 
254 /* Function vect_determine_vectorization_factor
255 
256    Determine the vectorization factor (VF).  VF is the number of data elements
257    that are operated upon in parallel in a single iteration of the vectorized
258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260    elements can fit in a single vector register.
261 
262    We currently support vectorization of loops in which all types operated upon
263    are of the same size.  Therefore this function currently sets VF according to
264    the size of the types operated upon, and fails if there are multiple sizes
265    in the loop.
266 
267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
268    original loop:
269         for (i=0; i<N; i++){
270           a[i] = b[i] + c[i];
271         }
272 
273    vectorized loop:
274         for (i=0; i<N; i+=VF){
275           a[i:VF] = b[i:VF] + c[i:VF];
276         }
277 */
278 
279 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 {
282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284   unsigned nbbs = loop->num_nodes;
285   poly_uint64 vectorization_factor = 1;
286   tree scalar_type = NULL_TREE;
287   gphi *phi;
288   tree vectype;
289   stmt_vec_info stmt_info;
290   unsigned i;
291 
292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 
294   for (i = 0; i < nbbs; i++)
295     {
296       basic_block bb = bbs[i];
297 
298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 	   gsi_next (&si))
300 	{
301 	  phi = si.phi ();
302 	  stmt_info = loop_vinfo->lookup_stmt (phi);
303 	  if (dump_enabled_p ())
304 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 			     phi);
306 
307 	  gcc_assert (stmt_info);
308 
309 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
310 	      || STMT_VINFO_LIVE_P (stmt_info))
311             {
312 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 
315 	      if (dump_enabled_p ())
316 		dump_printf_loc (MSG_NOTE, vect_location,
317 				 "get vectype for scalar type:  %T\n",
318 				 scalar_type);
319 
320 	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 	      if (!vectype)
322 		return opt_result::failure_at (phi,
323 					       "not vectorized: unsupported "
324 					       "data-type %T\n",
325 					       scalar_type);
326 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 
328 	      if (dump_enabled_p ())
329 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 				 vectype);
331 
332 	      if (dump_enabled_p ())
333 		{
334 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 		  dump_printf (MSG_NOTE, "\n");
337 		}
338 
339 	      vect_update_max_nunits (&vectorization_factor, vectype);
340 	    }
341 	}
342 
343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 	   gsi_next (&si))
345 	{
346 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
347 	  opt_result res
348 	    = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
349 	  if (!res)
350 	    return res;
351         }
352     }
353 
354   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
355   if (dump_enabled_p ())
356     {
357       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
358       dump_dec (MSG_NOTE, vectorization_factor);
359       dump_printf (MSG_NOTE, "\n");
360     }
361 
362   if (known_le (vectorization_factor, 1U))
363     return opt_result::failure_at (vect_location,
364 				   "not vectorized: unsupported data-type\n");
365   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
366   return opt_result::success ();
367 }
368 
369 
370 /* Function vect_is_simple_iv_evolution.
371 
372    FORNOW: A simple evolution of an induction variables in the loop is
373    considered a polynomial evolution.  */
374 
375 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
377                              tree * step)
378 {
379   tree init_expr;
380   tree step_expr;
381   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
382   basic_block bb;
383 
384   /* When there is no evolution in this loop, the evolution function
385      is not "simple".  */
386   if (evolution_part == NULL_TREE)
387     return false;
388 
389   /* When the evolution is a polynomial of degree >= 2
390      the evolution function is not "simple".  */
391   if (tree_is_chrec (evolution_part))
392     return false;
393 
394   step_expr = evolution_part;
395   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
396 
397   if (dump_enabled_p ())
398     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
399 		     step_expr, init_expr);
400 
401   *init = init_expr;
402   *step = step_expr;
403 
404   if (TREE_CODE (step_expr) != INTEGER_CST
405       && (TREE_CODE (step_expr) != SSA_NAME
406 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
407 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
408 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
409 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
410 		  || !flag_associative_math)))
411       && (TREE_CODE (step_expr) != REAL_CST
412 	  || !flag_associative_math))
413     {
414       if (dump_enabled_p ())
415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416                          "step unknown.\n");
417       return false;
418     }
419 
420   return true;
421 }
422 
423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
424    what we are assuming is a double reduction.  For example, given
425    a structure like this:
426 
427       outer1:
428 	x_1 = PHI <x_4(outer2), ...>;
429 	...
430 
431       inner:
432 	x_2 = PHI <x_1(outer1), ...>;
433 	...
434 	x_3 = ...;
435 	...
436 
437       outer2:
438 	x_4 = PHI <x_3(inner)>;
439 	...
440 
441    outer loop analysis would treat x_1 as a double reduction phi and
442    this function would then return true for x_2.  */
443 
444 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)445 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
446 {
447   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
448   use_operand_p use_p;
449   ssa_op_iter op_iter;
450   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
451     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
452       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
453 	return true;
454   return false;
455 }
456 
457 /* Function vect_analyze_scalar_cycles_1.
458 
459    Examine the cross iteration def-use cycles of scalar variables
460    in LOOP.  LOOP_VINFO represents the loop that is now being
461    considered for vectorization (can be LOOP, or an outer-loop
462    enclosing LOOP).  */
463 
464 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)465 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
466 {
467   basic_block bb = loop->header;
468   tree init, step;
469   auto_vec<stmt_vec_info, 64> worklist;
470   gphi_iterator gsi;
471   bool double_reduc, reduc_chain;
472 
473   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
474 
475   /* First - identify all inductions.  Reduction detection assumes that all the
476      inductions have been identified, therefore, this order must not be
477      changed.  */
478   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
479     {
480       gphi *phi = gsi.phi ();
481       tree access_fn = NULL;
482       tree def = PHI_RESULT (phi);
483       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
484 
485       if (dump_enabled_p ())
486 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
487 
488       /* Skip virtual phi's.  The data dependences that are associated with
489          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
490       if (virtual_operand_p (def))
491 	continue;
492 
493       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
494 
495       /* Analyze the evolution function.  */
496       access_fn = analyze_scalar_evolution (loop, def);
497       if (access_fn)
498 	{
499 	  STRIP_NOPS (access_fn);
500 	  if (dump_enabled_p ())
501 	    dump_printf_loc (MSG_NOTE, vect_location,
502 			     "Access function of PHI: %T\n", access_fn);
503 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
504 	    = initial_condition_in_loop_num (access_fn, loop->num);
505 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
506 	    = evolution_part_in_loop_num (access_fn, loop->num);
507 	}
508 
509       if (!access_fn
510 	  || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
511 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
512 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
513 	      && TREE_CODE (step) != INTEGER_CST))
514 	{
515 	  worklist.safe_push (stmt_vinfo);
516 	  continue;
517 	}
518 
519       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
520 		  != NULL_TREE);
521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
522 
523       if (dump_enabled_p ())
524 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
525       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
526     }
527 
528 
529   /* Second - identify all reductions and nested cycles.  */
530   while (worklist.length () > 0)
531     {
532       stmt_vec_info stmt_vinfo = worklist.pop ();
533       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
534       tree def = PHI_RESULT (phi);
535 
536       if (dump_enabled_p ())
537 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
538 
539       gcc_assert (!virtual_operand_p (def)
540 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
541 
542       stmt_vec_info reduc_stmt_info
543 	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
544 				    &reduc_chain);
545       if (reduc_stmt_info)
546         {
547 	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
548 	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
549 	  if (double_reduc)
550 	    {
551 	      if (dump_enabled_p ())
552 		dump_printf_loc (MSG_NOTE, vect_location,
553 				 "Detected double reduction.\n");
554 
555               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
556 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
557             }
558           else
559             {
560               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
561                 {
562                   if (dump_enabled_p ())
563                     dump_printf_loc (MSG_NOTE, vect_location,
564 				     "Detected vectorizable nested cycle.\n");
565 
566                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
567                 }
568               else
569                 {
570                   if (dump_enabled_p ())
571                     dump_printf_loc (MSG_NOTE, vect_location,
572 				     "Detected reduction.\n");
573 
574                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
575 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
576                   /* Store the reduction cycles for possible vectorization in
577                      loop-aware SLP if it was not detected as reduction
578 		     chain.  */
579 		  if (! reduc_chain)
580 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
581 		      (reduc_stmt_info);
582                 }
583             }
584         }
585       else
586         if (dump_enabled_p ())
587           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
588 			   "Unknown def-use cycle pattern.\n");
589     }
590 }
591 
592 
593 /* Function vect_analyze_scalar_cycles.
594 
595    Examine the cross iteration def-use cycles of scalar variables, by
596    analyzing the loop-header PHIs of scalar variables.  Classify each
597    cycle as one of the following: invariant, induction, reduction, unknown.
598    We do that for the loop represented by LOOP_VINFO, and also to its
599    inner-loop, if exists.
600    Examples for scalar cycles:
601 
602    Example1: reduction:
603 
604               loop1:
605               for (i=0; i<N; i++)
606                  sum += a[i];
607 
608    Example2: induction:
609 
610               loop2:
611               for (i=0; i<N; i++)
612                  a[i] = i;  */
613 
614 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)615 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
616 {
617   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
618 
619   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
620 
621   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
622      Reductions in such inner-loop therefore have different properties than
623      the reductions in the nest that gets vectorized:
624      1. When vectorized, they are executed in the same order as in the original
625         scalar loop, so we can't change the order of computation when
626         vectorizing them.
627      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
628         current checks are too strict.  */
629 
630   if (loop->inner)
631     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
632 }
633 
634 /* Transfer group and reduction information from STMT_INFO to its
635    pattern stmt.  */
636 
637 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)638 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
639 {
640   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
641   stmt_vec_info stmtp;
642   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
643 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
644   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
645   do
646     {
647       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
648       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
649 			   == STMT_VINFO_DEF_TYPE (stmt_info));
650       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
651       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
652       if (stmt_info)
653 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
654 	  = STMT_VINFO_RELATED_STMT (stmt_info);
655     }
656   while (stmt_info);
657 }
658 
659 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
660 
661 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)662 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
663 {
664   stmt_vec_info first;
665   unsigned i;
666 
667   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
668     if (STMT_VINFO_IN_PATTERN_P (first))
669       {
670 	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 	while (next)
672 	  {
673 	    if (! STMT_VINFO_IN_PATTERN_P (next)
674 		|| STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
675 	      break;
676 	    next = REDUC_GROUP_NEXT_ELEMENT (next);
677 	  }
678 	/* If not all stmt in the chain are patterns or if we failed
679 	   to update STMT_VINFO_REDUC_IDX try to handle the chain
680 	   without patterns.  */
681 	if (! next
682 	    && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
683 	  {
684 	    vect_fixup_reduc_chain (first);
685 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
686 	      = STMT_VINFO_RELATED_STMT (first);
687 	  }
688       }
689 }
690 
691 /* Function vect_get_loop_niters.
692 
693    Determine how many iterations the loop is executed and place it
694    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
695    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
696    niter information holds in ASSUMPTIONS.
697 
698    Return the loop exit condition.  */
699 
700 
701 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)702 vect_get_loop_niters (class loop *loop, tree *assumptions,
703 		      tree *number_of_iterations, tree *number_of_iterationsm1)
704 {
705   edge exit = single_exit (loop);
706   class tree_niter_desc niter_desc;
707   tree niter_assumptions, niter, may_be_zero;
708   gcond *cond = get_loop_exit_condition (loop);
709 
710   *assumptions = boolean_true_node;
711   *number_of_iterationsm1 = chrec_dont_know;
712   *number_of_iterations = chrec_dont_know;
713   DUMP_VECT_SCOPE ("get_loop_niters");
714 
715   if (!exit)
716     return cond;
717 
718   may_be_zero = NULL_TREE;
719   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
720       || chrec_contains_undetermined (niter_desc.niter))
721     return cond;
722 
723   niter_assumptions = niter_desc.assumptions;
724   may_be_zero = niter_desc.may_be_zero;
725   niter = niter_desc.niter;
726 
727   if (may_be_zero && integer_zerop (may_be_zero))
728     may_be_zero = NULL_TREE;
729 
730   if (may_be_zero)
731     {
732       if (COMPARISON_CLASS_P (may_be_zero))
733 	{
734 	  /* Try to combine may_be_zero with assumptions, this can simplify
735 	     computation of niter expression.  */
736 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
737 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
738 					     niter_assumptions,
739 					     fold_build1 (TRUTH_NOT_EXPR,
740 							  boolean_type_node,
741 							  may_be_zero));
742 	  else
743 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
744 				 build_int_cst (TREE_TYPE (niter), 0),
745 				 rewrite_to_non_trapping_overflow (niter));
746 
747 	  may_be_zero = NULL_TREE;
748 	}
749       else if (integer_nonzerop (may_be_zero))
750 	{
751 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
752 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
753 	  return cond;
754 	}
755       else
756 	return cond;
757     }
758 
759   *assumptions = niter_assumptions;
760   *number_of_iterationsm1 = niter;
761 
762   /* We want the number of loop header executions which is the number
763      of latch executions plus one.
764      ???  For UINT_MAX latch executions this number overflows to zero
765      for loops like do { n++; } while (n != 0);  */
766   if (niter && !chrec_contains_undetermined (niter))
767     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
768 			  build_int_cst (TREE_TYPE (niter), 1));
769   *number_of_iterations = niter;
770 
771   return cond;
772 }
773 
774 /* Function bb_in_loop_p
775 
776    Used as predicate for dfs order traversal of the loop bbs.  */
777 
778 static bool
bb_in_loop_p(const_basic_block bb,const void * data)779 bb_in_loop_p (const_basic_block bb, const void *data)
780 {
781   const class loop *const loop = (const class loop *)data;
782   if (flow_bb_inside_loop_p (loop, bb))
783     return true;
784   return false;
785 }
786 
787 
788 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
789    stmt_vec_info structs for all the stmts in LOOP_IN.  */
790 
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)791 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
792   : vec_info (vec_info::loop, init_cost (loop_in), shared),
793     loop (loop_in),
794     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
795     num_itersm1 (NULL_TREE),
796     num_iters (NULL_TREE),
797     num_iters_unchanged (NULL_TREE),
798     num_iters_assumptions (NULL_TREE),
799     th (0),
800     versioning_threshold (0),
801     vectorization_factor (0),
802     max_vectorization_factor (0),
803     mask_skip_niters (NULL_TREE),
804     mask_compare_type (NULL_TREE),
805     simd_if_cond (NULL_TREE),
806     unaligned_dr (NULL),
807     peeling_for_alignment (0),
808     ptr_mask (0),
809     ivexpr_map (NULL),
810     scan_map (NULL),
811     slp_unrolling_factor (1),
812     single_scalar_iteration_cost (0),
813     vec_outside_cost (0),
814     vec_inside_cost (0),
815     vectorizable (false),
816     can_fully_mask_p (true),
817     fully_masked_p (false),
818     peeling_for_gaps (false),
819     peeling_for_niter (false),
820     no_data_dependencies (false),
821     has_mask_store (false),
822     scalar_loop_scaling (profile_probability::uninitialized ()),
823     scalar_loop (NULL),
824     orig_loop_info (NULL)
825 {
826   /* CHECKME: We want to visit all BBs before their successors (except for
827      latch blocks, for which this assertion wouldn't hold).  In the simple
828      case of the loop forms we allow, a dfs order of the BBs would the same
829      as reversed postorder traversal, so we are safe.  */
830 
831   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
832 					  bbs, loop->num_nodes, loop);
833   gcc_assert (nbbs == loop->num_nodes);
834 
835   for (unsigned int i = 0; i < nbbs; i++)
836     {
837       basic_block bb = bbs[i];
838       gimple_stmt_iterator si;
839 
840       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
841 	{
842 	  gimple *phi = gsi_stmt (si);
843 	  gimple_set_uid (phi, 0);
844 	  add_stmt (phi);
845 	}
846 
847       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
848 	{
849 	  gimple *stmt = gsi_stmt (si);
850 	  gimple_set_uid (stmt, 0);
851 	  add_stmt (stmt);
852 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
853 	     third argument is the #pragma omp simd if (x) condition, when 0,
854 	     loop shouldn't be vectorized, when non-zero constant, it should
855 	     be vectorized normally, otherwise versioned with vectorized loop
856 	     done if the condition is non-zero at runtime.  */
857 	  if (loop_in->simduid
858 	      && is_gimple_call (stmt)
859 	      && gimple_call_internal_p (stmt)
860 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
861 	      && gimple_call_num_args (stmt) >= 3
862 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
863 	      && (loop_in->simduid
864 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
865 	    {
866 	      tree arg = gimple_call_arg (stmt, 2);
867 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
868 		simd_if_cond = arg;
869 	      else
870 		gcc_assert (integer_nonzerop (arg));
871 	    }
872 	}
873     }
874 
875   epilogue_vinfos.create (6);
876 }
877 
878 /* Free all levels of MASKS.  */
879 
880 void
release_vec_loop_masks(vec_loop_masks * masks)881 release_vec_loop_masks (vec_loop_masks *masks)
882 {
883   rgroup_masks *rgm;
884   unsigned int i;
885   FOR_EACH_VEC_ELT (*masks, i, rgm)
886     rgm->masks.release ();
887   masks->release ();
888 }
889 
890 /* Free all memory used by the _loop_vec_info, as well as all the
891    stmt_vec_info structs of all the stmts in the loop.  */
892 
~_loop_vec_info()893 _loop_vec_info::~_loop_vec_info ()
894 {
895   free (bbs);
896 
897   release_vec_loop_masks (&masks);
898   delete ivexpr_map;
899   delete scan_map;
900   epilogue_vinfos.release ();
901 
902   loop->aux = NULL;
903 }
904 
905 /* Return an invariant or register for EXPR and emit necessary
906    computations in the LOOP_VINFO loop preheader.  */
907 
908 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)909 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
910 {
911   if (is_gimple_reg (expr)
912       || is_gimple_min_invariant (expr))
913     return expr;
914 
915   if (! loop_vinfo->ivexpr_map)
916     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
917   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
918   if (! cached)
919     {
920       gimple_seq stmts = NULL;
921       cached = force_gimple_operand (unshare_expr (expr),
922 				     &stmts, true, NULL_TREE);
923       if (stmts)
924 	{
925 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
926 	  gsi_insert_seq_on_edge_immediate (e, stmts);
927 	}
928     }
929   return cached;
930 }
931 
932 /* Return true if we can use CMP_TYPE as the comparison type to produce
933    all masks required to mask LOOP_VINFO.  */
934 
935 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)936 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
937 {
938   rgroup_masks *rgm;
939   unsigned int i;
940   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
941     if (rgm->mask_type != NULL_TREE
942 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
943 					    cmp_type, rgm->mask_type,
944 					    OPTIMIZE_FOR_SPEED))
945       return false;
946   return true;
947 }
948 
949 /* Calculate the maximum number of scalars per iteration for every
950    rgroup in LOOP_VINFO.  */
951 
952 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)953 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
954 {
955   unsigned int res = 1;
956   unsigned int i;
957   rgroup_masks *rgm;
958   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
959     res = MAX (res, rgm->max_nscalars_per_iter);
960   return res;
961 }
962 
963 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
964    whether we can actually generate the masks required.  Return true if so,
965    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
966 
967 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)968 vect_verify_full_masking (loop_vec_info loop_vinfo)
969 {
970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
971   unsigned int min_ni_width;
972   unsigned int max_nscalars_per_iter
973     = vect_get_max_nscalars_per_iter (loop_vinfo);
974 
975   /* Use a normal loop if there are no statements that need masking.
976      This only happens in rare degenerate cases: it means that the loop
977      has no loads, no stores, and no live-out values.  */
978   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
979     return false;
980 
981   /* Get the maximum number of iterations that is representable
982      in the counter type.  */
983   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
984   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 
986   /* Get a more refined estimate for the number of iterations.  */
987   widest_int max_back_edges;
988   if (max_loop_iterations (loop, &max_back_edges))
989     max_ni = wi::smin (max_ni, max_back_edges + 1);
990 
991   /* Account for rgroup masks, in which each bit is replicated N times.  */
992   max_ni *= max_nscalars_per_iter;
993 
994   /* Work out how many bits we need to represent the limit.  */
995   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
996 
997   /* Find a scalar mode for which WHILE_ULT is supported.  */
998   opt_scalar_int_mode cmp_mode_iter;
999   tree cmp_type = NULL_TREE;
1000   tree iv_type = NULL_TREE;
1001   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1002   unsigned int iv_precision = UINT_MAX;
1003 
1004   if (iv_limit != -1)
1005     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1006 				      UNSIGNED);
1007 
1008   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1009     {
1010       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1011       if (cmp_bits >= min_ni_width
1012 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1013 	{
1014 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1015 	  if (this_type
1016 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1017 	    {
1018 	      /* Although we could stop as soon as we find a valid mode,
1019 		 there are at least two reasons why that's not always the
1020 		 best choice:
1021 
1022 		 - An IV that's Pmode or wider is more likely to be reusable
1023 		   in address calculations than an IV that's narrower than
1024 		   Pmode.
1025 
1026 		 - Doing the comparison in IV_PRECISION or wider allows
1027 		   a natural 0-based IV, whereas using a narrower comparison
1028 		   type requires mitigations against wrap-around.
1029 
1030 		 Conversely, if the IV limit is variable, doing the comparison
1031 		 in a wider type than the original type can introduce
1032 		 unnecessary extensions, so picking the widest valid mode
1033 		 is not always a good choice either.
1034 
1035 		 Here we prefer the first IV type that's Pmode or wider,
1036 		 and the first comparison type that's IV_PRECISION or wider.
1037 		 (The comparison type must be no wider than the IV type,
1038 		 to avoid extensions in the vector loop.)
1039 
1040 		 ??? We might want to try continuing beyond Pmode for ILP32
1041 		 targets if CMP_BITS < IV_PRECISION.  */
1042 	      iv_type = this_type;
1043 	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1044 		cmp_type = this_type;
1045 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1046 		break;
1047 	    }
1048 	}
1049     }
1050 
1051   if (!cmp_type)
1052     return false;
1053 
1054   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1055   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1056   return true;
1057 }
1058 
1059 /* Calculate the cost of one scalar iteration of the loop.  */
1060 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1061 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1062 {
1063   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1064   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1065   int nbbs = loop->num_nodes, factor;
1066   int innerloop_iters, i;
1067 
1068   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1069 
1070   /* Gather costs for statements in the scalar loop.  */
1071 
1072   /* FORNOW.  */
1073   innerloop_iters = 1;
1074   if (loop->inner)
1075     innerloop_iters = 50; /* FIXME */
1076 
1077   for (i = 0; i < nbbs; i++)
1078     {
1079       gimple_stmt_iterator si;
1080       basic_block bb = bbs[i];
1081 
1082       if (bb->loop_father == loop->inner)
1083         factor = innerloop_iters;
1084       else
1085         factor = 1;
1086 
1087       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1088         {
1089 	  gimple *stmt = gsi_stmt (si);
1090 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1091 
1092           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1093             continue;
1094 
1095           /* Skip stmts that are not vectorized inside the loop.  */
1096 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1097           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1098               && (!STMT_VINFO_LIVE_P (vstmt_info)
1099                   || !VECTORIZABLE_CYCLE_DEF
1100 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1101             continue;
1102 
1103 	  vect_cost_for_stmt kind;
1104           if (STMT_VINFO_DATA_REF (stmt_info))
1105             {
1106               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1107                kind = scalar_load;
1108              else
1109                kind = scalar_store;
1110             }
1111 	  else if (vect_nop_conversion_p (stmt_info))
1112 	    continue;
1113 	  else
1114             kind = scalar_stmt;
1115 
1116 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1117 			    factor, kind, stmt_info, 0, vect_prologue);
1118         }
1119     }
1120 
1121   /* Now accumulate cost.  */
1122   void *target_cost_data = init_cost (loop);
1123   stmt_info_for_cost *si;
1124   int j;
1125   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126 		    j, si)
1127     (void) add_stmt_cost (target_cost_data, si->count,
1128 			  si->kind, si->stmt_info, si->misalign,
1129 			  vect_body);
1130   unsigned dummy, body_cost = 0;
1131   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1132   destroy_cost_data (target_cost_data);
1133   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1134 }
1135 
1136 
1137 /* Function vect_analyze_loop_form_1.
1138 
1139    Verify that certain CFG restrictions hold, including:
1140    - the loop has a pre-header
1141    - the loop has a single entry and exit
1142    - the loop exit condition is simple enough
1143    - the number of iterations can be analyzed, i.e, a countable loop.  The
1144      niter could be analyzed under some assumptions.  */
1145 
1146 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1147 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1148 			  tree *assumptions, tree *number_of_iterationsm1,
1149 			  tree *number_of_iterations, gcond **inner_loop_cond)
1150 {
1151   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1152 
1153   /* Different restrictions apply when we are considering an inner-most loop,
1154      vs. an outer (nested) loop.
1155      (FORNOW. May want to relax some of these restrictions in the future).  */
1156 
1157   if (!loop->inner)
1158     {
1159       /* Inner-most loop.  We currently require that the number of BBs is
1160 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1161 	 look like this:
1162 
1163                         (pre-header)
1164                            |
1165                           header <--------+
1166                            | |            |
1167                            | +--> latch --+
1168                            |
1169                         (exit-bb)  */
1170 
1171       if (loop->num_nodes != 2)
1172 	return opt_result::failure_at (vect_location,
1173 				       "not vectorized:"
1174 				       " control flow in loop.\n");
1175 
1176       if (empty_block_p (loop->header))
1177 	return opt_result::failure_at (vect_location,
1178 				       "not vectorized: empty loop.\n");
1179     }
1180   else
1181     {
1182       class loop *innerloop = loop->inner;
1183       edge entryedge;
1184 
1185       /* Nested loop. We currently require that the loop is doubly-nested,
1186 	 contains a single inner loop, and the number of BBs is exactly 5.
1187 	 Vectorizable outer-loops look like this:
1188 
1189 			(pre-header)
1190 			   |
1191 			  header <---+
1192 			   |         |
1193 		          inner-loop |
1194 			   |         |
1195 			  tail ------+
1196 			   |
1197 		        (exit-bb)
1198 
1199 	 The inner-loop has the properties expected of inner-most loops
1200 	 as described above.  */
1201 
1202       if ((loop->inner)->inner || (loop->inner)->next)
1203 	return opt_result::failure_at (vect_location,
1204 				       "not vectorized:"
1205 				       " multiple nested loops.\n");
1206 
1207       if (loop->num_nodes != 5)
1208 	return opt_result::failure_at (vect_location,
1209 				       "not vectorized:"
1210 				       " control flow in loop.\n");
1211 
1212       entryedge = loop_preheader_edge (innerloop);
1213       if (entryedge->src != loop->header
1214 	  || !single_exit (innerloop)
1215 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1216 	return opt_result::failure_at (vect_location,
1217 				       "not vectorized:"
1218 				       " unsupported outerloop form.\n");
1219 
1220       /* Analyze the inner-loop.  */
1221       tree inner_niterm1, inner_niter, inner_assumptions;
1222       opt_result res
1223 	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1224 				    &inner_assumptions, &inner_niterm1,
1225 				    &inner_niter, NULL);
1226       if (!res)
1227 	{
1228 	  if (dump_enabled_p ())
1229 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 			     "not vectorized: Bad inner loop.\n");
1231 	  return res;
1232 	}
1233 
1234       /* Don't support analyzing niter under assumptions for inner
1235 	 loop.  */
1236       if (!integer_onep (inner_assumptions))
1237 	return opt_result::failure_at (vect_location,
1238 				       "not vectorized: Bad inner loop.\n");
1239 
1240       if (!expr_invariant_in_loop_p (loop, inner_niter))
1241 	return opt_result::failure_at (vect_location,
1242 				       "not vectorized: inner-loop count not"
1243 				       " invariant.\n");
1244 
1245       if (dump_enabled_p ())
1246         dump_printf_loc (MSG_NOTE, vect_location,
1247 			 "Considering outer-loop vectorization.\n");
1248     }
1249 
1250   if (!single_exit (loop))
1251     return opt_result::failure_at (vect_location,
1252 				   "not vectorized: multiple exits.\n");
1253   if (EDGE_COUNT (loop->header->preds) != 2)
1254     return opt_result::failure_at (vect_location,
1255 				   "not vectorized:"
1256 				   " too many incoming edges.\n");
1257 
1258   /* We assume that the loop exit condition is at the end of the loop. i.e,
1259      that the loop is represented as a do-while (with a proper if-guard
1260      before the loop if needed), where the loop header contains all the
1261      executable statements, and the latch is empty.  */
1262   if (!empty_block_p (loop->latch)
1263       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1264     return opt_result::failure_at (vect_location,
1265 				   "not vectorized: latch block not empty.\n");
1266 
1267   /* Make sure the exit is not abnormal.  */
1268   edge e = single_exit (loop);
1269   if (e->flags & EDGE_ABNORMAL)
1270     return opt_result::failure_at (vect_location,
1271 				   "not vectorized:"
1272 				   " abnormal loop exit edge.\n");
1273 
1274   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1275 				     number_of_iterationsm1);
1276   if (!*loop_cond)
1277     return opt_result::failure_at
1278       (vect_location,
1279        "not vectorized: complicated exit condition.\n");
1280 
1281   if (integer_zerop (*assumptions)
1282       || !*number_of_iterations
1283       || chrec_contains_undetermined (*number_of_iterations))
1284     return opt_result::failure_at
1285       (*loop_cond,
1286        "not vectorized: number of iterations cannot be computed.\n");
1287 
1288   if (integer_zerop (*number_of_iterations))
1289     return opt_result::failure_at
1290       (*loop_cond,
1291        "not vectorized: number of iterations = 0.\n");
1292 
1293   return opt_result::success ();
1294 }
1295 
1296 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1297 
1298 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1299 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1300 {
1301   tree assumptions, number_of_iterations, number_of_iterationsm1;
1302   gcond *loop_cond, *inner_loop_cond = NULL;
1303 
1304   opt_result res
1305     = vect_analyze_loop_form_1 (loop, &loop_cond,
1306 				&assumptions, &number_of_iterationsm1,
1307 				&number_of_iterations, &inner_loop_cond);
1308   if (!res)
1309     return opt_loop_vec_info::propagate_failure (res);
1310 
1311   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1312   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1313   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1314   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1315   if (!integer_onep (assumptions))
1316     {
1317       /* We consider to vectorize this loop by versioning it under
1318 	 some assumptions.  In order to do this, we need to clear
1319 	 existing information computed by scev and niter analyzer.  */
1320       scev_reset_htab ();
1321       free_numbers_of_iterations_estimates (loop);
1322       /* Also set flag for this loop so that following scev and niter
1323 	 analysis are done under the assumptions.  */
1324       loop_constraint_set (loop, LOOP_C_FINITE);
1325       /* Also record the assumptions for versioning.  */
1326       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1327     }
1328 
1329   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1330     {
1331       if (dump_enabled_p ())
1332         {
1333           dump_printf_loc (MSG_NOTE, vect_location,
1334 			   "Symbolic number of iterations is ");
1335 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1336           dump_printf (MSG_NOTE, "\n");
1337         }
1338     }
1339 
1340   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1341   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1342   if (inner_loop_cond)
1343     {
1344       stmt_vec_info inner_loop_cond_info
1345 	= loop_vinfo->lookup_stmt (inner_loop_cond);
1346       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347     }
1348 
1349   gcc_assert (!loop->aux);
1350   loop->aux = loop_vinfo;
1351   return opt_loop_vec_info::success (loop_vinfo);
1352 }
1353 
1354 
1355 
1356 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1357    statements update the vectorization factor.  */
1358 
1359 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1360 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1361 {
1362   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1363   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1364   int nbbs = loop->num_nodes;
1365   poly_uint64 vectorization_factor;
1366   int i;
1367 
1368   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1369 
1370   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1371   gcc_assert (known_ne (vectorization_factor, 0U));
1372 
1373   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1374      vectorization factor of the loop is the unrolling factor required by
1375      the SLP instances.  If that unrolling factor is 1, we say, that we
1376      perform pure SLP on loop - cross iteration parallelism is not
1377      exploited.  */
1378   bool only_slp_in_loop = true;
1379   for (i = 0; i < nbbs; i++)
1380     {
1381       basic_block bb = bbs[i];
1382       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1383 	   gsi_next (&si))
1384 	{
1385 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1386 	  if (!stmt_info)
1387 	    continue;
1388 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1389 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1390 	      && !PURE_SLP_STMT (stmt_info))
1391 	    /* STMT needs both SLP and loop-based vectorization.  */
1392 	    only_slp_in_loop = false;
1393 	}
1394       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1395 	   gsi_next (&si))
1396 	{
1397 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1398 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1399 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1400 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1401 	      && !PURE_SLP_STMT (stmt_info))
1402 	    /* STMT needs both SLP and loop-based vectorization.  */
1403 	    only_slp_in_loop = false;
1404 	}
1405     }
1406 
1407   if (only_slp_in_loop)
1408     {
1409       if (dump_enabled_p ())
1410 	dump_printf_loc (MSG_NOTE, vect_location,
1411 			 "Loop contains only SLP stmts\n");
1412       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413     }
1414   else
1415     {
1416       if (dump_enabled_p ())
1417 	dump_printf_loc (MSG_NOTE, vect_location,
1418 			 "Loop contains SLP and non-SLP stmts\n");
1419       /* Both the vectorization factor and unroll factor have the form
1420 	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1421 	 so they must have a common multiple.  */
1422       vectorization_factor
1423 	= force_common_multiple (vectorization_factor,
1424 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1425     }
1426 
1427   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1428   if (dump_enabled_p ())
1429     {
1430       dump_printf_loc (MSG_NOTE, vect_location,
1431 		       "Updating vectorization factor to ");
1432       dump_dec (MSG_NOTE, vectorization_factor);
1433       dump_printf (MSG_NOTE, ".\n");
1434     }
1435 }
1436 
1437 /* Return true if STMT_INFO describes a double reduction phi and if
1438    the other phi in the reduction is also relevant for vectorization.
1439    This rejects cases such as:
1440 
1441       outer1:
1442 	x_1 = PHI <x_3(outer2), ...>;
1443 	...
1444 
1445       inner:
1446 	x_2 = ...;
1447 	...
1448 
1449       outer2:
1450 	x_3 = PHI <x_2(inner)>;
1451 
1452    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1453 
1454 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1455 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1456 {
1457   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1458     return false;
1459 
1460   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1461 }
1462 
1463 /* Function vect_analyze_loop_operations.
1464 
1465    Scan the loop stmts and make sure they are all vectorizable.  */
1466 
1467 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1468 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1469 {
1470   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1471   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1472   int nbbs = loop->num_nodes;
1473   int i;
1474   stmt_vec_info stmt_info;
1475   bool need_to_vectorize = false;
1476   bool ok;
1477 
1478   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1479 
1480   auto_vec<stmt_info_for_cost> cost_vec;
1481 
1482   for (i = 0; i < nbbs; i++)
1483     {
1484       basic_block bb = bbs[i];
1485 
1486       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1487 	   gsi_next (&si))
1488         {
1489           gphi *phi = si.phi ();
1490           ok = true;
1491 
1492 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1493           if (dump_enabled_p ())
1494 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1495 	  if (virtual_operand_p (gimple_phi_result (phi)))
1496 	    continue;
1497 
1498           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1499              (i.e., a phi in the tail of the outer-loop).  */
1500           if (! is_loop_header_bb_p (bb))
1501             {
1502               /* FORNOW: we currently don't support the case that these phis
1503                  are not used in the outerloop (unless it is double reduction,
1504                  i.e., this phi is vect_reduction_def), cause this case
1505                  requires to actually do something here.  */
1506               if (STMT_VINFO_LIVE_P (stmt_info)
1507 		  && !vect_active_double_reduction_p (stmt_info))
1508 		return opt_result::failure_at (phi,
1509 					       "Unsupported loop-closed phi"
1510 					       " in outer-loop.\n");
1511 
1512               /* If PHI is used in the outer loop, we check that its operand
1513                  is defined in the inner loop.  */
1514               if (STMT_VINFO_RELEVANT_P (stmt_info))
1515                 {
1516                   tree phi_op;
1517 
1518                   if (gimple_phi_num_args (phi) != 1)
1519                     return opt_result::failure_at (phi, "unsupported phi");
1520 
1521                   phi_op = PHI_ARG_DEF (phi, 0);
1522 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1523 		  if (!op_def_info)
1524 		    return opt_result::failure_at (phi, "unsupported phi\n");
1525 
1526 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1527 		      && (STMT_VINFO_RELEVANT (op_def_info)
1528 			  != vect_used_in_outer_by_reduction))
1529 		    return opt_result::failure_at (phi, "unsupported phi\n");
1530 
1531 		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1532 		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1533 			   == vect_double_reduction_def))
1534 		      && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1535 		    return opt_result::failure_at (phi, "unsupported phi\n");
1536                 }
1537 
1538               continue;
1539             }
1540 
1541           gcc_assert (stmt_info);
1542 
1543           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544                || STMT_VINFO_LIVE_P (stmt_info))
1545               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546 	    /* A scalar-dependence cycle that we don't support.  */
1547 	    return opt_result::failure_at (phi,
1548 					   "not vectorized:"
1549 					   " scalar dependence cycle.\n");
1550 
1551           if (STMT_VINFO_RELEVANT_P (stmt_info))
1552             {
1553               need_to_vectorize = true;
1554               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555 		  && ! PURE_SLP_STMT (stmt_info))
1556 		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1557 					     &cost_vec);
1558 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1559 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1560 			    == vect_double_reduction_def)
1561 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1562 		       && ! PURE_SLP_STMT (stmt_info))
1563 		ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1564             }
1565 
1566 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1567 	  if (ok
1568 	      && STMT_VINFO_LIVE_P (stmt_info)
1569 	      && !PURE_SLP_STMT (stmt_info))
1570 	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1571 					      -1, false, &cost_vec);
1572 
1573           if (!ok)
1574 	    return opt_result::failure_at (phi,
1575 					   "not vectorized: relevant phi not "
1576 					   "supported: %G",
1577 					   static_cast <gimple *> (phi));
1578         }
1579 
1580       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1581 	   gsi_next (&si))
1582         {
1583 	  gimple *stmt = gsi_stmt (si);
1584 	  if (!gimple_clobber_p (stmt))
1585 	    {
1586 	      opt_result res
1587 		= vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1588 				     &need_to_vectorize,
1589 				     NULL, NULL, &cost_vec);
1590 	      if (!res)
1591 		return res;
1592 	    }
1593         }
1594     } /* bbs */
1595 
1596   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1597 
1598   /* All operations in the loop are either irrelevant (deal with loop
1599      control, or dead), or only used outside the loop and can be moved
1600      out of the loop (e.g. invariants, inductions).  The loop can be
1601      optimized away by scalar optimizations.  We're better off not
1602      touching this loop.  */
1603   if (!need_to_vectorize)
1604     {
1605       if (dump_enabled_p ())
1606         dump_printf_loc (MSG_NOTE, vect_location,
1607 			 "All the computation can be taken out of the loop.\n");
1608       return opt_result::failure_at
1609 	(vect_location,
1610 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1611     }
1612 
1613   return opt_result::success ();
1614 }
1615 
1616 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1617    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1618    definitely no, or -1 if it's worth retrying.  */
1619 
1620 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1621 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1622 {
1623   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1624   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1625 
1626   /* Only fully-masked loops can have iteration counts less than the
1627      vectorization factor.  */
1628   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1629     {
1630       HOST_WIDE_INT max_niter;
1631 
1632       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1633 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1634       else
1635 	max_niter = max_stmt_executions_int (loop);
1636 
1637       if (max_niter != -1
1638 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1639 	{
1640 	  if (dump_enabled_p ())
1641 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642 			     "not vectorized: iteration count smaller than "
1643 			     "vectorization factor.\n");
1644 	  return 0;
1645 	}
1646     }
1647 
1648   int min_profitable_iters, min_profitable_estimate;
1649   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1650 				      &min_profitable_estimate);
1651 
1652   if (min_profitable_iters < 0)
1653     {
1654       if (dump_enabled_p ())
1655 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656 			 "not vectorized: vectorization not profitable.\n");
1657       if (dump_enabled_p ())
1658 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659 			 "not vectorized: vector version will never be "
1660 			 "profitable.\n");
1661       return -1;
1662     }
1663 
1664   int min_scalar_loop_bound = (param_min_vect_loop_bound
1665 			       * assumed_vf);
1666 
1667   /* Use the cost model only if it is more conservative than user specified
1668      threshold.  */
1669   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1670 				    min_profitable_iters);
1671 
1672   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1673 
1674   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1675       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1676     {
1677       if (dump_enabled_p ())
1678 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 			 "not vectorized: vectorization not profitable.\n");
1680       if (dump_enabled_p ())
1681 	dump_printf_loc (MSG_NOTE, vect_location,
1682 			 "not vectorized: iteration count smaller than user "
1683 			 "specified loop bound parameter or minimum profitable "
1684 			 "iterations (whichever is more conservative).\n");
1685       return 0;
1686     }
1687 
1688   /* The static profitablity threshold min_profitable_estimate includes
1689      the cost of having to check at runtime whether the scalar loop
1690      should be used instead.  If it turns out that we don't need or want
1691      such a check, the threshold we should use for the static estimate
1692      is simply the point at which the vector loop becomes more profitable
1693      than the scalar loop.  */
1694   if (min_profitable_estimate > min_profitable_iters
1695       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1696       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1697       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1698       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1699     {
1700       if (dump_enabled_p ())
1701 	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1702 			 " choice between the scalar and vector loops\n");
1703       min_profitable_estimate = min_profitable_iters;
1704     }
1705 
1706   HOST_WIDE_INT estimated_niter;
1707 
1708   /* If we are vectorizing an epilogue then we know the maximum number of
1709      scalar iterations it will cover is at least one lower than the
1710      vectorization factor of the main loop.  */
1711   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1712     estimated_niter
1713       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1714   else
1715     {
1716       estimated_niter = estimated_stmt_executions_int (loop);
1717       if (estimated_niter == -1)
1718 	estimated_niter = likely_max_stmt_executions_int (loop);
1719     }
1720   if (estimated_niter != -1
1721       && ((unsigned HOST_WIDE_INT) estimated_niter
1722 	  < MAX (th, (unsigned) min_profitable_estimate)))
1723     {
1724       if (dump_enabled_p ())
1725 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1726 			 "not vectorized: estimated iteration count too "
1727 			 "small.\n");
1728       if (dump_enabled_p ())
1729 	dump_printf_loc (MSG_NOTE, vect_location,
1730 			 "not vectorized: estimated iteration count smaller "
1731 			 "than specified loop bound parameter or minimum "
1732 			 "profitable iterations (whichever is more "
1733 			 "conservative).\n");
1734       return -1;
1735     }
1736 
1737   return 1;
1738 }
1739 
1740 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1741 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1742 			   vec<data_reference_p> *datarefs,
1743 			   unsigned int *n_stmts)
1744 {
1745   *n_stmts = 0;
1746   for (unsigned i = 0; i < loop->num_nodes; i++)
1747     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1748 	 !gsi_end_p (gsi); gsi_next (&gsi))
1749       {
1750 	gimple *stmt = gsi_stmt (gsi);
1751 	if (is_gimple_debug (stmt))
1752 	  continue;
1753 	++(*n_stmts);
1754 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1755 	if (!res)
1756 	  {
1757 	    if (is_gimple_call (stmt) && loop->safelen)
1758 	      {
1759 		tree fndecl = gimple_call_fndecl (stmt), op;
1760 		if (fndecl != NULL_TREE)
1761 		  {
1762 		    cgraph_node *node = cgraph_node::get (fndecl);
1763 		    if (node != NULL && node->simd_clones != NULL)
1764 		      {
1765 			unsigned int j, n = gimple_call_num_args (stmt);
1766 			for (j = 0; j < n; j++)
1767 			  {
1768 			    op = gimple_call_arg (stmt, j);
1769 			    if (DECL_P (op)
1770 				|| (REFERENCE_CLASS_P (op)
1771 				    && get_base_address (op)))
1772 			      break;
1773 			  }
1774 			op = gimple_call_lhs (stmt);
1775 			/* Ignore #pragma omp declare simd functions
1776 			   if they don't have data references in the
1777 			   call stmt itself.  */
1778 			if (j == n
1779 			    && !(op
1780 				 && (DECL_P (op)
1781 				     || (REFERENCE_CLASS_P (op)
1782 					 && get_base_address (op)))))
1783 			  continue;
1784 		      }
1785 		  }
1786 	      }
1787 	    return res;
1788 	  }
1789 	/* If dependence analysis will give up due to the limit on the
1790 	   number of datarefs stop here and fail fatally.  */
1791 	if (datarefs->length ()
1792 	    > (unsigned)param_loop_max_datarefs_for_datadeps)
1793 	  return opt_result::failure_at (stmt, "exceeded param "
1794 					 "loop-max-datarefs-for-datadeps\n");
1795       }
1796   return opt_result::success ();
1797 }
1798 
1799 /* Look for SLP-only access groups and turn each individual access into its own
1800    group.  */
1801 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)1802 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1803 {
1804   unsigned int i;
1805   struct data_reference *dr;
1806 
1807   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1808 
1809   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1810   FOR_EACH_VEC_ELT (datarefs, i, dr)
1811     {
1812       gcc_assert (DR_REF (dr));
1813       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1814 
1815       /* Check if the load is a part of an interleaving chain.  */
1816       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1817 	{
1818 	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1819 	  unsigned int group_size = DR_GROUP_SIZE (first_element);
1820 
1821 	  /* Check if SLP-only groups.  */
1822 	  if (!STMT_SLP_TYPE (stmt_info)
1823 	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
1824 	    {
1825 	      /* Dissolve the group.  */
1826 	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1827 
1828 	      stmt_vec_info vinfo = first_element;
1829 	      while (vinfo)
1830 		{
1831 		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1832 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1833 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1834 		  DR_GROUP_SIZE (vinfo) = 1;
1835 		  if (STMT_VINFO_STRIDED_P (first_element))
1836 		    DR_GROUP_GAP (vinfo) = 0;
1837 		  else
1838 		    DR_GROUP_GAP (vinfo) = group_size - 1;
1839 		  vinfo = next;
1840 		}
1841 	    }
1842 	}
1843     }
1844 }
1845 
1846 
1847 /* Decides whether we need to create an epilogue loop to handle
1848    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1849 
1850 void
determine_peel_for_niter(loop_vec_info loop_vinfo)1851 determine_peel_for_niter (loop_vec_info loop_vinfo)
1852 {
1853   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1854 
1855   unsigned HOST_WIDE_INT const_vf;
1856   HOST_WIDE_INT max_niter
1857     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1858 
1859   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1860   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1861     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1862 					  (loop_vinfo));
1863 
1864   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1865     /* The main loop handles all iterations.  */
1866     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1867   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1868 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1869     {
1870       /* Work out the (constant) number of iterations that need to be
1871 	 peeled for reasons other than niters.  */
1872       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1873       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1874 	peel_niter += 1;
1875       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1876 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1877 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1878     }
1879   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880 	   /* ??? When peeling for gaps but not alignment, we could
1881 	      try to check whether the (variable) niters is known to be
1882 	      VF * N + 1.  That's something of a niche case though.  */
1883 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1884 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1885 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1886 		< (unsigned) exact_log2 (const_vf))
1887 	       /* In case of versioning, check if the maximum number of
1888 		  iterations is greater than th.  If they are identical,
1889 		  the epilogue is unnecessary.  */
1890 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891 		   || ((unsigned HOST_WIDE_INT) max_niter
1892 		       > (th / const_vf) * const_vf))))
1893     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1894 }
1895 
1896 
1897 /* Function vect_analyze_loop_2.
1898 
1899    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1900    for it.  The different analyses will record information in the
1901    loop_vec_info struct.  */
1902 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1903 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1904 {
1905   opt_result ok = opt_result::success ();
1906   int res;
1907   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1908   poly_uint64 min_vf = 2;
1909   loop_vec_info orig_loop_vinfo = NULL;
1910 
1911   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1912      loop_vec_info of the first vectorized loop.  */
1913   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1914     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1915   else
1916     orig_loop_vinfo = loop_vinfo;
1917   gcc_assert (orig_loop_vinfo);
1918 
1919   /* The first group of checks is independent of the vector size.  */
1920   fatal = true;
1921 
1922   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1923       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1924     return opt_result::failure_at (vect_location,
1925 				   "not vectorized: simd if(0)\n");
1926 
1927   /* Find all data references in the loop (which correspond to vdefs/vuses)
1928      and analyze their evolution in the loop.  */
1929 
1930   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1931 
1932   /* Gather the data references and count stmts in the loop.  */
1933   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1934     {
1935       opt_result res
1936 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1937 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
1938 				     n_stmts);
1939       if (!res)
1940 	{
1941 	  if (dump_enabled_p ())
1942 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943 			     "not vectorized: loop contains function "
1944 			     "calls or data references that cannot "
1945 			     "be analyzed\n");
1946 	  return res;
1947 	}
1948       loop_vinfo->shared->save_datarefs ();
1949     }
1950   else
1951     loop_vinfo->shared->check_datarefs ();
1952 
1953   /* Analyze the data references and also adjust the minimal
1954      vectorization factor according to the loads and stores.  */
1955 
1956   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1957   if (!ok)
1958     {
1959       if (dump_enabled_p ())
1960 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961 			 "bad data references.\n");
1962       return ok;
1963     }
1964 
1965   /* Classify all cross-iteration scalar data-flow cycles.
1966      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1967   vect_analyze_scalar_cycles (loop_vinfo);
1968 
1969   vect_pattern_recog (loop_vinfo);
1970 
1971   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1972 
1973   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1974      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1975 
1976   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1977   if (!ok)
1978     {
1979       if (dump_enabled_p ())
1980 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 			 "bad data access.\n");
1982       return ok;
1983     }
1984 
1985   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1986 
1987   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1988   if (!ok)
1989     {
1990       if (dump_enabled_p ())
1991 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992 			 "unexpected pattern.\n");
1993       return ok;
1994     }
1995 
1996   /* While the rest of the analysis below depends on it in some way.  */
1997   fatal = false;
1998 
1999   /* Analyze data dependences between the data-refs in the loop
2000      and adjust the maximum vectorization factor according to
2001      the dependences.
2002      FORNOW: fail at the first data dependence that we encounter.  */
2003 
2004   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2005   if (!ok)
2006     {
2007       if (dump_enabled_p ())
2008 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 			 "bad data dependence.\n");
2010       return ok;
2011     }
2012   if (max_vf != MAX_VECTORIZATION_FACTOR
2013       && maybe_lt (max_vf, min_vf))
2014     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2015   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2016 
2017   ok = vect_determine_vectorization_factor (loop_vinfo);
2018   if (!ok)
2019     {
2020       if (dump_enabled_p ())
2021 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2022 			 "can't determine vectorization factor.\n");
2023       return ok;
2024     }
2025   if (max_vf != MAX_VECTORIZATION_FACTOR
2026       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2027     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2028 
2029   /* Compute the scalar iteration cost.  */
2030   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2031 
2032   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2033 
2034   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2035   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2036   if (!ok)
2037     return ok;
2038 
2039   /* If there are any SLP instances mark them as pure_slp.  */
2040   bool slp = vect_make_slp_decision (loop_vinfo);
2041   if (slp)
2042     {
2043       /* Find stmts that need to be both vectorized and SLPed.  */
2044       vect_detect_hybrid_slp (loop_vinfo);
2045 
2046       /* Update the vectorization factor based on the SLP decision.  */
2047       vect_update_vf_for_slp (loop_vinfo);
2048     }
2049 
2050   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2051 
2052   /* We don't expect to have to roll back to anything other than an empty
2053      set of rgroups.  */
2054   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2055 
2056   /* This is the point where we can re-start analysis with SLP forced off.  */
2057 start_over:
2058 
2059   /* Now the vectorization factor is final.  */
2060   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2061   gcc_assert (known_ne (vectorization_factor, 0U));
2062 
2063   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2064     {
2065       dump_printf_loc (MSG_NOTE, vect_location,
2066 		       "vectorization_factor = ");
2067       dump_dec (MSG_NOTE, vectorization_factor);
2068       dump_printf (MSG_NOTE, ", niters = %wd\n",
2069 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2070     }
2071 
2072   /* Analyze the alignment of the data-refs in the loop.
2073      Fail if a data reference is found that cannot be vectorized.  */
2074 
2075   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2076   if (!ok)
2077     {
2078       if (dump_enabled_p ())
2079 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080 			 "bad data alignment.\n");
2081       return ok;
2082     }
2083 
2084   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2085      It is important to call pruning after vect_analyze_data_ref_accesses,
2086      since we use grouping information gathered by interleaving analysis.  */
2087   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2088   if (!ok)
2089     return ok;
2090 
2091   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2092      vectorization, since we do not want to add extra peeling or
2093      add versioning for alignment.  */
2094   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2095     /* This pass will decide on using loop versioning and/or loop peeling in
2096        order to enhance the alignment of data references in the loop.  */
2097     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2098   else
2099     ok = vect_verify_datarefs_alignment (loop_vinfo);
2100   if (!ok)
2101     return ok;
2102 
2103   if (slp)
2104     {
2105       /* Analyze operations in the SLP instances.  Note this may
2106 	 remove unsupported SLP instances which makes the above
2107 	 SLP kind detection invalid.  */
2108       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2109       vect_slp_analyze_operations (loop_vinfo);
2110       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2111 	{
2112 	  ok = opt_result::failure_at (vect_location,
2113 				       "unsupported SLP instances\n");
2114 	  goto again;
2115 	}
2116     }
2117 
2118   /* Dissolve SLP-only groups.  */
2119   vect_dissolve_slp_only_groups (loop_vinfo);
2120 
2121   /* Scan all the remaining operations in the loop that are not subject
2122      to SLP and make sure they are vectorizable.  */
2123   ok = vect_analyze_loop_operations (loop_vinfo);
2124   if (!ok)
2125     {
2126       if (dump_enabled_p ())
2127 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 			 "bad operation or unsupported loop bound.\n");
2129       return ok;
2130     }
2131 
2132   /* Decide whether to use a fully-masked loop for this vectorization
2133      factor.  */
2134   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2135     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2136        && vect_verify_full_masking (loop_vinfo));
2137   if (dump_enabled_p ())
2138     {
2139       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2140 	dump_printf_loc (MSG_NOTE, vect_location,
2141 			 "using a fully-masked loop.\n");
2142       else
2143 	dump_printf_loc (MSG_NOTE, vect_location,
2144 			 "not using a fully-masked loop.\n");
2145     }
2146 
2147   /* If epilog loop is required because of data accesses with gaps,
2148      one additional iteration needs to be peeled.  Check if there is
2149      enough iterations for vectorization.  */
2150   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2151       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2152       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2153     {
2154       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2155       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2156 
2157       if (known_lt (wi::to_widest (scalar_niters), vf))
2158 	return opt_result::failure_at (vect_location,
2159 				       "loop has no enough iterations to"
2160 				       " support peeling for gaps.\n");
2161     }
2162 
2163   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2164      loop or a loop that has a lower VF than the main loop.  */
2165   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2166       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2167       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2169     return opt_result::failure_at (vect_location,
2170 				   "Vectorization factor too high for"
2171 				   " epilogue loop.\n");
2172 
2173   /* Check the costings of the loop make vectorizing worthwhile.  */
2174   res = vect_analyze_loop_costing (loop_vinfo);
2175   if (res < 0)
2176     {
2177       ok = opt_result::failure_at (vect_location,
2178 				   "Loop costings may not be worthwhile.\n");
2179       goto again;
2180     }
2181   if (!res)
2182     return opt_result::failure_at (vect_location,
2183 				   "Loop costings not worthwhile.\n");
2184 
2185   determine_peel_for_niter (loop_vinfo);
2186   /* If an epilogue loop is required make sure we can create one.  */
2187   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2188       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2189     {
2190       if (dump_enabled_p ())
2191         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2192       if (!vect_can_advance_ivs_p (loop_vinfo)
2193 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2194 					   single_exit (LOOP_VINFO_LOOP
2195 							 (loop_vinfo))))
2196         {
2197 	  ok = opt_result::failure_at (vect_location,
2198 				       "not vectorized: can't create required "
2199 				       "epilog loop\n");
2200           goto again;
2201         }
2202     }
2203 
2204   /* During peeling, we need to check if number of loop iterations is
2205      enough for both peeled prolog loop and vector loop.  This check
2206      can be merged along with threshold check of loop versioning, so
2207      increase threshold for this case if necessary.
2208 
2209      If we are analyzing an epilogue we still want to check what its
2210      versioning threshold would be.  If we decide to vectorize the epilogues we
2211      will want to use the lowest versioning threshold of all epilogues and main
2212      loop.  This will enable us to enter a vectorized epilogue even when
2213      versioning the loop.  We can't simply check whether the epilogue requires
2214      versioning though since we may have skipped some versioning checks when
2215      analyzing the epilogue.  For instance, checks for alias versioning will be
2216      skipped when dealing with epilogues as we assume we already checked them
2217      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2218   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2219     {
2220       poly_uint64 niters_th = 0;
2221       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2222 
2223       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2224 	{
2225 	  /* Niters for peeled prolog loop.  */
2226 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2227 	    {
2228 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2229 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2230 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2231 	    }
2232 	  else
2233 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2234 	}
2235 
2236       /* Niters for at least one iteration of vectorized loop.  */
2237       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2238 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2239       /* One additional iteration because of peeling for gap.  */
2240       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2241 	niters_th += 1;
2242 
2243       /*  Use the same condition as vect_transform_loop to decide when to use
2244 	  the cost to determine a versioning threshold.  */
2245       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2246 	  && ordered_p (th, niters_th))
2247 	niters_th = ordered_max (poly_uint64 (th), niters_th);
2248 
2249       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2250     }
2251 
2252   gcc_assert (known_eq (vectorization_factor,
2253 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2254 
2255   /* Ok to vectorize!  */
2256   return opt_result::success ();
2257 
2258 again:
2259   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2260   gcc_assert (!ok);
2261 
2262   /* Try again with SLP forced off but if we didn't do any SLP there is
2263      no point in re-trying.  */
2264   if (!slp)
2265     return ok;
2266 
2267   /* If there are reduction chains re-trying will fail anyway.  */
2268   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2269     return ok;
2270 
2271   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2272      via interleaving or lane instructions.  */
2273   slp_instance instance;
2274   slp_tree node;
2275   unsigned i, j;
2276   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2277     {
2278       stmt_vec_info vinfo;
2279       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2280       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2281 	continue;
2282       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2283       unsigned int size = DR_GROUP_SIZE (vinfo);
2284       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2285       if (! vect_store_lanes_supported (vectype, size, false)
2286 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2287 	 && ! vect_grouped_store_supported (vectype, size))
2288 	return opt_result::failure_at (vinfo->stmt,
2289 				       "unsupported grouped store\n");
2290       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2291 	{
2292 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2293 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2294 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2295 	  size = DR_GROUP_SIZE (vinfo);
2296 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2297 	  if (! vect_load_lanes_supported (vectype, size, false)
2298 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2299 						size))
2300 	    return opt_result::failure_at (vinfo->stmt,
2301 					   "unsupported grouped load\n");
2302 	}
2303     }
2304 
2305   if (dump_enabled_p ())
2306     dump_printf_loc (MSG_NOTE, vect_location,
2307 		     "re-trying with SLP disabled\n");
2308 
2309   /* Roll back state appropriately.  No SLP this time.  */
2310   slp = false;
2311   /* Restore vectorization factor as it were without SLP.  */
2312   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2313   /* Free the SLP instances.  */
2314   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2315     vect_free_slp_instance (instance, false);
2316   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2317   /* Reset SLP type to loop_vect on all stmts.  */
2318   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2319     {
2320       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2321       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2322 	   !gsi_end_p (si); gsi_next (&si))
2323 	{
2324 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2325 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2326 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2327 	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2328 	    {
2329 	      /* vectorizable_reduction adjusts reduction stmt def-types,
2330 		 restore them to that of the PHI.  */
2331 	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2332 		= STMT_VINFO_DEF_TYPE (stmt_info);
2333 	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2334 					(STMT_VINFO_REDUC_DEF (stmt_info)))
2335 		= STMT_VINFO_DEF_TYPE (stmt_info);
2336 	    }
2337 	}
2338       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2339 	   !gsi_end_p (si); gsi_next (&si))
2340 	{
2341 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2342 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2343 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2344 	    {
2345 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2346 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2347 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2348 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2349 		   !gsi_end_p (pi); gsi_next (&pi))
2350 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2351 		  = loop_vect;
2352 	    }
2353 	}
2354     }
2355   /* Free optimized alias test DDRS.  */
2356   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2357   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2358   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2359   /* Reset target cost data.  */
2360   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2361   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2362     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2363   /* Reset accumulated rgroup information.  */
2364   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2365   /* Reset assorted flags.  */
2366   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2367   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2368   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2369   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2370   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2371 
2372   goto start_over;
2373 }
2374 
2375 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2376    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2377    OLD_LOOP_VINFO is better unless something specifically indicates
2378    otherwise.
2379 
2380    Note that this deliberately isn't a partial order.  */
2381 
2382 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2383 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2384 			  loop_vec_info old_loop_vinfo)
2385 {
2386   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2387   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2388 
2389   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2390   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2391 
2392   /* Always prefer a VF of loop->simdlen over any other VF.  */
2393   if (loop->simdlen)
2394     {
2395       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2396       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2397       if (new_simdlen_p != old_simdlen_p)
2398 	return new_simdlen_p;
2399     }
2400 
2401   /* Limit the VFs to what is likely to be the maximum number of iterations,
2402      to handle cases in which at least one loop_vinfo is fully-masked.  */
2403   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2404   if (estimated_max_niter != -1)
2405     {
2406       if (known_le (estimated_max_niter, new_vf))
2407 	new_vf = estimated_max_niter;
2408       if (known_le (estimated_max_niter, old_vf))
2409 	old_vf = estimated_max_niter;
2410     }
2411 
2412   /* Check whether the (fractional) cost per scalar iteration is lower
2413      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2414   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2415 			     * poly_widest_int (old_vf));
2416   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2417 			     * poly_widest_int (new_vf));
2418   if (maybe_lt (rel_old, rel_new))
2419     {
2420       /* When old_loop_vinfo uses a variable vectorization factor,
2421 	 we know that it has a lower cost for at least one runtime VF.
2422 	 However, we don't know how likely that VF is.
2423 
2424 	 One option would be to compare the costs for the estimated VFs.
2425 	 The problem is that that can put too much pressure on the cost
2426 	 model.  E.g. if the estimated VF is also the lowest possible VF,
2427 	 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2428 	 for the estimated VF, we'd then choose new_loop_vinfo even
2429 	 though (a) new_loop_vinfo might not actually be better than
2430 	 old_loop_vinfo for that VF and (b) it would be significantly
2431 	 worse at larger VFs.
2432 
2433 	 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2434 	 no more expensive than old_loop_vinfo even after doubling the
2435 	 estimated old_loop_vinfo VF.  For all but trivial loops, this
2436 	 ensures that we only pick new_loop_vinfo if it is significantly
2437 	 better than old_loop_vinfo at the estimated VF.  */
2438       if (rel_new.is_constant ())
2439 	return false;
2440 
2441       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2442       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2443       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2444 				      * widest_int (old_estimated_vf));
2445       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2446 				      * widest_int (new_estimated_vf));
2447       return estimated_rel_new * 2 <= estimated_rel_old;
2448     }
2449   if (known_lt (rel_new, rel_old))
2450     return true;
2451 
2452   /* If there's nothing to choose between the loop bodies, see whether
2453      there's a difference in the prologue and epilogue costs.  */
2454   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2455     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2456 
2457   return false;
2458 }
2459 
2460 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2461    true if we should.  */
2462 
2463 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2464 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2465 			loop_vec_info old_loop_vinfo)
2466 {
2467   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2468     return false;
2469 
2470   if (dump_enabled_p ())
2471     dump_printf_loc (MSG_NOTE, vect_location,
2472 		     "***** Preferring vector mode %s to vector mode %s\n",
2473 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2474 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2475   return true;
2476 }
2477 
2478 /* Function vect_analyze_loop.
2479 
2480    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2481    for it.  The different analyses will record information in the
2482    loop_vec_info struct.  */
2483 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2484 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2485 {
2486   auto_vector_modes vector_modes;
2487 
2488   /* Autodetect first vector size we try.  */
2489   unsigned int autovec_flags
2490     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2491 						    loop->simdlen != 0);
2492   unsigned int mode_i = 0;
2493 
2494   DUMP_VECT_SCOPE ("analyze_loop_nest");
2495 
2496   if (loop_outer (loop)
2497       && loop_vec_info_for_loop (loop_outer (loop))
2498       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2499     return opt_loop_vec_info::failure_at (vect_location,
2500 					  "outer-loop already vectorized.\n");
2501 
2502   if (!find_loop_nest (loop, &shared->loop_nest))
2503     return opt_loop_vec_info::failure_at
2504       (vect_location,
2505        "not vectorized: loop nest containing two or more consecutive inner"
2506        " loops cannot be vectorized\n");
2507 
2508   unsigned n_stmts = 0;
2509   machine_mode autodetected_vector_mode = VOIDmode;
2510   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2511   machine_mode next_vector_mode = VOIDmode;
2512   poly_uint64 lowest_th = 0;
2513   unsigned vectorized_loops = 0;
2514   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2515 			     && !unlimited_cost_model (loop));
2516 
2517   bool vect_epilogues = false;
2518   opt_result res = opt_result::success ();
2519   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2520   while (1)
2521     {
2522       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2523       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2524       if (!loop_vinfo)
2525 	{
2526 	  if (dump_enabled_p ())
2527 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 			     "bad loop form.\n");
2529 	  gcc_checking_assert (first_loop_vinfo == NULL);
2530 	  return loop_vinfo;
2531 	}
2532       loop_vinfo->vector_mode = next_vector_mode;
2533 
2534       bool fatal = false;
2535 
2536       /* When pick_lowest_cost_p is true, we should in principle iterate
2537 	 over all the loop_vec_infos that LOOP_VINFO could replace and
2538 	 try to vectorize LOOP_VINFO under the same conditions.
2539 	 E.g. when trying to replace an epilogue loop, we should vectorize
2540 	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2541 	 to replace the main loop, we should vectorize LOOP_VINFO as a main
2542 	 loop too.
2543 
2544 	 However, autovectorize_vector_modes is usually sorted as follows:
2545 
2546 	 - Modes that naturally produce lower VFs usually follow modes that
2547 	   naturally produce higher VFs.
2548 
2549 	 - When modes naturally produce the same VF, maskable modes
2550 	   usually follow unmaskable ones, so that the maskable mode
2551 	   can be used to vectorize the epilogue of the unmaskable mode.
2552 
2553 	 This order is preferred because it leads to the maximum
2554 	 epilogue vectorization opportunities.  Targets should only use
2555 	 a different order if they want to make wide modes available while
2556 	 disparaging them relative to earlier, smaller modes.  The assumption
2557 	 in that case is that the wider modes are more expensive in some
2558 	 way that isn't reflected directly in the costs.
2559 
2560 	 There should therefore be few interesting cases in which
2561 	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2562 	 treated as a standalone loop, and ends up being genuinely cheaper
2563 	 than FIRST_LOOP_VINFO.  */
2564       if (vect_epilogues)
2565 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2566 
2567       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2568       if (mode_i == 0)
2569 	autodetected_vector_mode = loop_vinfo->vector_mode;
2570       if (dump_enabled_p ())
2571 	{
2572 	  if (res)
2573 	    dump_printf_loc (MSG_NOTE, vect_location,
2574 			     "***** Analysis succeeded with vector mode %s\n",
2575 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2576 	  else
2577 	    dump_printf_loc (MSG_NOTE, vect_location,
2578 			     "***** Analysis failed with vector mode %s\n",
2579 			     GET_MODE_NAME (loop_vinfo->vector_mode));
2580 	}
2581 
2582       loop->aux = NULL;
2583 
2584       if (!fatal)
2585 	while (mode_i < vector_modes.length ()
2586 	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2587 	  {
2588 	    if (dump_enabled_p ())
2589 	      dump_printf_loc (MSG_NOTE, vect_location,
2590 			       "***** The result for vector mode %s would"
2591 			       " be the same\n",
2592 			       GET_MODE_NAME (vector_modes[mode_i]));
2593 	    mode_i += 1;
2594 	  }
2595 
2596       if (res)
2597 	{
2598 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2599 	  vectorized_loops++;
2600 
2601 	  /* Once we hit the desired simdlen for the first time,
2602 	     discard any previous attempts.  */
2603 	  if (simdlen
2604 	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2605 	    {
2606 	      delete first_loop_vinfo;
2607 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2608 	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2609 	      simdlen = 0;
2610 	    }
2611 	  else if (pick_lowest_cost_p && first_loop_vinfo)
2612 	    {
2613 	      /* Keep trying to roll back vectorization attempts while the
2614 		 loop_vec_infos they produced were worse than this one.  */
2615 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2616 	      while (!vinfos.is_empty ()
2617 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2618 		{
2619 		  gcc_assert (vect_epilogues);
2620 		  delete vinfos.pop ();
2621 		}
2622 	      if (vinfos.is_empty ()
2623 		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2624 		{
2625 		  delete first_loop_vinfo;
2626 		  first_loop_vinfo = opt_loop_vec_info::success (NULL);
2627 		  LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2628 		}
2629 	    }
2630 
2631 	  if (first_loop_vinfo == NULL)
2632 	    {
2633 	      first_loop_vinfo = loop_vinfo;
2634 	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2635 	    }
2636 	  else if (vect_epilogues
2637 		   /* For now only allow one epilogue loop.  */
2638 		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
2639 	    {
2640 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2641 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2642 	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2643 			  || maybe_ne (lowest_th, 0U));
2644 	      /* Keep track of the known smallest versioning
2645 		 threshold.  */
2646 	      if (ordered_p (lowest_th, th))
2647 		lowest_th = ordered_min (lowest_th, th);
2648 	    }
2649 	  else
2650 	    delete loop_vinfo;
2651 
2652 	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2653 	     enabled, SIMDUID is not set, it is the innermost loop and we have
2654 	     either already found the loop's SIMDLEN or there was no SIMDLEN to
2655 	     begin with.
2656 	     TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2657 	  vect_epilogues = (!simdlen
2658 			    && loop->inner == NULL
2659 			    && param_vect_epilogues_nomask
2660 			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2661 			    && !loop->simduid
2662 			    /* For now only allow one epilogue loop, but allow
2663 			       pick_lowest_cost_p to replace it.  */
2664 			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2665 				|| pick_lowest_cost_p));
2666 
2667 	  /* Commit to first_loop_vinfo if we have no reason to try
2668 	     alternatives.  */
2669 	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2670 	    break;
2671 	}
2672       else
2673 	{
2674 	  delete loop_vinfo;
2675 	  if (fatal)
2676 	    {
2677 	      gcc_checking_assert (first_loop_vinfo == NULL);
2678 	      break;
2679 	    }
2680 	}
2681 
2682       if (mode_i < vector_modes.length ()
2683 	  && VECTOR_MODE_P (autodetected_vector_mode)
2684 	  && (related_vector_mode (vector_modes[mode_i],
2685 				   GET_MODE_INNER (autodetected_vector_mode))
2686 	      == autodetected_vector_mode)
2687 	  && (related_vector_mode (autodetected_vector_mode,
2688 				   GET_MODE_INNER (vector_modes[mode_i]))
2689 	      == vector_modes[mode_i]))
2690 	{
2691 	  if (dump_enabled_p ())
2692 	    dump_printf_loc (MSG_NOTE, vect_location,
2693 			     "***** Skipping vector mode %s, which would"
2694 			     " repeat the analysis for %s\n",
2695 			     GET_MODE_NAME (vector_modes[mode_i]),
2696 			     GET_MODE_NAME (autodetected_vector_mode));
2697 	  mode_i += 1;
2698 	}
2699 
2700       if (mode_i == vector_modes.length ()
2701 	  || autodetected_vector_mode == VOIDmode)
2702 	break;
2703 
2704       /* Try the next biggest vector size.  */
2705       next_vector_mode = vector_modes[mode_i++];
2706       if (dump_enabled_p ())
2707 	dump_printf_loc (MSG_NOTE, vect_location,
2708 			 "***** Re-trying analysis with vector mode %s\n",
2709 			 GET_MODE_NAME (next_vector_mode));
2710     }
2711 
2712   if (first_loop_vinfo)
2713     {
2714       loop->aux = (loop_vec_info) first_loop_vinfo;
2715       if (dump_enabled_p ())
2716 	dump_printf_loc (MSG_NOTE, vect_location,
2717 			 "***** Choosing vector mode %s\n",
2718 			 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2719       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2720       return first_loop_vinfo;
2721     }
2722 
2723   return opt_loop_vec_info::propagate_failure (res);
2724 }
2725 
2726 /* Return true if there is an in-order reduction function for CODE, storing
2727    it in *REDUC_FN if so.  */
2728 
2729 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2730 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2731 {
2732   switch (code)
2733     {
2734     case PLUS_EXPR:
2735       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2736       return true;
2737 
2738     default:
2739       return false;
2740     }
2741 }
2742 
2743 /* Function reduction_fn_for_scalar_code
2744 
2745    Input:
2746    CODE - tree_code of a reduction operations.
2747 
2748    Output:
2749    REDUC_FN - the corresponding internal function to be used to reduce the
2750       vector of partial results into a single scalar result, or IFN_LAST
2751       if the operation is a supported reduction operation, but does not have
2752       such an internal function.
2753 
2754    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2755 
2756 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2757 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2758 {
2759   switch (code)
2760     {
2761       case MAX_EXPR:
2762         *reduc_fn = IFN_REDUC_MAX;
2763         return true;
2764 
2765       case MIN_EXPR:
2766         *reduc_fn = IFN_REDUC_MIN;
2767         return true;
2768 
2769       case PLUS_EXPR:
2770         *reduc_fn = IFN_REDUC_PLUS;
2771         return true;
2772 
2773       case BIT_AND_EXPR:
2774 	*reduc_fn = IFN_REDUC_AND;
2775 	return true;
2776 
2777       case BIT_IOR_EXPR:
2778 	*reduc_fn = IFN_REDUC_IOR;
2779 	return true;
2780 
2781       case BIT_XOR_EXPR:
2782 	*reduc_fn = IFN_REDUC_XOR;
2783 	return true;
2784 
2785       case MULT_EXPR:
2786       case MINUS_EXPR:
2787         *reduc_fn = IFN_LAST;
2788         return true;
2789 
2790       default:
2791        return false;
2792     }
2793 }
2794 
2795 /* If there is a neutral value X such that SLP reduction NODE would not
2796    be affected by the introduction of additional X elements, return that X,
2797    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2798    is the vector type that would hold element X.  REDUC_CHAIN is true if
2799    the SLP statements perform a single reduction, false if each statement
2800    performs an independent reduction.  */
2801 
2802 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)2803 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2804 			      tree_code code, bool reduc_chain)
2805 {
2806   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2807   stmt_vec_info stmt_vinfo = stmts[0];
2808   tree scalar_type = TREE_TYPE (vector_type);
2809   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2810   gcc_assert (loop);
2811 
2812   switch (code)
2813     {
2814     case WIDEN_SUM_EXPR:
2815     case DOT_PROD_EXPR:
2816     case SAD_EXPR:
2817     case PLUS_EXPR:
2818     case MINUS_EXPR:
2819     case BIT_IOR_EXPR:
2820     case BIT_XOR_EXPR:
2821       return build_zero_cst (scalar_type);
2822 
2823     case MULT_EXPR:
2824       return build_one_cst (scalar_type);
2825 
2826     case BIT_AND_EXPR:
2827       return build_all_ones_cst (scalar_type);
2828 
2829     case MAX_EXPR:
2830     case MIN_EXPR:
2831       /* For MIN/MAX the initial values are neutral.  A reduction chain
2832 	 has only a single initial value, so that value is neutral for
2833 	 all statements.  */
2834       if (reduc_chain)
2835 	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2836 				      loop_preheader_edge (loop));
2837       return NULL_TREE;
2838 
2839     default:
2840       return NULL_TREE;
2841     }
2842 }
2843 
2844 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2845    STMT is printed with a message MSG. */
2846 
2847 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2848 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2849 {
2850   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2851 }
2852 
2853 /* Return true if we need an in-order reduction for operation CODE
2854    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2855    overflow must wrap.  */
2856 
2857 bool
needs_fold_left_reduction_p(tree type,tree_code code)2858 needs_fold_left_reduction_p (tree type, tree_code code)
2859 {
2860   /* CHECKME: check for !flag_finite_math_only too?  */
2861   if (SCALAR_FLOAT_TYPE_P (type))
2862     switch (code)
2863       {
2864       case MIN_EXPR:
2865       case MAX_EXPR:
2866 	return false;
2867 
2868       default:
2869 	return !flag_associative_math;
2870       }
2871 
2872   if (INTEGRAL_TYPE_P (type))
2873     {
2874       if (!operation_no_trapping_overflow (type, code))
2875 	return true;
2876       return false;
2877     }
2878 
2879   if (SAT_FIXED_POINT_TYPE_P (type))
2880     return true;
2881 
2882   return false;
2883 }
2884 
2885 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2886    has a handled computation expression.  Store the main reduction
2887    operation in *CODE.  */
2888 
2889 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)2890 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2891 		      tree loop_arg, enum tree_code *code,
2892 		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2893 {
2894   auto_bitmap visited;
2895   tree lookfor = PHI_RESULT (phi);
2896   ssa_op_iter curri;
2897   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2898   while (USE_FROM_PTR (curr) != loop_arg)
2899     curr = op_iter_next_use (&curri);
2900   curri.i = curri.numops;
2901   do
2902     {
2903       path.safe_push (std::make_pair (curri, curr));
2904       tree use = USE_FROM_PTR (curr);
2905       if (use == lookfor)
2906 	break;
2907       gimple *def = SSA_NAME_DEF_STMT (use);
2908       if (gimple_nop_p (def)
2909 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2910 	{
2911 pop:
2912 	  do
2913 	    {
2914 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2915 	      curri = x.first;
2916 	      curr = x.second;
2917 	      do
2918 		curr = op_iter_next_use (&curri);
2919 	      /* Skip already visited or non-SSA operands (from iterating
2920 	         over PHI args).  */
2921 	      while (curr != NULL_USE_OPERAND_P
2922 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2923 			 || ! bitmap_set_bit (visited,
2924 					      SSA_NAME_VERSION
2925 					        (USE_FROM_PTR (curr)))));
2926 	    }
2927 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2928 	  if (curr == NULL_USE_OPERAND_P)
2929 	    break;
2930 	}
2931       else
2932 	{
2933 	  if (gimple_code (def) == GIMPLE_PHI)
2934 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2935 	  else
2936 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2937 	  while (curr != NULL_USE_OPERAND_P
2938 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2939 		     || ! bitmap_set_bit (visited,
2940 					  SSA_NAME_VERSION
2941 					    (USE_FROM_PTR (curr)))))
2942 	    curr = op_iter_next_use (&curri);
2943 	  if (curr == NULL_USE_OPERAND_P)
2944 	    goto pop;
2945 	}
2946     }
2947   while (1);
2948   if (dump_file && (dump_flags & TDF_DETAILS))
2949     {
2950       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2951       unsigned i;
2952       std::pair<ssa_op_iter, use_operand_p> *x;
2953       FOR_EACH_VEC_ELT (path, i, x)
2954 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2955       dump_printf (MSG_NOTE, "\n");
2956     }
2957 
2958   /* Check whether the reduction path detected is valid.  */
2959   bool fail = path.length () == 0;
2960   bool neg = false;
2961   int sign = -1;
2962   *code = ERROR_MARK;
2963   for (unsigned i = 1; i < path.length (); ++i)
2964     {
2965       gimple *use_stmt = USE_STMT (path[i].second);
2966       tree op = USE_FROM_PTR (path[i].second);
2967       if (! is_gimple_assign (use_stmt)
2968 	  /* The following make sure we can compute the operand index
2969 	     easily plus it mostly disallows chaining via COND_EXPR condition
2970 	     operands.  */
2971 	  || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2972 	      && (gimple_num_ops (use_stmt) <= 2
2973 		  || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2974 	      && (gimple_num_ops (use_stmt) <= 3
2975 		  || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2976 	{
2977 	  fail = true;
2978 	  break;
2979 	}
2980       /* Check there's only a single stmt the op is used on inside
2981          of the loop.  */
2982       imm_use_iterator imm_iter;
2983       gimple *op_use_stmt;
2984       unsigned cnt = 0;
2985       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2986 	if (!is_gimple_debug (op_use_stmt)
2987 	    && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2988 	  {
2989 	    /* We want to allow x + x but not x < 1 ? x : 2.  */
2990 	    if (is_gimple_assign (op_use_stmt)
2991 		&& gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2992 	      {
2993 		use_operand_p use_p;
2994 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2995 		  cnt++;
2996 	      }
2997 	    else
2998 	      cnt++;
2999 	  }
3000       if (cnt != 1)
3001 	{
3002 	  fail = true;
3003 	  break;
3004 	}
3005       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3006       if (use_code == MINUS_EXPR)
3007 	{
3008 	  use_code = PLUS_EXPR;
3009 	  /* Track whether we negate the reduction value each iteration.  */
3010 	  if (gimple_assign_rhs2 (use_stmt) == op)
3011 	    neg = ! neg;
3012 	}
3013       if (CONVERT_EXPR_CODE_P (use_code)
3014 	  && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3015 				    TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3016 	;
3017       else if (*code == ERROR_MARK)
3018 	{
3019 	  *code = use_code;
3020 	  sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3021 	}
3022       else if (use_code != *code)
3023 	{
3024 	  fail = true;
3025 	  break;
3026 	}
3027       else if ((use_code == MIN_EXPR
3028 		|| use_code == MAX_EXPR)
3029 	       && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3030 	{
3031 	  fail = true;
3032 	  break;
3033 	}
3034     }
3035   return ! fail && ! neg && *code != ERROR_MARK;
3036 }
3037 
3038 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3039 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3040 		      tree loop_arg, enum tree_code code)
3041 {
3042   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3043   enum tree_code code_;
3044   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3045 	  && code_ == code);
3046 }
3047 
3048 
3049 
3050 /* Function vect_is_simple_reduction
3051 
3052    (1) Detect a cross-iteration def-use cycle that represents a simple
3053    reduction computation.  We look for the following pattern:
3054 
3055    loop_header:
3056      a1 = phi < a0, a2 >
3057      a3 = ...
3058      a2 = operation (a3, a1)
3059 
3060    or
3061 
3062    a3 = ...
3063    loop_header:
3064      a1 = phi < a0, a2 >
3065      a2 = operation (a3, a1)
3066 
3067    such that:
3068    1. operation is commutative and associative and it is safe to
3069       change the order of the computation
3070    2. no uses for a2 in the loop (a2 is used out of the loop)
3071    3. no uses of a1 in the loop besides the reduction operation
3072    4. no uses of a1 outside the loop.
3073 
3074    Conditions 1,4 are tested here.
3075    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3076 
3077    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3078    nested cycles.
3079 
3080    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3081    reductions:
3082 
3083      a1 = phi < a0, a2 >
3084      inner loop (def of a3)
3085      a2 = phi < a3 >
3086 
3087    (4) Detect condition expressions, ie:
3088      for (int i = 0; i < N; i++)
3089        if (a[i] < val)
3090 	ret_val = a[i];
3091 
3092 */
3093 
3094 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3095 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3096 			  bool *double_reduc, bool *reduc_chain_p)
3097 {
3098   gphi *phi = as_a <gphi *> (phi_info->stmt);
3099   gimple *phi_use_stmt = NULL;
3100   imm_use_iterator imm_iter;
3101   use_operand_p use_p;
3102 
3103   *double_reduc = false;
3104   *reduc_chain_p = false;
3105   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3106 
3107   tree phi_name = PHI_RESULT (phi);
3108   /* ???  If there are no uses of the PHI result the inner loop reduction
3109      won't be detected as possibly double-reduction by vectorizable_reduction
3110      because that tries to walk the PHI arg from the preheader edge which
3111      can be constant.  See PR60382.  */
3112   if (has_zero_uses (phi_name))
3113     return NULL;
3114   class loop *loop = (gimple_bb (phi))->loop_father;
3115   unsigned nphi_def_loop_uses = 0;
3116   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3117     {
3118       gimple *use_stmt = USE_STMT (use_p);
3119       if (is_gimple_debug (use_stmt))
3120 	continue;
3121 
3122       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3123         {
3124           if (dump_enabled_p ())
3125 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3126 			     "intermediate value used outside loop.\n");
3127 
3128           return NULL;
3129         }
3130 
3131       nphi_def_loop_uses++;
3132       phi_use_stmt = use_stmt;
3133     }
3134 
3135   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3136   if (TREE_CODE (latch_def) != SSA_NAME)
3137     {
3138       if (dump_enabled_p ())
3139 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 			 "reduction: not ssa_name: %T\n", latch_def);
3141       return NULL;
3142     }
3143 
3144   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3145   if (!def_stmt_info
3146       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3147     return NULL;
3148 
3149   bool nested_in_vect_loop
3150     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3151   unsigned nlatch_def_loop_uses = 0;
3152   auto_vec<gphi *, 3> lcphis;
3153   bool inner_loop_of_double_reduc = false;
3154   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3155     {
3156       gimple *use_stmt = USE_STMT (use_p);
3157       if (is_gimple_debug (use_stmt))
3158 	continue;
3159       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3160 	nlatch_def_loop_uses++;
3161       else
3162 	{
3163 	  /* We can have more than one loop-closed PHI.  */
3164 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3165 	  if (nested_in_vect_loop
3166 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3167 		  == vect_double_reduction_def))
3168 	    inner_loop_of_double_reduc = true;
3169 	}
3170     }
3171 
3172   /* If we are vectorizing an inner reduction we are executing that
3173      in the original order only in case we are not dealing with a
3174      double reduction.  */
3175   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3176     {
3177       if (dump_enabled_p ())
3178 	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3179 			"detected nested cycle: ");
3180       return def_stmt_info;
3181     }
3182 
3183   /* If this isn't a nested cycle or if the nested cycle reduction value
3184      is used ouside of the inner loop we cannot handle uses of the reduction
3185      value.  */
3186   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3187     {
3188       if (dump_enabled_p ())
3189 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190 			 "reduction used in loop.\n");
3191       return NULL;
3192     }
3193 
3194   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3195      defined in the inner loop.  */
3196   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3197     {
3198       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3199       if (gimple_phi_num_args (def_stmt) != 1
3200           || TREE_CODE (op1) != SSA_NAME)
3201         {
3202           if (dump_enabled_p ())
3203 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3204 			     "unsupported phi node definition.\n");
3205 
3206           return NULL;
3207         }
3208 
3209       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3210       if (gimple_bb (def1)
3211 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3212           && loop->inner
3213           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3214           && is_gimple_assign (def1)
3215 	  && is_a <gphi *> (phi_use_stmt)
3216 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3217         {
3218           if (dump_enabled_p ())
3219             report_vect_op (MSG_NOTE, def_stmt,
3220 			    "detected double reduction: ");
3221 
3222           *double_reduc = true;
3223 	  return def_stmt_info;
3224         }
3225 
3226       return NULL;
3227     }
3228 
3229   /* Look for the expression computing latch_def from then loop PHI result.  */
3230   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3231   enum tree_code code;
3232   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3233 			    path))
3234     {
3235       STMT_VINFO_REDUC_CODE (phi_info) = code;
3236       if (code == COND_EXPR && !nested_in_vect_loop)
3237 	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3238 
3239       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3240 	 reduction chain for which the additional restriction is that
3241 	 all operations in the chain are the same.  */
3242       auto_vec<stmt_vec_info, 8> reduc_chain;
3243       unsigned i;
3244       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3245       for (i = path.length () - 1; i >= 1; --i)
3246 	{
3247 	  gimple *stmt = USE_STMT (path[i].second);
3248 	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3249 	  STMT_VINFO_REDUC_IDX (stmt_info)
3250 	    = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3251 	  enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3252 	  bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3253 				     && (i == 1 || i == path.length () - 1));
3254 	  if ((stmt_code != code && !leading_conversion)
3255 	      /* We can only handle the final value in epilogue
3256 		 generation for reduction chains.  */
3257 	      || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3258 	    is_slp_reduc = false;
3259 	  /* For reduction chains we support a trailing/leading
3260 	     conversions.  We do not store those in the actual chain.  */
3261 	  if (leading_conversion)
3262 	    continue;
3263 	  reduc_chain.safe_push (stmt_info);
3264 	}
3265       if (is_slp_reduc && reduc_chain.length () > 1)
3266 	{
3267 	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3268 	    {
3269 	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3270 	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3271 	    }
3272 	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3273 	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3274 
3275 	  /* Save the chain for further analysis in SLP detection.  */
3276 	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3277 	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3278 
3279 	  *reduc_chain_p = true;
3280 	  if (dump_enabled_p ())
3281 	    dump_printf_loc (MSG_NOTE, vect_location,
3282 			    "reduction: detected reduction chain\n");
3283 	}
3284       else if (dump_enabled_p ())
3285 	dump_printf_loc (MSG_NOTE, vect_location,
3286 			 "reduction: detected reduction\n");
3287 
3288       return def_stmt_info;
3289     }
3290 
3291   if (dump_enabled_p ())
3292     dump_printf_loc (MSG_NOTE, vect_location,
3293 		     "reduction: unknown pattern\n");
3294 
3295   return NULL;
3296 }
3297 
3298 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3299 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3300 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3301                              int *peel_iters_epilogue,
3302                              stmt_vector_for_cost *scalar_cost_vec,
3303 			     stmt_vector_for_cost *prologue_cost_vec,
3304 			     stmt_vector_for_cost *epilogue_cost_vec)
3305 {
3306   int retval = 0;
3307   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3308 
3309   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3310     {
3311       *peel_iters_epilogue = assumed_vf / 2;
3312       if (dump_enabled_p ())
3313         dump_printf_loc (MSG_NOTE, vect_location,
3314 			 "cost model: epilogue peel iters set to vf/2 "
3315 			 "because loop iterations are unknown .\n");
3316 
3317       /* If peeled iterations are known but number of scalar loop
3318          iterations are unknown, count a taken branch per peeled loop.  */
3319       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3320 				 NULL, 0, vect_prologue);
3321       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3322 				  NULL, 0, vect_epilogue);
3323     }
3324   else
3325     {
3326       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3327       peel_iters_prologue = niters < peel_iters_prologue ?
3328                             niters : peel_iters_prologue;
3329       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3330       /* If we need to peel for gaps, but no peeling is required, we have to
3331 	 peel VF iterations.  */
3332       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3333 	*peel_iters_epilogue = assumed_vf;
3334     }
3335 
3336   stmt_info_for_cost *si;
3337   int j;
3338   if (peel_iters_prologue)
3339     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3340       retval += record_stmt_cost (prologue_cost_vec,
3341 				  si->count * peel_iters_prologue,
3342 				  si->kind, si->stmt_info, si->misalign,
3343 				  vect_prologue);
3344   if (*peel_iters_epilogue)
3345     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3346       retval += record_stmt_cost (epilogue_cost_vec,
3347 				  si->count * *peel_iters_epilogue,
3348 				  si->kind, si->stmt_info, si->misalign,
3349 				  vect_epilogue);
3350 
3351   return retval;
3352 }
3353 
3354 /* Function vect_estimate_min_profitable_iters
3355 
3356    Return the number of iterations required for the vector version of the
3357    loop to be profitable relative to the cost of the scalar version of the
3358    loop.
3359 
3360    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3361    of iterations for vectorization.  -1 value means loop vectorization
3362    is not profitable.  This returned value may be used for dynamic
3363    profitability check.
3364 
3365    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3366    for static check against estimated number of iterations.  */
3367 
3368 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3369 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3370 				    int *ret_min_profitable_niters,
3371 				    int *ret_min_profitable_estimate)
3372 {
3373   int min_profitable_iters;
3374   int min_profitable_estimate;
3375   int peel_iters_prologue;
3376   int peel_iters_epilogue;
3377   unsigned vec_inside_cost = 0;
3378   int vec_outside_cost = 0;
3379   unsigned vec_prologue_cost = 0;
3380   unsigned vec_epilogue_cost = 0;
3381   int scalar_single_iter_cost = 0;
3382   int scalar_outside_cost = 0;
3383   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3384   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3385   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3386 
3387   /* Cost model disabled.  */
3388   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3389     {
3390       if (dump_enabled_p ())
3391 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3392       *ret_min_profitable_niters = 0;
3393       *ret_min_profitable_estimate = 0;
3394       return;
3395     }
3396 
3397   /* Requires loop versioning tests to handle misalignment.  */
3398   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3399     {
3400       /*  FIXME: Make cost depend on complexity of individual check.  */
3401       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3402       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3403 			    vect_prologue);
3404       if (dump_enabled_p ())
3405 	dump_printf (MSG_NOTE,
3406 		     "cost model: Adding cost of checks for loop "
3407 		     "versioning to treat misalignment.\n");
3408     }
3409 
3410   /* Requires loop versioning with alias checks.  */
3411   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3412     {
3413       /*  FIXME: Make cost depend on complexity of individual check.  */
3414       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3415       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3416 			    vect_prologue);
3417       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3418       if (len)
3419 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3420 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3421 			      NULL, 0, vect_prologue);
3422       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3423       if (len)
3424 	{
3425 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3426 	  unsigned int nstmts = len * 2 - 1;
3427 	  /* +1 for each bias that needs adding.  */
3428 	  for (unsigned int i = 0; i < len; ++i)
3429 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3430 	      nstmts += 1;
3431 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3432 				NULL, 0, vect_prologue);
3433 	}
3434       if (dump_enabled_p ())
3435 	dump_printf (MSG_NOTE,
3436 		     "cost model: Adding cost of checks for loop "
3437 		     "versioning aliasing.\n");
3438     }
3439 
3440   /* Requires loop versioning with niter checks.  */
3441   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3442     {
3443       /*  FIXME: Make cost depend on complexity of individual check.  */
3444       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3445 			    vect_prologue);
3446       if (dump_enabled_p ())
3447 	dump_printf (MSG_NOTE,
3448 		     "cost model: Adding cost of checks for loop "
3449 		     "versioning niters.\n");
3450     }
3451 
3452   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3453     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3454 			  vect_prologue);
3455 
3456   /* Count statements in scalar loop.  Using this as scalar cost for a single
3457      iteration for now.
3458 
3459      TODO: Add outer loop support.
3460 
3461      TODO: Consider assigning different costs to different scalar
3462      statements.  */
3463 
3464   scalar_single_iter_cost
3465     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3466 
3467   /* Add additional cost for the peeled instructions in prologue and epilogue
3468      loop.  (For fully-masked loops there will be no peeling.)
3469 
3470      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3471      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3472 
3473      TODO: Build an expression that represents peel_iters for prologue and
3474      epilogue to be used in a run-time test.  */
3475 
3476   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3477     {
3478       peel_iters_prologue = 0;
3479       peel_iters_epilogue = 0;
3480 
3481       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3482 	{
3483 	  /* We need to peel exactly one iteration.  */
3484 	  peel_iters_epilogue += 1;
3485 	  stmt_info_for_cost *si;
3486 	  int j;
3487 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3488 			    j, si)
3489 	    (void) add_stmt_cost (target_cost_data, si->count,
3490 				  si->kind, si->stmt_info, si->misalign,
3491 				  vect_epilogue);
3492 	}
3493 
3494       /* Calculate how many masks we need to generate.  */
3495       unsigned int num_masks = 0;
3496       rgroup_masks *rgm;
3497       unsigned int num_vectors_m1;
3498       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3499 	if (rgm->mask_type)
3500 	  num_masks += num_vectors_m1 + 1;
3501       gcc_assert (num_masks > 0);
3502 
3503       /* In the worst case, we need to generate each mask in the prologue
3504 	 and in the loop body.  One of the loop body mask instructions
3505 	 replaces the comparison in the scalar loop, and since we don't
3506 	 count the scalar comparison against the scalar body, we shouldn't
3507 	 count that vector instruction against the vector body either.
3508 
3509 	 Sometimes we can use unpacks instead of generating prologue
3510 	 masks and sometimes the prologue mask will fold to a constant,
3511 	 so the actual prologue cost might be smaller.  However, it's
3512 	 simpler and safer to use the worst-case cost; if this ends up
3513 	 being the tie-breaker between vectorizing or not, then it's
3514 	 probably better not to vectorize.  */
3515       (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3516 			    NULL, 0, vect_prologue);
3517       (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3518 			    NULL, 0, vect_body);
3519     }
3520   else if (npeel < 0)
3521     {
3522       peel_iters_prologue = assumed_vf / 2;
3523       if (dump_enabled_p ())
3524 	dump_printf (MSG_NOTE, "cost model: "
3525 		     "prologue peel iters set to vf/2.\n");
3526 
3527       /* If peeling for alignment is unknown, loop bound of main loop becomes
3528          unknown.  */
3529       peel_iters_epilogue = assumed_vf / 2;
3530       if (dump_enabled_p ())
3531 	dump_printf (MSG_NOTE, "cost model: "
3532 		     "epilogue peel iters set to vf/2 because "
3533 		     "peeling for alignment is unknown.\n");
3534 
3535       /* If peeled iterations are unknown, count a taken branch and a not taken
3536          branch per peeled loop. Even if scalar loop iterations are known,
3537          vector iterations are not known since peeled prologue iterations are
3538          not known. Hence guards remain the same.  */
3539       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3540 			    NULL, 0, vect_prologue);
3541       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3542 			    NULL, 0, vect_prologue);
3543       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3544 			    NULL, 0, vect_epilogue);
3545       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3546 			    NULL, 0, vect_epilogue);
3547       stmt_info_for_cost *si;
3548       int j;
3549       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3550 	{
3551 	  (void) add_stmt_cost (target_cost_data,
3552 				si->count * peel_iters_prologue,
3553 				si->kind, si->stmt_info, si->misalign,
3554 				vect_prologue);
3555 	  (void) add_stmt_cost (target_cost_data,
3556 				si->count * peel_iters_epilogue,
3557 				si->kind, si->stmt_info, si->misalign,
3558 				vect_epilogue);
3559 	}
3560     }
3561   else
3562     {
3563       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3564       stmt_info_for_cost *si;
3565       int j;
3566       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3567 
3568       prologue_cost_vec.create (2);
3569       epilogue_cost_vec.create (2);
3570       peel_iters_prologue = npeel;
3571 
3572       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3573 					  &peel_iters_epilogue,
3574 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3575 					    (loop_vinfo),
3576 					  &prologue_cost_vec,
3577 					  &epilogue_cost_vec);
3578 
3579       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3580 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3581 			      si->misalign, vect_prologue);
3582 
3583       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3584 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3585 			      si->misalign, vect_epilogue);
3586 
3587       prologue_cost_vec.release ();
3588       epilogue_cost_vec.release ();
3589     }
3590 
3591   /* FORNOW: The scalar outside cost is incremented in one of the
3592      following ways:
3593 
3594      1. The vectorizer checks for alignment and aliasing and generates
3595      a condition that allows dynamic vectorization.  A cost model
3596      check is ANDED with the versioning condition.  Hence scalar code
3597      path now has the added cost of the versioning check.
3598 
3599        if (cost > th & versioning_check)
3600          jmp to vector code
3601 
3602      Hence run-time scalar is incremented by not-taken branch cost.
3603 
3604      2. The vectorizer then checks if a prologue is required.  If the
3605      cost model check was not done before during versioning, it has to
3606      be done before the prologue check.
3607 
3608        if (cost <= th)
3609          prologue = scalar_iters
3610        if (prologue == 0)
3611          jmp to vector code
3612        else
3613          execute prologue
3614        if (prologue == num_iters)
3615 	 go to exit
3616 
3617      Hence the run-time scalar cost is incremented by a taken branch,
3618      plus a not-taken branch, plus a taken branch cost.
3619 
3620      3. The vectorizer then checks if an epilogue is required.  If the
3621      cost model check was not done before during prologue check, it
3622      has to be done with the epilogue check.
3623 
3624        if (prologue == 0)
3625          jmp to vector code
3626        else
3627          execute prologue
3628        if (prologue == num_iters)
3629 	 go to exit
3630        vector code:
3631          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3632            jmp to epilogue
3633 
3634      Hence the run-time scalar cost should be incremented by 2 taken
3635      branches.
3636 
3637      TODO: The back end may reorder the BBS's differently and reverse
3638      conditions/branch directions.  Change the estimates below to
3639      something more reasonable.  */
3640 
3641   /* If the number of iterations is known and we do not do versioning, we can
3642      decide whether to vectorize at compile time.  Hence the scalar version
3643      do not carry cost model guard costs.  */
3644   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3645       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3646     {
3647       /* Cost model check occurs at versioning.  */
3648       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3649 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3650       else
3651 	{
3652 	  /* Cost model check occurs at prologue generation.  */
3653 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3654 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3655 	      + vect_get_stmt_cost (cond_branch_not_taken);
3656 	  /* Cost model check occurs at epilogue generation.  */
3657 	  else
3658 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3659 	}
3660     }
3661 
3662   /* Complete the target-specific cost calculations.  */
3663   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3664 	       &vec_inside_cost, &vec_epilogue_cost);
3665 
3666   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3667 
3668   /* Stash the costs so that we can compare two loop_vec_infos.  */
3669   loop_vinfo->vec_inside_cost = vec_inside_cost;
3670   loop_vinfo->vec_outside_cost = vec_outside_cost;
3671 
3672   if (dump_enabled_p ())
3673     {
3674       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3675       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3676                    vec_inside_cost);
3677       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3678                    vec_prologue_cost);
3679       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3680                    vec_epilogue_cost);
3681       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3682                    scalar_single_iter_cost);
3683       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3684                    scalar_outside_cost);
3685       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3686                    vec_outside_cost);
3687       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3688                    peel_iters_prologue);
3689       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3690                    peel_iters_epilogue);
3691     }
3692 
3693   /* Calculate number of iterations required to make the vector version
3694      profitable, relative to the loop bodies only.  The following condition
3695      must hold true:
3696      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3697      where
3698      SIC = scalar iteration cost, VIC = vector iteration cost,
3699      VOC = vector outside cost, VF = vectorization factor,
3700      NPEEL = prologue iterations + epilogue iterations,
3701      SOC = scalar outside cost for run time cost model check.  */
3702 
3703   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3704 			  - vec_inside_cost);
3705   if (saving_per_viter <= 0)
3706     {
3707       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3708 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3709 		    "vectorization did not happen for a simd loop");
3710 
3711       if (dump_enabled_p ())
3712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3713 			 "cost model: the vector iteration cost = %d "
3714 			 "divided by the scalar iteration cost = %d "
3715 			 "is greater or equal to the vectorization factor = %d"
3716                          ".\n",
3717 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3718       *ret_min_profitable_niters = -1;
3719       *ret_min_profitable_estimate = -1;
3720       return;
3721     }
3722 
3723   /* ??? The "if" arm is written to handle all cases; see below for what
3724      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3725   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3726     {
3727       /* Rewriting the condition above in terms of the number of
3728 	 vector iterations (vniters) rather than the number of
3729 	 scalar iterations (niters) gives:
3730 
3731 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3732 
3733 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3734 
3735 	 For integer N, X and Y when X > 0:
3736 
3737 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3738       int outside_overhead = (vec_outside_cost
3739 			      - scalar_single_iter_cost * peel_iters_prologue
3740 			      - scalar_single_iter_cost * peel_iters_epilogue
3741 			      - scalar_outside_cost);
3742       /* We're only interested in cases that require at least one
3743 	 vector iteration.  */
3744       int min_vec_niters = 1;
3745       if (outside_overhead > 0)
3746 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3747 
3748       if (dump_enabled_p ())
3749 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3750 		     min_vec_niters);
3751 
3752       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3753 	{
3754 	  /* Now that we know the minimum number of vector iterations,
3755 	     find the minimum niters for which the scalar cost is larger:
3756 
3757 	     SIC * niters > VIC * vniters + VOC - SOC
3758 
3759 	     We know that the minimum niters is no more than
3760 	     vniters * VF + NPEEL, but it might be (and often is) less
3761 	     than that if a partial vector iteration is cheaper than the
3762 	     equivalent scalar code.  */
3763 	  int threshold = (vec_inside_cost * min_vec_niters
3764 			   + vec_outside_cost
3765 			   - scalar_outside_cost);
3766 	  if (threshold <= 0)
3767 	    min_profitable_iters = 1;
3768 	  else
3769 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3770 	}
3771       else
3772 	/* Convert the number of vector iterations into a number of
3773 	   scalar iterations.  */
3774 	min_profitable_iters = (min_vec_niters * assumed_vf
3775 				+ peel_iters_prologue
3776 				+ peel_iters_epilogue);
3777     }
3778   else
3779     {
3780       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3781 			      * assumed_vf
3782 			      - vec_inside_cost * peel_iters_prologue
3783 			      - vec_inside_cost * peel_iters_epilogue);
3784       if (min_profitable_iters <= 0)
3785         min_profitable_iters = 0;
3786       else
3787 	{
3788 	  min_profitable_iters /= saving_per_viter;
3789 
3790 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3791 	      <= (((int) vec_inside_cost * min_profitable_iters)
3792 		  + (((int) vec_outside_cost - scalar_outside_cost)
3793 		     * assumed_vf)))
3794 	    min_profitable_iters++;
3795 	}
3796     }
3797 
3798   if (dump_enabled_p ())
3799     dump_printf (MSG_NOTE,
3800 		 "  Calculated minimum iters for profitability: %d\n",
3801 		 min_profitable_iters);
3802 
3803   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3804       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3805     /* We want the vectorized loop to execute at least once.  */
3806     min_profitable_iters = assumed_vf + peel_iters_prologue;
3807 
3808   if (dump_enabled_p ())
3809     dump_printf_loc (MSG_NOTE, vect_location,
3810                      "  Runtime profitability threshold = %d\n",
3811                      min_profitable_iters);
3812 
3813   *ret_min_profitable_niters = min_profitable_iters;
3814 
3815   /* Calculate number of iterations required to make the vector version
3816      profitable, relative to the loop bodies only.
3817 
3818      Non-vectorized variant is SIC * niters and it must win over vector
3819      variant on the expected loop trip count.  The following condition must hold true:
3820      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3821 
3822   if (vec_outside_cost <= 0)
3823     min_profitable_estimate = 0;
3824   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3825     {
3826       /* This is a repeat of the code above, but with + SOC rather
3827 	 than - SOC.  */
3828       int outside_overhead = (vec_outside_cost
3829 			      - scalar_single_iter_cost * peel_iters_prologue
3830 			      - scalar_single_iter_cost * peel_iters_epilogue
3831 			      + scalar_outside_cost);
3832       int min_vec_niters = 1;
3833       if (outside_overhead > 0)
3834 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3835 
3836       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3837 	{
3838 	  int threshold = (vec_inside_cost * min_vec_niters
3839 			   + vec_outside_cost
3840 			   + scalar_outside_cost);
3841 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3842 	}
3843       else
3844 	min_profitable_estimate = (min_vec_niters * assumed_vf
3845 				   + peel_iters_prologue
3846 				   + peel_iters_epilogue);
3847     }
3848   else
3849     {
3850       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3851 				 * assumed_vf
3852 				 - vec_inside_cost * peel_iters_prologue
3853 				 - vec_inside_cost * peel_iters_epilogue)
3854 				 / ((scalar_single_iter_cost * assumed_vf)
3855 				   - vec_inside_cost);
3856     }
3857   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3858   if (dump_enabled_p ())
3859     dump_printf_loc (MSG_NOTE, vect_location,
3860 		     "  Static estimate profitability threshold = %d\n",
3861 		     min_profitable_estimate);
3862 
3863   *ret_min_profitable_estimate = min_profitable_estimate;
3864 }
3865 
3866 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3867    vector elements (not bits) for a vector with NELT elements.  */
3868 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3869 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3870 			      vec_perm_builder *sel)
3871 {
3872   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3873      by vec_perm_indices.  */
3874   sel->new_vector (nelt, 1, 3);
3875   for (unsigned int i = 0; i < 3; i++)
3876     sel->quick_push (i + offset);
3877 }
3878 
3879 /* Checks whether the target supports whole-vector shifts for vectors of mode
3880    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3881    it supports vec_perm_const with masks for all necessary shift amounts.  */
3882 static bool
have_whole_vector_shift(machine_mode mode)3883 have_whole_vector_shift (machine_mode mode)
3884 {
3885   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3886     return true;
3887 
3888   /* Variable-length vectors should be handled via the optab.  */
3889   unsigned int nelt;
3890   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3891     return false;
3892 
3893   vec_perm_builder sel;
3894   vec_perm_indices indices;
3895   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3896     {
3897       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3898       indices.new_vector (sel, 2, nelt);
3899       if (!can_vec_perm_const_p (mode, indices, false))
3900 	return false;
3901     }
3902   return true;
3903 }
3904 
3905 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3906    functions. Design better to avoid maintenance issues.  */
3907 
3908 /* Function vect_model_reduction_cost.
3909 
3910    Models cost for a reduction operation, including the vector ops
3911    generated within the strip-mine loop, the initial definition before
3912    the loop, and the epilogue code that must be generated.  */
3913 
3914 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)3915 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3916 			   vect_reduction_type reduction_type,
3917 			   int ncopies, stmt_vector_for_cost *cost_vec)
3918 {
3919   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3920   enum tree_code code;
3921   optab optab;
3922   tree vectype;
3923   machine_mode mode;
3924   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3925   class loop *loop = NULL;
3926 
3927   if (loop_vinfo)
3928     loop = LOOP_VINFO_LOOP (loop_vinfo);
3929 
3930   /* Condition reductions generate two reductions in the loop.  */
3931   if (reduction_type == COND_REDUCTION)
3932     ncopies *= 2;
3933 
3934   vectype = STMT_VINFO_VECTYPE (stmt_info);
3935   mode = TYPE_MODE (vectype);
3936   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3937 
3938   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3939 
3940   if (reduction_type == EXTRACT_LAST_REDUCTION)
3941     /* No extra instructions are needed in the prologue.  The loop body
3942        operations are costed in vectorizable_condition.  */
3943     inside_cost = 0;
3944   else if (reduction_type == FOLD_LEFT_REDUCTION)
3945     {
3946       /* No extra instructions needed in the prologue.  */
3947       prologue_cost = 0;
3948 
3949       if (reduc_fn != IFN_LAST)
3950 	/* Count one reduction-like operation per vector.  */
3951 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3952 					stmt_info, 0, vect_body);
3953       else
3954 	{
3955 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3956 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3957 	  inside_cost = record_stmt_cost (cost_vec, nelements,
3958 					  vec_to_scalar, stmt_info, 0,
3959 					  vect_body);
3960 	  inside_cost += record_stmt_cost (cost_vec, nelements,
3961 					   scalar_stmt, stmt_info, 0,
3962 					   vect_body);
3963 	}
3964     }
3965   else
3966     {
3967       /* Add in cost for initial definition.
3968 	 For cond reduction we have four vectors: initial index, step,
3969 	 initial result of the data reduction, initial value of the index
3970 	 reduction.  */
3971       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3972       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3973 					 scalar_to_vec, stmt_info, 0,
3974 					 vect_prologue);
3975 
3976       /* Cost of reduction op inside loop.  */
3977       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3978 				      stmt_info, 0, vect_body);
3979     }
3980 
3981   /* Determine cost of epilogue code.
3982 
3983      We have a reduction operator that will reduce the vector in one statement.
3984      Also requires scalar extract.  */
3985 
3986   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3987     {
3988       if (reduc_fn != IFN_LAST)
3989 	{
3990 	  if (reduction_type == COND_REDUCTION)
3991 	    {
3992 	      /* An EQ stmt and an COND_EXPR stmt.  */
3993 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3994 						 vector_stmt, stmt_info, 0,
3995 						 vect_epilogue);
3996 	      /* Reduction of the max index and a reduction of the found
3997 		 values.  */
3998 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3999 						 vec_to_scalar, stmt_info, 0,
4000 						 vect_epilogue);
4001 	      /* A broadcast of the max value.  */
4002 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4003 						 scalar_to_vec, stmt_info, 0,
4004 						 vect_epilogue);
4005 	    }
4006 	  else
4007 	    {
4008 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4009 						 stmt_info, 0, vect_epilogue);
4010 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4011 						 vec_to_scalar, stmt_info, 0,
4012 						 vect_epilogue);
4013 	    }
4014 	}
4015       else if (reduction_type == COND_REDUCTION)
4016 	{
4017 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4018 	  /* Extraction of scalar elements.  */
4019 	  epilogue_cost += record_stmt_cost (cost_vec,
4020 					     2 * estimated_nunits,
4021 					     vec_to_scalar, stmt_info, 0,
4022 					     vect_epilogue);
4023 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4024 	  epilogue_cost += record_stmt_cost (cost_vec,
4025 					     2 * estimated_nunits - 3,
4026 					     scalar_stmt, stmt_info, 0,
4027 					     vect_epilogue);
4028 	}
4029       else if (reduction_type == EXTRACT_LAST_REDUCTION
4030 	       || reduction_type == FOLD_LEFT_REDUCTION)
4031 	/* No extra instructions need in the epilogue.  */
4032 	;
4033       else
4034 	{
4035 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4036 	  tree bitsize =
4037 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4038 	  int element_bitsize = tree_to_uhwi (bitsize);
4039 	  int nelements = vec_size_in_bits / element_bitsize;
4040 
4041 	  if (code == COND_EXPR)
4042 	    code = MAX_EXPR;
4043 
4044 	  optab = optab_for_tree_code (code, vectype, optab_default);
4045 
4046 	  /* We have a whole vector shift available.  */
4047 	  if (optab != unknown_optab
4048 	      && VECTOR_MODE_P (mode)
4049 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4050 	      && have_whole_vector_shift (mode))
4051 	    {
4052 	      /* Final reduction via vector shifts and the reduction operator.
4053 		 Also requires scalar extract.  */
4054 	      epilogue_cost += record_stmt_cost (cost_vec,
4055 						 exact_log2 (nelements) * 2,
4056 						 vector_stmt, stmt_info, 0,
4057 						 vect_epilogue);
4058 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4059 						 vec_to_scalar, stmt_info, 0,
4060 						 vect_epilogue);
4061 	    }
4062 	  else
4063 	    /* Use extracts and reduction op for final reduction.  For N
4064 	       elements, we have N extracts and N-1 reduction ops.  */
4065 	    epilogue_cost += record_stmt_cost (cost_vec,
4066 					       nelements + nelements - 1,
4067 					       vector_stmt, stmt_info, 0,
4068 					       vect_epilogue);
4069 	}
4070     }
4071 
4072   if (dump_enabled_p ())
4073     dump_printf (MSG_NOTE,
4074                  "vect_model_reduction_cost: inside_cost = %d, "
4075                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4076                  prologue_cost, epilogue_cost);
4077 }
4078 
4079 
4080 /* Function vect_model_induction_cost.
4081 
4082    Models cost for induction operations.  */
4083 
4084 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4086 			   stmt_vector_for_cost *cost_vec)
4087 {
4088   unsigned inside_cost, prologue_cost;
4089 
4090   if (PURE_SLP_STMT (stmt_info))
4091     return;
4092 
4093   /* loop cost for vec_loop.  */
4094   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4095 				  stmt_info, 0, vect_body);
4096 
4097   /* prologue cost for vec_init and vec_step.  */
4098   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4099 				    stmt_info, 0, vect_prologue);
4100 
4101   if (dump_enabled_p ())
4102     dump_printf_loc (MSG_NOTE, vect_location,
4103                      "vect_model_induction_cost: inside_cost = %d, "
4104                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4105 }
4106 
4107 
4108 
4109 /* Function get_initial_def_for_reduction
4110 
4111    Input:
4112    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4113    INIT_VAL - the initial value of the reduction variable
4114 
4115    Output:
4116    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4117         of the reduction (used for adjusting the epilog - see below).
4118    Return a vector variable, initialized according to the operation that
4119 	STMT_VINFO performs. This vector will be used as the initial value
4120 	of the vector of partial results.
4121 
4122    Option1 (adjust in epilog): Initialize the vector as follows:
4123      add/bit or/xor:    [0,0,...,0,0]
4124      mult/bit and:      [1,1,...,1,1]
4125      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4126    and when necessary (e.g. add/mult case) let the caller know
4127    that it needs to adjust the result by init_val.
4128 
4129    Option2: Initialize the vector as follows:
4130      add/bit or/xor:    [init_val,0,0,...,0]
4131      mult/bit and:      [init_val,1,1,...,1]
4132      min/max/cond_expr: [init_val,init_val,...,init_val]
4133    and no adjustments are needed.
4134 
4135    For example, for the following code:
4136 
4137    s = init_val;
4138    for (i=0;i<n;i++)
4139      s = s + a[i];
4140 
4141    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4142    For a vector of 4 units, we want to return either [0,0,0,init_val],
4143    or [0,0,0,0] and let the caller know that it needs to adjust
4144    the result at the end by 'init_val'.
4145 
4146    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4147    initialization vector is simpler (same element in all entries), if
4148    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4149 
4150    A cost model should help decide between these two schemes.  */
4151 
4152 static tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4153 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4154 			       enum tree_code code, tree init_val,
4155                                tree *adjustment_def)
4156 {
4157   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4158   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4159   tree scalar_type = TREE_TYPE (init_val);
4160   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4161   tree def_for_init;
4162   tree init_def;
4163   REAL_VALUE_TYPE real_init_val = dconst0;
4164   int int_init_val = 0;
4165   gimple_seq stmts = NULL;
4166 
4167   gcc_assert (vectype);
4168 
4169   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4170 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4171 
4172   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4173 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4174 
4175   /* ADJUSTMENT_DEF is NULL when called from
4176      vect_create_epilog_for_reduction to vectorize double reduction.  */
4177   if (adjustment_def)
4178     *adjustment_def = NULL;
4179 
4180   switch (code)
4181     {
4182     case WIDEN_SUM_EXPR:
4183     case DOT_PROD_EXPR:
4184     case SAD_EXPR:
4185     case PLUS_EXPR:
4186     case MINUS_EXPR:
4187     case BIT_IOR_EXPR:
4188     case BIT_XOR_EXPR:
4189     case MULT_EXPR:
4190     case BIT_AND_EXPR:
4191       {
4192         if (code == MULT_EXPR)
4193           {
4194             real_init_val = dconst1;
4195             int_init_val = 1;
4196           }
4197 
4198         if (code == BIT_AND_EXPR)
4199           int_init_val = -1;
4200 
4201         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4202           def_for_init = build_real (scalar_type, real_init_val);
4203         else
4204           def_for_init = build_int_cst (scalar_type, int_init_val);
4205 
4206 	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4207 	  {
4208 	    /* Option1: the first element is '0' or '1' as well.  */
4209 	    if (!operand_equal_p (def_for_init, init_val, 0))
4210 	      *adjustment_def = init_val;
4211 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4212 						     def_for_init);
4213 	  }
4214 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4215 	  {
4216 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4217 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4218 						     def_for_init);
4219 	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4220 				     vectype, init_def, init_val);
4221 	  }
4222 	else
4223 	  {
4224 	    /* Option2: the first element is INIT_VAL.  */
4225 	    tree_vector_builder elts (vectype, 1, 2);
4226 	    elts.quick_push (init_val);
4227 	    elts.quick_push (def_for_init);
4228 	    init_def = gimple_build_vector (&stmts, &elts);
4229 	  }
4230       }
4231       break;
4232 
4233     case MIN_EXPR:
4234     case MAX_EXPR:
4235     case COND_EXPR:
4236       {
4237 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4238 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4239       }
4240       break;
4241 
4242     default:
4243       gcc_unreachable ();
4244     }
4245 
4246   if (stmts)
4247     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4248   return init_def;
4249 }
4250 
4251 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4252    NUMBER_OF_VECTORS is the number of vector defs to create.
4253    If NEUTRAL_OP is nonnull, introducing extra elements of that
4254    value will not change the result.  */
4255 
4256 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4257 get_initial_defs_for_reduction (slp_tree slp_node,
4258 				vec<tree> *vec_oprnds,
4259 				unsigned int number_of_vectors,
4260 				bool reduc_chain, tree neutral_op)
4261 {
4262   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4263   stmt_vec_info stmt_vinfo = stmts[0];
4264   vec_info *vinfo = stmt_vinfo->vinfo;
4265   unsigned HOST_WIDE_INT nunits;
4266   unsigned j, number_of_places_left_in_vector;
4267   tree vector_type;
4268   unsigned int group_size = stmts.length ();
4269   unsigned int i;
4270   class loop *loop;
4271 
4272   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4273 
4274   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4275 
4276   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4277   gcc_assert (loop);
4278   edge pe = loop_preheader_edge (loop);
4279 
4280   gcc_assert (!reduc_chain || neutral_op);
4281 
4282   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4283      created vectors. It is greater than 1 if unrolling is performed.
4284 
4285      For example, we have two scalar operands, s1 and s2 (e.g., group of
4286      strided accesses of size two), while NUNITS is four (i.e., four scalars
4287      of this type can be packed in a vector).  The output vector will contain
4288      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4289      will be 2).
4290 
4291      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4292      vectors containing the operands.
4293 
4294      For example, NUNITS is four as before, and the group size is 8
4295      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4296      {s5, s6, s7, s8}.  */
4297 
4298   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4299     nunits = group_size;
4300 
4301   number_of_places_left_in_vector = nunits;
4302   bool constant_p = true;
4303   tree_vector_builder elts (vector_type, nunits, 1);
4304   elts.quick_grow (nunits);
4305   gimple_seq ctor_seq = NULL;
4306   for (j = 0; j < nunits * number_of_vectors; ++j)
4307     {
4308       tree op;
4309       i = j % group_size;
4310       stmt_vinfo = stmts[i];
4311 
4312       /* Get the def before the loop.  In reduction chain we have only
4313 	 one initial value.  Else we have as many as PHIs in the group.  */
4314       if (reduc_chain)
4315 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4316       else if (((vec_oprnds->length () + 1) * nunits
4317 		- number_of_places_left_in_vector >= group_size)
4318 	       && neutral_op)
4319 	op = neutral_op;
4320       else
4321 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4322 
4323       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4324       number_of_places_left_in_vector--;
4325       elts[nunits - number_of_places_left_in_vector - 1] = op;
4326       if (!CONSTANT_CLASS_P (op))
4327 	constant_p = false;
4328 
4329       if (number_of_places_left_in_vector == 0)
4330 	{
4331 	  tree init;
4332 	  if (constant_p && !neutral_op
4333 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4334 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4335 	    /* Build the vector directly from ELTS.  */
4336 	    init = gimple_build_vector (&ctor_seq, &elts);
4337 	  else if (neutral_op)
4338 	    {
4339 	      /* Build a vector of the neutral value and shift the
4340 		 other elements into place.  */
4341 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4342 						   neutral_op);
4343 	      int k = nunits;
4344 	      while (k > 0 && elts[k - 1] == neutral_op)
4345 		k -= 1;
4346 	      while (k > 0)
4347 		{
4348 		  k -= 1;
4349 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4350 				       vector_type, init, elts[k]);
4351 		}
4352 	    }
4353 	  else
4354 	    {
4355 	      /* First time round, duplicate ELTS to fill the
4356 		 required number of vectors.  */
4357 	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4358 					number_of_vectors, *vec_oprnds);
4359 	      break;
4360 	    }
4361 	  vec_oprnds->quick_push (init);
4362 
4363 	  number_of_places_left_in_vector = nunits;
4364 	  elts.new_vector (vector_type, nunits, 1);
4365 	  elts.quick_grow (nunits);
4366 	  constant_p = true;
4367 	}
4368     }
4369   if (ctor_seq != NULL)
4370     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4371 }
4372 
4373 /* For a statement STMT_INFO taking part in a reduction operation return
4374    the stmt_vec_info the meta information is stored on.  */
4375 
4376 stmt_vec_info
info_for_reduction(stmt_vec_info stmt_info)4377 info_for_reduction (stmt_vec_info stmt_info)
4378 {
4379   stmt_info = vect_orig_stmt (stmt_info);
4380   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4381   if (!is_a <gphi *> (stmt_info->stmt))
4382     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4383   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4384   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4385     {
4386       if (gimple_phi_num_args (phi) == 1)
4387 	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4388     }
4389   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4390     {
4391       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4392       stmt_vec_info info
4393 	  = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4394       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4395 	stmt_info = info;
4396     }
4397   return stmt_info;
4398 }
4399 
4400 /* Function vect_create_epilog_for_reduction
4401 
4402    Create code at the loop-epilog to finalize the result of a reduction
4403    computation.
4404 
4405    STMT_INFO is the scalar reduction stmt that is being vectorized.
4406    SLP_NODE is an SLP node containing a group of reduction statements. The
4407      first one in this group is STMT_INFO.
4408    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4409    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4410      (counting from 0)
4411 
4412    This function:
4413    1. Completes the reduction def-use cycles.
4414    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4415       by calling the function specified by REDUC_FN if available, or by
4416       other means (whole-vector shifts or a scalar loop).
4417       The function also creates a new phi node at the loop exit to preserve
4418       loop-closed form, as illustrated below.
4419 
4420      The flow at the entry to this function:
4421 
4422         loop:
4423           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4424           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4425           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4426         loop_exit:
4427           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4428           use <s_out0>
4429           use <s_out0>
4430 
4431      The above is transformed by this function into:
4432 
4433         loop:
4434           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4435           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4436           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4437         loop_exit:
4438           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4439           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4440           v_out2 = reduce <v_out1>
4441           s_out3 = extract_field <v_out2, 0>
4442           s_out4 = adjust_result <s_out3>
4443           use <s_out4>
4444           use <s_out4>
4445 */
4446 
4447 static void
vect_create_epilog_for_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4448 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4449 				  slp_tree slp_node,
4450 				  slp_instance slp_node_instance)
4451 {
4452   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4453   gcc_assert (reduc_info->is_reduc_info);
4454   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4455   /* For double reductions we need to get at the inner loop reduction
4456      stmt which has the meta info attached.  Our stmt_info is that of the
4457      loop-closed PHI of the inner loop which we remember as
4458      def for the reduction PHI generation.  */
4459   bool double_reduc = false;
4460   stmt_vec_info rdef_info = stmt_info;
4461   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4462     {
4463       gcc_assert (!slp_node);
4464       double_reduc = true;
4465       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4466 					    (stmt_info->stmt, 0));
4467       stmt_info = vect_stmt_to_vectorize (stmt_info);
4468     }
4469   gphi *reduc_def_stmt
4470     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4471   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4472   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4473   stmt_vec_info prev_phi_info;
4474   tree vectype;
4475   machine_mode mode;
4476   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4477   basic_block exit_bb;
4478   tree scalar_dest;
4479   tree scalar_type;
4480   gimple *new_phi = NULL, *phi;
4481   stmt_vec_info phi_info;
4482   gimple_stmt_iterator exit_gsi;
4483   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4484   gimple *epilog_stmt = NULL;
4485   gimple *exit_phi;
4486   tree bitsize;
4487   tree def;
4488   tree orig_name, scalar_result;
4489   imm_use_iterator imm_iter, phi_imm_iter;
4490   use_operand_p use_p, phi_use_p;
4491   gimple *use_stmt;
4492   bool nested_in_vect_loop = false;
4493   auto_vec<gimple *> new_phis;
4494   int j, i;
4495   auto_vec<tree> scalar_results;
4496   unsigned int group_size = 1, k;
4497   auto_vec<gimple *> phis;
4498   bool slp_reduc = false;
4499   bool direct_slp_reduc;
4500   tree new_phi_result;
4501   tree induction_index = NULL_TREE;
4502 
4503   if (slp_node)
4504     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4505 
4506   if (nested_in_vect_loop_p (loop, stmt_info))
4507     {
4508       outer_loop = loop;
4509       loop = loop->inner;
4510       nested_in_vect_loop = true;
4511       gcc_assert (!slp_node);
4512     }
4513   gcc_assert (!nested_in_vect_loop || double_reduc);
4514 
4515   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4516   gcc_assert (vectype);
4517   mode = TYPE_MODE (vectype);
4518 
4519   tree initial_def = NULL;
4520   tree induc_val = NULL_TREE;
4521   tree adjustment_def = NULL;
4522   if (slp_node)
4523     ;
4524   else
4525     {
4526       /* Get at the scalar def before the loop, that defines the initial value
4527 	 of the reduction variable.  */
4528       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4529 					   loop_preheader_edge (loop));
4530       /* Optimize: for induction condition reduction, if we can't use zero
4531          for induc_val, use initial_def.  */
4532       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4533 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4534       else if (double_reduc)
4535 	;
4536       else if (nested_in_vect_loop)
4537 	;
4538       else
4539 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4540     }
4541 
4542   unsigned vec_num;
4543   int ncopies;
4544   if (slp_node)
4545     {
4546       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4547       ncopies = 1;
4548     }
4549   else
4550     {
4551       vec_num = 1;
4552       ncopies = 0;
4553       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4554       do
4555 	{
4556 	  ncopies++;
4557 	  phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4558 	}
4559       while (phi_info);
4560     }
4561 
4562   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4563      which is updated with the current index of the loop for every match of
4564      the original loop's cond_expr (VEC_STMT).  This results in a vector
4565      containing the last time the condition passed for that vector lane.
4566      The first match will be a 1 to allow 0 to be used for non-matching
4567      indexes.  If there are no matches at all then the vector will be all
4568      zeroes.
4569 
4570      PR92772: This algorithm is broken for architectures that support
4571      masked vectors, but do not provide fold_extract_last.  */
4572   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4573     {
4574       auto_vec<std::pair<tree, bool>, 2> ccompares;
4575       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4576       cond_info = vect_stmt_to_vectorize (cond_info);
4577       while (cond_info != reduc_info)
4578 	{
4579 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4580 	    {
4581 	      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4582 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4583 	      ccompares.safe_push
4584 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4585 				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4586 	    }
4587 	  cond_info
4588 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4589 						 1 + STMT_VINFO_REDUC_IDX
4590 							(cond_info)));
4591 	  cond_info = vect_stmt_to_vectorize (cond_info);
4592 	}
4593       gcc_assert (ccompares.length () != 0);
4594 
4595       tree indx_before_incr, indx_after_incr;
4596       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4597       int scalar_precision
4598 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4599       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4600       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4601 	(TYPE_MODE (vectype), cr_index_scalar_type,
4602 	 TYPE_VECTOR_SUBPARTS (vectype));
4603 
4604       /* First we create a simple vector induction variable which starts
4605 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4606 	 vector size (STEP).  */
4607 
4608       /* Create a {1,2,3,...} vector.  */
4609       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4610 
4611       /* Create a vector of the step value.  */
4612       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4613       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4614 
4615       /* Create an induction variable.  */
4616       gimple_stmt_iterator incr_gsi;
4617       bool insert_after;
4618       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4619       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4620 		 insert_after, &indx_before_incr, &indx_after_incr);
4621 
4622       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4623 	 filled with zeros (VEC_ZERO).  */
4624 
4625       /* Create a vector of 0s.  */
4626       tree zero = build_zero_cst (cr_index_scalar_type);
4627       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4628 
4629       /* Create a vector phi node.  */
4630       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4631       new_phi = create_phi_node (new_phi_tree, loop->header);
4632       loop_vinfo->add_stmt (new_phi);
4633       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4634 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4635 
4636       /* Now take the condition from the loops original cond_exprs
4637 	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4638 	 every match uses values from the induction variable
4639 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4640 	 (NEW_PHI_TREE).
4641 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4642 	 the new cond_expr (INDEX_COND_EXPR).  */
4643       gimple_seq stmts = NULL;
4644       for (int i = ccompares.length () - 1; i != -1; --i)
4645 	{
4646 	  tree ccompare = ccompares[i].first;
4647 	  if (ccompares[i].second)
4648 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4649 					 cr_index_vector_type,
4650 					 ccompare,
4651 					 indx_before_incr, new_phi_tree);
4652 	  else
4653 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4654 					 cr_index_vector_type,
4655 					 ccompare,
4656 					 new_phi_tree, indx_before_incr);
4657 	}
4658       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4659       stmt_vec_info index_vec_info
4660 	= loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4661       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4662 
4663       /* Update the phi with the vec cond.  */
4664       induction_index = new_phi_tree;
4665       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4666 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4667     }
4668 
4669   /* 2. Create epilog code.
4670         The reduction epilog code operates across the elements of the vector
4671         of partial results computed by the vectorized loop.
4672         The reduction epilog code consists of:
4673 
4674         step 1: compute the scalar result in a vector (v_out2)
4675         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4676         step 3: adjust the scalar result (s_out3) if needed.
4677 
4678         Step 1 can be accomplished using one the following three schemes:
4679           (scheme 1) using reduc_fn, if available.
4680           (scheme 2) using whole-vector shifts, if available.
4681           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4682                      combined.
4683 
4684           The overall epilog code looks like this:
4685 
4686           s_out0 = phi <s_loop>         # original EXIT_PHI
4687           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4688           v_out2 = reduce <v_out1>              # step 1
4689           s_out3 = extract_field <v_out2, 0>    # step 2
4690           s_out4 = adjust_result <s_out3>       # step 3
4691 
4692           (step 3 is optional, and steps 1 and 2 may be combined).
4693           Lastly, the uses of s_out0 are replaced by s_out4.  */
4694 
4695 
4696   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4697          v_out1 = phi <VECT_DEF>
4698          Store them in NEW_PHIS.  */
4699   if (double_reduc)
4700     loop = outer_loop;
4701   exit_bb = single_exit (loop)->dest;
4702   prev_phi_info = NULL;
4703   new_phis.create (slp_node ? vec_num : ncopies);
4704   for (unsigned i = 0; i < vec_num; i++)
4705     {
4706       if (slp_node)
4707 	def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4708       else
4709 	def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4710       for (j = 0; j < ncopies; j++)
4711         {
4712 	  tree new_def = copy_ssa_name (def);
4713           phi = create_phi_node (new_def, exit_bb);
4714 	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4715           if (j == 0)
4716             new_phis.quick_push (phi);
4717           else
4718 	    {
4719 	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4720 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4721 	    }
4722 
4723           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4724 	  prev_phi_info = phi_info;
4725         }
4726     }
4727 
4728   exit_gsi = gsi_after_labels (exit_bb);
4729 
4730   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4731          (i.e. when reduc_fn is not available) and in the final adjustment
4732 	 code (if needed).  Also get the original scalar reduction variable as
4733          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4734          represents a reduction pattern), the tree-code and scalar-def are
4735          taken from the original stmt that the pattern-stmt (STMT) replaces.
4736          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4737          are taken from STMT.  */
4738 
4739   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4740   if (orig_stmt_info != stmt_info)
4741     {
4742       /* Reduction pattern  */
4743       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4744       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4745     }
4746 
4747   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4748   scalar_type = TREE_TYPE (scalar_dest);
4749   scalar_results.create (group_size);
4750   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4751   bitsize = TYPE_SIZE (scalar_type);
4752 
4753   /* SLP reduction without reduction chain, e.g.,
4754      # a1 = phi <a2, a0>
4755      # b1 = phi <b2, b0>
4756      a2 = operation (a1)
4757      b2 = operation (b1)  */
4758   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4759 
4760   /* True if we should implement SLP_REDUC using native reduction operations
4761      instead of scalar operations.  */
4762   direct_slp_reduc = (reduc_fn != IFN_LAST
4763 		      && slp_reduc
4764 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4765 
4766   /* In case of reduction chain, e.g.,
4767      # a1 = phi <a3, a0>
4768      a2 = operation (a1)
4769      a3 = operation (a2),
4770 
4771      we may end up with more than one vector result.  Here we reduce them to
4772      one vector.  */
4773   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4774     {
4775       gimple_seq stmts = NULL;
4776       tree first_vect = PHI_RESULT (new_phis[0]);
4777       first_vect = gimple_convert (&stmts, vectype, first_vect);
4778       for (k = 1; k < new_phis.length (); k++)
4779         {
4780 	  gimple *next_phi = new_phis[k];
4781           tree second_vect = PHI_RESULT (next_phi);
4782 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4783           first_vect = gimple_build (&stmts, code, vectype,
4784 				     first_vect, second_vect);
4785         }
4786       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4787 
4788       new_phi_result = first_vect;
4789       new_phis.truncate (0);
4790       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4791     }
4792   /* Likewise if we couldn't use a single defuse cycle.  */
4793   else if (ncopies > 1)
4794     {
4795       gcc_assert (new_phis.length () == 1);
4796       gimple_seq stmts = NULL;
4797       tree first_vect = PHI_RESULT (new_phis[0]);
4798       first_vect = gimple_convert (&stmts, vectype, first_vect);
4799       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4800       for (int k = 1; k < ncopies; ++k)
4801 	{
4802 	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4803 	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
4804 	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4805 	  first_vect = gimple_build (&stmts, code, vectype,
4806 				     first_vect, second_vect);
4807 	}
4808       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4809       new_phi_result = first_vect;
4810       new_phis.truncate (0);
4811       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4812     }
4813   else
4814     new_phi_result = PHI_RESULT (new_phis[0]);
4815 
4816   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4817       && reduc_fn != IFN_LAST)
4818     {
4819       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4820 	 various data values where the condition matched and another vector
4821 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4822 	 need to extract the last matching index (which will be the index with
4823 	 highest value) and use this to index into the data vector.
4824 	 For the case where there were no matches, the data vector will contain
4825 	 all default values and the index vector will be all zeros.  */
4826 
4827       /* Get various versions of the type of the vector of indexes.  */
4828       tree index_vec_type = TREE_TYPE (induction_index);
4829       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4830       tree index_scalar_type = TREE_TYPE (index_vec_type);
4831       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4832 
4833       /* Get an unsigned integer version of the type of the data vector.  */
4834       int scalar_precision
4835 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4836       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4837       tree vectype_unsigned = build_vector_type
4838 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4839 
4840       /* First we need to create a vector (ZERO_VEC) of zeros and another
4841 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4842 	 can create using a MAX reduction and then expanding.
4843 	 In the case where the loop never made any matches, the max index will
4844 	 be zero.  */
4845 
4846       /* Vector of {0, 0, 0,...}.  */
4847       tree zero_vec = build_zero_cst (vectype);
4848 
4849       gimple_seq stmts = NULL;
4850       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4851       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4852 
4853       /* Find maximum value from the vector of found indexes.  */
4854       tree max_index = make_ssa_name (index_scalar_type);
4855       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4856 							  1, induction_index);
4857       gimple_call_set_lhs (max_index_stmt, max_index);
4858       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4859 
4860       /* Vector of {max_index, max_index, max_index,...}.  */
4861       tree max_index_vec = make_ssa_name (index_vec_type);
4862       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4863 						      max_index);
4864       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4865 							max_index_vec_rhs);
4866       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4867 
4868       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4869 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4870 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4871 	 otherwise.  Only one value should match, resulting in a vector
4872 	 (VEC_COND) with one data value and the rest zeros.
4873 	 In the case where the loop never made any matches, every index will
4874 	 match, resulting in a vector with all data values (which will all be
4875 	 the default value).  */
4876 
4877       /* Compare the max index vector to the vector of found indexes to find
4878 	 the position of the max value.  */
4879       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4880       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4881 						      induction_index,
4882 						      max_index_vec);
4883       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4884 
4885       /* Use the compare to choose either values from the data vector or
4886 	 zero.  */
4887       tree vec_cond = make_ssa_name (vectype);
4888       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4889 						   vec_compare, new_phi_result,
4890 						   zero_vec);
4891       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4892 
4893       /* Finally we need to extract the data value from the vector (VEC_COND)
4894 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4895 	 reduction, but because this doesn't exist, we can use a MAX reduction
4896 	 instead.  The data value might be signed or a float so we need to cast
4897 	 it first.
4898 	 In the case where the loop never made any matches, the data values are
4899 	 all identical, and so will reduce down correctly.  */
4900 
4901       /* Make the matched data values unsigned.  */
4902       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4903       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4904 				       vec_cond);
4905       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4906 							VIEW_CONVERT_EXPR,
4907 							vec_cond_cast_rhs);
4908       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4909 
4910       /* Reduce down to a scalar value.  */
4911       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4912       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4913 							   1, vec_cond_cast);
4914       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4915       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4916 
4917       /* Convert the reduced value back to the result type and set as the
4918 	 result.  */
4919       stmts = NULL;
4920       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4921 			       data_reduc);
4922       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4923       scalar_results.safe_push (new_temp);
4924     }
4925   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4926 	   && reduc_fn == IFN_LAST)
4927     {
4928       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4929 	 idx = 0;
4930          idx_val = induction_index[0];
4931 	 val = data_reduc[0];
4932          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4933 	   if (induction_index[i] > idx_val)
4934 	     val = data_reduc[i], idx_val = induction_index[i];
4935 	 return val;  */
4936 
4937       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4938       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4939       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4940       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4941       /* Enforced by vectorizable_reduction, which ensures we have target
4942 	 support before allowing a conditional reduction on variable-length
4943 	 vectors.  */
4944       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4945       tree idx_val = NULL_TREE, val = NULL_TREE;
4946       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4947 	{
4948 	  tree old_idx_val = idx_val;
4949 	  tree old_val = val;
4950 	  idx_val = make_ssa_name (idx_eltype);
4951 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4952 					     build3 (BIT_FIELD_REF, idx_eltype,
4953 						     induction_index,
4954 						     bitsize_int (el_size),
4955 						     bitsize_int (off)));
4956 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4957 	  val = make_ssa_name (data_eltype);
4958 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4959 					     build3 (BIT_FIELD_REF,
4960 						     data_eltype,
4961 						     new_phi_result,
4962 						     bitsize_int (el_size),
4963 						     bitsize_int (off)));
4964 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965 	  if (off != 0)
4966 	    {
4967 	      tree new_idx_val = idx_val;
4968 	      if (off != v_size - el_size)
4969 		{
4970 		  new_idx_val = make_ssa_name (idx_eltype);
4971 		  epilog_stmt = gimple_build_assign (new_idx_val,
4972 						     MAX_EXPR, idx_val,
4973 						     old_idx_val);
4974 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975 		}
4976 	      tree new_val = make_ssa_name (data_eltype);
4977 	      epilog_stmt = gimple_build_assign (new_val,
4978 						 COND_EXPR,
4979 						 build2 (GT_EXPR,
4980 							 boolean_type_node,
4981 							 idx_val,
4982 							 old_idx_val),
4983 						 val, old_val);
4984 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4985 	      idx_val = new_idx_val;
4986 	      val = new_val;
4987 	    }
4988 	}
4989       /* Convert the reduced value back to the result type and set as the
4990 	 result.  */
4991       gimple_seq stmts = NULL;
4992       val = gimple_convert (&stmts, scalar_type, val);
4993       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4994       scalar_results.safe_push (val);
4995     }
4996 
4997   /* 2.3 Create the reduction code, using one of the three schemes described
4998          above. In SLP we simply need to extract all the elements from the
4999          vector (without reducing them), so we use scalar shifts.  */
5000   else if (reduc_fn != IFN_LAST && !slp_reduc)
5001     {
5002       tree tmp;
5003       tree vec_elem_type;
5004 
5005       /* Case 1:  Create:
5006          v_out2 = reduc_expr <v_out1>  */
5007 
5008       if (dump_enabled_p ())
5009         dump_printf_loc (MSG_NOTE, vect_location,
5010 			 "Reduce using direct vector reduction.\n");
5011 
5012       gimple_seq stmts = NULL;
5013       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5014       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5015       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5016 			       vec_elem_type, new_phi_result);
5017       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5018       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5019 
5020       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5021 	  && induc_val)
5022 	{
5023 	  /* Earlier we set the initial value to be a vector if induc_val
5024 	     values.  Check the result and if it is induc_val then replace
5025 	     with the original initial value, unless induc_val is
5026 	     the same as initial_def already.  */
5027 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5028 				  induc_val);
5029 
5030 	  tmp = make_ssa_name (new_scalar_dest);
5031 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5032 					     initial_def, new_temp);
5033 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5034 	  new_temp = tmp;
5035 	}
5036 
5037       scalar_results.safe_push (new_temp);
5038     }
5039   else if (direct_slp_reduc)
5040     {
5041       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5042 	 with the elements for other SLP statements replaced with the
5043 	 neutral value.  We can then do a normal reduction on each vector.  */
5044 
5045       /* Enforced by vectorizable_reduction.  */
5046       gcc_assert (new_phis.length () == 1);
5047       gcc_assert (pow2p_hwi (group_size));
5048 
5049       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5050       vec<stmt_vec_info> orig_phis
5051 	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5052       gimple_seq seq = NULL;
5053 
5054       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5055 	 and the same element size as VECTYPE.  */
5056       tree index = build_index_vector (vectype, 0, 1);
5057       tree index_type = TREE_TYPE (index);
5058       tree index_elt_type = TREE_TYPE (index_type);
5059       tree mask_type = truth_type_for (index_type);
5060 
5061       /* Create a vector that, for each element, identifies which of
5062 	 the REDUC_GROUP_SIZE results should use it.  */
5063       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5064       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5065 			    build_vector_from_val (index_type, index_mask));
5066 
5067       /* Get a neutral vector value.  This is simply a splat of the neutral
5068 	 scalar value if we have one, otherwise the initial scalar value
5069 	 is itself a neutral value.  */
5070       tree vector_identity = NULL_TREE;
5071       tree neutral_op = NULL_TREE;
5072       if (slp_node)
5073 	{
5074 	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5075 	  neutral_op
5076 	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5077 					    vectype, code, first != NULL);
5078 	}
5079       if (neutral_op)
5080 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5081 							neutral_op);
5082       for (unsigned int i = 0; i < group_size; ++i)
5083 	{
5084 	  /* If there's no univeral neutral value, we can use the
5085 	     initial scalar value from the original PHI.  This is used
5086 	     for MIN and MAX reduction, for example.  */
5087 	  if (!neutral_op)
5088 	    {
5089 	      tree scalar_value
5090 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5091 					 loop_preheader_edge (loop));
5092 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5093 					     scalar_value);
5094 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5095 							      scalar_value);
5096 	    }
5097 
5098 	  /* Calculate the equivalent of:
5099 
5100 	     sel[j] = (index[j] == i);
5101 
5102 	     which selects the elements of NEW_PHI_RESULT that should
5103 	     be included in the result.  */
5104 	  tree compare_val = build_int_cst (index_elt_type, i);
5105 	  compare_val = build_vector_from_val (index_type, compare_val);
5106 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5107 				   index, compare_val);
5108 
5109 	  /* Calculate the equivalent of:
5110 
5111 	     vec = seq ? new_phi_result : vector_identity;
5112 
5113 	     VEC is now suitable for a full vector reduction.  */
5114 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5115 				   sel, new_phi_result, vector_identity);
5116 
5117 	  /* Do the reduction and convert it to the appropriate type.  */
5118 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5119 				      TREE_TYPE (vectype), vec);
5120 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5121 	  scalar_results.safe_push (scalar);
5122 	}
5123       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5124     }
5125   else
5126     {
5127       bool reduce_with_shift;
5128       tree vec_temp;
5129 
5130       gcc_assert (slp_reduc || new_phis.length () == 1);
5131 
5132       /* See if the target wants to do the final (shift) reduction
5133 	 in a vector mode of smaller size and first reduce upper/lower
5134 	 halves against each other.  */
5135       enum machine_mode mode1 = mode;
5136       tree stype = TREE_TYPE (vectype);
5137       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5138       unsigned nunits1 = nunits;
5139       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5140 	  && new_phis.length () == 1)
5141 	{
5142 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5143 	  /* For SLP reductions we have to make sure lanes match up, but
5144 	     since we're doing individual element final reduction reducing
5145 	     vector width here is even more important.
5146 	     ???  We can also separate lanes with permutes, for the common
5147 	     case of power-of-two group-size odd/even extracts would work.  */
5148 	  if (slp_reduc && nunits != nunits1)
5149 	    {
5150 	      nunits1 = least_common_multiple (nunits1, group_size);
5151 	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5152 	    }
5153 	}
5154       if (!slp_reduc
5155 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5156 	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5157 
5158       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5159 							   stype, nunits1);
5160       reduce_with_shift = have_whole_vector_shift (mode1);
5161       if (!VECTOR_MODE_P (mode1))
5162 	reduce_with_shift = false;
5163       else
5164 	{
5165 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5166 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5167 	    reduce_with_shift = false;
5168 	}
5169 
5170       /* First reduce the vector to the desired vector size we should
5171 	 do shift reduction on by combining upper and lower halves.  */
5172       new_temp = new_phi_result;
5173       while (nunits > nunits1)
5174 	{
5175 	  nunits /= 2;
5176 	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5177 							  stype, nunits);
5178 	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5179 
5180 	  /* The target has to make sure we support lowpart/highpart
5181 	     extraction, either via direct vector extract or through
5182 	     an integer mode punning.  */
5183 	  tree dst1, dst2;
5184 	  if (convert_optab_handler (vec_extract_optab,
5185 				     TYPE_MODE (TREE_TYPE (new_temp)),
5186 				     TYPE_MODE (vectype1))
5187 	      != CODE_FOR_nothing)
5188 	    {
5189 	      /* Extract sub-vectors directly once vec_extract becomes
5190 		 a conversion optab.  */
5191 	      dst1 = make_ssa_name (vectype1);
5192 	      epilog_stmt
5193 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5194 					 build3 (BIT_FIELD_REF, vectype1,
5195 						 new_temp, TYPE_SIZE (vectype1),
5196 						 bitsize_int (0)));
5197 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5198 	      dst2 =  make_ssa_name (vectype1);
5199 	      epilog_stmt
5200 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5201 					 build3 (BIT_FIELD_REF, vectype1,
5202 						 new_temp, TYPE_SIZE (vectype1),
5203 						 bitsize_int (bitsize)));
5204 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5205 	    }
5206 	  else
5207 	    {
5208 	      /* Extract via punning to appropriately sized integer mode
5209 		 vector.  */
5210 	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
5211 	      tree etype = build_vector_type (eltype, 2);
5212 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5213 						 TYPE_MODE (etype),
5214 						 TYPE_MODE (eltype))
5215 			  != CODE_FOR_nothing);
5216 	      tree tem = make_ssa_name (etype);
5217 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5218 						 build1 (VIEW_CONVERT_EXPR,
5219 							 etype, new_temp));
5220 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5221 	      new_temp = tem;
5222 	      tem = make_ssa_name (eltype);
5223 	      epilog_stmt
5224 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5225 					 build3 (BIT_FIELD_REF, eltype,
5226 						 new_temp, TYPE_SIZE (eltype),
5227 						 bitsize_int (0)));
5228 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 	      dst1 = make_ssa_name (vectype1);
5230 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5231 						 build1 (VIEW_CONVERT_EXPR,
5232 							 vectype1, tem));
5233 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 	      tem = make_ssa_name (eltype);
5235 	      epilog_stmt
5236 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5237 					 build3 (BIT_FIELD_REF, eltype,
5238 						 new_temp, TYPE_SIZE (eltype),
5239 						 bitsize_int (bitsize)));
5240 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241 	      dst2 =  make_ssa_name (vectype1);
5242 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5243 						 build1 (VIEW_CONVERT_EXPR,
5244 							 vectype1, tem));
5245 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5246 	    }
5247 
5248 	  new_temp = make_ssa_name (vectype1);
5249 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5250 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 	  new_phis[0] = epilog_stmt;
5252 	}
5253 
5254       if (reduce_with_shift && !slp_reduc)
5255 	{
5256 	  int element_bitsize = tree_to_uhwi (bitsize);
5257 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5258 	     for variable-length vectors and also requires direct target support
5259 	     for loop reductions.  */
5260 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5261 	  int nelements = vec_size_in_bits / element_bitsize;
5262 	  vec_perm_builder sel;
5263 	  vec_perm_indices indices;
5264 
5265           int elt_offset;
5266 
5267           tree zero_vec = build_zero_cst (vectype1);
5268           /* Case 2: Create:
5269              for (offset = nelements/2; offset >= 1; offset/=2)
5270                 {
5271                   Create:  va' = vec_shift <va, offset>
5272                   Create:  va = vop <va, va'>
5273                 }  */
5274 
5275           tree rhs;
5276 
5277           if (dump_enabled_p ())
5278             dump_printf_loc (MSG_NOTE, vect_location,
5279 			     "Reduce using vector shifts\n");
5280 
5281 	  gimple_seq stmts = NULL;
5282 	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5283           for (elt_offset = nelements / 2;
5284                elt_offset >= 1;
5285                elt_offset /= 2)
5286             {
5287 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5288 	      indices.new_vector (sel, 2, nelements);
5289 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5290 	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5291 				       new_temp, zero_vec, mask);
5292 	      new_temp = gimple_build (&stmts, code,
5293 				       vectype1, new_name, new_temp);
5294             }
5295 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5296 
5297 	  /* 2.4  Extract the final scalar result.  Create:
5298 	     s_out3 = extract_field <v_out2, bitpos>  */
5299 
5300 	  if (dump_enabled_p ())
5301 	    dump_printf_loc (MSG_NOTE, vect_location,
5302 			     "extract scalar result\n");
5303 
5304 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5305 			bitsize, bitsize_zero_node);
5306 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5307 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5308 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5309 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310 	  scalar_results.safe_push (new_temp);
5311         }
5312       else
5313         {
5314           /* Case 3: Create:
5315              s = extract_field <v_out2, 0>
5316              for (offset = element_size;
5317                   offset < vector_size;
5318                   offset += element_size;)
5319                {
5320                  Create:  s' = extract_field <v_out2, offset>
5321                  Create:  s = op <s, s'>  // For non SLP cases
5322                }  */
5323 
5324           if (dump_enabled_p ())
5325             dump_printf_loc (MSG_NOTE, vect_location,
5326 			     "Reduce using scalar code.\n");
5327 
5328 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5329 	  int element_bitsize = tree_to_uhwi (bitsize);
5330 	  tree compute_type = TREE_TYPE (vectype);
5331 	  gimple_seq stmts = NULL;
5332           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5333             {
5334               int bit_offset;
5335               if (gimple_code (new_phi) == GIMPLE_PHI)
5336                 vec_temp = PHI_RESULT (new_phi);
5337               else
5338                 vec_temp = gimple_assign_lhs (new_phi);
5339 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5340 				       vec_temp, bitsize, bitsize_zero_node);
5341 
5342               /* In SLP we don't need to apply reduction operation, so we just
5343                  collect s' values in SCALAR_RESULTS.  */
5344               if (slp_reduc)
5345                 scalar_results.safe_push (new_temp);
5346 
5347               for (bit_offset = element_bitsize;
5348                    bit_offset < vec_size_in_bits;
5349                    bit_offset += element_bitsize)
5350                 {
5351                   tree bitpos = bitsize_int (bit_offset);
5352 		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5353 					   compute_type, vec_temp,
5354 					   bitsize, bitpos);
5355                   if (slp_reduc)
5356                     {
5357                       /* In SLP we don't need to apply reduction operation, so
5358                          we just collect s' values in SCALAR_RESULTS.  */
5359                       new_temp = new_name;
5360                       scalar_results.safe_push (new_name);
5361                     }
5362                   else
5363 		    new_temp = gimple_build (&stmts, code, compute_type,
5364 					     new_name, new_temp);
5365                 }
5366             }
5367 
5368           /* The only case where we need to reduce scalar results in SLP, is
5369              unrolling.  If the size of SCALAR_RESULTS is greater than
5370              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5371              REDUC_GROUP_SIZE.  */
5372           if (slp_reduc)
5373             {
5374               tree res, first_res, new_res;
5375 
5376               /* Reduce multiple scalar results in case of SLP unrolling.  */
5377               for (j = group_size; scalar_results.iterate (j, &res);
5378                    j++)
5379                 {
5380                   first_res = scalar_results[j % group_size];
5381 		  new_res = gimple_build (&stmts, code, compute_type,
5382 					  first_res, res);
5383                   scalar_results[j % group_size] = new_res;
5384                 }
5385 	      for (k = 0; k < group_size; k++)
5386 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
5387 						    scalar_results[k]);
5388             }
5389           else
5390 	    {
5391 	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5392 	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5393 	      scalar_results.safe_push (new_temp);
5394 	    }
5395 
5396 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5397         }
5398 
5399       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5400 	  && induc_val)
5401 	{
5402 	  /* Earlier we set the initial value to be a vector if induc_val
5403 	     values.  Check the result and if it is induc_val then replace
5404 	     with the original initial value, unless induc_val is
5405 	     the same as initial_def already.  */
5406 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5407 				  induc_val);
5408 
5409 	  tree tmp = make_ssa_name (new_scalar_dest);
5410 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5411 					     initial_def, new_temp);
5412 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413 	  scalar_results[0] = tmp;
5414 	}
5415     }
5416 
5417   /* 2.5 Adjust the final result by the initial value of the reduction
5418 	 variable. (When such adjustment is not needed, then
5419 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5420 	 new_temp = loop_exit_def + adjustment_def  */
5421 
5422   if (adjustment_def)
5423     {
5424       gcc_assert (!slp_reduc);
5425       gimple_seq stmts = NULL;
5426       if (nested_in_vect_loop)
5427 	{
5428           new_phi = new_phis[0];
5429 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5430 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5431 	  new_temp = gimple_build (&stmts, code, vectype,
5432 				   PHI_RESULT (new_phi), adjustment_def);
5433 	}
5434       else
5435 	{
5436           new_temp = scalar_results[0];
5437 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5438 	  adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5439 	  new_temp = gimple_build (&stmts, code, scalar_type,
5440 				   new_temp, adjustment_def);
5441 	}
5442 
5443       epilog_stmt = gimple_seq_last_stmt (stmts);
5444       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5445       if (nested_in_vect_loop)
5446         {
5447 	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5448 	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5449 	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5450 
5451           if (!double_reduc)
5452             scalar_results.quick_push (new_temp);
5453           else
5454             scalar_results[0] = new_temp;
5455         }
5456       else
5457         scalar_results[0] = new_temp;
5458 
5459       new_phis[0] = epilog_stmt;
5460     }
5461 
5462   if (double_reduc)
5463     loop = loop->inner;
5464 
5465   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5466           phis with new adjusted scalar results, i.e., replace use <s_out0>
5467           with use <s_out4>.
5468 
5469      Transform:
5470         loop_exit:
5471           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5472           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5473           v_out2 = reduce <v_out1>
5474           s_out3 = extract_field <v_out2, 0>
5475           s_out4 = adjust_result <s_out3>
5476           use <s_out0>
5477           use <s_out0>
5478 
5479      into:
5480 
5481         loop_exit:
5482           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5483           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5484           v_out2 = reduce <v_out1>
5485           s_out3 = extract_field <v_out2, 0>
5486           s_out4 = adjust_result <s_out3>
5487           use <s_out4>
5488           use <s_out4> */
5489 
5490 
5491   /* In SLP reduction chain we reduce vector results into one vector if
5492      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5493      LHS of the last stmt in the reduction chain, since we are looking for
5494      the loop exit phi node.  */
5495   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5496     {
5497       stmt_vec_info dest_stmt_info
5498 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5499       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5500       group_size = 1;
5501     }
5502 
5503   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5504      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5505      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5506      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5507      correspond to the first vector stmt, etc.
5508      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5509   if (group_size > new_phis.length ())
5510     gcc_assert (!(group_size % new_phis.length ()));
5511 
5512   for (k = 0; k < group_size; k++)
5513     {
5514       if (slp_reduc)
5515         {
5516 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5517 
5518 	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5519 	  /* SLP statements can't participate in patterns.  */
5520 	  gcc_assert (!orig_stmt_info);
5521 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5522         }
5523 
5524       if (nested_in_vect_loop)
5525         {
5526           if (double_reduc)
5527             loop = outer_loop;
5528           else
5529 	    gcc_unreachable ();
5530         }
5531 
5532       phis.create (3);
5533       /* Find the loop-closed-use at the loop exit of the original scalar
5534          result.  (The reduction result is expected to have two immediate uses,
5535          one at the latch block, and one at the loop exit).  For double
5536          reductions we are looking for exit phis of the outer loop.  */
5537       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5538         {
5539           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5540 	    {
5541 	      if (!is_gimple_debug (USE_STMT (use_p)))
5542 		phis.safe_push (USE_STMT (use_p));
5543 	    }
5544           else
5545             {
5546               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5547                 {
5548                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5549 
5550                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5551                     {
5552                       if (!flow_bb_inside_loop_p (loop,
5553                                              gimple_bb (USE_STMT (phi_use_p)))
5554 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5555                         phis.safe_push (USE_STMT (phi_use_p));
5556                     }
5557                 }
5558             }
5559         }
5560 
5561       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5562         {
5563           /* Replace the uses:  */
5564           orig_name = PHI_RESULT (exit_phi);
5565           scalar_result = scalar_results[k];
5566           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5567 	    {
5568 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5569 		SET_USE (use_p, scalar_result);
5570 	      update_stmt (use_stmt);
5571 	    }
5572         }
5573 
5574       phis.release ();
5575     }
5576 }
5577 
5578 /* Return a vector of type VECTYPE that is equal to the vector select
5579    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5580    before GSI.  */
5581 
5582 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5583 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5584 		     tree vec, tree identity)
5585 {
5586   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5587   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5588 					  mask, vec, identity);
5589   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5590   return cond;
5591 }
5592 
5593 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5594    order, starting with LHS.  Insert the extraction statements before GSI and
5595    associate the new scalar SSA names with variable SCALAR_DEST.
5596    Return the SSA name for the result.  */
5597 
5598 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5599 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5600 		       tree_code code, tree lhs, tree vector_rhs)
5601 {
5602   tree vectype = TREE_TYPE (vector_rhs);
5603   tree scalar_type = TREE_TYPE (vectype);
5604   tree bitsize = TYPE_SIZE (scalar_type);
5605   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5606   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5607 
5608   for (unsigned HOST_WIDE_INT bit_offset = 0;
5609        bit_offset < vec_size_in_bits;
5610        bit_offset += element_bitsize)
5611     {
5612       tree bitpos = bitsize_int (bit_offset);
5613       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5614 			 bitsize, bitpos);
5615 
5616       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5617       rhs = make_ssa_name (scalar_dest, stmt);
5618       gimple_assign_set_lhs (stmt, rhs);
5619       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5620 
5621       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5622       tree new_name = make_ssa_name (scalar_dest, stmt);
5623       gimple_assign_set_lhs (stmt, new_name);
5624       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5625       lhs = new_name;
5626     }
5627   return lhs;
5628 }
5629 
5630 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5631    type of the vector input.  */
5632 
5633 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)5634 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5635 {
5636   internal_fn mask_reduc_fn;
5637 
5638   switch (reduc_fn)
5639     {
5640     case IFN_FOLD_LEFT_PLUS:
5641       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5642       break;
5643 
5644     default:
5645       return IFN_LAST;
5646     }
5647 
5648   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5649 				      OPTIMIZE_FOR_SPEED))
5650     return mask_reduc_fn;
5651   return IFN_LAST;
5652 }
5653 
5654 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5655    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5656    statement.  CODE is the operation performed by STMT_INFO and OPS are
5657    its scalar operands.  REDUC_INDEX is the index of the operand in
5658    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5659    implements in-order reduction, or IFN_LAST if we should open-code it.
5660    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5661    that should be used to control the operation in a fully-masked loop.  */
5662 
5663 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5664 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5665 			       gimple_stmt_iterator *gsi,
5666 			       stmt_vec_info *vec_stmt, slp_tree slp_node,
5667 			       gimple *reduc_def_stmt,
5668 			       tree_code code, internal_fn reduc_fn,
5669 			       tree ops[3], tree vectype_in,
5670 			       int reduc_index, vec_loop_masks *masks)
5671 {
5672   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5673   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5674   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5675   stmt_vec_info new_stmt_info = NULL;
5676   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5677 
5678   int ncopies;
5679   if (slp_node)
5680     ncopies = 1;
5681   else
5682     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5683 
5684   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5685   gcc_assert (ncopies == 1);
5686   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5687 
5688   if (slp_node)
5689     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5690 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
5691 
5692   tree op0 = ops[1 - reduc_index];
5693 
5694   int group_size = 1;
5695   stmt_vec_info scalar_dest_def_info;
5696   auto_vec<tree> vec_oprnds0;
5697   if (slp_node)
5698     {
5699       auto_vec<vec<tree> > vec_defs (2);
5700       vect_get_slp_defs (slp_node, &vec_defs);
5701       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5702       vec_defs[0].release ();
5703       vec_defs[1].release ();
5704       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5705       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5706     }
5707   else
5708     {
5709       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5710       vec_oprnds0.create (1);
5711       vec_oprnds0.quick_push (loop_vec_def0);
5712       scalar_dest_def_info = stmt_info;
5713     }
5714 
5715   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5716   tree scalar_type = TREE_TYPE (scalar_dest);
5717   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5718 
5719   int vec_num = vec_oprnds0.length ();
5720   gcc_assert (vec_num == 1 || slp_node);
5721   tree vec_elem_type = TREE_TYPE (vectype_out);
5722   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5723 
5724   tree vector_identity = NULL_TREE;
5725   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5726     vector_identity = build_zero_cst (vectype_out);
5727 
5728   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5729   int i;
5730   tree def0;
5731   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5732     {
5733       gimple *new_stmt;
5734       tree mask = NULL_TREE;
5735       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5736 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5737 
5738       /* Handle MINUS by adding the negative.  */
5739       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5740 	{
5741 	  tree negated = make_ssa_name (vectype_out);
5742 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5743 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5744 	  def0 = negated;
5745 	}
5746 
5747       if (mask && mask_reduc_fn == IFN_LAST)
5748 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5749 				    vector_identity);
5750 
5751       /* On the first iteration the input is simply the scalar phi
5752 	 result, and for subsequent iterations it is the output of
5753 	 the preceding operation.  */
5754       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5755 	{
5756 	  if (mask && mask_reduc_fn != IFN_LAST)
5757 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5758 						   def0, mask);
5759 	  else
5760 	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5761 						   def0);
5762 	  /* For chained SLP reductions the output of the previous reduction
5763 	     operation serves as the input of the next. For the final statement
5764 	     the output cannot be a temporary - we reuse the original
5765 	     scalar destination of the last statement.  */
5766 	  if (i != vec_num - 1)
5767 	    {
5768 	      gimple_set_lhs (new_stmt, scalar_dest_var);
5769 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5770 	      gimple_set_lhs (new_stmt, reduc_var);
5771 	    }
5772 	}
5773       else
5774 	{
5775 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5776 					     reduc_var, def0);
5777 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5778 	  /* Remove the statement, so that we can use the same code paths
5779 	     as for statements that we've just created.  */
5780 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5781 	  gsi_remove (&tmp_gsi, true);
5782 	}
5783 
5784       if (i == vec_num - 1)
5785 	{
5786 	  gimple_set_lhs (new_stmt, scalar_dest);
5787 	  new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5788 						    new_stmt);
5789 	}
5790       else
5791 	new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5792 						     new_stmt, gsi);
5793 
5794       if (slp_node)
5795 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5796     }
5797 
5798   if (!slp_node)
5799     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5800 
5801   return true;
5802 }
5803 
5804 /* Function is_nonwrapping_integer_induction.
5805 
5806    Check if STMT_VINO (which is part of loop LOOP) both increments and
5807    does not cause overflow.  */
5808 
5809 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)5810 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5811 {
5812   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5813   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5814   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5815   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5816   widest_int ni, max_loop_value, lhs_max;
5817   wi::overflow_type overflow = wi::OVF_NONE;
5818 
5819   /* Make sure the loop is integer based.  */
5820   if (TREE_CODE (base) != INTEGER_CST
5821       || TREE_CODE (step) != INTEGER_CST)
5822     return false;
5823 
5824   /* Check that the max size of the loop will not wrap.  */
5825 
5826   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5827     return true;
5828 
5829   if (! max_stmt_executions (loop, &ni))
5830     return false;
5831 
5832   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5833 			    &overflow);
5834   if (overflow)
5835     return false;
5836 
5837   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5838 			    TYPE_SIGN (lhs_type), &overflow);
5839   if (overflow)
5840     return false;
5841 
5842   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5843 	  <= TYPE_PRECISION (lhs_type));
5844 }
5845 
5846 /* Check if masking can be supported by inserting a conditional expression.
5847    CODE is the code for the operation.  COND_FN is the conditional internal
5848    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5849 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)5850 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5851 			 tree vectype_in)
5852 {
5853   if (cond_fn != IFN_LAST
5854       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5855 					 OPTIMIZE_FOR_SPEED))
5856     return false;
5857 
5858   switch (code)
5859     {
5860     case DOT_PROD_EXPR:
5861     case SAD_EXPR:
5862       return true;
5863 
5864     default:
5865       return false;
5866     }
5867 }
5868 
5869 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5870    code for the operation.  VOP is the array of operands.  MASK is the loop
5871    mask.  GSI is a statement iterator used to place the new conditional
5872    expression.  */
5873 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)5874 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5875 		      gimple_stmt_iterator *gsi)
5876 {
5877   switch (code)
5878     {
5879     case DOT_PROD_EXPR:
5880       {
5881 	tree vectype = TREE_TYPE (vop[1]);
5882 	tree zero = build_zero_cst (vectype);
5883 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5884 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5885 					       mask, vop[1], zero);
5886 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5887 	vop[1] = masked_op1;
5888 	break;
5889       }
5890 
5891     case SAD_EXPR:
5892       {
5893 	tree vectype = TREE_TYPE (vop[1]);
5894 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5895 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5896 					       mask, vop[1], vop[0]);
5897 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5898 	vop[1] = masked_op1;
5899 	break;
5900       }
5901 
5902     default:
5903       gcc_unreachable ();
5904     }
5905 }
5906 
5907 /* Function vectorizable_reduction.
5908 
5909    Check if STMT_INFO performs a reduction operation that can be vectorized.
5910    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5911    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5912    Return true if STMT_INFO is vectorizable in this way.
5913 
5914    This function also handles reduction idioms (patterns) that have been
5915    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5916    may be of this form:
5917      X = pattern_expr (arg0, arg1, ..., X)
5918    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5919    sequence that had been detected and replaced by the pattern-stmt
5920    (STMT_INFO).
5921 
5922    This function also handles reduction of condition expressions, for example:
5923      for (int i = 0; i < N; i++)
5924        if (a[i] < value)
5925 	 last = a[i];
5926    This is handled by vectorising the loop and creating an additional vector
5927    containing the loop indexes for which "a[i] < value" was true.  In the
5928    function epilogue this is reduced to a single max value and then used to
5929    index into the vector of results.
5930 
5931    In some cases of reduction patterns, the type of the reduction variable X is
5932    different than the type of the other arguments of STMT_INFO.
5933    In such cases, the vectype that is used when transforming STMT_INFO into
5934    a vector stmt is different than the vectype that is used to determine the
5935    vectorization factor, because it consists of a different number of elements
5936    than the actual number of elements that are being operated upon in parallel.
5937 
5938    For example, consider an accumulation of shorts into an int accumulator.
5939    On some targets it's possible to vectorize this pattern operating on 8
5940    shorts at a time (hence, the vectype for purposes of determining the
5941    vectorization factor should be V8HI); on the other hand, the vectype that
5942    is used to create the vector form is actually V4SI (the type of the result).
5943 
5944    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5945    indicates what is the actual level of parallelism (V8HI in the example), so
5946    that the right vectorization factor would be derived.  This vectype
5947    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5948    be used to create the vectorized stmt.  The right vectype for the vectorized
5949    stmt is obtained from the type of the result X:
5950       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5951 
5952    This means that, contrary to "regular" reductions (or "regular" stmts in
5953    general), the following equation:
5954       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5955    does *NOT* necessarily hold for reduction patterns.  */
5956 
5957 bool
vectorizable_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)5958 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5959 			slp_instance slp_node_instance,
5960 			stmt_vector_for_cost *cost_vec)
5961 {
5962   tree scalar_dest;
5963   tree vectype_in = NULL_TREE;
5964   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5965   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5966   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5967   stmt_vec_info cond_stmt_vinfo = NULL;
5968   tree scalar_type;
5969   int i;
5970   int ncopies;
5971   bool single_defuse_cycle = false;
5972   bool nested_cycle = false;
5973   bool double_reduc = false;
5974   int vec_num;
5975   tree tem;
5976   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5977   tree cond_reduc_val = NULL_TREE;
5978 
5979   /* Make sure it was already recognized as a reduction computation.  */
5980   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5981       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5982       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5983     return false;
5984 
5985   /* The stmt we store reduction analysis meta on.  */
5986   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5987   reduc_info->is_reduc_info = true;
5988 
5989   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5990     {
5991       if (is_a <gphi *> (stmt_info->stmt))
5992 	/* Analysis for double-reduction is done on the outer
5993 	   loop PHI, nested cycles have no further restrictions.  */
5994 	STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5995       else
5996 	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5997       return true;
5998     }
5999 
6000   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6001   stmt_vec_info phi_info = stmt_info;
6002   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6003       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6004     {
6005       if (!is_a <gphi *> (stmt_info->stmt))
6006 	{
6007 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6008 	  return true;
6009 	}
6010       if (slp_node)
6011 	{
6012 	  slp_node_instance->reduc_phis = slp_node;
6013 	  /* ???  We're leaving slp_node to point to the PHIs, we only
6014 	     need it to get at the number of vector stmts which wasn't
6015 	     yet initialized for the instance root.  */
6016 	}
6017       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6018 	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6019       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6020 	{
6021 	  use_operand_p use_p;
6022 	  gimple *use_stmt;
6023 	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6024 				     &use_p, &use_stmt);
6025 	  gcc_assert (res);
6026 	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
6027 	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6028 	}
6029     }
6030 
6031   /* PHIs should not participate in patterns.  */
6032   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6033   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6034 
6035   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6036      and compute the reduction chain length.  */
6037   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6038 					  loop_latch_edge (loop));
6039   unsigned reduc_chain_length = 0;
6040   bool only_slp_reduc_chain = true;
6041   stmt_info = NULL;
6042   while (reduc_def != PHI_RESULT (reduc_def_phi))
6043     {
6044       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6045       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6046       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6047 	{
6048 	  if (dump_enabled_p ())
6049 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050 			     "reduction chain broken by patterns.\n");
6051 	  return false;
6052 	}
6053       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6054 	only_slp_reduc_chain = false;
6055       /* ???  For epilogue generation live members of the chain need
6056          to point back to the PHI via their original stmt for
6057 	 info_for_reduction to work.  */
6058       if (STMT_VINFO_LIVE_P (vdef))
6059 	STMT_VINFO_REDUC_DEF (def) = phi_info;
6060       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6061       if (!assign)
6062 	{
6063 	  if (dump_enabled_p ())
6064 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6065 			     "reduction chain includes calls.\n");
6066 	  return false;
6067 	}
6068       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6069 	{
6070 	  if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6071 				      TREE_TYPE (gimple_assign_rhs1 (assign))))
6072 	    {
6073 	      if (dump_enabled_p ())
6074 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6075 				 "conversion in the reduction chain.\n");
6076 	      return false;
6077 	    }
6078 	}
6079       else if (!stmt_info)
6080 	/* First non-conversion stmt.  */
6081 	stmt_info = vdef;
6082       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6083       reduc_chain_length++;
6084     }
6085   /* PHIs should not participate in patterns.  */
6086   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6087 
6088   if (nested_in_vect_loop_p (loop, stmt_info))
6089     {
6090       loop = loop->inner;
6091       nested_cycle = true;
6092     }
6093 
6094   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6095      element.  */
6096   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6097     {
6098       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6099       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6100     }
6101   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6102     gcc_assert (slp_node
6103 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6104 
6105   /* 1. Is vectorizable reduction?  */
6106   /* Not supportable if the reduction variable is used in the loop, unless
6107      it's a reduction chain.  */
6108   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6109       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6110     return false;
6111 
6112   /* Reductions that are not used even in an enclosing outer-loop,
6113      are expected to be "live" (used out of the loop).  */
6114   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6115       && !STMT_VINFO_LIVE_P (stmt_info))
6116     return false;
6117 
6118   /* 2. Has this been recognized as a reduction pattern?
6119 
6120      Check if STMT represents a pattern that has been recognized
6121      in earlier analysis stages.  For stmts that represent a pattern,
6122      the STMT_VINFO_RELATED_STMT field records the last stmt in
6123      the original sequence that constitutes the pattern.  */
6124 
6125   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6126   if (orig_stmt_info)
6127     {
6128       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6129       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6130     }
6131 
6132   /* 3. Check the operands of the operation.  The first operands are defined
6133         inside the loop body. The last operand is the reduction variable,
6134         which is defined by the loop-header-phi.  */
6135 
6136   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6137   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6138   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6139   enum tree_code code = gimple_assign_rhs_code (stmt);
6140   bool lane_reduc_code_p
6141     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6142   int op_type = TREE_CODE_LENGTH (code);
6143 
6144   scalar_dest = gimple_assign_lhs (stmt);
6145   scalar_type = TREE_TYPE (scalar_dest);
6146   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6147       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6148     return false;
6149 
6150   /* Do not try to vectorize bit-precision reductions.  */
6151   if (!type_has_mode_precision_p (scalar_type))
6152     return false;
6153 
6154   /* For lane-reducing ops we're reducing the number of reduction PHIs
6155      which means the only use of that may be in the lane-reducing operation.  */
6156   if (lane_reduc_code_p
6157       && reduc_chain_length != 1
6158       && !only_slp_reduc_chain)
6159     {
6160       if (dump_enabled_p ())
6161 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6162 			 "lane-reducing reduction with extra stmts.\n");
6163       return false;
6164     }
6165 
6166   /* All uses but the last are expected to be defined in the loop.
6167      The last use is the reduction variable.  In case of nested cycle this
6168      assumption is not true: we use reduc_index to record the index of the
6169      reduction variable.  */
6170   reduc_def = PHI_RESULT (reduc_def_phi);
6171   for (i = 0; i < op_type; i++)
6172     {
6173       tree op = gimple_op (stmt, i + 1);
6174       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6175       if (i == 0 && code == COND_EXPR)
6176         continue;
6177 
6178       stmt_vec_info def_stmt_info;
6179       enum vect_def_type dt;
6180       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6181 			       &def_stmt_info))
6182 	{
6183 	  if (dump_enabled_p ())
6184 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6185 			     "use not simple.\n");
6186 	  return false;
6187 	}
6188       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6189 	continue;
6190 
6191       /* There should be only one cycle def in the stmt, the one
6192          leading to reduc_def.  */
6193       if (VECTORIZABLE_CYCLE_DEF (dt))
6194 	return false;
6195 
6196       /* To properly compute ncopies we are interested in the widest
6197 	 non-reduction input type in case we're looking at a widening
6198 	 accumulation that we later handle in vect_transform_reduction.  */
6199       if (lane_reduc_code_p
6200 	  && tem
6201 	  && (!vectype_in
6202 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6203 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6204 	vectype_in = tem;
6205 
6206       if (code == COND_EXPR)
6207 	{
6208 	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6209 	  if (dt == vect_constant_def)
6210 	    {
6211 	      cond_reduc_dt = dt;
6212 	      cond_reduc_val = op;
6213 	    }
6214 	  if (dt == vect_induction_def
6215 	      && def_stmt_info
6216 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6217 	    {
6218 	      cond_reduc_dt = dt;
6219 	      cond_stmt_vinfo = def_stmt_info;
6220 	    }
6221 	}
6222     }
6223   if (!vectype_in)
6224     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6225   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6226 
6227   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6228   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6229   /* If we have a condition reduction, see if we can simplify it further.  */
6230   if (v_reduc_type == COND_REDUCTION)
6231     {
6232       if (slp_node)
6233 	return false;
6234 
6235       /* When the condition uses the reduction value in the condition, fail.  */
6236       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6237 	{
6238 	  if (dump_enabled_p ())
6239 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240 			     "condition depends on previous iteration\n");
6241 	  return false;
6242 	}
6243 
6244       if (reduc_chain_length == 1
6245 	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6246 					     vectype_in, OPTIMIZE_FOR_SPEED))
6247 	{
6248 	  if (dump_enabled_p ())
6249 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6250 			     "optimizing condition reduction with"
6251 			     " FOLD_EXTRACT_LAST.\n");
6252 	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6253 	}
6254       else if (cond_reduc_dt == vect_induction_def)
6255 	{
6256 	  tree base
6257 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6258 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6259 
6260 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6261 		      && TREE_CODE (step) == INTEGER_CST);
6262 	  cond_reduc_val = NULL_TREE;
6263 	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6264 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6265 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6266 	    ;
6267 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6268 	     above base; punt if base is the minimum value of the type for
6269 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6270 	  else if (tree_int_cst_sgn (step) == -1)
6271 	    {
6272 	      cond_reduc_op_code = MIN_EXPR;
6273 	      if (tree_int_cst_sgn (base) == -1)
6274 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6275 	      else if (tree_int_cst_lt (base,
6276 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6277 		cond_reduc_val
6278 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6279 	    }
6280 	  else
6281 	    {
6282 	      cond_reduc_op_code = MAX_EXPR;
6283 	      if (tree_int_cst_sgn (base) == 1)
6284 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6285 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6286 					base))
6287 		cond_reduc_val
6288 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6289 	    }
6290 	  if (cond_reduc_val)
6291 	    {
6292 	      if (dump_enabled_p ())
6293 		dump_printf_loc (MSG_NOTE, vect_location,
6294 				 "condition expression based on "
6295 				 "integer induction.\n");
6296 	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6297 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6298 		= cond_reduc_val;
6299 	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6300 	    }
6301 	}
6302       else if (cond_reduc_dt == vect_constant_def)
6303 	{
6304 	  enum vect_def_type cond_initial_dt;
6305 	  tree cond_initial_val
6306 	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6307 
6308 	  gcc_assert (cond_reduc_val != NULL_TREE);
6309 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6310 	  if (cond_initial_dt == vect_constant_def
6311 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6312 				     TREE_TYPE (cond_reduc_val)))
6313 	    {
6314 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6315 				    cond_initial_val, cond_reduc_val);
6316 	      if (e && (integer_onep (e) || integer_zerop (e)))
6317 		{
6318 		  if (dump_enabled_p ())
6319 		    dump_printf_loc (MSG_NOTE, vect_location,
6320 				     "condition expression based on "
6321 				     "compile time constant.\n");
6322 		  /* Record reduction code at analysis stage.  */
6323 		  STMT_VINFO_REDUC_CODE (reduc_info)
6324 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6325 		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6326 		}
6327 	    }
6328 	}
6329     }
6330 
6331   if (STMT_VINFO_LIVE_P (phi_info))
6332     return false;
6333 
6334   if (slp_node)
6335     ncopies = 1;
6336   else
6337     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6338 
6339   gcc_assert (ncopies >= 1);
6340 
6341   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6342 
6343   if (nested_cycle)
6344     {
6345       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6346 		  == vect_double_reduction_def);
6347       double_reduc = true;
6348     }
6349 
6350   /* 4.2. Check support for the epilog operation.
6351 
6352           If STMT represents a reduction pattern, then the type of the
6353           reduction variable may be different than the type of the rest
6354           of the arguments.  For example, consider the case of accumulation
6355           of shorts into an int accumulator; The original code:
6356                         S1: int_a = (int) short_a;
6357           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6358 
6359           was replaced with:
6360                         STMT: int_acc = widen_sum <short_a, int_acc>
6361 
6362           This means that:
6363           1. The tree-code that is used to create the vector operation in the
6364              epilog code (that reduces the partial results) is not the
6365              tree-code of STMT, but is rather the tree-code of the original
6366              stmt from the pattern that STMT is replacing.  I.e, in the example
6367              above we want to use 'widen_sum' in the loop, but 'plus' in the
6368              epilog.
6369           2. The type (mode) we use to check available target support
6370              for the vector operation to be created in the *epilog*, is
6371              determined by the type of the reduction variable (in the example
6372              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6373              However the type (mode) we use to check available target support
6374              for the vector operation to be created *inside the loop*, is
6375              determined by the type of the other arguments to STMT (in the
6376              example we'd check this: optab_handler (widen_sum_optab,
6377 	     vect_short_mode)).
6378 
6379           This is contrary to "regular" reductions, in which the types of all
6380           the arguments are the same as the type of the reduction variable.
6381           For "regular" reductions we can therefore use the same vector type
6382           (and also the same tree-code) when generating the epilog code and
6383           when generating the code inside the loop.  */
6384 
6385   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6386   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6387 
6388   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6389   if (reduction_type == TREE_CODE_REDUCTION)
6390     {
6391       /* Check whether it's ok to change the order of the computation.
6392 	 Generally, when vectorizing a reduction we change the order of the
6393 	 computation.  This may change the behavior of the program in some
6394 	 cases, so we need to check that this is ok.  One exception is when
6395 	 vectorizing an outer-loop: the inner-loop is executed sequentially,
6396 	 and therefore vectorizing reductions in the inner-loop during
6397 	 outer-loop vectorization is safe.  */
6398       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6399 	{
6400 	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
6401 	     is not directy used in stmt.  */
6402 	  if (!only_slp_reduc_chain
6403 	      && reduc_chain_length != 1)
6404 	    {
6405 	      if (dump_enabled_p ())
6406 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6407 				 "in-order reduction chain without SLP.\n");
6408 	      return false;
6409 	    }
6410 	  STMT_VINFO_REDUC_TYPE (reduc_info)
6411 	    = reduction_type = FOLD_LEFT_REDUCTION;
6412 	}
6413       else if (!commutative_tree_code (orig_code)
6414 	       || !associative_tree_code (orig_code))
6415 	{
6416 	  if (dump_enabled_p ())
6417 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6418 			    "reduction: not commutative/associative");
6419 	  return false;
6420 	}
6421     }
6422 
6423   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6424       && ncopies > 1)
6425     {
6426       if (dump_enabled_p ())
6427 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428 			 "multiple types in double reduction or condition "
6429 			 "reduction or fold-left reduction.\n");
6430       return false;
6431     }
6432 
6433   internal_fn reduc_fn = IFN_LAST;
6434   if (reduction_type == TREE_CODE_REDUCTION
6435       || reduction_type == FOLD_LEFT_REDUCTION
6436       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6437       || reduction_type == CONST_COND_REDUCTION)
6438     {
6439       if (reduction_type == FOLD_LEFT_REDUCTION
6440 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6441 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6442 	{
6443 	  if (reduc_fn != IFN_LAST
6444 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6445 						  OPTIMIZE_FOR_SPEED))
6446 	    {
6447 	      if (dump_enabled_p ())
6448 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6449 				 "reduc op not supported by target.\n");
6450 
6451 	      reduc_fn = IFN_LAST;
6452 	    }
6453 	}
6454       else
6455 	{
6456 	  if (!nested_cycle || double_reduc)
6457 	    {
6458 	      if (dump_enabled_p ())
6459 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6460 				 "no reduc code for scalar code.\n");
6461 
6462 	      return false;
6463 	    }
6464 	}
6465     }
6466   else if (reduction_type == COND_REDUCTION)
6467     {
6468       int scalar_precision
6469 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6470       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6471       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6472 						nunits_out);
6473 
6474       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6475 					  OPTIMIZE_FOR_SPEED))
6476 	reduc_fn = IFN_REDUC_MAX;
6477     }
6478   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6479 
6480   if (reduction_type != EXTRACT_LAST_REDUCTION
6481       && (!nested_cycle || double_reduc)
6482       && reduc_fn == IFN_LAST
6483       && !nunits_out.is_constant ())
6484     {
6485       if (dump_enabled_p ())
6486 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6487 			 "missing target support for reduction on"
6488 			 " variable-length vectors.\n");
6489       return false;
6490     }
6491 
6492   /* For SLP reductions, see if there is a neutral value we can use.  */
6493   tree neutral_op = NULL_TREE;
6494   if (slp_node)
6495     neutral_op = neutral_op_for_slp_reduction
6496       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6497        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6498 
6499   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6500     {
6501       /* We can't support in-order reductions of code such as this:
6502 
6503 	   for (int i = 0; i < n1; ++i)
6504 	     for (int j = 0; j < n2; ++j)
6505 	       l += a[j];
6506 
6507 	 since GCC effectively transforms the loop when vectorizing:
6508 
6509 	   for (int i = 0; i < n1 / VF; ++i)
6510 	     for (int j = 0; j < n2; ++j)
6511 	       for (int k = 0; k < VF; ++k)
6512 		 l += a[j];
6513 
6514 	 which is a reassociation of the original operation.  */
6515       if (dump_enabled_p ())
6516 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517 			 "in-order double reduction not supported.\n");
6518 
6519       return false;
6520     }
6521 
6522   if (reduction_type == FOLD_LEFT_REDUCTION
6523       && slp_node
6524       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6525     {
6526       /* We cannot use in-order reductions in this case because there is
6527 	 an implicit reassociation of the operations involved.  */
6528       if (dump_enabled_p ())
6529 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6530 			 "in-order unchained SLP reductions not supported.\n");
6531       return false;
6532     }
6533 
6534   /* For double reductions, and for SLP reductions with a neutral value,
6535      we construct a variable-length initial vector by loading a vector
6536      full of the neutral value and then shift-and-inserting the start
6537      values into the low-numbered elements.  */
6538   if ((double_reduc || neutral_op)
6539       && !nunits_out.is_constant ()
6540       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6541 					  vectype_out, OPTIMIZE_FOR_SPEED))
6542     {
6543       if (dump_enabled_p ())
6544 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6545 			 "reduction on variable-length vectors requires"
6546 			 " target support for a vector-shift-and-insert"
6547 			 " operation.\n");
6548       return false;
6549     }
6550 
6551   /* Check extra constraints for variable-length unchained SLP reductions.  */
6552   if (STMT_SLP_TYPE (stmt_info)
6553       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6554       && !nunits_out.is_constant ())
6555     {
6556       /* We checked above that we could build the initial vector when
6557 	 there's a neutral element value.  Check here for the case in
6558 	 which each SLP statement has its own initial value and in which
6559 	 that value needs to be repeated for every instance of the
6560 	 statement within the initial vector.  */
6561       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6562       if (!neutral_op
6563 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6564 					      TREE_TYPE (vectype_out)))
6565 	{
6566 	  if (dump_enabled_p ())
6567 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6568 			     "unsupported form of SLP reduction for"
6569 			     " variable-length vectors: cannot build"
6570 			     " initial vector.\n");
6571 	  return false;
6572 	}
6573       /* The epilogue code relies on the number of elements being a multiple
6574 	 of the group size.  The duplicate-and-interleave approach to setting
6575 	 up the initial vector does too.  */
6576       if (!multiple_p (nunits_out, group_size))
6577 	{
6578 	  if (dump_enabled_p ())
6579 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580 			     "unsupported form of SLP reduction for"
6581 			     " variable-length vectors: the vector size"
6582 			     " is not a multiple of the number of results.\n");
6583 	  return false;
6584 	}
6585     }
6586 
6587   if (reduction_type == COND_REDUCTION)
6588     {
6589       widest_int ni;
6590 
6591       if (! max_loop_iterations (loop, &ni))
6592 	{
6593 	  if (dump_enabled_p ())
6594 	    dump_printf_loc (MSG_NOTE, vect_location,
6595 			     "loop count not known, cannot create cond "
6596 			     "reduction.\n");
6597 	  return false;
6598 	}
6599       /* Convert backedges to iterations.  */
6600       ni += 1;
6601 
6602       /* The additional index will be the same type as the condition.  Check
6603 	 that the loop can fit into this less one (because we'll use up the
6604 	 zero slot for when there are no matches).  */
6605       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6606       if (wi::geu_p (ni, wi::to_widest (max_index)))
6607 	{
6608 	  if (dump_enabled_p ())
6609 	    dump_printf_loc (MSG_NOTE, vect_location,
6610 			     "loop size is greater than data size.\n");
6611 	  return false;
6612 	}
6613     }
6614 
6615   /* In case the vectorization factor (VF) is bigger than the number
6616      of elements that we can fit in a vectype (nunits), we have to generate
6617      more than one vector stmt - i.e - we need to "unroll" the
6618      vector stmt by a factor VF/nunits.  For more details see documentation
6619      in vectorizable_operation.  */
6620 
6621   /* If the reduction is used in an outer loop we need to generate
6622      VF intermediate results, like so (e.g. for ncopies=2):
6623 	r0 = phi (init, r0)
6624 	r1 = phi (init, r1)
6625 	r0 = x0 + r0;
6626         r1 = x1 + r1;
6627     (i.e. we generate VF results in 2 registers).
6628     In this case we have a separate def-use cycle for each copy, and therefore
6629     for each copy we get the vector def for the reduction variable from the
6630     respective phi node created for this copy.
6631 
6632     Otherwise (the reduction is unused in the loop nest), we can combine
6633     together intermediate results, like so (e.g. for ncopies=2):
6634 	r = phi (init, r)
6635 	r = x0 + r;
6636 	r = x1 + r;
6637    (i.e. we generate VF/2 results in a single register).
6638    In this case for each copy we get the vector def for the reduction variable
6639    from the vectorized reduction operation generated in the previous iteration.
6640 
6641    This only works when we see both the reduction PHI and its only consumer
6642    in vectorizable_reduction and there are no intermediate stmts
6643    participating.  */
6644   if (ncopies > 1
6645       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6646       && reduc_chain_length == 1)
6647     single_defuse_cycle = true;
6648 
6649   if (single_defuse_cycle || lane_reduc_code_p)
6650     {
6651       gcc_assert (code != COND_EXPR);
6652 
6653       /* 4. Supportable by target?  */
6654       bool ok = true;
6655 
6656       /* 4.1. check support for the operation in the loop  */
6657       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6658       if (!optab)
6659 	{
6660 	  if (dump_enabled_p ())
6661 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662 			     "no optab.\n");
6663 	  ok = false;
6664         }
6665 
6666       machine_mode vec_mode = TYPE_MODE (vectype_in);
6667       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6668         {
6669           if (dump_enabled_p ())
6670             dump_printf (MSG_NOTE, "op not supported by target.\n");
6671 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6672 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6673 	    ok = false;
6674 	  else
6675 	    if (dump_enabled_p ())
6676 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6677         }
6678 
6679       /* Worthwhile without SIMD support?  */
6680       if (ok
6681 	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6682 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6683         {
6684           if (dump_enabled_p ())
6685 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6686 			     "not worthwhile without SIMD support.\n");
6687 	  ok = false;
6688         }
6689 
6690       /* lane-reducing operations have to go through vect_transform_reduction.
6691          For the other cases try without the single cycle optimization.  */
6692       if (!ok)
6693 	{
6694 	  if (lane_reduc_code_p)
6695 	    return false;
6696 	  else
6697 	    single_defuse_cycle = false;
6698 	}
6699     }
6700   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6701 
6702   /* If the reduction stmt is one of the patterns that have lane
6703      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6704   if ((ncopies > 1 && ! single_defuse_cycle)
6705       && lane_reduc_code_p)
6706     {
6707       if (dump_enabled_p ())
6708 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709 			 "multi def-use cycle not possible for lane-reducing "
6710 			 "reduction operation\n");
6711       return false;
6712     }
6713 
6714   if (slp_node)
6715     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6716   else
6717     vec_num = 1;
6718 
6719   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6720 			     cost_vec);
6721   if (dump_enabled_p ()
6722       && reduction_type == FOLD_LEFT_REDUCTION)
6723     dump_printf_loc (MSG_NOTE, vect_location,
6724 		     "using an in-order (fold-left) reduction.\n");
6725   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6726   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6727      reductions go through their own vectorizable_* routines.  */
6728   if (!single_defuse_cycle
6729       && code != DOT_PROD_EXPR
6730       && code != WIDEN_SUM_EXPR
6731       && code != SAD_EXPR
6732       && reduction_type != FOLD_LEFT_REDUCTION)
6733     {
6734       stmt_vec_info tem
6735 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6736       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6737 	{
6738 	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6739 	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6740 	}
6741       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6742       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6743     }
6744   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6745     {
6746       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6747       internal_fn cond_fn = get_conditional_internal_fn (code);
6748 
6749       if (reduction_type != FOLD_LEFT_REDUCTION
6750 	  && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6751 	  && (cond_fn == IFN_LAST
6752 	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6753 						  OPTIMIZE_FOR_SPEED)))
6754 	{
6755 	  if (dump_enabled_p ())
6756 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6757 			     "can't use a fully-masked loop because no"
6758 			     " conditional operation is available.\n");
6759 	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6760 	}
6761       else if (reduction_type == FOLD_LEFT_REDUCTION
6762 	       && reduc_fn == IFN_LAST
6763 	       && !expand_vec_cond_expr_p (vectype_in,
6764 					   truth_type_for (vectype_in),
6765 					   SSA_NAME))
6766 	{
6767 	  if (dump_enabled_p ())
6768 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 			     "can't use a fully-masked loop because no"
6770 			     " conditional operation is available.\n");
6771 	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6772 	}
6773       else
6774 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6775 			       vectype_in, NULL);
6776     }
6777   return true;
6778 }
6779 
6780 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6781    value.  */
6782 
6783 bool
vect_transform_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node)6784 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6785 			  stmt_vec_info *vec_stmt, slp_tree slp_node)
6786 {
6787   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6788   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6789   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6790   int i;
6791   int ncopies;
6792   int j;
6793   int vec_num;
6794 
6795   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6796   gcc_assert (reduc_info->is_reduc_info);
6797 
6798   if (nested_in_vect_loop_p (loop, stmt_info))
6799     {
6800       loop = loop->inner;
6801       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6802     }
6803 
6804   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6805   enum tree_code code = gimple_assign_rhs_code (stmt);
6806   int op_type = TREE_CODE_LENGTH (code);
6807 
6808   /* Flatten RHS.  */
6809   tree ops[3];
6810   switch (get_gimple_rhs_class (code))
6811     {
6812     case GIMPLE_TERNARY_RHS:
6813       ops[2] = gimple_assign_rhs3 (stmt);
6814       /* Fall thru.  */
6815     case GIMPLE_BINARY_RHS:
6816       ops[0] = gimple_assign_rhs1 (stmt);
6817       ops[1] = gimple_assign_rhs2 (stmt);
6818       break;
6819     default:
6820       gcc_unreachable ();
6821     }
6822 
6823   /* All uses but the last are expected to be defined in the loop.
6824      The last use is the reduction variable.  In case of nested cycle this
6825      assumption is not true: we use reduc_index to record the index of the
6826      reduction variable.  */
6827   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6828   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6829   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6830   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6831 
6832   if (slp_node)
6833     {
6834       ncopies = 1;
6835       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6836     }
6837   else
6838     {
6839       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6840       vec_num = 1;
6841     }
6842 
6843   internal_fn cond_fn = get_conditional_internal_fn (code);
6844   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6845   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6846 
6847   /* Transform.  */
6848   stmt_vec_info new_stmt_info = NULL;
6849   stmt_vec_info prev_stmt_info;
6850   tree new_temp = NULL_TREE;
6851   auto_vec<tree> vec_oprnds0;
6852   auto_vec<tree> vec_oprnds1;
6853   auto_vec<tree> vec_oprnds2;
6854   tree def0;
6855 
6856   if (dump_enabled_p ())
6857     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6858 
6859   /* FORNOW: Multiple types are not supported for condition.  */
6860   if (code == COND_EXPR)
6861     gcc_assert (ncopies == 1);
6862 
6863   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6864 
6865   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6866   if (reduction_type == FOLD_LEFT_REDUCTION)
6867     {
6868       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6869       return vectorize_fold_left_reduction
6870 	  (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6871 	   reduc_fn, ops, vectype_in, reduc_index, masks);
6872     }
6873 
6874   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6875   gcc_assert (single_defuse_cycle
6876 	      || code == DOT_PROD_EXPR
6877 	      || code == WIDEN_SUM_EXPR
6878 	      || code == SAD_EXPR);
6879 
6880   /* Create the destination vector  */
6881   tree scalar_dest = gimple_assign_lhs (stmt);
6882   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6883 
6884   prev_stmt_info = NULL;
6885   if (!slp_node)
6886     {
6887       vec_oprnds0.create (1);
6888       vec_oprnds1.create (1);
6889       if (op_type == ternary_op)
6890         vec_oprnds2.create (1);
6891     }
6892 
6893   for (j = 0; j < ncopies; j++)
6894     {
6895       /* Handle uses.  */
6896       if (j == 0)
6897         {
6898 	  if (slp_node)
6899 	    {
6900 	      /* Get vec defs for all the operands except the reduction index,
6901 		 ensuring the ordering of the ops in the vector is kept.  */
6902 	      auto_vec<vec<tree>, 3> vec_defs;
6903 	      vect_get_slp_defs (slp_node, &vec_defs);
6904 	      vec_oprnds0.safe_splice (vec_defs[0]);
6905 	      vec_defs[0].release ();
6906 	      vec_oprnds1.safe_splice (vec_defs[1]);
6907 	      vec_defs[1].release ();
6908 	      if (op_type == ternary_op)
6909 		{
6910 		  vec_oprnds2.safe_splice (vec_defs[2]);
6911 		  vec_defs[2].release ();
6912 		}
6913 	    }
6914           else
6915 	    {
6916               vec_oprnds0.quick_push
6917 		(vect_get_vec_def_for_operand (ops[0], stmt_info));
6918               vec_oprnds1.quick_push
6919 		(vect_get_vec_def_for_operand (ops[1], stmt_info));
6920               if (op_type == ternary_op)
6921 		vec_oprnds2.quick_push
6922 		  (vect_get_vec_def_for_operand (ops[2], stmt_info));
6923 	    }
6924         }
6925       else
6926         {
6927           if (!slp_node)
6928             {
6929 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6930 
6931 	      if (single_defuse_cycle && reduc_index == 0)
6932 		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6933 	      else
6934 		vec_oprnds0[0]
6935 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6936 						    vec_oprnds0[0]);
6937 	      if (single_defuse_cycle && reduc_index == 1)
6938 		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6939 	      else
6940 		vec_oprnds1[0]
6941 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6942 						    vec_oprnds1[0]);
6943 	      if (op_type == ternary_op)
6944 		{
6945 		  if (single_defuse_cycle && reduc_index == 2)
6946 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6947 		  else
6948 		    vec_oprnds2[0]
6949 		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6950 							vec_oprnds2[0]);
6951 		}
6952             }
6953         }
6954 
6955       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6956         {
6957 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6958 	  if (masked_loop_p && !mask_by_cond_expr)
6959 	    {
6960 	      /* Make sure that the reduction accumulator is vop[0].  */
6961 	      if (reduc_index == 1)
6962 		{
6963 		  gcc_assert (commutative_tree_code (code));
6964 		  std::swap (vop[0], vop[1]);
6965 		}
6966 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6967 					      vectype_in, i * ncopies + j);
6968 	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6969 							vop[0], vop[1],
6970 							vop[0]);
6971 	      new_temp = make_ssa_name (vec_dest, call);
6972 	      gimple_call_set_lhs (call, new_temp);
6973 	      gimple_call_set_nothrow (call, true);
6974 	      new_stmt_info
6975 		= vect_finish_stmt_generation (stmt_info, call, gsi);
6976 	    }
6977 	  else
6978 	    {
6979 	      if (op_type == ternary_op)
6980 		vop[2] = vec_oprnds2[i];
6981 
6982 	      if (masked_loop_p && mask_by_cond_expr)
6983 		{
6984 		  tree mask = vect_get_loop_mask (gsi, masks,
6985 						  vec_num * ncopies,
6986 						  vectype_in, i * ncopies + j);
6987 		  build_vect_cond_expr (code, vop, mask, gsi);
6988 		}
6989 
6990 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
6991 						       vop[0], vop[1], vop[2]);
6992 	      new_temp = make_ssa_name (vec_dest, new_stmt);
6993 	      gimple_assign_set_lhs (new_stmt, new_temp);
6994 	      new_stmt_info
6995 		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6996 	    }
6997 
6998           if (slp_node)
6999 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7000         }
7001 
7002       if (slp_node || single_defuse_cycle)
7003         continue;
7004 
7005       if (j == 0)
7006 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7007       else
7008 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7009 
7010       prev_stmt_info = new_stmt_info;
7011     }
7012 
7013   if (single_defuse_cycle && !slp_node)
7014     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7015 
7016   return true;
7017 }
7018 
7019 /* Transform phase of a cycle PHI.  */
7020 
7021 bool
vect_transform_cycle_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7022 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7023 			  slp_tree slp_node, slp_instance slp_node_instance)
7024 {
7025   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7026   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7027   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7028   int i;
7029   int ncopies;
7030   stmt_vec_info prev_phi_info;
7031   int j;
7032   bool nested_cycle = false;
7033   int vec_num;
7034 
7035   if (nested_in_vect_loop_p (loop, stmt_info))
7036     {
7037       loop = loop->inner;
7038       nested_cycle = true;
7039     }
7040 
7041   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7042   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7043   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7044   gcc_assert (reduc_info->is_reduc_info);
7045 
7046   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7047       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7048     /* Leave the scalar phi in place.  */
7049     return true;
7050 
7051   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7052   /* For a nested cycle we do not fill the above.  */
7053   if (!vectype_in)
7054     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7055   gcc_assert (vectype_in);
7056 
7057   if (slp_node)
7058     {
7059       /* The size vect_schedule_slp_instance computes is off for us.  */
7060       vec_num = vect_get_num_vectors
7061 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7062 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7063       ncopies = 1;
7064     }
7065   else
7066     {
7067       vec_num = 1;
7068       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7069     }
7070 
7071   /* Check whether we should use a single PHI node and accumulate
7072      vectors to one before the backedge.  */
7073   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7074     ncopies = 1;
7075 
7076   /* Create the destination vector  */
7077   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7078   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7079 					       vectype_out);
7080 
7081   /* Get the loop-entry arguments.  */
7082   tree vec_initial_def;
7083   auto_vec<tree> vec_initial_defs;
7084   if (slp_node)
7085     {
7086       vec_initial_defs.reserve (vec_num);
7087       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7088       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7089       tree neutral_op
7090 	= neutral_op_for_slp_reduction (slp_node, vectype_out,
7091 					STMT_VINFO_REDUC_CODE (reduc_info),
7092 					first != NULL);
7093       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7094 				      &vec_initial_defs, vec_num,
7095 				      first != NULL, neutral_op);
7096     }
7097   else
7098     {
7099       /* Get at the scalar def before the loop, that defines the initial
7100 	 value of the reduction variable.  */
7101       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7102 						loop_preheader_edge (loop));
7103       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7104 	 and we can't use zero for induc_val, use initial_def.  Similarly
7105 	 for REDUC_MIN and initial_def larger than the base.  */
7106       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7107 	{
7108 	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7109 	  if (TREE_CODE (initial_def) == INTEGER_CST
7110 	      && !integer_zerop (induc_val)
7111 	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7112 		   && tree_int_cst_lt (initial_def, induc_val))
7113 		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7114 		      && tree_int_cst_lt (induc_val, initial_def))))
7115 	    {
7116 	      induc_val = initial_def;
7117 	      /* Communicate we used the initial_def to epilouge
7118 		 generation.  */
7119 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7120 	    }
7121 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7122 	}
7123       else if (nested_cycle)
7124 	{
7125 	  /* Do not use an adjustment def as that case is not supported
7126 	     correctly if ncopies is not one.  */
7127 	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7128 							  reduc_stmt_info);
7129 	}
7130       else
7131 	{
7132 	  tree adjustment_def = NULL_TREE;
7133 	  tree *adjustment_defp = &adjustment_def;
7134 	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7135 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7136 	    adjustment_defp = NULL;
7137 	  vec_initial_def
7138 	    = get_initial_def_for_reduction (reduc_stmt_info, code,
7139 					     initial_def, adjustment_defp);
7140 	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7141 	}
7142       vec_initial_defs.create (1);
7143       vec_initial_defs.quick_push (vec_initial_def);
7144     }
7145 
7146   /* Generate the reduction PHIs upfront.  */
7147   prev_phi_info = NULL;
7148   for (i = 0; i < vec_num; i++)
7149     {
7150       tree vec_init_def = vec_initial_defs[i];
7151       for (j = 0; j < ncopies; j++)
7152 	{
7153 	  /* Create the reduction-phi that defines the reduction
7154 	     operand.  */
7155 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7156 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7157 
7158 	  /* Set the loop-entry arg of the reduction-phi.  */
7159 	  if (j != 0 && nested_cycle)
7160 	    vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7161 							   vec_init_def);
7162 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7163 		       UNKNOWN_LOCATION);
7164 
7165 	  /* The loop-latch arg is set in epilogue processing.  */
7166 
7167 	  if (slp_node)
7168 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7169 	  else
7170 	    {
7171 	      if (j == 0)
7172 		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7173 	      else
7174 		STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7175 	      prev_phi_info = new_phi_info;
7176 	    }
7177 	}
7178     }
7179 
7180   return true;
7181 }
7182 
7183 /* Vectorizes LC PHIs.  */
7184 
7185 bool
vectorizable_lc_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node)7186 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7187 		     slp_tree slp_node)
7188 {
7189   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7190   if (!loop_vinfo
7191       || !is_a <gphi *> (stmt_info->stmt)
7192       || gimple_phi_num_args (stmt_info->stmt) != 1)
7193     return false;
7194 
7195   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7196       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7197     return false;
7198 
7199   if (!vec_stmt) /* transformation not required.  */
7200     {
7201       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7202       return true;
7203     }
7204 
7205   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7206   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7207   basic_block bb = gimple_bb (stmt_info->stmt);
7208   edge e = single_pred_edge (bb);
7209   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7210   vec<tree> vec_oprnds = vNULL;
7211   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7212 		     stmt_info, &vec_oprnds, NULL, slp_node);
7213   if (slp_node)
7214     {
7215       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7216       gcc_assert (vec_oprnds.length () == vec_num);
7217       for (unsigned i = 0; i < vec_num; i++)
7218 	{
7219 	  /* Create the vectorized LC PHI node.  */
7220 	  gphi *new_phi = create_phi_node (vec_dest, bb);
7221 	  add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7222 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7223 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7224 	}
7225     }
7226   else
7227     {
7228       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7229       stmt_vec_info prev_phi_info = NULL;
7230       for (unsigned i = 0; i < ncopies; i++)
7231 	{
7232 	  if (i != 0)
7233 	    vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7234 	  /* Create the vectorized LC PHI node.  */
7235 	  gphi *new_phi = create_phi_node (vec_dest, bb);
7236 	  add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7237 	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7238 	  if (i == 0)
7239 	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7240 	  else
7241 	    STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7242 	  prev_phi_info = new_phi_info;
7243 	}
7244     }
7245   vec_oprnds.release ();
7246 
7247   return true;
7248 }
7249 
7250 
7251 /* Function vect_min_worthwhile_factor.
7252 
7253    For a loop where we could vectorize the operation indicated by CODE,
7254    return the minimum vectorization factor that makes it worthwhile
7255    to use generic vectors.  */
7256 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7257 vect_min_worthwhile_factor (enum tree_code code)
7258 {
7259   switch (code)
7260     {
7261     case PLUS_EXPR:
7262     case MINUS_EXPR:
7263     case NEGATE_EXPR:
7264       return 4;
7265 
7266     case BIT_AND_EXPR:
7267     case BIT_IOR_EXPR:
7268     case BIT_XOR_EXPR:
7269     case BIT_NOT_EXPR:
7270       return 2;
7271 
7272     default:
7273       return INT_MAX;
7274     }
7275 }
7276 
7277 /* Return true if VINFO indicates we are doing loop vectorization and if
7278    it is worth decomposing CODE operations into scalar operations for
7279    that loop's vectorization factor.  */
7280 
7281 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7282 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7283 {
7284   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7285   unsigned HOST_WIDE_INT value;
7286   return (loop_vinfo
7287 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7288 	  && value >= vect_min_worthwhile_factor (code));
7289 }
7290 
7291 /* Function vectorizable_induction
7292 
7293    Check if STMT_INFO performs an induction computation that can be vectorized.
7294    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7295    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7296    Return true if STMT_INFO is vectorizable in this way.  */
7297 
7298 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7299 vectorizable_induction (stmt_vec_info stmt_info,
7300 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7301 			stmt_vec_info *vec_stmt, slp_tree slp_node,
7302 			stmt_vector_for_cost *cost_vec)
7303 {
7304   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7305   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7306   unsigned ncopies;
7307   bool nested_in_vect_loop = false;
7308   class loop *iv_loop;
7309   tree vec_def;
7310   edge pe = loop_preheader_edge (loop);
7311   basic_block new_bb;
7312   tree new_vec, vec_init, vec_step, t;
7313   tree new_name;
7314   gimple *new_stmt;
7315   gphi *induction_phi;
7316   tree induc_def, vec_dest;
7317   tree init_expr, step_expr;
7318   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7319   unsigned i;
7320   tree expr;
7321   gimple_seq stmts;
7322   imm_use_iterator imm_iter;
7323   use_operand_p use_p;
7324   gimple *exit_phi;
7325   edge latch_e;
7326   tree loop_arg;
7327   gimple_stmt_iterator si;
7328 
7329   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7330   if (!phi)
7331     return false;
7332 
7333   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7334     return false;
7335 
7336   /* Make sure it was recognized as induction computation.  */
7337   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7338     return false;
7339 
7340   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7341   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7342 
7343   if (slp_node)
7344     ncopies = 1;
7345   else
7346     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7347   gcc_assert (ncopies >= 1);
7348 
7349   /* FORNOW. These restrictions should be relaxed.  */
7350   if (nested_in_vect_loop_p (loop, stmt_info))
7351     {
7352       imm_use_iterator imm_iter;
7353       use_operand_p use_p;
7354       gimple *exit_phi;
7355       edge latch_e;
7356       tree loop_arg;
7357 
7358       if (ncopies > 1)
7359 	{
7360 	  if (dump_enabled_p ())
7361 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7362 			     "multiple types in nested loop.\n");
7363 	  return false;
7364 	}
7365 
7366       /* FORNOW: outer loop induction with SLP not supported.  */
7367       if (STMT_SLP_TYPE (stmt_info))
7368 	return false;
7369 
7370       exit_phi = NULL;
7371       latch_e = loop_latch_edge (loop->inner);
7372       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7373       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7374 	{
7375 	  gimple *use_stmt = USE_STMT (use_p);
7376 	  if (is_gimple_debug (use_stmt))
7377 	    continue;
7378 
7379 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7380 	    {
7381 	      exit_phi = use_stmt;
7382 	      break;
7383 	    }
7384 	}
7385       if (exit_phi)
7386 	{
7387 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7388 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7389 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7390 	    {
7391 	      if (dump_enabled_p ())
7392 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7393 				 "inner-loop induction only used outside "
7394 				 "of the outer vectorized loop.\n");
7395 	      return false;
7396 	    }
7397 	}
7398 
7399       nested_in_vect_loop = true;
7400       iv_loop = loop->inner;
7401     }
7402   else
7403     iv_loop = loop;
7404   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7405 
7406   if (slp_node && !nunits.is_constant ())
7407     {
7408       /* The current SLP code creates the initial value element-by-element.  */
7409       if (dump_enabled_p ())
7410 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411 			 "SLP induction not supported for variable-length"
7412 			 " vectors.\n");
7413       return false;
7414     }
7415 
7416   if (!vec_stmt) /* transformation not required.  */
7417     {
7418       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7419       DUMP_VECT_SCOPE ("vectorizable_induction");
7420       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7421       return true;
7422     }
7423 
7424   /* Transform.  */
7425 
7426   /* Compute a vector variable, initialized with the first VF values of
7427      the induction variable.  E.g., for an iv with IV_PHI='X' and
7428      evolution S, for a vector of 4 units, we want to compute:
7429      [X, X + S, X + 2*S, X + 3*S].  */
7430 
7431   if (dump_enabled_p ())
7432     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7433 
7434   latch_e = loop_latch_edge (iv_loop);
7435   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7436 
7437   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7438   gcc_assert (step_expr != NULL_TREE);
7439   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7440 
7441   pe = loop_preheader_edge (iv_loop);
7442   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7443 				     loop_preheader_edge (iv_loop));
7444 
7445   stmts = NULL;
7446   if (!nested_in_vect_loop)
7447     {
7448       /* Convert the initial value to the IV update type.  */
7449       tree new_type = TREE_TYPE (step_expr);
7450       init_expr = gimple_convert (&stmts, new_type, init_expr);
7451 
7452       /* If we are using the loop mask to "peel" for alignment then we need
7453 	 to adjust the start value here.  */
7454       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7455       if (skip_niters != NULL_TREE)
7456 	{
7457 	  if (FLOAT_TYPE_P (vectype))
7458 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7459 					skip_niters);
7460 	  else
7461 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7462 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7463 					 skip_niters, step_expr);
7464 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7465 				    init_expr, skip_step);
7466 	}
7467     }
7468 
7469   if (stmts)
7470     {
7471       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7472       gcc_assert (!new_bb);
7473     }
7474 
7475   /* Find the first insertion point in the BB.  */
7476   basic_block bb = gimple_bb (phi);
7477   si = gsi_after_labels (bb);
7478 
7479   /* For SLP induction we have to generate several IVs as for example
7480      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7481      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7482      [VF*S, VF*S, VF*S, VF*S] for all.  */
7483   if (slp_node)
7484     {
7485       /* Enforced above.  */
7486       unsigned int const_nunits = nunits.to_constant ();
7487 
7488       /* Generate [VF*S, VF*S, ... ].  */
7489       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7490 	{
7491 	  expr = build_int_cst (integer_type_node, vf);
7492 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7493 	}
7494       else
7495 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7496       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7497 			      expr, step_expr);
7498       if (! CONSTANT_CLASS_P (new_name))
7499 	new_name = vect_init_vector (stmt_info, new_name,
7500 				     TREE_TYPE (step_expr), NULL);
7501       new_vec = build_vector_from_val (step_vectype, new_name);
7502       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7503 
7504       /* Now generate the IVs.  */
7505       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7506       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7507       unsigned elts = const_nunits * nvects;
7508       unsigned nivs = least_common_multiple (group_size,
7509 					     const_nunits) / const_nunits;
7510       gcc_assert (elts % group_size == 0);
7511       tree elt = init_expr;
7512       unsigned ivn;
7513       for (ivn = 0; ivn < nivs; ++ivn)
7514 	{
7515 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7516 	  stmts = NULL;
7517 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7518 	    {
7519 	      if (ivn*const_nunits + eltn >= group_size
7520 		  && (ivn * const_nunits + eltn) % group_size == 0)
7521 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7522 				    elt, step_expr);
7523 	      elts.quick_push (elt);
7524 	    }
7525 	  vec_init = gimple_build_vector (&stmts, &elts);
7526 	  vec_init = gimple_convert (&stmts, vectype, vec_init);
7527 	  if (stmts)
7528 	    {
7529 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7530 	      gcc_assert (!new_bb);
7531 	    }
7532 
7533 	  /* Create the induction-phi that defines the induction-operand.  */
7534 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7535 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7536 	  stmt_vec_info induction_phi_info
7537 	    = loop_vinfo->add_stmt (induction_phi);
7538 	  induc_def = PHI_RESULT (induction_phi);
7539 
7540 	  /* Create the iv update inside the loop  */
7541 	  gimple_seq stmts = NULL;
7542 	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7543 	  vec_def = gimple_build (&stmts,
7544 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7545 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7546 	  loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7547 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7548 
7549 	  /* Set the arguments of the phi node:  */
7550 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7551 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7552 		       UNKNOWN_LOCATION);
7553 
7554 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7555 	}
7556 
7557       /* Re-use IVs when we can.  */
7558       if (ivn < nvects)
7559 	{
7560 	  unsigned vfp
7561 	    = least_common_multiple (group_size, const_nunits) / group_size;
7562 	  /* Generate [VF'*S, VF'*S, ... ].  */
7563 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7564 	    {
7565 	      expr = build_int_cst (integer_type_node, vfp);
7566 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7567 	    }
7568 	  else
7569 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7570 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7571 				  expr, step_expr);
7572 	  if (! CONSTANT_CLASS_P (new_name))
7573 	    new_name = vect_init_vector (stmt_info, new_name,
7574 					 TREE_TYPE (step_expr), NULL);
7575 	  new_vec = build_vector_from_val (step_vectype, new_name);
7576 	  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7577 	  for (; ivn < nvects; ++ivn)
7578 	    {
7579 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7580 	      tree def;
7581 	      if (gimple_code (iv) == GIMPLE_PHI)
7582 		def = gimple_phi_result (iv);
7583 	      else
7584 		def = gimple_assign_lhs (iv);
7585 	      gimple_seq stmts = NULL;
7586 	      def = gimple_convert (&stmts, step_vectype, def);
7587 	      def = gimple_build (&stmts,
7588 				  PLUS_EXPR, step_vectype, def, vec_step);
7589 	      def = gimple_convert (&stmts, vectype, def);
7590 	      if (gimple_code (iv) == GIMPLE_PHI)
7591 		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7592 	      else
7593 		{
7594 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7595 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7596 		}
7597 	      SLP_TREE_VEC_STMTS (slp_node).quick_push
7598 		(loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7599 	    }
7600 	}
7601 
7602       return true;
7603     }
7604 
7605   /* Create the vector that holds the initial_value of the induction.  */
7606   if (nested_in_vect_loop)
7607     {
7608       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7609 	 been created during vectorization of previous stmts.  We obtain it
7610 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7611       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7612       /* If the initial value is not of proper type, convert it.  */
7613       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7614 	{
7615 	  new_stmt
7616 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7617 							  vect_simple_var,
7618 							  "vec_iv_"),
7619 				   VIEW_CONVERT_EXPR,
7620 				   build1 (VIEW_CONVERT_EXPR, vectype,
7621 					   vec_init));
7622 	  vec_init = gimple_assign_lhs (new_stmt);
7623 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7624 						 new_stmt);
7625 	  gcc_assert (!new_bb);
7626 	  loop_vinfo->add_stmt (new_stmt);
7627 	}
7628     }
7629   else
7630     {
7631       /* iv_loop is the loop to be vectorized. Create:
7632 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7633       stmts = NULL;
7634       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7635 
7636       unsigned HOST_WIDE_INT const_nunits;
7637       if (nunits.is_constant (&const_nunits))
7638 	{
7639 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7640 	  elts.quick_push (new_name);
7641 	  for (i = 1; i < const_nunits; i++)
7642 	    {
7643 	      /* Create: new_name_i = new_name + step_expr  */
7644 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7645 				       new_name, step_expr);
7646 	      elts.quick_push (new_name);
7647 	    }
7648 	  /* Create a vector from [new_name_0, new_name_1, ...,
7649 	     new_name_nunits-1]  */
7650 	  vec_init = gimple_build_vector (&stmts, &elts);
7651 	}
7652       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7653 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7654 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7655 				 new_name, step_expr);
7656       else
7657 	{
7658 	  /* Build:
7659 	        [base, base, base, ...]
7660 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7661 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7662 	  gcc_assert (flag_associative_math);
7663 	  tree index = build_index_vector (step_vectype, 0, 1);
7664 	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7665 							new_name);
7666 	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7667 							step_expr);
7668 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7669 	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7670 				   vec_init, step_vec);
7671 	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7672 				   vec_init, base_vec);
7673 	}
7674       vec_init = gimple_convert (&stmts, vectype, vec_init);
7675 
7676       if (stmts)
7677 	{
7678 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7679 	  gcc_assert (!new_bb);
7680 	}
7681     }
7682 
7683 
7684   /* Create the vector that holds the step of the induction.  */
7685   if (nested_in_vect_loop)
7686     /* iv_loop is nested in the loop to be vectorized. Generate:
7687        vec_step = [S, S, S, S]  */
7688     new_name = step_expr;
7689   else
7690     {
7691       /* iv_loop is the loop to be vectorized. Generate:
7692 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7693       gimple_seq seq = NULL;
7694       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7695 	{
7696 	  expr = build_int_cst (integer_type_node, vf);
7697 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7698 	}
7699       else
7700 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7701       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7702 			       expr, step_expr);
7703       if (seq)
7704 	{
7705 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7706 	  gcc_assert (!new_bb);
7707 	}
7708     }
7709 
7710   t = unshare_expr (new_name);
7711   gcc_assert (CONSTANT_CLASS_P (new_name)
7712 	      || TREE_CODE (new_name) == SSA_NAME);
7713   new_vec = build_vector_from_val (step_vectype, t);
7714   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7715 
7716 
7717   /* Create the following def-use cycle:
7718      loop prolog:
7719          vec_init = ...
7720 	 vec_step = ...
7721      loop:
7722          vec_iv = PHI <vec_init, vec_loop>
7723          ...
7724          STMT
7725          ...
7726          vec_loop = vec_iv + vec_step;  */
7727 
7728   /* Create the induction-phi that defines the induction-operand.  */
7729   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7730   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7731   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7732   induc_def = PHI_RESULT (induction_phi);
7733 
7734   /* Create the iv update inside the loop  */
7735   stmts = NULL;
7736   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7737   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7738   vec_def = gimple_convert (&stmts, vectype, vec_def);
7739   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7740   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7741   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7742 
7743   /* Set the arguments of the phi node:  */
7744   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7745   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7746 	       UNKNOWN_LOCATION);
7747 
7748   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7749 
7750   /* In case that vectorization factor (VF) is bigger than the number
7751      of elements that we can fit in a vectype (nunits), we have to generate
7752      more than one vector stmt - i.e - we need to "unroll" the
7753      vector stmt by a factor VF/nunits.  For more details see documentation
7754      in vectorizable_operation.  */
7755 
7756   if (ncopies > 1)
7757     {
7758       gimple_seq seq = NULL;
7759       stmt_vec_info prev_stmt_vinfo;
7760       /* FORNOW. This restriction should be relaxed.  */
7761       gcc_assert (!nested_in_vect_loop);
7762 
7763       /* Create the vector that holds the step of the induction.  */
7764       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7765 	{
7766 	  expr = build_int_cst (integer_type_node, nunits);
7767 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7768 	}
7769       else
7770 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7771       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7772 			       expr, step_expr);
7773       if (seq)
7774 	{
7775 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7776 	  gcc_assert (!new_bb);
7777 	}
7778 
7779       t = unshare_expr (new_name);
7780       gcc_assert (CONSTANT_CLASS_P (new_name)
7781 		  || TREE_CODE (new_name) == SSA_NAME);
7782       new_vec = build_vector_from_val (step_vectype, t);
7783       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7784 
7785       vec_def = induc_def;
7786       prev_stmt_vinfo = induction_phi_info;
7787       for (i = 1; i < ncopies; i++)
7788 	{
7789 	  /* vec_i = vec_prev + vec_step  */
7790 	  gimple_seq stmts = NULL;
7791 	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7792 	  vec_def = gimple_build (&stmts,
7793 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7794 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7795 
7796 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7797 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
7798 	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7799 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7800 	  prev_stmt_vinfo = new_stmt_info;
7801 	}
7802     }
7803 
7804   if (nested_in_vect_loop)
7805     {
7806       /* Find the loop-closed exit-phi of the induction, and record
7807          the final vector of induction results:  */
7808       exit_phi = NULL;
7809       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7810         {
7811 	  gimple *use_stmt = USE_STMT (use_p);
7812 	  if (is_gimple_debug (use_stmt))
7813 	    continue;
7814 
7815 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7816 	    {
7817 	      exit_phi = use_stmt;
7818 	      break;
7819 	    }
7820         }
7821       if (exit_phi)
7822 	{
7823 	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7824 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
7825 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
7826 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7827 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
7828 
7829 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7830 	  if (dump_enabled_p ())
7831 	    dump_printf_loc (MSG_NOTE, vect_location,
7832 			     "vector of inductions after inner-loop:%G",
7833 			     new_stmt);
7834 	}
7835     }
7836 
7837 
7838   if (dump_enabled_p ())
7839     dump_printf_loc (MSG_NOTE, vect_location,
7840 		     "transform induction: created def-use cycle: %G%G",
7841 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
7842 
7843   return true;
7844 }
7845 
7846 /* Function vectorizable_live_operation.
7847 
7848    STMT_INFO computes a value that is used outside the loop.  Check if
7849    it can be supported.  */
7850 
7851 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost *)7852 vectorizable_live_operation (stmt_vec_info stmt_info,
7853 			     gimple_stmt_iterator *gsi,
7854 			     slp_tree slp_node, slp_instance slp_node_instance,
7855 			     int slp_index, bool vec_stmt_p,
7856 			     stmt_vector_for_cost *)
7857 {
7858   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7859   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7860   imm_use_iterator imm_iter;
7861   tree lhs, lhs_type, bitsize, vec_bitsize;
7862   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7863   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7864   int ncopies;
7865   gimple *use_stmt;
7866   auto_vec<tree> vec_oprnds;
7867   int vec_entry = 0;
7868   poly_uint64 vec_index = 0;
7869 
7870   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7871 
7872   /* If a stmt of a reduction is live, vectorize it via
7873      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7874      validity so just trigger the transform here.  */
7875   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7876     {
7877       if (!vec_stmt_p)
7878 	return true;
7879       if (slp_node)
7880 	{
7881 	  /* For reduction chains the meta-info is attached to
7882 	     the group leader.  */
7883 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7884 	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7885 	  /* For SLP reductions we vectorize the epilogue for
7886 	     all involved stmts together.  */
7887 	  else if (slp_index != 0)
7888 	    return true;
7889 	}
7890       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7891       gcc_assert (reduc_info->is_reduc_info);
7892       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7893 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7894 	return true;
7895       vect_create_epilog_for_reduction (stmt_info, slp_node,
7896 					slp_node_instance);
7897       return true;
7898     }
7899 
7900   /* FORNOW.  CHECKME.  */
7901   if (nested_in_vect_loop_p (loop, stmt_info))
7902     return false;
7903 
7904   /* If STMT is not relevant and it is a simple assignment and its inputs are
7905      invariant then it can remain in place, unvectorized.  The original last
7906      scalar value that it computes will be used.  */
7907   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7908     {
7909       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7910       if (dump_enabled_p ())
7911 	dump_printf_loc (MSG_NOTE, vect_location,
7912 			 "statement is simple and uses invariant.  Leaving in "
7913 			 "place.\n");
7914       return true;
7915     }
7916 
7917   if (slp_node)
7918     ncopies = 1;
7919   else
7920     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7921 
7922   if (slp_node)
7923     {
7924       gcc_assert (slp_index >= 0);
7925 
7926       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7927       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7928 
7929       /* Get the last occurrence of the scalar index from the concatenation of
7930 	 all the slp vectors. Calculate which slp vector it is and the index
7931 	 within.  */
7932       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7933 
7934       /* Calculate which vector contains the result, and which lane of
7935 	 that vector we need.  */
7936       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7937 	{
7938 	  if (dump_enabled_p ())
7939 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7940 			     "Cannot determine which vector holds the"
7941 			     " final result.\n");
7942 	  return false;
7943 	}
7944     }
7945 
7946   if (!vec_stmt_p)
7947     {
7948       /* No transformation required.  */
7949       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7950 	{
7951 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7952 					       OPTIMIZE_FOR_SPEED))
7953 	    {
7954 	      if (dump_enabled_p ())
7955 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7956 				 "can't use a fully-masked loop because "
7957 				 "the target doesn't support extract last "
7958 				 "reduction.\n");
7959 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7960 	    }
7961 	  else if (slp_node)
7962 	    {
7963 	      if (dump_enabled_p ())
7964 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 				 "can't use a fully-masked loop because an "
7966 				 "SLP statement is live after the loop.\n");
7967 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7968 	    }
7969 	  else if (ncopies > 1)
7970 	    {
7971 	      if (dump_enabled_p ())
7972 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7973 				 "can't use a fully-masked loop because"
7974 				 " ncopies is greater than 1.\n");
7975 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7976 	    }
7977 	  else
7978 	    {
7979 	      gcc_assert (ncopies == 1 && !slp_node);
7980 	      vect_record_loop_mask (loop_vinfo,
7981 				     &LOOP_VINFO_MASKS (loop_vinfo),
7982 				     1, vectype, NULL);
7983 	    }
7984 	}
7985       return true;
7986     }
7987 
7988   /* Use the lhs of the original scalar statement.  */
7989   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7990 
7991   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7992 	: gimple_get_lhs (stmt);
7993   lhs_type = TREE_TYPE (lhs);
7994 
7995   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7996 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7997 	     : TYPE_SIZE (TREE_TYPE (vectype)));
7998   vec_bitsize = TYPE_SIZE (vectype);
7999 
8000   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8001   tree vec_lhs, bitstart;
8002   if (slp_node)
8003     {
8004       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8005 
8006       /* Get the correct slp vectorized stmt.  */
8007       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8008       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8009 	vec_lhs = gimple_phi_result (phi);
8010       else
8011 	vec_lhs = gimple_get_lhs (vec_stmt);
8012 
8013       /* Get entry to use.  */
8014       bitstart = bitsize_int (vec_index);
8015       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8016     }
8017   else
8018     {
8019       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8020       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8021       gcc_checking_assert (ncopies == 1
8022 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8023 
8024       /* For multiple copies, get the last copy.  */
8025       for (int i = 1; i < ncopies; ++i)
8026 	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8027 
8028       /* Get the last lane in the vector.  */
8029       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8030     }
8031 
8032   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8033      requirement, insert one phi node for it.  It looks like:
8034 	 loop;
8035        BB:
8036 	 # lhs' = PHI <lhs>
8037      ==>
8038 	 loop;
8039        BB:
8040 	 # vec_lhs' = PHI <vec_lhs>
8041 	 new_tree = lane_extract <vec_lhs', ...>;
8042 	 lhs' = new_tree;  */
8043 
8044   basic_block exit_bb = single_exit (loop)->dest;
8045   gcc_assert (single_pred_p (exit_bb));
8046 
8047   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8048   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8049   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8050 
8051   gimple_seq stmts = NULL;
8052   tree new_tree;
8053   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8054     {
8055       /* Emit:
8056 
8057 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8058 
8059 	 where VEC_LHS is the vectorized live-out result and MASK is
8060 	 the loop mask for the final iteration.  */
8061       gcc_assert (ncopies == 1 && !slp_node);
8062       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8063       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8064 				      vectype, 0);
8065       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8066 				      mask, vec_lhs_phi);
8067 
8068       /* Convert the extracted vector element to the required scalar type.  */
8069       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8070     }
8071   else
8072     {
8073       tree bftype = TREE_TYPE (vectype);
8074       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8075 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8076       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8077       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8078 				       &stmts, true, NULL_TREE);
8079     }
8080 
8081   if (stmts)
8082     {
8083       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8084       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8085 
8086       /* Remove existing phi from lhs and create one copy from new_tree.  */
8087       tree lhs_phi = NULL_TREE;
8088       gimple_stmt_iterator gsi;
8089       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8090 	{
8091 	  gimple *phi = gsi_stmt (gsi);
8092 	  if ((gimple_phi_arg_def (phi, 0) == lhs))
8093 	    {
8094 	      remove_phi_node (&gsi, false);
8095 	      lhs_phi = gimple_phi_result (phi);
8096 	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8097 	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8098 	      break;
8099 	    }
8100 	}
8101     }
8102 
8103   /* Replace use of lhs with newly computed result.  If the use stmt is a
8104      single arg PHI, just replace all uses of PHI result.  It's necessary
8105      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8106   use_operand_p use_p;
8107   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8108     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8109 	&& !is_gimple_debug (use_stmt))
8110     {
8111       if (gimple_code (use_stmt) == GIMPLE_PHI
8112 	  && gimple_phi_num_args (use_stmt) == 1)
8113 	{
8114 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8115 	}
8116       else
8117 	{
8118 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8119 	    SET_USE (use_p, new_tree);
8120 	}
8121       update_stmt (use_stmt);
8122     }
8123 
8124   return true;
8125 }
8126 
8127 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8128 
8129 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8130 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8131 {
8132   ssa_op_iter op_iter;
8133   imm_use_iterator imm_iter;
8134   def_operand_p def_p;
8135   gimple *ustmt;
8136 
8137   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8138     {
8139       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8140 	{
8141 	  basic_block bb;
8142 
8143 	  if (!is_gimple_debug (ustmt))
8144 	    continue;
8145 
8146 	  bb = gimple_bb (ustmt);
8147 
8148 	  if (!flow_bb_inside_loop_p (loop, bb))
8149 	    {
8150 	      if (gimple_debug_bind_p (ustmt))
8151 		{
8152 		  if (dump_enabled_p ())
8153 		    dump_printf_loc (MSG_NOTE, vect_location,
8154                                      "killing debug use\n");
8155 
8156 		  gimple_debug_bind_reset_value (ustmt);
8157 		  update_stmt (ustmt);
8158 		}
8159 	      else
8160 		gcc_unreachable ();
8161 	    }
8162 	}
8163     }
8164 }
8165 
8166 /* Given loop represented by LOOP_VINFO, return true if computation of
8167    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8168    otherwise.  */
8169 
8170 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8171 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8172 {
8173   /* Constant case.  */
8174   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8175     {
8176       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8177       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8178 
8179       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8180       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8181       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8182 	return true;
8183     }
8184 
8185   widest_int max;
8186   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8187   /* Check the upper bound of loop niters.  */
8188   if (get_max_loop_iterations (loop, &max))
8189     {
8190       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8191       signop sgn = TYPE_SIGN (type);
8192       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8193       if (max < type_max)
8194 	return true;
8195     }
8196   return false;
8197 }
8198 
8199 /* Return a mask type with half the number of elements as OLD_TYPE,
8200    given that it should have mode NEW_MODE.  */
8201 
8202 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8203 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8204 {
8205   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8206   return build_truth_vector_type_for_mode (nunits, new_mode);
8207 }
8208 
8209 /* Return a mask type with twice as many elements as OLD_TYPE,
8210    given that it should have mode NEW_MODE.  */
8211 
8212 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8213 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8214 {
8215   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8216   return build_truth_vector_type_for_mode (nunits, new_mode);
8217 }
8218 
8219 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8220    contain a sequence of NVECTORS masks that each control a vector of type
8221    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8222    these vector masks with the vector version of SCALAR_MASK.  */
8223 
8224 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8225 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8226 		       unsigned int nvectors, tree vectype, tree scalar_mask)
8227 {
8228   gcc_assert (nvectors != 0);
8229   if (masks->length () < nvectors)
8230     masks->safe_grow_cleared (nvectors);
8231   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8232   /* The number of scalars per iteration and the number of vectors are
8233      both compile-time constants.  */
8234   unsigned int nscalars_per_iter
8235     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8236 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8237 
8238   if (scalar_mask)
8239     {
8240       scalar_cond_masked_key cond (scalar_mask, nvectors);
8241       loop_vinfo->scalar_cond_masked_set.add (cond);
8242     }
8243 
8244   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8245     {
8246       rgm->max_nscalars_per_iter = nscalars_per_iter;
8247       rgm->mask_type = truth_type_for (vectype);
8248     }
8249 }
8250 
8251 /* Given a complete set of masks MASKS, extract mask number INDEX
8252    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8253    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8254 
8255    See the comment above vec_loop_masks for more details about the mask
8256    arrangement.  */
8257 
8258 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8259 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8260 		    unsigned int nvectors, tree vectype, unsigned int index)
8261 {
8262   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8263   tree mask_type = rgm->mask_type;
8264 
8265   /* Populate the rgroup's mask array, if this is the first time we've
8266      used it.  */
8267   if (rgm->masks.is_empty ())
8268     {
8269       rgm->masks.safe_grow_cleared (nvectors);
8270       for (unsigned int i = 0; i < nvectors; ++i)
8271 	{
8272 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8273 	  /* Provide a dummy definition until the real one is available.  */
8274 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8275 	  rgm->masks[i] = mask;
8276 	}
8277     }
8278 
8279   tree mask = rgm->masks[index];
8280   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8281 		TYPE_VECTOR_SUBPARTS (vectype)))
8282     {
8283       /* A loop mask for data type X can be reused for data type Y
8284 	 if X has N times more elements than Y and if Y's elements
8285 	 are N times bigger than X's.  In this case each sequence
8286 	 of N elements in the loop mask will be all-zero or all-one.
8287 	 We can then view-convert the mask so that each sequence of
8288 	 N elements is replaced by a single element.  */
8289       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8290 			      TYPE_VECTOR_SUBPARTS (vectype)));
8291       gimple_seq seq = NULL;
8292       mask_type = truth_type_for (vectype);
8293       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8294       if (seq)
8295 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8296     }
8297   return mask;
8298 }
8299 
8300 /* Scale profiling counters by estimation for LOOP which is vectorized
8301    by factor VF.  */
8302 
8303 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)8304 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8305 {
8306   edge preheader = loop_preheader_edge (loop);
8307   /* Reduce loop iterations by the vectorization factor.  */
8308   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8309   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8310 
8311   if (freq_h.nonzero_p ())
8312     {
8313       profile_probability p;
8314 
8315       /* Avoid dropping loop body profile counter to 0 because of zero count
8316 	 in loop's preheader.  */
8317       if (!(freq_e == profile_count::zero ()))
8318         freq_e = freq_e.force_nonzero ();
8319       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8320       scale_loop_frequencies (loop, p);
8321     }
8322 
8323   edge exit_e = single_exit (loop);
8324   exit_e->probability = profile_probability::always ()
8325 				 .apply_scale (1, new_est_niter + 1);
8326 
8327   edge exit_l = single_pred_edge (loop->latch);
8328   profile_probability prob = exit_l->probability;
8329   exit_l->probability = exit_e->probability.invert ();
8330   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8331     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8332 }
8333 
8334 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8335    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8336    stmt_vec_info.  */
8337 
8338 static void
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8339 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8340 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8341 {
8342   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8343   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8344 
8345   if (dump_enabled_p ())
8346     dump_printf_loc (MSG_NOTE, vect_location,
8347 		     "------>vectorizing statement: %G", stmt_info->stmt);
8348 
8349   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8350     vect_loop_kill_debug_uses (loop, stmt_info);
8351 
8352   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8353       && !STMT_VINFO_LIVE_P (stmt_info))
8354     return;
8355 
8356   if (STMT_VINFO_VECTYPE (stmt_info))
8357     {
8358       poly_uint64 nunits
8359 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8360       if (!STMT_SLP_TYPE (stmt_info)
8361 	  && maybe_ne (nunits, vf)
8362 	  && dump_enabled_p ())
8363 	/* For SLP VF is set according to unrolling factor, and not
8364 	   to vector size, hence for SLP this print is not valid.  */
8365 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8366     }
8367 
8368   /* Pure SLP statements have already been vectorized.  We still need
8369      to apply loop vectorization to hybrid SLP statements.  */
8370   if (PURE_SLP_STMT (stmt_info))
8371     return;
8372 
8373   if (dump_enabled_p ())
8374     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8375 
8376   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8377     *seen_store = stmt_info;
8378 }
8379 
8380 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8381    in the hash_map with its corresponding values.  */
8382 
8383 static tree
find_in_mapping(tree t,void * context)8384 find_in_mapping (tree t, void *context)
8385 {
8386   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8387 
8388   tree *value = mapping->get (t);
8389   return value ? *value : t;
8390 }
8391 
8392 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8393    original loop that has now been vectorized.
8394 
8395    The inits of the data_references need to be advanced with the number of
8396    iterations of the main loop.  This has been computed in vect_do_peeling and
8397    is stored in parameter ADVANCE.  We first restore the data_references
8398    initial offset with the values recored in ORIG_DRS_INIT.
8399 
8400    Since the loop_vec_info of this EPILOGUE was constructed for the original
8401    loop, its stmt_vec_infos all point to the original statements.  These need
8402    to be updated to point to their corresponding copies as well as the SSA_NAMES
8403    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8404 
8405    The data_reference's connections also need to be updated.  Their
8406    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8407    stmt_vec_infos, their statements need to point to their corresponding copy,
8408    if they are gather loads or scatter stores then their reference needs to be
8409    updated to point to its corresponding copy and finally we set
8410    'base_misaligned' to false as we have already peeled for alignment in the
8411    prologue of the main loop.  */
8412 
8413 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)8414 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8415 {
8416   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8417   auto_vec<gimple *> stmt_worklist;
8418   hash_map<tree,tree> mapping;
8419   gimple *orig_stmt, *new_stmt;
8420   gimple_stmt_iterator epilogue_gsi;
8421   gphi_iterator epilogue_phi_gsi;
8422   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8423   basic_block *epilogue_bbs = get_loop_body (epilogue);
8424   unsigned i;
8425 
8426   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8427 
8428   /* Advance data_reference's with the number of iterations of the previous
8429      loop and its prologue.  */
8430   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8431 
8432 
8433   /* The EPILOGUE loop is a copy of the original loop so they share the same
8434      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8435      point to the copied statements.  We also create a mapping of all LHS' in
8436      the original loop and all the LHS' in the EPILOGUE and create worklists to
8437      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8438   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8439     {
8440       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8441 	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8442 	{
8443 	  new_stmt = epilogue_phi_gsi.phi ();
8444 
8445 	  gcc_assert (gimple_uid (new_stmt) > 0);
8446 	  stmt_vinfo
8447 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8448 
8449 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8450 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8451 
8452 	  mapping.put (gimple_phi_result (orig_stmt),
8453 		       gimple_phi_result (new_stmt));
8454 	  /* PHI nodes can not have patterns or related statements.  */
8455 	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8456 		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8457 	}
8458 
8459       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8460 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8461 	{
8462 	  new_stmt = gsi_stmt (epilogue_gsi);
8463 
8464 	  gcc_assert (gimple_uid (new_stmt) > 0);
8465 	  stmt_vinfo
8466 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8467 
8468 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8469 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8470 
8471 	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
8472 	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8473 
8474 	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8475 	    {
8476 	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8477 	      for (gimple_stmt_iterator gsi = gsi_start (seq);
8478 		   !gsi_end_p (gsi); gsi_next (&gsi))
8479 		stmt_worklist.safe_push (gsi_stmt (gsi));
8480 	    }
8481 
8482 	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8483 	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8484 	    {
8485 	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8486 	      stmt_worklist.safe_push (stmt);
8487 	      /* Set BB such that the assert in
8488 		'get_initial_def_for_reduction' is able to determine that
8489 		the BB of the related stmt is inside this loop.  */
8490 	      gimple_set_bb (stmt,
8491 			     gimple_bb (new_stmt));
8492 	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8493 	      gcc_assert (related_vinfo == NULL
8494 			  || related_vinfo == stmt_vinfo);
8495 	    }
8496 	}
8497     }
8498 
8499   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8500      using the original main loop and thus need to be updated to refer to the
8501      cloned variables used in the epilogue.  */
8502   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8503     {
8504       gimple *stmt = stmt_worklist[i];
8505       tree *new_op;
8506 
8507       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8508 	{
8509 	  tree op = gimple_op (stmt, j);
8510 	  if ((new_op = mapping.get(op)))
8511 	    gimple_set_op (stmt, j, *new_op);
8512 	  else
8513 	    {
8514 	      /* PR92429: The last argument of simplify_replace_tree disables
8515 		 folding when replacing arguments.  This is required as
8516 		 otherwise you might end up with different statements than the
8517 		 ones analyzed in vect_loop_analyze, leading to different
8518 		 vectorization.  */
8519 	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8520 					  &find_in_mapping, &mapping, false);
8521 	      gimple_set_op (stmt, j, op);
8522 	    }
8523 	}
8524     }
8525 
8526   struct data_reference *dr;
8527   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8528   FOR_EACH_VEC_ELT (datarefs, i, dr)
8529     {
8530       orig_stmt = DR_STMT (dr);
8531       gcc_assert (gimple_uid (orig_stmt) > 0);
8532       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8533       /* Data references for gather loads and scatter stores do not use the
8534 	 updated offset we set using ADVANCE.  Instead we have to make sure the
8535 	 reference in the data references point to the corresponding copy of
8536 	 the original in the epilogue.  */
8537       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8538 	  == VMAT_GATHER_SCATTER)
8539 	{
8540 	  DR_REF (dr)
8541 	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8542 				     &find_in_mapping, &mapping);
8543 	  DR_BASE_ADDRESS (dr)
8544 	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8545 				     &find_in_mapping, &mapping);
8546 	}
8547       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8548       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8549       /* The vector size of the epilogue is smaller than that of the main loop
8550 	 so the alignment is either the same or lower. This means the dr will
8551 	 thus by definition be aligned.  */
8552       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8553     }
8554 
8555   epilogue_vinfo->shared->datarefs_copy.release ();
8556   epilogue_vinfo->shared->save_datarefs ();
8557 }
8558 
8559 /* Function vect_transform_loop.
8560 
8561    The analysis phase has determined that the loop is vectorizable.
8562    Vectorize the loop - created vectorized stmts to replace the scalar
8563    stmts in the loop, and update the loop exit condition.
8564    Returns scalar epilogue loop if any.  */
8565 
8566 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)8567 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8568 {
8569   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8570   class loop *epilogue = NULL;
8571   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8572   int nbbs = loop->num_nodes;
8573   int i;
8574   tree niters_vector = NULL_TREE;
8575   tree step_vector = NULL_TREE;
8576   tree niters_vector_mult_vf = NULL_TREE;
8577   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8578   unsigned int lowest_vf = constant_lower_bound (vf);
8579   gimple *stmt;
8580   bool check_profitability = false;
8581   unsigned int th;
8582 
8583   DUMP_VECT_SCOPE ("vec_transform_loop");
8584 
8585   loop_vinfo->shared->check_datarefs ();
8586 
8587   /* Use the more conservative vectorization threshold.  If the number
8588      of iterations is constant assume the cost check has been performed
8589      by our caller.  If the threshold makes all loops profitable that
8590      run at least the (estimated) vectorization factor number of times
8591      checking is pointless, too.  */
8592   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8593   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8594     {
8595       if (dump_enabled_p ())
8596 	dump_printf_loc (MSG_NOTE, vect_location,
8597 			 "Profitability threshold is %d loop iterations.\n",
8598 			 th);
8599       check_profitability = true;
8600     }
8601 
8602   /* Make sure there exists a single-predecessor exit bb.  Do this before
8603      versioning.   */
8604   edge e = single_exit (loop);
8605   if (! single_pred_p (e->dest))
8606     {
8607       split_loop_exit_edge (e, true);
8608       if (dump_enabled_p ())
8609 	dump_printf (MSG_NOTE, "split exit edge\n");
8610     }
8611 
8612   /* Version the loop first, if required, so the profitability check
8613      comes first.  */
8614 
8615   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8616     {
8617       class loop *sloop
8618 	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8619       sloop->force_vectorize = false;
8620       check_profitability = false;
8621     }
8622 
8623   /* Make sure there exists a single-predecessor exit bb also on the
8624      scalar loop copy.  Do this after versioning but before peeling
8625      so CFG structure is fine for both scalar and if-converted loop
8626      to make slpeel_duplicate_current_defs_from_edges face matched
8627      loop closed PHI nodes on the exit.  */
8628   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8629     {
8630       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8631       if (! single_pred_p (e->dest))
8632 	{
8633 	  split_loop_exit_edge (e, true);
8634 	  if (dump_enabled_p ())
8635 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8636 	}
8637     }
8638 
8639   tree niters = vect_build_loop_niters (loop_vinfo);
8640   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8641   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8642   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8643   tree advance;
8644   drs_init_vec orig_drs_init;
8645 
8646   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8647 			      &step_vector, &niters_vector_mult_vf, th,
8648 			      check_profitability, niters_no_overflow,
8649 			      &advance);
8650 
8651   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8652       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8653     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8654 			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8655 
8656   if (niters_vector == NULL_TREE)
8657     {
8658       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8659 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8660 	  && known_eq (lowest_vf, vf))
8661 	{
8662 	  niters_vector
8663 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8664 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8665 	  step_vector = build_one_cst (TREE_TYPE (niters));
8666 	}
8667       else
8668 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8669 				     &step_vector, niters_no_overflow);
8670     }
8671 
8672   /* 1) Make sure the loop header has exactly two entries
8673      2) Make sure we have a preheader basic block.  */
8674 
8675   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8676 
8677   split_edge (loop_preheader_edge (loop));
8678 
8679   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8680       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8681     /* This will deal with any possible peeling.  */
8682     vect_prepare_for_masked_peels (loop_vinfo);
8683 
8684   /* Schedule the SLP instances first, then handle loop vectorization
8685      below.  */
8686   if (!loop_vinfo->slp_instances.is_empty ())
8687     {
8688       DUMP_VECT_SCOPE ("scheduling SLP instances");
8689       vect_schedule_slp (loop_vinfo);
8690     }
8691 
8692   /* FORNOW: the vectorizer supports only loops which body consist
8693      of one basic block (header + empty latch). When the vectorizer will
8694      support more involved loop forms, the order by which the BBs are
8695      traversed need to be reconsidered.  */
8696 
8697   for (i = 0; i < nbbs; i++)
8698     {
8699       basic_block bb = bbs[i];
8700       stmt_vec_info stmt_info;
8701 
8702       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8703 	   gsi_next (&si))
8704         {
8705 	  gphi *phi = si.phi ();
8706 	  if (dump_enabled_p ())
8707 	    dump_printf_loc (MSG_NOTE, vect_location,
8708 			     "------>vectorizing phi: %G", phi);
8709 	  stmt_info = loop_vinfo->lookup_stmt (phi);
8710 	  if (!stmt_info)
8711 	    continue;
8712 
8713 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8714 	    vect_loop_kill_debug_uses (loop, stmt_info);
8715 
8716 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8717 	      && !STMT_VINFO_LIVE_P (stmt_info))
8718 	    continue;
8719 
8720 	  if (STMT_VINFO_VECTYPE (stmt_info)
8721 	      && (maybe_ne
8722 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8723 	      && dump_enabled_p ())
8724 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8725 
8726 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8727 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8728 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8729 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8730 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8731 	      && ! PURE_SLP_STMT (stmt_info))
8732 	    {
8733 	      if (dump_enabled_p ())
8734 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8735 	      vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8736 	    }
8737 	}
8738 
8739       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8740 	   !gsi_end_p (si);)
8741 	{
8742 	  stmt = gsi_stmt (si);
8743 	  /* During vectorization remove existing clobber stmts.  */
8744 	  if (gimple_clobber_p (stmt))
8745 	    {
8746 	      unlink_stmt_vdef (stmt);
8747 	      gsi_remove (&si, true);
8748 	      release_defs (stmt);
8749 	    }
8750 	  else
8751 	    {
8752 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
8753 
8754 	      /* vector stmts created in the outer-loop during vectorization of
8755 		 stmts in an inner-loop may not have a stmt_info, and do not
8756 		 need to be vectorized.  */
8757 	      stmt_vec_info seen_store = NULL;
8758 	      if (stmt_info)
8759 		{
8760 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8761 		    {
8762 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8763 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8764 			   !gsi_end_p (subsi); gsi_next (&subsi))
8765 			{
8766 			  stmt_vec_info pat_stmt_info
8767 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8768 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8769 						    &si, &seen_store);
8770 			}
8771 		      stmt_vec_info pat_stmt_info
8772 			= STMT_VINFO_RELATED_STMT (stmt_info);
8773 		      vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8774 						&seen_store);
8775 		    }
8776 		  vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8777 					    &seen_store);
8778 		}
8779 	      gsi_next (&si);
8780 	      if (seen_store)
8781 		{
8782 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8783 		    /* Interleaving.  If IS_STORE is TRUE, the
8784 		       vectorization of the interleaving chain was
8785 		       completed - free all the stores in the chain.  */
8786 		    vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8787 		  else
8788 		    /* Free the attached stmt_vec_info and remove the stmt.  */
8789 		    loop_vinfo->remove_stmt (stmt_info);
8790 		}
8791 	    }
8792 	}
8793 
8794       /* Stub out scalar statements that must not survive vectorization.
8795 	 Doing this here helps with grouped statements, or statements that
8796 	 are involved in patterns.  */
8797       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8798 	   !gsi_end_p (gsi); gsi_next (&gsi))
8799 	{
8800 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8801 	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8802 	    {
8803 	      tree lhs = gimple_get_lhs (call);
8804 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8805 		{
8806 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8807 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8808 		  gsi_replace (&gsi, new_stmt, true);
8809 		}
8810 	    }
8811 	}
8812     }				/* BBs in loop */
8813 
8814   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8815      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8816   if (integer_onep (step_vector))
8817     niters_no_overflow = true;
8818   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8819 			   niters_vector_mult_vf, !niters_no_overflow);
8820 
8821   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8822   scale_profile_for_vect_loop (loop, assumed_vf);
8823 
8824   /* True if the final iteration might not handle a full vector's
8825      worth of scalar iterations.  */
8826   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8827   /* The minimum number of iterations performed by the epilogue.  This
8828      is 1 when peeling for gaps because we always need a final scalar
8829      iteration.  */
8830   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8831   /* +1 to convert latch counts to loop iteration counts,
8832      -min_epilogue_iters to remove iterations that cannot be performed
8833        by the vector code.  */
8834   int bias_for_lowest = 1 - min_epilogue_iters;
8835   int bias_for_assumed = bias_for_lowest;
8836   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8837   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8838     {
8839       /* When the amount of peeling is known at compile time, the first
8840 	 iteration will have exactly alignment_npeels active elements.
8841 	 In the worst case it will have at least one.  */
8842       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8843       bias_for_lowest += lowest_vf - min_first_active;
8844       bias_for_assumed += assumed_vf - min_first_active;
8845     }
8846   /* In these calculations the "- 1" converts loop iteration counts
8847      back to latch counts.  */
8848   if (loop->any_upper_bound)
8849     loop->nb_iterations_upper_bound
8850       = (final_iter_may_be_partial
8851 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8852 			  lowest_vf) - 1
8853 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8854 			   lowest_vf) - 1);
8855   if (loop->any_likely_upper_bound)
8856     loop->nb_iterations_likely_upper_bound
8857       = (final_iter_may_be_partial
8858 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8859 			  + bias_for_lowest, lowest_vf) - 1
8860 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8861 			   + bias_for_lowest, lowest_vf) - 1);
8862   if (loop->any_estimate)
8863     loop->nb_iterations_estimate
8864       = (final_iter_may_be_partial
8865 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8866 			  assumed_vf) - 1
8867 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8868 			   assumed_vf) - 1);
8869 
8870   if (dump_enabled_p ())
8871     {
8872       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8873 	{
8874 	  dump_printf_loc (MSG_NOTE, vect_location,
8875 			   "LOOP VECTORIZED\n");
8876 	  if (loop->inner)
8877 	    dump_printf_loc (MSG_NOTE, vect_location,
8878 			     "OUTER LOOP VECTORIZED\n");
8879 	  dump_printf (MSG_NOTE, "\n");
8880 	}
8881       else
8882 	dump_printf_loc (MSG_NOTE, vect_location,
8883 			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8884 			 GET_MODE_NAME (loop_vinfo->vector_mode));
8885     }
8886 
8887   /* Loops vectorized with a variable factor won't benefit from
8888      unrolling/peeling.  */
8889   if (!vf.is_constant ())
8890     {
8891       loop->unroll = 1;
8892       if (dump_enabled_p ())
8893 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8894 			 " variable-length vectorization factor\n");
8895     }
8896   /* Free SLP instances here because otherwise stmt reference counting
8897      won't work.  */
8898   slp_instance instance;
8899   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8900     vect_free_slp_instance (instance, true);
8901   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8902   /* Clear-up safelen field since its value is invalid after vectorization
8903      since vectorized loop can have loop-carried dependencies.  */
8904   loop->safelen = 0;
8905 
8906   if (epilogue)
8907     {
8908       update_epilogue_loop_vinfo (epilogue, advance);
8909 
8910       epilogue->simduid = loop->simduid;
8911       epilogue->force_vectorize = loop->force_vectorize;
8912       epilogue->dont_vectorize = false;
8913     }
8914 
8915   return epilogue;
8916 }
8917 
8918 /* The code below is trying to perform simple optimization - revert
8919    if-conversion for masked stores, i.e. if the mask of a store is zero
8920    do not perform it and all stored value producers also if possible.
8921    For example,
8922      for (i=0; i<n; i++)
8923        if (c[i])
8924 	{
8925 	  p1[i] += 1;
8926 	  p2[i] = p3[i] +2;
8927 	}
8928    this transformation will produce the following semi-hammock:
8929 
8930    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8931      {
8932        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8933        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8934        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8935        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8936        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8937        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8938      }
8939 */
8940 
8941 void
optimize_mask_stores(class loop * loop)8942 optimize_mask_stores (class loop *loop)
8943 {
8944   basic_block *bbs = get_loop_body (loop);
8945   unsigned nbbs = loop->num_nodes;
8946   unsigned i;
8947   basic_block bb;
8948   class loop *bb_loop;
8949   gimple_stmt_iterator gsi;
8950   gimple *stmt;
8951   auto_vec<gimple *> worklist;
8952   auto_purge_vect_location sentinel;
8953 
8954   vect_location = find_loop_location (loop);
8955   /* Pick up all masked stores in loop if any.  */
8956   for (i = 0; i < nbbs; i++)
8957     {
8958       bb = bbs[i];
8959       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8960 	   gsi_next (&gsi))
8961 	{
8962 	  stmt = gsi_stmt (gsi);
8963 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8964 	    worklist.safe_push (stmt);
8965 	}
8966     }
8967 
8968   free (bbs);
8969   if (worklist.is_empty ())
8970     return;
8971 
8972   /* Loop has masked stores.  */
8973   while (!worklist.is_empty ())
8974     {
8975       gimple *last, *last_store;
8976       edge e, efalse;
8977       tree mask;
8978       basic_block store_bb, join_bb;
8979       gimple_stmt_iterator gsi_to;
8980       tree vdef, new_vdef;
8981       gphi *phi;
8982       tree vectype;
8983       tree zero;
8984 
8985       last = worklist.pop ();
8986       mask = gimple_call_arg (last, 2);
8987       bb = gimple_bb (last);
8988       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8989 	 the same loop as if_bb.  It could be different to LOOP when two
8990 	 level loop-nest is vectorized and mask_store belongs to the inner
8991 	 one.  */
8992       e = split_block (bb, last);
8993       bb_loop = bb->loop_father;
8994       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8995       join_bb = e->dest;
8996       store_bb = create_empty_bb (bb);
8997       add_bb_to_loop (store_bb, bb_loop);
8998       e->flags = EDGE_TRUE_VALUE;
8999       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9000       /* Put STORE_BB to likely part.  */
9001       efalse->probability = profile_probability::unlikely ();
9002       store_bb->count = efalse->count ();
9003       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9004       if (dom_info_available_p (CDI_DOMINATORS))
9005 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9006       if (dump_enabled_p ())
9007 	dump_printf_loc (MSG_NOTE, vect_location,
9008 			 "Create new block %d to sink mask stores.",
9009 			 store_bb->index);
9010       /* Create vector comparison with boolean result.  */
9011       vectype = TREE_TYPE (mask);
9012       zero = build_zero_cst (vectype);
9013       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9014       gsi = gsi_last_bb (bb);
9015       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9016       /* Create new PHI node for vdef of the last masked store:
9017 	 .MEM_2 = VDEF <.MEM_1>
9018 	 will be converted to
9019 	 .MEM.3 = VDEF <.MEM_1>
9020 	 and new PHI node will be created in join bb
9021 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9022       */
9023       vdef = gimple_vdef (last);
9024       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9025       gimple_set_vdef (last, new_vdef);
9026       phi = create_phi_node (vdef, join_bb);
9027       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9028 
9029       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9030       while (true)
9031 	{
9032 	  gimple_stmt_iterator gsi_from;
9033 	  gimple *stmt1 = NULL;
9034 
9035 	  /* Move masked store to STORE_BB.  */
9036 	  last_store = last;
9037 	  gsi = gsi_for_stmt (last);
9038 	  gsi_from = gsi;
9039 	  /* Shift GSI to the previous stmt for further traversal.  */
9040 	  gsi_prev (&gsi);
9041 	  gsi_to = gsi_start_bb (store_bb);
9042 	  gsi_move_before (&gsi_from, &gsi_to);
9043 	  /* Setup GSI_TO to the non-empty block start.  */
9044 	  gsi_to = gsi_start_bb (store_bb);
9045 	  if (dump_enabled_p ())
9046 	    dump_printf_loc (MSG_NOTE, vect_location,
9047 			     "Move stmt to created bb\n%G", last);
9048 	  /* Move all stored value producers if possible.  */
9049 	  while (!gsi_end_p (gsi))
9050 	    {
9051 	      tree lhs;
9052 	      imm_use_iterator imm_iter;
9053 	      use_operand_p use_p;
9054 	      bool res;
9055 
9056 	      /* Skip debug statements.  */
9057 	      if (is_gimple_debug (gsi_stmt (gsi)))
9058 		{
9059 		  gsi_prev (&gsi);
9060 		  continue;
9061 		}
9062 	      stmt1 = gsi_stmt (gsi);
9063 	      /* Do not consider statements writing to memory or having
9064 		 volatile operand.  */
9065 	      if (gimple_vdef (stmt1)
9066 		  || gimple_has_volatile_ops (stmt1))
9067 		break;
9068 	      gsi_from = gsi;
9069 	      gsi_prev (&gsi);
9070 	      lhs = gimple_get_lhs (stmt1);
9071 	      if (!lhs)
9072 		break;
9073 
9074 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9075 	      if (TREE_CODE (lhs) != SSA_NAME)
9076 		break;
9077 
9078 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9079 		{
9080 		  /* Remove dead scalar statement.  */
9081 		  if (has_zero_uses (lhs))
9082 		    {
9083 		      gsi_remove (&gsi_from, true);
9084 		      continue;
9085 		    }
9086 		}
9087 
9088 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9089 	      res = true;
9090 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9091 		{
9092 		  gimple *use_stmt;
9093 		  use_stmt = USE_STMT (use_p);
9094 		  if (is_gimple_debug (use_stmt))
9095 		    continue;
9096 		  if (gimple_bb (use_stmt) != store_bb)
9097 		    {
9098 		      res = false;
9099 		      break;
9100 		    }
9101 		}
9102 	      if (!res)
9103 		break;
9104 
9105 	      if (gimple_vuse (stmt1)
9106 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9107 		break;
9108 
9109 	      /* Can move STMT1 to STORE_BB.  */
9110 	      if (dump_enabled_p ())
9111 		dump_printf_loc (MSG_NOTE, vect_location,
9112 				 "Move stmt to created bb\n%G", stmt1);
9113 	      gsi_move_before (&gsi_from, &gsi_to);
9114 	      /* Shift GSI_TO for further insertion.  */
9115 	      gsi_prev (&gsi_to);
9116 	    }
9117 	  /* Put other masked stores with the same mask to STORE_BB.  */
9118 	  if (worklist.is_empty ()
9119 	      || gimple_call_arg (worklist.last (), 2) != mask
9120 	      || worklist.last () != stmt1)
9121 	    break;
9122 	  last = worklist.pop ();
9123 	}
9124       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9125     }
9126 }
9127 
9128 /* Decide whether it is possible to use a zero-based induction variable
9129    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9130    return the value that the induction variable must be able to hold
9131    in order to ensure that the loop ends with an all-false mask.
9132    Return -1 otherwise.  */
9133 widest_int
vect_iv_limit_for_full_masking(loop_vec_info loop_vinfo)9134 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9135 {
9136   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9137   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9138   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9139 
9140   /* Calculate the value that the induction variable must be able
9141      to hit in order to ensure that we end the loop with an all-false mask.
9142      This involves adding the maximum number of inactive trailing scalar
9143      iterations.  */
9144   widest_int iv_limit = -1;
9145   if (max_loop_iterations (loop, &iv_limit))
9146     {
9147       if (niters_skip)
9148 	{
9149 	  /* Add the maximum number of skipped iterations to the
9150 	     maximum iteration count.  */
9151 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
9152 	    iv_limit += wi::to_widest (niters_skip);
9153 	  else
9154 	    iv_limit += max_vf - 1;
9155 	}
9156       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9157 	/* Make a conservatively-correct assumption.  */
9158 	iv_limit += max_vf - 1;
9159 
9160       /* IV_LIMIT is the maximum number of latch iterations, which is also
9161 	 the maximum in-range IV value.  Round this value down to the previous
9162 	 vector alignment boundary and then add an extra full iteration.  */
9163       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9164       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9165     }
9166   return iv_limit;
9167 }
9168 
9169