1 /* Loop Vectorization
2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 #define vec_step vec_step_
59 
60 /* Loop Vectorization Pass.
61 
62    This pass tries to vectorize loops.
63 
64    For example, the vectorizer transforms the following simple loop:
65 
66         short a[N]; short b[N]; short c[N]; int i;
67 
68         for (i=0; i<N; i++){
69           a[i] = b[i] + c[i];
70         }
71 
72    as if it was manually vectorized by rewriting the source code into:
73 
74         typedef int __attribute__((mode(V8HI))) v8hi;
75         short a[N];  short b[N]; short c[N];   int i;
76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77         v8hi va, vb, vc;
78 
79         for (i=0; i<N/8; i++){
80           vb = pb[i];
81           vc = pc[i];
82           va = vb + vc;
83           pa[i] = va;
84         }
85 
86         The main entry to this pass is vectorize_loops(), in which
87    the vectorizer applies a set of analyses on a given set of loops,
88    followed by the actual vectorization transformation for the loops that
89    had successfully passed the analysis phase.
90         Throughout this pass we make a distinction between two types of
91    data: scalars (which are represented by SSA_NAMES), and memory references
92    ("data-refs").  These two types of data require different handling both
93    during analysis and transformation. The types of data-refs that the
94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96    accesses are required to have a simple (consecutive) access pattern.
97 
98    Analysis phase:
99    ===============
100         The driver for the analysis phase is vect_analyze_loop().
101    It applies a set of analyses, some of which rely on the scalar evolution
102    analyzer (scev) developed by Sebastian Pop.
103 
104         During the analysis phase the vectorizer records some information
105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106    loop, as well as general information about the loop as a whole, which is
107    recorded in a "loop_vec_info" struct attached to each loop.
108 
109    Transformation phase:
110    =====================
111         The loop transformation phase scans all the stmts in the loop, and
112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113    the loop that needs to be vectorized.  It inserts the vector code sequence
114    just before the scalar stmt S, and records a pointer to the vector code
115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116    attached to S).  This pointer will be used for the vectorization of following
117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118    otherwise, we rely on dead code elimination for removing it.
119 
120         For example, say stmt S1 was vectorized into stmt VS1:
121 
122    VS1: vb = px[i];
123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124    S2:  a = b;
125 
126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
129    resulting sequence would be:
130 
131    VS1: vb = px[i];
132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133    VS2: va = vb;
134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 
136         Operands that are not SSA_NAMEs, are data-refs that appear in
137    load/store operations (like 'x[i]' in S1), and are handled differently.
138 
139    Target modeling:
140    =================
141         Currently the only target specific information that is used is the
142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143    Targets that can support different sizes of vectors, for now will need
144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
145    flexibility will be added in the future.
146 
147         Since we only vectorize operations which vector form can be
148    expressed using existing tree codes, to verify that an operation is
149    supported, the vectorizer checks the relevant optab at the relevant
150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
151    the value found is CODE_FOR_nothing, then there's no target support, and
152    we can't vectorize the stmt.
153 
154    For additional information on this project see:
155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157 
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159 
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162    may already be set for general statements (not just data refs).  */
163 
164 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf,vec<stmt_vec_info> * mask_producers)165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
166 			      bool vectype_maybe_set_p,
167 			      poly_uint64 *vf,
168 			      vec<stmt_vec_info > *mask_producers)
169 {
170   gimple *stmt = stmt_info->stmt;
171 
172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173        && !STMT_VINFO_LIVE_P (stmt_info))
174       || gimple_clobber_p (stmt))
175     {
176       if (dump_enabled_p ())
177 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178       return opt_result::success ();
179     }
180 
181   tree stmt_vectype, nunits_vectype;
182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
183 						   &nunits_vectype);
184   if (!res)
185     return res;
186 
187   if (stmt_vectype)
188     {
189       if (STMT_VINFO_VECTYPE (stmt_info))
190 	/* The only case when a vectype had been already set is for stmts
191 	   that contain a data ref, or for "pattern-stmts" (stmts generated
192 	   by the vectorizer to represent/replace a certain idiom).  */
193 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 		     || vectype_maybe_set_p)
195 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196       else if (stmt_vectype == boolean_type_node)
197 	mask_producers->safe_push (stmt_info);
198       else
199 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200     }
201 
202   if (nunits_vectype)
203     vect_update_max_nunits (vf, nunits_vectype);
204 
205   return opt_result::success ();
206 }
207 
208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
209    types of STMT_INFO and all attached pattern statements and update
210    the vectorization factor VF accordingly.  If some of the statements
211    produce a mask result whose vector type can only be calculated later,
212    add them to MASK_PRODUCERS.  Return true on success or false if
213    something prevented vectorization.  */
214 
215 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf,vec<stmt_vec_info> * mask_producers)216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
217 			    vec<stmt_vec_info > *mask_producers)
218 {
219   vec_info *vinfo = stmt_info->vinfo;
220   if (dump_enabled_p ())
221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 		     stmt_info->stmt);
223   opt_result res
224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
225   if (!res)
226     return res;
227 
228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
229       && STMT_VINFO_RELATED_STMT (stmt_info))
230     {
231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 
234       /* If a pattern statement has def stmts, analyze them too.  */
235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
236 	   !gsi_end_p (si); gsi_next (&si))
237 	{
238 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
239 	  if (dump_enabled_p ())
240 	    dump_printf_loc (MSG_NOTE, vect_location,
241 			     "==> examining pattern def stmt: %G",
242 			     def_stmt_info->stmt);
243 	  if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 					     vf, mask_producers))
245 	  res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
246 					      vf, mask_producers);
247 	  if (!res)
248 	    return res;
249 	}
250 
251       if (dump_enabled_p ())
252 	dump_printf_loc (MSG_NOTE, vect_location,
253 			 "==> examining pattern statement: %G",
254 			 stmt_info->stmt);
255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
256       if (!res)
257 	return res;
258     }
259 
260   return opt_result::success ();
261 }
262 
263 /* Function vect_determine_vectorization_factor
264 
265    Determine the vectorization factor (VF).  VF is the number of data elements
266    that are operated upon in parallel in a single iteration of the vectorized
267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
269    elements can fit in a single vector register.
270 
271    We currently support vectorization of loops in which all types operated upon
272    are of the same size.  Therefore this function currently sets VF according to
273    the size of the types operated upon, and fails if there are multiple sizes
274    in the loop.
275 
276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
277    original loop:
278         for (i=0; i<N; i++){
279           a[i] = b[i] + c[i];
280         }
281 
282    vectorized loop:
283         for (i=0; i<N; i+=VF){
284           a[i:VF] = b[i:VF] + c[i:VF];
285         }
286 */
287 
288 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
290 {
291   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
293   unsigned nbbs = loop->num_nodes;
294   poly_uint64 vectorization_factor = 1;
295   tree scalar_type = NULL_TREE;
296   gphi *phi;
297   tree vectype;
298   stmt_vec_info stmt_info;
299   unsigned i;
300   auto_vec<stmt_vec_info> mask_producers;
301 
302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
303 
304   for (i = 0; i < nbbs; i++)
305     {
306       basic_block bb = bbs[i];
307 
308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
309 	   gsi_next (&si))
310 	{
311 	  phi = si.phi ();
312 	  stmt_info = loop_vinfo->lookup_stmt (phi);
313 	  if (dump_enabled_p ())
314 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
315 			     phi);
316 
317 	  gcc_assert (stmt_info);
318 
319 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
320 	      || STMT_VINFO_LIVE_P (stmt_info))
321             {
322 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
324 
325 	      if (dump_enabled_p ())
326 		dump_printf_loc (MSG_NOTE, vect_location,
327 				 "get vectype for scalar type:  %T\n",
328 				 scalar_type);
329 
330 	      vectype = get_vectype_for_scalar_type (scalar_type);
331 	      if (!vectype)
332 		return opt_result::failure_at (phi,
333 					       "not vectorized: unsupported "
334 					       "data-type %T\n",
335 					       scalar_type);
336 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
337 
338 	      if (dump_enabled_p ())
339 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
340 				 vectype);
341 
342 	      if (dump_enabled_p ())
343 		{
344 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
345 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
346 		  dump_printf (MSG_NOTE, "\n");
347 		}
348 
349 	      vect_update_max_nunits (&vectorization_factor, vectype);
350 	    }
351 	}
352 
353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
354 	   gsi_next (&si))
355 	{
356 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
357 	  opt_result res
358 	    = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
359 					  &mask_producers);
360 	  if (!res)
361 	    return res;
362         }
363     }
364 
365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
366   if (dump_enabled_p ())
367     {
368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
369       dump_dec (MSG_NOTE, vectorization_factor);
370       dump_printf (MSG_NOTE, "\n");
371     }
372 
373   if (known_le (vectorization_factor, 1U))
374     return opt_result::failure_at (vect_location,
375 				   "not vectorized: unsupported data-type\n");
376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
377 
378   for (i = 0; i < mask_producers.length (); i++)
379     {
380       stmt_info = mask_producers[i];
381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
382       if (!mask_type)
383 	return opt_result::propagate_failure (mask_type);
384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385     }
386 
387   return opt_result::success ();
388 }
389 
390 
391 /* Function vect_is_simple_iv_evolution.
392 
393    FORNOW: A simple evolution of an induction variables in the loop is
394    considered a polynomial evolution.  */
395 
396 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398                              tree * step)
399 {
400   tree init_expr;
401   tree step_expr;
402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403   basic_block bb;
404 
405   /* When there is no evolution in this loop, the evolution function
406      is not "simple".  */
407   if (evolution_part == NULL_TREE)
408     return false;
409 
410   /* When the evolution is a polynomial of degree >= 2
411      the evolution function is not "simple".  */
412   if (tree_is_chrec (evolution_part))
413     return false;
414 
415   step_expr = evolution_part;
416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
417 
418   if (dump_enabled_p ())
419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
420 		     step_expr, init_expr);
421 
422   *init = init_expr;
423   *step = step_expr;
424 
425   if (TREE_CODE (step_expr) != INTEGER_CST
426       && (TREE_CODE (step_expr) != SSA_NAME
427 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
428 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
429 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
430 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
431 		  || !flag_associative_math)))
432       && (TREE_CODE (step_expr) != REAL_CST
433 	  || !flag_associative_math))
434     {
435       if (dump_enabled_p ())
436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437                          "step unknown.\n");
438       return false;
439     }
440 
441   return true;
442 }
443 
444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
445    what we are assuming is a double reduction.  For example, given
446    a structure like this:
447 
448       outer1:
449 	x_1 = PHI <x_4(outer2), ...>;
450 	...
451 
452       inner:
453 	x_2 = PHI <x_1(outer1), ...>;
454 	...
455 	x_3 = ...;
456 	...
457 
458       outer2:
459 	x_4 = PHI <x_3(inner)>;
460 	...
461 
462    outer loop analysis would treat x_1 as a double reduction phi and
463    this function would then return true for x_2.  */
464 
465 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
467 {
468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
469   use_operand_p use_p;
470   ssa_op_iter op_iter;
471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
474 	return true;
475   return false;
476 }
477 
478 /* Function vect_analyze_scalar_cycles_1.
479 
480    Examine the cross iteration def-use cycles of scalar variables
481    in LOOP.  LOOP_VINFO represents the loop that is now being
482    considered for vectorization (can be LOOP, or an outer-loop
483    enclosing LOOP).  */
484 
485 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
487 {
488   basic_block bb = loop->header;
489   tree init, step;
490   auto_vec<stmt_vec_info, 64> worklist;
491   gphi_iterator gsi;
492   bool double_reduc;
493 
494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
495 
496   /* First - identify all inductions.  Reduction detection assumes that all the
497      inductions have been identified, therefore, this order must not be
498      changed.  */
499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
500     {
501       gphi *phi = gsi.phi ();
502       tree access_fn = NULL;
503       tree def = PHI_RESULT (phi);
504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
505 
506       if (dump_enabled_p ())
507 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
508 
509       /* Skip virtual phi's.  The data dependences that are associated with
510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
511       if (virtual_operand_p (def))
512 	continue;
513 
514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
515 
516       /* Analyze the evolution function.  */
517       access_fn = analyze_scalar_evolution (loop, def);
518       if (access_fn)
519 	{
520 	  STRIP_NOPS (access_fn);
521 	  if (dump_enabled_p ())
522 	    dump_printf_loc (MSG_NOTE, vect_location,
523 			     "Access function of PHI: %T\n", access_fn);
524 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
525 	    = initial_condition_in_loop_num (access_fn, loop->num);
526 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
527 	    = evolution_part_in_loop_num (access_fn, loop->num);
528 	}
529 
530       if (!access_fn
531 	  || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
532 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
533 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
534 	      && TREE_CODE (step) != INTEGER_CST))
535 	{
536 	  worklist.safe_push (stmt_vinfo);
537 	  continue;
538 	}
539 
540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
541 		  != NULL_TREE);
542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
543 
544       if (dump_enabled_p ())
545 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
547     }
548 
549 
550   /* Second - identify all reductions and nested cycles.  */
551   while (worklist.length () > 0)
552     {
553       stmt_vec_info stmt_vinfo = worklist.pop ();
554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
555       tree def = PHI_RESULT (phi);
556 
557       if (dump_enabled_p ())
558 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
559 
560       gcc_assert (!virtual_operand_p (def)
561 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
562 
563       stmt_vec_info reduc_stmt_info
564 	= vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
565 				       &double_reduc, false);
566       if (reduc_stmt_info)
567         {
568           if (double_reduc)
569             {
570               if (dump_enabled_p ())
571                 dump_printf_loc (MSG_NOTE, vect_location,
572 				 "Detected double reduction.\n");
573 
574               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
575 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info)
576 		= vect_double_reduction_def;
577             }
578           else
579             {
580               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581                 {
582                   if (dump_enabled_p ())
583                     dump_printf_loc (MSG_NOTE, vect_location,
584 				     "Detected vectorizable nested cycle.\n");
585 
586                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
587 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
588                 }
589               else
590                 {
591                   if (dump_enabled_p ())
592                     dump_printf_loc (MSG_NOTE, vect_location,
593 				     "Detected reduction.\n");
594 
595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
596 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
597                   /* Store the reduction cycles for possible vectorization in
598                      loop-aware SLP if it was not detected as reduction
599 		     chain.  */
600 		  if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
601 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
602 		      (reduc_stmt_info);
603                 }
604             }
605         }
606       else
607         if (dump_enabled_p ())
608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 			   "Unknown def-use cycle pattern.\n");
610     }
611 }
612 
613 
614 /* Function vect_analyze_scalar_cycles.
615 
616    Examine the cross iteration def-use cycles of scalar variables, by
617    analyzing the loop-header PHIs of scalar variables.  Classify each
618    cycle as one of the following: invariant, induction, reduction, unknown.
619    We do that for the loop represented by LOOP_VINFO, and also to its
620    inner-loop, if exists.
621    Examples for scalar cycles:
622 
623    Example1: reduction:
624 
625               loop1:
626               for (i=0; i<N; i++)
627                  sum += a[i];
628 
629    Example2: induction:
630 
631               loop2:
632               for (i=0; i<N; i++)
633                  a[i] = i;  */
634 
635 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
637 {
638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
639 
640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
641 
642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643      Reductions in such inner-loop therefore have different properties than
644      the reductions in the nest that gets vectorized:
645      1. When vectorized, they are executed in the same order as in the original
646         scalar loop, so we can't change the order of computation when
647         vectorizing them.
648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649         current checks are too strict.  */
650 
651   if (loop->inner)
652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 }
654 
655 /* Transfer group and reduction information from STMT_INFO to its
656    pattern stmt.  */
657 
658 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)659 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
660 {
661   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
662   stmt_vec_info stmtp;
663   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
664 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
665   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666   do
667     {
668       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
669       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
670       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
671       if (stmt_info)
672 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
673 	  = STMT_VINFO_RELATED_STMT (stmt_info);
674     }
675   while (stmt_info);
676   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 }
678 
679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
680 
681 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
683 {
684   stmt_vec_info first;
685   unsigned i;
686 
687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
688     if (STMT_VINFO_IN_PATTERN_P (first))
689       {
690 	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
691 	while (next)
692 	  {
693 	    if (! STMT_VINFO_IN_PATTERN_P (next))
694 	      break;
695 	    next = REDUC_GROUP_NEXT_ELEMENT (next);
696 	  }
697 	/* If not all stmt in the chain are patterns try to handle
698 	   the chain without patterns.  */
699 	if (! next)
700 	  {
701 	    vect_fixup_reduc_chain (first);
702 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
703 	      = STMT_VINFO_RELATED_STMT (first);
704 	  }
705       }
706 }
707 
708 /* Function vect_get_loop_niters.
709 
710    Determine how many iterations the loop is executed and place it
711    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
712    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
713    niter information holds in ASSUMPTIONS.
714 
715    Return the loop exit condition.  */
716 
717 
718 static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
720 		      tree *number_of_iterations, tree *number_of_iterationsm1)
721 {
722   edge exit = single_exit (loop);
723   struct tree_niter_desc niter_desc;
724   tree niter_assumptions, niter, may_be_zero;
725   gcond *cond = get_loop_exit_condition (loop);
726 
727   *assumptions = boolean_true_node;
728   *number_of_iterationsm1 = chrec_dont_know;
729   *number_of_iterations = chrec_dont_know;
730   DUMP_VECT_SCOPE ("get_loop_niters");
731 
732   if (!exit)
733     return cond;
734 
735   niter = chrec_dont_know;
736   may_be_zero = NULL_TREE;
737   niter_assumptions = boolean_true_node;
738   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
739       || chrec_contains_undetermined (niter_desc.niter))
740     return cond;
741 
742   niter_assumptions = niter_desc.assumptions;
743   may_be_zero = niter_desc.may_be_zero;
744   niter = niter_desc.niter;
745 
746   if (may_be_zero && integer_zerop (may_be_zero))
747     may_be_zero = NULL_TREE;
748 
749   if (may_be_zero)
750     {
751       if (COMPARISON_CLASS_P (may_be_zero))
752 	{
753 	  /* Try to combine may_be_zero with assumptions, this can simplify
754 	     computation of niter expression.  */
755 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
756 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
757 					     niter_assumptions,
758 					     fold_build1 (TRUTH_NOT_EXPR,
759 							  boolean_type_node,
760 							  may_be_zero));
761 	  else
762 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
763 				 build_int_cst (TREE_TYPE (niter), 0),
764 				 rewrite_to_non_trapping_overflow (niter));
765 
766 	  may_be_zero = NULL_TREE;
767 	}
768       else if (integer_nonzerop (may_be_zero))
769 	{
770 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
771 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
772 	  return cond;
773 	}
774       else
775 	return cond;
776     }
777 
778   *assumptions = niter_assumptions;
779   *number_of_iterationsm1 = niter;
780 
781   /* We want the number of loop header executions which is the number
782      of latch executions plus one.
783      ???  For UINT_MAX latch executions this number overflows to zero
784      for loops like do { n++; } while (n != 0);  */
785   if (niter && !chrec_contains_undetermined (niter))
786     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
787 			  build_int_cst (TREE_TYPE (niter), 1));
788   *number_of_iterations = niter;
789 
790   return cond;
791 }
792 
793 /* Function bb_in_loop_p
794 
795    Used as predicate for dfs order traversal of the loop bbs.  */
796 
797 static bool
bb_in_loop_p(const_basic_block bb,const void * data)798 bb_in_loop_p (const_basic_block bb, const void *data)
799 {
800   const struct loop *const loop = (const struct loop *)data;
801   if (flow_bb_inside_loop_p (loop, bb))
802     return true;
803   return false;
804 }
805 
806 
807 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
808    stmt_vec_info structs for all the stmts in LOOP_IN.  */
809 
_loop_vec_info(struct loop * loop_in,vec_info_shared * shared)810 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
811   : vec_info (vec_info::loop, init_cost (loop_in), shared),
812     loop (loop_in),
813     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
814     num_itersm1 (NULL_TREE),
815     num_iters (NULL_TREE),
816     num_iters_unchanged (NULL_TREE),
817     num_iters_assumptions (NULL_TREE),
818     th (0),
819     versioning_threshold (0),
820     vectorization_factor (0),
821     max_vectorization_factor (0),
822     mask_skip_niters (NULL_TREE),
823     mask_compare_type (NULL_TREE),
824     simd_if_cond (NULL_TREE),
825     unaligned_dr (NULL),
826     peeling_for_alignment (0),
827     ptr_mask (0),
828     ivexpr_map (NULL),
829     slp_unrolling_factor (1),
830     single_scalar_iteration_cost (0),
831     vectorizable (false),
832     can_fully_mask_p (true),
833     fully_masked_p (false),
834     peeling_for_gaps (false),
835     peeling_for_niter (false),
836     operands_swapped (false),
837     no_data_dependencies (false),
838     has_mask_store (false),
839     scalar_loop (NULL),
840     orig_loop_info (NULL)
841 {
842   /* CHECKME: We want to visit all BBs before their successors (except for
843      latch blocks, for which this assertion wouldn't hold).  In the simple
844      case of the loop forms we allow, a dfs order of the BBs would the same
845      as reversed postorder traversal, so we are safe.  */
846 
847   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
848 					  bbs, loop->num_nodes, loop);
849   gcc_assert (nbbs == loop->num_nodes);
850 
851   for (unsigned int i = 0; i < nbbs; i++)
852     {
853       basic_block bb = bbs[i];
854       gimple_stmt_iterator si;
855 
856       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
857 	{
858 	  gimple *phi = gsi_stmt (si);
859 	  gimple_set_uid (phi, 0);
860 	  add_stmt (phi);
861 	}
862 
863       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
864 	{
865 	  gimple *stmt = gsi_stmt (si);
866 	  gimple_set_uid (stmt, 0);
867 	  add_stmt (stmt);
868 	  /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
869 	     second argument is the #pragma omp simd if (x) condition, when 0,
870 	     loop shouldn't be vectorized, when non-zero constant, it should
871 	     be vectorized normally, otherwise versioned with vectorized loop
872 	     done if the condition is non-zero at runtime.  */
873 	  if (loop_in->simduid
874 	      && is_gimple_call (stmt)
875 	      && gimple_call_internal_p (stmt)
876 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
877 	      && gimple_call_num_args (stmt) >= 2
878 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
879 	      && (loop_in->simduid
880 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
881 	    {
882 	      tree arg = gimple_call_arg (stmt, 1);
883 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
884 		simd_if_cond = arg;
885 	      else
886 		gcc_assert (integer_nonzerop (arg));
887 	    }
888 	}
889     }
890 }
891 
892 /* Free all levels of MASKS.  */
893 
894 void
release_vec_loop_masks(vec_loop_masks * masks)895 release_vec_loop_masks (vec_loop_masks *masks)
896 {
897   rgroup_masks *rgm;
898   unsigned int i;
899   FOR_EACH_VEC_ELT (*masks, i, rgm)
900     rgm->masks.release ();
901   masks->release ();
902 }
903 
904 /* Free all memory used by the _loop_vec_info, as well as all the
905    stmt_vec_info structs of all the stmts in the loop.  */
906 
~_loop_vec_info()907 _loop_vec_info::~_loop_vec_info ()
908 {
909   int nbbs;
910   gimple_stmt_iterator si;
911   int j;
912 
913   nbbs = loop->num_nodes;
914   for (j = 0; j < nbbs; j++)
915     {
916       basic_block bb = bbs[j];
917       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
918         {
919 	  gimple *stmt = gsi_stmt (si);
920 
921 	  /* We may have broken canonical form by moving a constant
922 	     into RHS1 of a commutative op.  Fix such occurrences.  */
923 	  if (operands_swapped && is_gimple_assign (stmt))
924 	    {
925 	      enum tree_code code = gimple_assign_rhs_code (stmt);
926 
927 	      if ((code == PLUS_EXPR
928 		   || code == POINTER_PLUS_EXPR
929 		   || code == MULT_EXPR)
930 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
931 		swap_ssa_operands (stmt,
932 				   gimple_assign_rhs1_ptr (stmt),
933 				   gimple_assign_rhs2_ptr (stmt));
934 	      else if (code == COND_EXPR
935 		       && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
936 		{
937 		  tree cond_expr = gimple_assign_rhs1 (stmt);
938 		  enum tree_code cond_code = TREE_CODE (cond_expr);
939 
940 		  if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
941 		    {
942 		      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
943 								  0));
944 		      cond_code = invert_tree_comparison (cond_code,
945 							  honor_nans);
946 		      if (cond_code != ERROR_MARK)
947 			{
948 			  TREE_SET_CODE (cond_expr, cond_code);
949 			  swap_ssa_operands (stmt,
950 					     gimple_assign_rhs2_ptr (stmt),
951 					     gimple_assign_rhs3_ptr (stmt));
952 			}
953 		    }
954 		}
955 	    }
956           gsi_next (&si);
957         }
958     }
959 
960   free (bbs);
961 
962   release_vec_loop_masks (&masks);
963   delete ivexpr_map;
964 
965   loop->aux = NULL;
966 }
967 
968 /* Return an invariant or register for EXPR and emit necessary
969    computations in the LOOP_VINFO loop preheader.  */
970 
971 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)972 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
973 {
974   if (is_gimple_reg (expr)
975       || is_gimple_min_invariant (expr))
976     return expr;
977 
978   if (! loop_vinfo->ivexpr_map)
979     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
980   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
981   if (! cached)
982     {
983       gimple_seq stmts = NULL;
984       cached = force_gimple_operand (unshare_expr (expr),
985 				     &stmts, true, NULL_TREE);
986       if (stmts)
987 	{
988 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
989 	  gsi_insert_seq_on_edge_immediate (e, stmts);
990 	}
991     }
992   return cached;
993 }
994 
995 /* Return true if we can use CMP_TYPE as the comparison type to produce
996    all masks required to mask LOOP_VINFO.  */
997 
998 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)999 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1000 {
1001   rgroup_masks *rgm;
1002   unsigned int i;
1003   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1004     if (rgm->mask_type != NULL_TREE
1005 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1006 					    cmp_type, rgm->mask_type,
1007 					    OPTIMIZE_FOR_SPEED))
1008       return false;
1009   return true;
1010 }
1011 
1012 /* Calculate the maximum number of scalars per iteration for every
1013    rgroup in LOOP_VINFO.  */
1014 
1015 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)1016 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1017 {
1018   unsigned int res = 1;
1019   unsigned int i;
1020   rgroup_masks *rgm;
1021   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1022     res = MAX (res, rgm->max_nscalars_per_iter);
1023   return res;
1024 }
1025 
1026 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1027    whether we can actually generate the masks required.  Return true if so,
1028    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1029 
1030 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1031 vect_verify_full_masking (loop_vec_info loop_vinfo)
1032 {
1033   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1034   unsigned int min_ni_width;
1035 
1036   /* Use a normal loop if there are no statements that need masking.
1037      This only happens in rare degenerate cases: it means that the loop
1038      has no loads, no stores, and no live-out values.  */
1039   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1040     return false;
1041 
1042   /* Get the maximum number of iterations that is representable
1043      in the counter type.  */
1044   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1045   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1046 
1047   /* Get a more refined estimate for the number of iterations.  */
1048   widest_int max_back_edges;
1049   if (max_loop_iterations (loop, &max_back_edges))
1050     max_ni = wi::smin (max_ni, max_back_edges + 1);
1051 
1052   /* Account for rgroup masks, in which each bit is replicated N times.  */
1053   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1054 
1055   /* Work out how many bits we need to represent the limit.  */
1056   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1057 
1058   /* Find a scalar mode for which WHILE_ULT is supported.  */
1059   opt_scalar_int_mode cmp_mode_iter;
1060   tree cmp_type = NULL_TREE;
1061   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1062     {
1063       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1064       if (cmp_bits >= min_ni_width
1065 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1066 	{
1067 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1068 	  if (this_type
1069 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1070 	    {
1071 	      /* Although we could stop as soon as we find a valid mode,
1072 		 it's often better to continue until we hit Pmode, since the
1073 		 operands to the WHILE are more likely to be reusable in
1074 		 address calculations.  */
1075 	      cmp_type = this_type;
1076 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1077 		break;
1078 	    }
1079 	}
1080     }
1081 
1082   if (!cmp_type)
1083     return false;
1084 
1085   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1086   return true;
1087 }
1088 
1089 /* Calculate the cost of one scalar iteration of the loop.  */
1090 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1091 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1092 {
1093   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1094   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1095   int nbbs = loop->num_nodes, factor;
1096   int innerloop_iters, i;
1097 
1098   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1099 
1100   /* Gather costs for statements in the scalar loop.  */
1101 
1102   /* FORNOW.  */
1103   innerloop_iters = 1;
1104   if (loop->inner)
1105     innerloop_iters = 50; /* FIXME */
1106 
1107   for (i = 0; i < nbbs; i++)
1108     {
1109       gimple_stmt_iterator si;
1110       basic_block bb = bbs[i];
1111 
1112       if (bb->loop_father == loop->inner)
1113         factor = innerloop_iters;
1114       else
1115         factor = 1;
1116 
1117       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1118         {
1119 	  gimple *stmt = gsi_stmt (si);
1120 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1121 
1122           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1123             continue;
1124 
1125           /* Skip stmts that are not vectorized inside the loop.  */
1126 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1127           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1128               && (!STMT_VINFO_LIVE_P (vstmt_info)
1129                   || !VECTORIZABLE_CYCLE_DEF
1130 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1131             continue;
1132 
1133 	  vect_cost_for_stmt kind;
1134           if (STMT_VINFO_DATA_REF (stmt_info))
1135             {
1136               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1137                kind = scalar_load;
1138              else
1139                kind = scalar_store;
1140             }
1141           else
1142             kind = scalar_stmt;
1143 
1144 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1145 			    factor, kind, stmt_info, 0, vect_prologue);
1146         }
1147     }
1148 
1149   /* Now accumulate cost.  */
1150   void *target_cost_data = init_cost (loop);
1151   stmt_info_for_cost *si;
1152   int j;
1153   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1154 		    j, si)
1155     (void) add_stmt_cost (target_cost_data, si->count,
1156 			  si->kind, si->stmt_info, si->misalign,
1157 			  vect_body);
1158   unsigned dummy, body_cost = 0;
1159   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1160   destroy_cost_data (target_cost_data);
1161   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1162 }
1163 
1164 
1165 /* Function vect_analyze_loop_form_1.
1166 
1167    Verify that certain CFG restrictions hold, including:
1168    - the loop has a pre-header
1169    - the loop has a single entry and exit
1170    - the loop exit condition is simple enough
1171    - the number of iterations can be analyzed, i.e, a countable loop.  The
1172      niter could be analyzed under some assumptions.  */
1173 
1174 opt_result
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1175 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1176 			  tree *assumptions, tree *number_of_iterationsm1,
1177 			  tree *number_of_iterations, gcond **inner_loop_cond)
1178 {
1179   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1180 
1181   /* Different restrictions apply when we are considering an inner-most loop,
1182      vs. an outer (nested) loop.
1183      (FORNOW. May want to relax some of these restrictions in the future).  */
1184 
1185   if (!loop->inner)
1186     {
1187       /* Inner-most loop.  We currently require that the number of BBs is
1188 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1189 	 look like this:
1190 
1191                         (pre-header)
1192                            |
1193                           header <--------+
1194                            | |            |
1195                            | +--> latch --+
1196                            |
1197                         (exit-bb)  */
1198 
1199       if (loop->num_nodes != 2)
1200 	return opt_result::failure_at (vect_location,
1201 				       "not vectorized:"
1202 				       " control flow in loop.\n");
1203 
1204       if (empty_block_p (loop->header))
1205 	return opt_result::failure_at (vect_location,
1206 				       "not vectorized: empty loop.\n");
1207     }
1208   else
1209     {
1210       struct loop *innerloop = loop->inner;
1211       edge entryedge;
1212 
1213       /* Nested loop. We currently require that the loop is doubly-nested,
1214 	 contains a single inner loop, and the number of BBs is exactly 5.
1215 	 Vectorizable outer-loops look like this:
1216 
1217 			(pre-header)
1218 			   |
1219 			  header <---+
1220 			   |         |
1221 		          inner-loop |
1222 			   |         |
1223 			  tail ------+
1224 			   |
1225 		        (exit-bb)
1226 
1227 	 The inner-loop has the properties expected of inner-most loops
1228 	 as described above.  */
1229 
1230       if ((loop->inner)->inner || (loop->inner)->next)
1231 	return opt_result::failure_at (vect_location,
1232 				       "not vectorized:"
1233 				       " multiple nested loops.\n");
1234 
1235       if (loop->num_nodes != 5)
1236 	return opt_result::failure_at (vect_location,
1237 				       "not vectorized:"
1238 				       " control flow in loop.\n");
1239 
1240       entryedge = loop_preheader_edge (innerloop);
1241       if (entryedge->src != loop->header
1242 	  || !single_exit (innerloop)
1243 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1244 	return opt_result::failure_at (vect_location,
1245 				       "not vectorized:"
1246 				       " unsupported outerloop form.\n");
1247 
1248       /* Analyze the inner-loop.  */
1249       tree inner_niterm1, inner_niter, inner_assumptions;
1250       opt_result res
1251 	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1252 				    &inner_assumptions, &inner_niterm1,
1253 				    &inner_niter, NULL);
1254       if (!res)
1255 	{
1256 	  if (dump_enabled_p ())
1257 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 			     "not vectorized: Bad inner loop.\n");
1259 	  return res;
1260 	}
1261 
1262       /* Don't support analyzing niter under assumptions for inner
1263 	 loop.  */
1264       if (!integer_onep (inner_assumptions))
1265 	return opt_result::failure_at (vect_location,
1266 				       "not vectorized: Bad inner loop.\n");
1267 
1268       if (!expr_invariant_in_loop_p (loop, inner_niter))
1269 	return opt_result::failure_at (vect_location,
1270 				       "not vectorized: inner-loop count not"
1271 				       " invariant.\n");
1272 
1273       if (dump_enabled_p ())
1274         dump_printf_loc (MSG_NOTE, vect_location,
1275 			 "Considering outer-loop vectorization.\n");
1276     }
1277 
1278   if (!single_exit (loop))
1279     return opt_result::failure_at (vect_location,
1280 				   "not vectorized: multiple exits.\n");
1281   if (EDGE_COUNT (loop->header->preds) != 2)
1282     return opt_result::failure_at (vect_location,
1283 				   "not vectorized:"
1284 				   " too many incoming edges.\n");
1285 
1286   /* We assume that the loop exit condition is at the end of the loop. i.e,
1287      that the loop is represented as a do-while (with a proper if-guard
1288      before the loop if needed), where the loop header contains all the
1289      executable statements, and the latch is empty.  */
1290   if (!empty_block_p (loop->latch)
1291       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1292     return opt_result::failure_at (vect_location,
1293 				   "not vectorized: latch block not empty.\n");
1294 
1295   /* Make sure the exit is not abnormal.  */
1296   edge e = single_exit (loop);
1297   if (e->flags & EDGE_ABNORMAL)
1298     return opt_result::failure_at (vect_location,
1299 				   "not vectorized:"
1300 				   " abnormal loop exit edge.\n");
1301 
1302   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1303 				     number_of_iterationsm1);
1304   if (!*loop_cond)
1305     return opt_result::failure_at
1306       (vect_location,
1307        "not vectorized: complicated exit condition.\n");
1308 
1309   if (integer_zerop (*assumptions)
1310       || !*number_of_iterations
1311       || chrec_contains_undetermined (*number_of_iterations))
1312     return opt_result::failure_at
1313       (*loop_cond,
1314        "not vectorized: number of iterations cannot be computed.\n");
1315 
1316   if (integer_zerop (*number_of_iterations))
1317     return opt_result::failure_at
1318       (*loop_cond,
1319        "not vectorized: number of iterations = 0.\n");
1320 
1321   return opt_result::success ();
1322 }
1323 
1324 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1325 
1326 opt_loop_vec_info
vect_analyze_loop_form(struct loop * loop,vec_info_shared * shared)1327 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1328 {
1329   tree assumptions, number_of_iterations, number_of_iterationsm1;
1330   gcond *loop_cond, *inner_loop_cond = NULL;
1331 
1332   opt_result res
1333     = vect_analyze_loop_form_1 (loop, &loop_cond,
1334 				&assumptions, &number_of_iterationsm1,
1335 				&number_of_iterations, &inner_loop_cond);
1336   if (!res)
1337     return opt_loop_vec_info::propagate_failure (res);
1338 
1339   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1340   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1341   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1342   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1343   if (!integer_onep (assumptions))
1344     {
1345       /* We consider to vectorize this loop by versioning it under
1346 	 some assumptions.  In order to do this, we need to clear
1347 	 existing information computed by scev and niter analyzer.  */
1348       scev_reset_htab ();
1349       free_numbers_of_iterations_estimates (loop);
1350       /* Also set flag for this loop so that following scev and niter
1351 	 analysis are done under the assumptions.  */
1352       loop_constraint_set (loop, LOOP_C_FINITE);
1353       /* Also record the assumptions for versioning.  */
1354       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1355     }
1356 
1357   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1358     {
1359       if (dump_enabled_p ())
1360         {
1361           dump_printf_loc (MSG_NOTE, vect_location,
1362 			   "Symbolic number of iterations is ");
1363 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1364           dump_printf (MSG_NOTE, "\n");
1365         }
1366     }
1367 
1368   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1369   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1370   if (inner_loop_cond)
1371     {
1372       stmt_vec_info inner_loop_cond_info
1373 	= loop_vinfo->lookup_stmt (inner_loop_cond);
1374       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1375     }
1376 
1377   gcc_assert (!loop->aux);
1378   loop->aux = loop_vinfo;
1379   return opt_loop_vec_info::success (loop_vinfo);
1380 }
1381 
1382 
1383 
1384 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1385    statements update the vectorization factor.  */
1386 
1387 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1388 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1389 {
1390   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1392   int nbbs = loop->num_nodes;
1393   poly_uint64 vectorization_factor;
1394   int i;
1395 
1396   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1397 
1398   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1399   gcc_assert (known_ne (vectorization_factor, 0U));
1400 
1401   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1402      vectorization factor of the loop is the unrolling factor required by
1403      the SLP instances.  If that unrolling factor is 1, we say, that we
1404      perform pure SLP on loop - cross iteration parallelism is not
1405      exploited.  */
1406   bool only_slp_in_loop = true;
1407   for (i = 0; i < nbbs; i++)
1408     {
1409       basic_block bb = bbs[i];
1410       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1411 	   gsi_next (&si))
1412 	{
1413 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1414 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1415 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1416 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1417 	      && !PURE_SLP_STMT (stmt_info))
1418 	    /* STMT needs both SLP and loop-based vectorization.  */
1419 	    only_slp_in_loop = false;
1420 	}
1421     }
1422 
1423   if (only_slp_in_loop)
1424     {
1425       if (dump_enabled_p ())
1426 	dump_printf_loc (MSG_NOTE, vect_location,
1427 			 "Loop contains only SLP stmts\n");
1428       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1429     }
1430   else
1431     {
1432       if (dump_enabled_p ())
1433 	dump_printf_loc (MSG_NOTE, vect_location,
1434 			 "Loop contains SLP and non-SLP stmts\n");
1435       /* Both the vectorization factor and unroll factor have the form
1436 	 current_vector_size * X for some rational X, so they must have
1437 	 a common multiple.  */
1438       vectorization_factor
1439 	= force_common_multiple (vectorization_factor,
1440 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1441     }
1442 
1443   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1444   if (dump_enabled_p ())
1445     {
1446       dump_printf_loc (MSG_NOTE, vect_location,
1447 		       "Updating vectorization factor to ");
1448       dump_dec (MSG_NOTE, vectorization_factor);
1449       dump_printf (MSG_NOTE, ".\n");
1450     }
1451 }
1452 
1453 /* Return true if STMT_INFO describes a double reduction phi and if
1454    the other phi in the reduction is also relevant for vectorization.
1455    This rejects cases such as:
1456 
1457       outer1:
1458 	x_1 = PHI <x_3(outer2), ...>;
1459 	...
1460 
1461       inner:
1462 	x_2 = ...;
1463 	...
1464 
1465       outer2:
1466 	x_3 = PHI <x_2(inner)>;
1467 
1468    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1469 
1470 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1471 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1472 {
1473   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1474     return false;
1475 
1476   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1477 }
1478 
1479 /* Function vect_analyze_loop_operations.
1480 
1481    Scan the loop stmts and make sure they are all vectorizable.  */
1482 
1483 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1484 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1485 {
1486   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1487   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1488   int nbbs = loop->num_nodes;
1489   int i;
1490   stmt_vec_info stmt_info;
1491   bool need_to_vectorize = false;
1492   bool ok;
1493 
1494   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1495 
1496   auto_vec<stmt_info_for_cost> cost_vec;
1497 
1498   for (i = 0; i < nbbs; i++)
1499     {
1500       basic_block bb = bbs[i];
1501 
1502       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1503 	   gsi_next (&si))
1504         {
1505           gphi *phi = si.phi ();
1506           ok = true;
1507 
1508 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1509           if (dump_enabled_p ())
1510 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1511 	  if (virtual_operand_p (gimple_phi_result (phi)))
1512 	    continue;
1513 
1514           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1515              (i.e., a phi in the tail of the outer-loop).  */
1516           if (! is_loop_header_bb_p (bb))
1517             {
1518               /* FORNOW: we currently don't support the case that these phis
1519                  are not used in the outerloop (unless it is double reduction,
1520                  i.e., this phi is vect_reduction_def), cause this case
1521                  requires to actually do something here.  */
1522               if (STMT_VINFO_LIVE_P (stmt_info)
1523 		  && !vect_active_double_reduction_p (stmt_info))
1524 		return opt_result::failure_at (phi,
1525 					       "Unsupported loop-closed phi"
1526 					       " in outer-loop.\n");
1527 
1528               /* If PHI is used in the outer loop, we check that its operand
1529                  is defined in the inner loop.  */
1530               if (STMT_VINFO_RELEVANT_P (stmt_info))
1531                 {
1532                   tree phi_op;
1533 
1534                   if (gimple_phi_num_args (phi) != 1)
1535                     return opt_result::failure_at (phi, "unsupported phi");
1536 
1537                   phi_op = PHI_ARG_DEF (phi, 0);
1538 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1539 		  if (!op_def_info)
1540 		    return opt_result::failure_at (phi, "unsupported phi");
1541 
1542 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1543 		      && (STMT_VINFO_RELEVANT (op_def_info)
1544 			  != vect_used_in_outer_by_reduction))
1545 		    return opt_result::failure_at (phi, "unsupported phi");
1546                 }
1547 
1548               continue;
1549             }
1550 
1551           gcc_assert (stmt_info);
1552 
1553           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1554                || STMT_VINFO_LIVE_P (stmt_info))
1555               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1556 	    /* A scalar-dependence cycle that we don't support.  */
1557 	    return opt_result::failure_at (phi,
1558 					   "not vectorized:"
1559 					   " scalar dependence cycle.\n");
1560 
1561           if (STMT_VINFO_RELEVANT_P (stmt_info))
1562             {
1563               need_to_vectorize = true;
1564               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1565 		  && ! PURE_SLP_STMT (stmt_info))
1566 		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1567 					     &cost_vec);
1568 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1569 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1570 		       && ! PURE_SLP_STMT (stmt_info))
1571 		ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1572 					     &cost_vec);
1573             }
1574 
1575 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1576 	  if (ok
1577 	      && STMT_VINFO_LIVE_P (stmt_info)
1578 	      && !PURE_SLP_STMT (stmt_info))
1579 	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1580 					      &cost_vec);
1581 
1582           if (!ok)
1583 	    return opt_result::failure_at (phi,
1584 					   "not vectorized: relevant phi not "
1585 					   "supported: %G",
1586 					   static_cast <gimple *> (phi));
1587         }
1588 
1589       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1590 	   gsi_next (&si))
1591         {
1592 	  gimple *stmt = gsi_stmt (si);
1593 	  if (!gimple_clobber_p (stmt))
1594 	    {
1595 	      opt_result res
1596 		= vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1597 				     &need_to_vectorize,
1598 				     NULL, NULL, &cost_vec);
1599 	      if (!res)
1600 		return res;
1601 	    }
1602         }
1603     } /* bbs */
1604 
1605   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1606 
1607   /* All operations in the loop are either irrelevant (deal with loop
1608      control, or dead), or only used outside the loop and can be moved
1609      out of the loop (e.g. invariants, inductions).  The loop can be
1610      optimized away by scalar optimizations.  We're better off not
1611      touching this loop.  */
1612   if (!need_to_vectorize)
1613     {
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_NOTE, vect_location,
1616 			 "All the computation can be taken out of the loop.\n");
1617       return opt_result::failure_at
1618 	(vect_location,
1619 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1620     }
1621 
1622   return opt_result::success ();
1623 }
1624 
1625 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1626    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1627    definitely no, or -1 if it's worth retrying.  */
1628 
1629 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1630 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1631 {
1632   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1634 
1635   /* Only fully-masked loops can have iteration counts less than the
1636      vectorization factor.  */
1637   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1638     {
1639       HOST_WIDE_INT max_niter;
1640 
1641       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1642 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1643       else
1644 	max_niter = max_stmt_executions_int (loop);
1645 
1646       if (max_niter != -1
1647 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1648 	{
1649 	  if (dump_enabled_p ())
1650 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651 			     "not vectorized: iteration count smaller than "
1652 			     "vectorization factor.\n");
1653 	  return 0;
1654 	}
1655     }
1656 
1657   int min_profitable_iters, min_profitable_estimate;
1658   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1659 				      &min_profitable_estimate);
1660 
1661   if (min_profitable_iters < 0)
1662     {
1663       if (dump_enabled_p ())
1664 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665 			 "not vectorized: vectorization not profitable.\n");
1666       if (dump_enabled_p ())
1667 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1668 			 "not vectorized: vector version will never be "
1669 			 "profitable.\n");
1670       return -1;
1671     }
1672 
1673   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1674 			       * assumed_vf);
1675 
1676   /* Use the cost model only if it is more conservative than user specified
1677      threshold.  */
1678   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1679 				    min_profitable_iters);
1680 
1681   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1682 
1683   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1684       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1685     {
1686       if (dump_enabled_p ())
1687 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1688 			 "not vectorized: vectorization not profitable.\n");
1689       if (dump_enabled_p ())
1690 	dump_printf_loc (MSG_NOTE, vect_location,
1691 			 "not vectorized: iteration count smaller than user "
1692 			 "specified loop bound parameter or minimum profitable "
1693 			 "iterations (whichever is more conservative).\n");
1694       return 0;
1695     }
1696 
1697   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1698   if (estimated_niter == -1)
1699     estimated_niter = likely_max_stmt_executions_int (loop);
1700   if (estimated_niter != -1
1701       && ((unsigned HOST_WIDE_INT) estimated_niter
1702 	  < MAX (th, (unsigned) min_profitable_estimate)))
1703     {
1704       if (dump_enabled_p ())
1705 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706 			 "not vectorized: estimated iteration count too "
1707 			 "small.\n");
1708       if (dump_enabled_p ())
1709 	dump_printf_loc (MSG_NOTE, vect_location,
1710 			 "not vectorized: estimated iteration count smaller "
1711 			 "than specified loop bound parameter or minimum "
1712 			 "profitable iterations (whichever is more "
1713 			 "conservative).\n");
1714       return -1;
1715     }
1716 
1717   return 1;
1718 }
1719 
1720 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1721 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1722 			   vec<data_reference_p> *datarefs,
1723 			   unsigned int *n_stmts)
1724 {
1725   *n_stmts = 0;
1726   for (unsigned i = 0; i < loop->num_nodes; i++)
1727     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1728 	 !gsi_end_p (gsi); gsi_next (&gsi))
1729       {
1730 	gimple *stmt = gsi_stmt (gsi);
1731 	if (is_gimple_debug (stmt))
1732 	  continue;
1733 	++(*n_stmts);
1734 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1735 	if (!res)
1736 	  {
1737 	    if (is_gimple_call (stmt) && loop->safelen)
1738 	      {
1739 		tree fndecl = gimple_call_fndecl (stmt), op;
1740 		if (fndecl != NULL_TREE)
1741 		  {
1742 		    cgraph_node *node = cgraph_node::get (fndecl);
1743 		    if (node != NULL && node->simd_clones != NULL)
1744 		      {
1745 			unsigned int j, n = gimple_call_num_args (stmt);
1746 			for (j = 0; j < n; j++)
1747 			  {
1748 			    op = gimple_call_arg (stmt, j);
1749 			    if (DECL_P (op)
1750 				|| (REFERENCE_CLASS_P (op)
1751 				    && get_base_address (op)))
1752 			      break;
1753 			  }
1754 			op = gimple_call_lhs (stmt);
1755 			/* Ignore #pragma omp declare simd functions
1756 			   if they don't have data references in the
1757 			   call stmt itself.  */
1758 			if (j == n
1759 			    && !(op
1760 				 && (DECL_P (op)
1761 				     || (REFERENCE_CLASS_P (op)
1762 					 && get_base_address (op)))))
1763 			  continue;
1764 		      }
1765 		  }
1766 	      }
1767 	    return res;
1768 	  }
1769 	/* If dependence analysis will give up due to the limit on the
1770 	   number of datarefs stop here and fail fatally.  */
1771 	if (datarefs->length ()
1772 	    > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1773 	  return opt_result::failure_at (stmt, "exceeded param "
1774 					 "loop-max-datarefs-for-datadeps\n");
1775       }
1776   return opt_result::success ();
1777 }
1778 
1779 /* Function vect_analyze_loop_2.
1780 
1781    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1782    for it.  The different analyses will record information in the
1783    loop_vec_info struct.  */
1784 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1785 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1786 {
1787   opt_result ok = opt_result::success ();
1788   int res;
1789   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1790   poly_uint64 min_vf = 2;
1791 
1792   /* The first group of checks is independent of the vector size.  */
1793   fatal = true;
1794 
1795   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1796       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1797     return opt_result::failure_at (vect_location,
1798 				   "not vectorized: simd if(0)\n");
1799 
1800   /* Find all data references in the loop (which correspond to vdefs/vuses)
1801      and analyze their evolution in the loop.  */
1802 
1803   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804 
1805   /* Gather the data references and count stmts in the loop.  */
1806   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1807     {
1808       opt_result res
1809 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1810 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
1811 				     n_stmts);
1812       if (!res)
1813 	{
1814 	  if (dump_enabled_p ())
1815 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1816 			     "not vectorized: loop contains function "
1817 			     "calls or data references that cannot "
1818 			     "be analyzed\n");
1819 	  return res;
1820 	}
1821       loop_vinfo->shared->save_datarefs ();
1822     }
1823   else
1824     loop_vinfo->shared->check_datarefs ();
1825 
1826   /* Analyze the data references and also adjust the minimal
1827      vectorization factor according to the loads and stores.  */
1828 
1829   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1830   if (!ok)
1831     {
1832       if (dump_enabled_p ())
1833 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834 			 "bad data references.\n");
1835       return ok;
1836     }
1837 
1838   /* Classify all cross-iteration scalar data-flow cycles.
1839      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1840   vect_analyze_scalar_cycles (loop_vinfo);
1841 
1842   vect_pattern_recog (loop_vinfo);
1843 
1844   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1845 
1846   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1847      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1848 
1849   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 			 "bad data access.\n");
1855       return ok;
1856     }
1857 
1858   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1859 
1860   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1861   if (!ok)
1862     {
1863       if (dump_enabled_p ())
1864 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 			 "unexpected pattern.\n");
1866       return ok;
1867     }
1868 
1869   /* While the rest of the analysis below depends on it in some way.  */
1870   fatal = false;
1871 
1872   /* Analyze data dependences between the data-refs in the loop
1873      and adjust the maximum vectorization factor according to
1874      the dependences.
1875      FORNOW: fail at the first data dependence that we encounter.  */
1876 
1877   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1878   if (!ok)
1879     {
1880       if (dump_enabled_p ())
1881 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882 			 "bad data dependence.\n");
1883       return ok;
1884     }
1885   if (max_vf != MAX_VECTORIZATION_FACTOR
1886       && maybe_lt (max_vf, min_vf))
1887     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1888   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1889 
1890   ok = vect_determine_vectorization_factor (loop_vinfo);
1891   if (!ok)
1892     {
1893       if (dump_enabled_p ())
1894 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 			 "can't determine vectorization factor.\n");
1896       return ok;
1897     }
1898   if (max_vf != MAX_VECTORIZATION_FACTOR
1899       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1900     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1901 
1902   /* Compute the scalar iteration cost.  */
1903   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1904 
1905   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1906   unsigned th;
1907 
1908   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1909   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1910   if (!ok)
1911     return ok;
1912 
1913   /* If there are any SLP instances mark them as pure_slp.  */
1914   bool slp = vect_make_slp_decision (loop_vinfo);
1915   if (slp)
1916     {
1917       /* Find stmts that need to be both vectorized and SLPed.  */
1918       vect_detect_hybrid_slp (loop_vinfo);
1919 
1920       /* Update the vectorization factor based on the SLP decision.  */
1921       vect_update_vf_for_slp (loop_vinfo);
1922     }
1923 
1924   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1925 
1926   /* We don't expect to have to roll back to anything other than an empty
1927      set of rgroups.  */
1928   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1929 
1930   /* This is the point where we can re-start analysis with SLP forced off.  */
1931 start_over:
1932 
1933   /* Now the vectorization factor is final.  */
1934   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935   gcc_assert (known_ne (vectorization_factor, 0U));
1936 
1937   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1938     {
1939       dump_printf_loc (MSG_NOTE, vect_location,
1940 		       "vectorization_factor = ");
1941       dump_dec (MSG_NOTE, vectorization_factor);
1942       dump_printf (MSG_NOTE, ", niters = %wd\n",
1943 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
1944     }
1945 
1946   HOST_WIDE_INT max_niter
1947     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1948 
1949   /* Analyze the alignment of the data-refs in the loop.
1950      Fail if a data reference is found that cannot be vectorized.  */
1951 
1952   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1953   if (!ok)
1954     {
1955       if (dump_enabled_p ())
1956 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957 			 "bad data alignment.\n");
1958       return ok;
1959     }
1960 
1961   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1962      It is important to call pruning after vect_analyze_data_ref_accesses,
1963      since we use grouping information gathered by interleaving analysis.  */
1964   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1965   if (!ok)
1966     return ok;
1967 
1968   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1969      vectorization, since we do not want to add extra peeling or
1970      add versioning for alignment.  */
1971   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1972     /* This pass will decide on using loop versioning and/or loop peeling in
1973        order to enhance the alignment of data references in the loop.  */
1974     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1975   else
1976     ok = vect_verify_datarefs_alignment (loop_vinfo);
1977   if (!ok)
1978     return ok;
1979 
1980   if (slp)
1981     {
1982       /* Analyze operations in the SLP instances.  Note this may
1983 	 remove unsupported SLP instances which makes the above
1984 	 SLP kind detection invalid.  */
1985       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1986       vect_slp_analyze_operations (loop_vinfo);
1987       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1988 	{
1989 	  ok = opt_result::failure_at (vect_location,
1990 				       "unsupported SLP instances\n");
1991 	  goto again;
1992 	}
1993     }
1994 
1995   /* Scan all the remaining operations in the loop that are not subject
1996      to SLP and make sure they are vectorizable.  */
1997   ok = vect_analyze_loop_operations (loop_vinfo);
1998   if (!ok)
1999     {
2000       if (dump_enabled_p ())
2001 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 			 "bad operation or unsupported loop bound.\n");
2003       return ok;
2004     }
2005 
2006   /* Decide whether to use a fully-masked loop for this vectorization
2007      factor.  */
2008   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2009     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2010        && vect_verify_full_masking (loop_vinfo));
2011   if (dump_enabled_p ())
2012     {
2013       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2014 	dump_printf_loc (MSG_NOTE, vect_location,
2015 			 "using a fully-masked loop.\n");
2016       else
2017 	dump_printf_loc (MSG_NOTE, vect_location,
2018 			 "not using a fully-masked loop.\n");
2019     }
2020 
2021   /* If epilog loop is required because of data accesses with gaps,
2022      one additional iteration needs to be peeled.  Check if there is
2023      enough iterations for vectorization.  */
2024   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2025       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2026       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027     {
2028       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2029       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2030 
2031       if (known_lt (wi::to_widest (scalar_niters), vf))
2032 	return opt_result::failure_at (vect_location,
2033 				       "loop has no enough iterations to"
2034 				       " support peeling for gaps.\n");
2035     }
2036 
2037   /* Check the costings of the loop make vectorizing worthwhile.  */
2038   res = vect_analyze_loop_costing (loop_vinfo);
2039   if (res < 0)
2040     {
2041       ok = opt_result::failure_at (vect_location,
2042 				   "Loop costings may not be worthwhile.\n");
2043       goto again;
2044     }
2045   if (!res)
2046     return opt_result::failure_at (vect_location,
2047 				   "Loop costings not worthwhile.\n");
2048 
2049   /* Decide whether we need to create an epilogue loop to handle
2050      remaining scalar iterations.  */
2051   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2052 
2053   unsigned HOST_WIDE_INT const_vf;
2054   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2055     /* The main loop handles all iterations.  */
2056     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2057   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2058 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2059     {
2060       /* Work out the (constant) number of iterations that need to be
2061 	 peeled for reasons other than niters.  */
2062       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2063       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2064 	peel_niter += 1;
2065       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2066 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2067 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2068     }
2069   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2070 	   /* ??? When peeling for gaps but not alignment, we could
2071 	      try to check whether the (variable) niters is known to be
2072 	      VF * N + 1.  That's something of a niche case though.  */
2073 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2074 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2075 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2076 		< (unsigned) exact_log2 (const_vf))
2077 	       /* In case of versioning, check if the maximum number of
2078 		  iterations is greater than th.  If they are identical,
2079 		  the epilogue is unnecessary.  */
2080 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2081 		   || ((unsigned HOST_WIDE_INT) max_niter
2082 		       > (th / const_vf) * const_vf))))
2083     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2084 
2085   /* If an epilogue loop is required make sure we can create one.  */
2086   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2087       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2088     {
2089       if (dump_enabled_p ())
2090         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2091       if (!vect_can_advance_ivs_p (loop_vinfo)
2092 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2093 					   single_exit (LOOP_VINFO_LOOP
2094 							 (loop_vinfo))))
2095         {
2096 	  ok = opt_result::failure_at (vect_location,
2097 				       "not vectorized: can't create required "
2098 				       "epilog loop\n");
2099           goto again;
2100         }
2101     }
2102 
2103   /* During peeling, we need to check if number of loop iterations is
2104      enough for both peeled prolog loop and vector loop.  This check
2105      can be merged along with threshold check of loop versioning, so
2106      increase threshold for this case if necessary.  */
2107   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2108     {
2109       poly_uint64 niters_th = 0;
2110 
2111       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2112 	{
2113 	  /* Niters for peeled prolog loop.  */
2114 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2115 	    {
2116 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2117 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2118 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2119 	    }
2120 	  else
2121 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2122 	}
2123 
2124       /* Niters for at least one iteration of vectorized loop.  */
2125       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2126 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2127       /* One additional iteration because of peeling for gap.  */
2128       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2129 	niters_th += 1;
2130       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2131     }
2132 
2133   gcc_assert (known_eq (vectorization_factor,
2134 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2135 
2136   /* Ok to vectorize!  */
2137   return opt_result::success ();
2138 
2139 again:
2140   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2141   gcc_assert (!ok);
2142 
2143   /* Try again with SLP forced off but if we didn't do any SLP there is
2144      no point in re-trying.  */
2145   if (!slp)
2146     return ok;
2147 
2148   /* If there are reduction chains re-trying will fail anyway.  */
2149   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2150     return ok;
2151 
2152   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2153      via interleaving or lane instructions.  */
2154   slp_instance instance;
2155   slp_tree node;
2156   unsigned i, j;
2157   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2158     {
2159       stmt_vec_info vinfo;
2160       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2161       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2162 	continue;
2163       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2164       unsigned int size = DR_GROUP_SIZE (vinfo);
2165       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2166       if (! vect_store_lanes_supported (vectype, size, false)
2167 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2168 	 && ! vect_grouped_store_supported (vectype, size))
2169 	return opt_result::failure_at (vinfo->stmt,
2170 				       "unsupported grouped store\n");
2171       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2172 	{
2173 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2174 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2175 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2176 	  size = DR_GROUP_SIZE (vinfo);
2177 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2178 	  if (! vect_load_lanes_supported (vectype, size, false)
2179 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2180 						size))
2181 	    return opt_result::failure_at (vinfo->stmt,
2182 					   "unsupported grouped load\n");
2183 	}
2184     }
2185 
2186   if (dump_enabled_p ())
2187     dump_printf_loc (MSG_NOTE, vect_location,
2188 		     "re-trying with SLP disabled\n");
2189 
2190   /* Roll back state appropriately.  No SLP this time.  */
2191   slp = false;
2192   /* Restore vectorization factor as it were without SLP.  */
2193   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2194   /* Free the SLP instances.  */
2195   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2196     vect_free_slp_instance (instance, false);
2197   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2198   /* Reset SLP type to loop_vect on all stmts.  */
2199   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2200     {
2201       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2202       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2203 	   !gsi_end_p (si); gsi_next (&si))
2204 	{
2205 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2206 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2207 	}
2208       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2209 	   !gsi_end_p (si); gsi_next (&si))
2210 	{
2211 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2212 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2213 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2214 	    {
2215 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2216 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2217 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2218 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2219 		   !gsi_end_p (pi); gsi_next (&pi))
2220 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2221 		  = loop_vect;
2222 	    }
2223 	}
2224     }
2225   /* Free optimized alias test DDRS.  */
2226   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2227   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2228   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2229   /* Reset target cost data.  */
2230   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2231   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2232     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2233   /* Reset accumulated rgroup information.  */
2234   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2235   /* Reset assorted flags.  */
2236   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2237   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2238   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2239   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2240   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2241 
2242   goto start_over;
2243 }
2244 
2245 /* Function vect_analyze_loop.
2246 
2247    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2248    for it.  The different analyses will record information in the
2249    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2250    be vectorized.  */
2251 opt_loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo,vec_info_shared * shared)2252 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2253 		   vec_info_shared *shared)
2254 {
2255   auto_vector_sizes vector_sizes;
2256 
2257   /* Autodetect first vector size we try.  */
2258   current_vector_size = 0;
2259   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2260   unsigned int next_size = 0;
2261 
2262   DUMP_VECT_SCOPE ("analyze_loop_nest");
2263 
2264   if (loop_outer (loop)
2265       && loop_vec_info_for_loop (loop_outer (loop))
2266       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2267     return opt_loop_vec_info::failure_at (vect_location,
2268 					  "outer-loop already vectorized.\n");
2269 
2270   if (!find_loop_nest (loop, &shared->loop_nest))
2271     return opt_loop_vec_info::failure_at
2272       (vect_location,
2273        "not vectorized: loop nest containing two or more consecutive inner"
2274        " loops cannot be vectorized\n");
2275 
2276   unsigned n_stmts = 0;
2277   poly_uint64 autodetected_vector_size = 0;
2278   while (1)
2279     {
2280       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2281       opt_loop_vec_info loop_vinfo
2282 	= vect_analyze_loop_form (loop, shared);
2283       if (!loop_vinfo)
2284 	{
2285 	  if (dump_enabled_p ())
2286 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 			     "bad loop form.\n");
2288 	  return loop_vinfo;
2289 	}
2290 
2291       bool fatal = false;
2292 
2293       if (orig_loop_vinfo)
2294 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2295 
2296       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2297       if (res)
2298 	{
2299 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2300 
2301 	  return loop_vinfo;
2302 	}
2303 
2304       delete loop_vinfo;
2305 
2306       if (next_size == 0)
2307 	autodetected_vector_size = current_vector_size;
2308 
2309       if (next_size < vector_sizes.length ()
2310 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
2311 	next_size += 1;
2312 
2313       if (fatal
2314 	  || next_size == vector_sizes.length ()
2315 	  || known_eq (current_vector_size, 0U))
2316 	return opt_loop_vec_info::propagate_failure (res);
2317 
2318       /* Try the next biggest vector size.  */
2319       current_vector_size = vector_sizes[next_size++];
2320       if (dump_enabled_p ())
2321 	{
2322 	  dump_printf_loc (MSG_NOTE, vect_location,
2323 			   "***** Re-trying analysis with "
2324 			   "vector size ");
2325 	  dump_dec (MSG_NOTE, current_vector_size);
2326 	  dump_printf (MSG_NOTE, "\n");
2327 	}
2328     }
2329 }
2330 
2331 /* Return true if there is an in-order reduction function for CODE, storing
2332    it in *REDUC_FN if so.  */
2333 
2334 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2335 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2336 {
2337   switch (code)
2338     {
2339     case PLUS_EXPR:
2340       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2341       return true;
2342 
2343     default:
2344       return false;
2345     }
2346 }
2347 
2348 /* Function reduction_fn_for_scalar_code
2349 
2350    Input:
2351    CODE - tree_code of a reduction operations.
2352 
2353    Output:
2354    REDUC_FN - the corresponding internal function to be used to reduce the
2355       vector of partial results into a single scalar result, or IFN_LAST
2356       if the operation is a supported reduction operation, but does not have
2357       such an internal function.
2358 
2359    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2360 
2361 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2362 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2363 {
2364   switch (code)
2365     {
2366       case MAX_EXPR:
2367         *reduc_fn = IFN_REDUC_MAX;
2368         return true;
2369 
2370       case MIN_EXPR:
2371         *reduc_fn = IFN_REDUC_MIN;
2372         return true;
2373 
2374       case PLUS_EXPR:
2375         *reduc_fn = IFN_REDUC_PLUS;
2376         return true;
2377 
2378       case BIT_AND_EXPR:
2379 	*reduc_fn = IFN_REDUC_AND;
2380 	return true;
2381 
2382       case BIT_IOR_EXPR:
2383 	*reduc_fn = IFN_REDUC_IOR;
2384 	return true;
2385 
2386       case BIT_XOR_EXPR:
2387 	*reduc_fn = IFN_REDUC_XOR;
2388 	return true;
2389 
2390       case MULT_EXPR:
2391       case MINUS_EXPR:
2392         *reduc_fn = IFN_LAST;
2393         return true;
2394 
2395       default:
2396        return false;
2397     }
2398 }
2399 
2400 /* If there is a neutral value X such that SLP reduction NODE would not
2401    be affected by the introduction of additional X elements, return that X,
2402    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2403    is true if the SLP statements perform a single reduction, false if each
2404    statement performs an independent reduction.  */
2405 
2406 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)2407 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2408 			      bool reduc_chain)
2409 {
2410   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2411   stmt_vec_info stmt_vinfo = stmts[0];
2412   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2413   tree scalar_type = TREE_TYPE (vector_type);
2414   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2415   gcc_assert (loop);
2416 
2417   switch (code)
2418     {
2419     case WIDEN_SUM_EXPR:
2420     case DOT_PROD_EXPR:
2421     case SAD_EXPR:
2422     case PLUS_EXPR:
2423     case MINUS_EXPR:
2424     case BIT_IOR_EXPR:
2425     case BIT_XOR_EXPR:
2426       return build_zero_cst (scalar_type);
2427 
2428     case MULT_EXPR:
2429       return build_one_cst (scalar_type);
2430 
2431     case BIT_AND_EXPR:
2432       return build_all_ones_cst (scalar_type);
2433 
2434     case MAX_EXPR:
2435     case MIN_EXPR:
2436       /* For MIN/MAX the initial values are neutral.  A reduction chain
2437 	 has only a single initial value, so that value is neutral for
2438 	 all statements.  */
2439       if (reduc_chain)
2440 	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2441 				      loop_preheader_edge (loop));
2442       return NULL_TREE;
2443 
2444     default:
2445       return NULL_TREE;
2446     }
2447 }
2448 
2449 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2450    STMT is printed with a message MSG. */
2451 
2452 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2453 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2454 {
2455   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2456 }
2457 
2458 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2459    operation.  Return true if the results of DEF_STMT_INFO are something
2460    that can be accumulated by such a reduction.  */
2461 
2462 static bool
vect_valid_reduction_input_p(stmt_vec_info def_stmt_info)2463 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2464 {
2465   return (is_gimple_assign (def_stmt_info->stmt)
2466 	  || is_gimple_call (def_stmt_info->stmt)
2467 	  || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2468 	  || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2469 	      && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2470 	      && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2471 }
2472 
2473 /* Detect SLP reduction of the form:
2474 
2475    #a1 = phi <a5, a0>
2476    a2 = operation (a1)
2477    a3 = operation (a2)
2478    a4 = operation (a3)
2479    a5 = operation (a4)
2480 
2481    #a = phi <a5>
2482 
2483    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2484    FIRST_STMT is the first reduction stmt in the chain
2485    (a2 = operation (a1)).
2486 
2487    Return TRUE if a reduction chain was detected.  */
2488 
2489 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2490 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2491 		       gimple *first_stmt)
2492 {
2493   struct loop *loop = (gimple_bb (phi))->loop_father;
2494   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2495   enum tree_code code;
2496   gimple *loop_use_stmt = NULL;
2497   stmt_vec_info use_stmt_info;
2498   tree lhs;
2499   imm_use_iterator imm_iter;
2500   use_operand_p use_p;
2501   int nloop_uses, size = 0, n_out_of_loop_uses;
2502   bool found = false;
2503 
2504   if (loop != vect_loop)
2505     return false;
2506 
2507   auto_vec<stmt_vec_info, 8> reduc_chain;
2508   lhs = PHI_RESULT (phi);
2509   code = gimple_assign_rhs_code (first_stmt);
2510   while (1)
2511     {
2512       nloop_uses = 0;
2513       n_out_of_loop_uses = 0;
2514       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2515         {
2516 	  gimple *use_stmt = USE_STMT (use_p);
2517 	  if (is_gimple_debug (use_stmt))
2518 	    continue;
2519 
2520           /* Check if we got back to the reduction phi.  */
2521 	  if (use_stmt == phi)
2522             {
2523 	      loop_use_stmt = use_stmt;
2524               found = true;
2525               break;
2526             }
2527 
2528           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2529             {
2530 	      loop_use_stmt = use_stmt;
2531 	      nloop_uses++;
2532             }
2533            else
2534              n_out_of_loop_uses++;
2535 
2536            /* There are can be either a single use in the loop or two uses in
2537               phi nodes.  */
2538            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2539              return false;
2540         }
2541 
2542       if (found)
2543         break;
2544 
2545       /* We reached a statement with no loop uses.  */
2546       if (nloop_uses == 0)
2547 	return false;
2548 
2549       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2550       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2551         return false;
2552 
2553       if (!is_gimple_assign (loop_use_stmt)
2554 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2555 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2556         return false;
2557 
2558       /* Insert USE_STMT into reduction chain.  */
2559       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2560       reduc_chain.safe_push (use_stmt_info);
2561 
2562       lhs = gimple_assign_lhs (loop_use_stmt);
2563       size++;
2564    }
2565 
2566   if (!found || loop_use_stmt != phi || size < 2)
2567     return false;
2568 
2569   /* Swap the operands, if needed, to make the reduction operand be the second
2570      operand.  */
2571   lhs = PHI_RESULT (phi);
2572   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2573     {
2574       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2575       if (gimple_assign_rhs2 (next_stmt) == lhs)
2576 	{
2577 	  tree op = gimple_assign_rhs1 (next_stmt);
2578 	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2579 
2580 	  /* Check that the other def is either defined in the loop
2581 	     ("vect_internal_def"), or it's an induction (defined by a
2582 	     loop-header phi-node).  */
2583 	  if (def_stmt_info
2584 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2585 	      && vect_valid_reduction_input_p (def_stmt_info))
2586 	    {
2587 	      lhs = gimple_assign_lhs (next_stmt);
2588  	      continue;
2589 	    }
2590 
2591 	  return false;
2592 	}
2593       else
2594 	{
2595           tree op = gimple_assign_rhs2 (next_stmt);
2596 	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2597 
2598           /* Check that the other def is either defined in the loop
2599             ("vect_internal_def"), or it's an induction (defined by a
2600             loop-header phi-node).  */
2601 	  if (def_stmt_info
2602 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2603 	      && vect_valid_reduction_input_p (def_stmt_info))
2604   	    {
2605 	      if (dump_enabled_p ())
2606 		dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2607 				 next_stmt);
2608 
2609 	      swap_ssa_operands (next_stmt,
2610 	 		         gimple_assign_rhs1_ptr (next_stmt),
2611                                  gimple_assign_rhs2_ptr (next_stmt));
2612 	      update_stmt (next_stmt);
2613 
2614 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2615 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2616 	    }
2617 	  else
2618 	    return false;
2619         }
2620 
2621       lhs = gimple_assign_lhs (next_stmt);
2622     }
2623 
2624   /* Build up the actual chain.  */
2625   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2626     {
2627       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2628       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2629     }
2630   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2631   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2632 
2633   /* Save the chain for further analysis in SLP detection.  */
2634   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2635   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2636 
2637   return true;
2638 }
2639 
2640 /* Return true if we need an in-order reduction for operation CODE
2641    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2642    overflow must wrap.  */
2643 
2644 static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)2645 needs_fold_left_reduction_p (tree type, tree_code code,
2646 			     bool need_wrapping_integral_overflow)
2647 {
2648   /* CHECKME: check for !flag_finite_math_only too?  */
2649   if (SCALAR_FLOAT_TYPE_P (type))
2650     switch (code)
2651       {
2652       case MIN_EXPR:
2653       case MAX_EXPR:
2654 	return false;
2655 
2656       default:
2657 	return !flag_associative_math;
2658       }
2659 
2660   if (INTEGRAL_TYPE_P (type))
2661     {
2662       if (!operation_no_trapping_overflow (type, code))
2663 	return true;
2664       if (need_wrapping_integral_overflow
2665 	  && !TYPE_OVERFLOW_WRAPS (type)
2666 	  && operation_can_overflow (code))
2667 	return true;
2668       return false;
2669     }
2670 
2671   if (SAT_FIXED_POINT_TYPE_P (type))
2672     return true;
2673 
2674   return false;
2675 }
2676 
2677 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2678    reduction operation CODE has a handled computation expression.  */
2679 
2680 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)2681 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2682 		      tree loop_arg, enum tree_code code)
2683 {
2684   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2685   auto_bitmap visited;
2686   tree lookfor = PHI_RESULT (phi);
2687   ssa_op_iter curri;
2688   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2689   while (USE_FROM_PTR (curr) != loop_arg)
2690     curr = op_iter_next_use (&curri);
2691   curri.i = curri.numops;
2692   do
2693     {
2694       path.safe_push (std::make_pair (curri, curr));
2695       tree use = USE_FROM_PTR (curr);
2696       if (use == lookfor)
2697 	break;
2698       gimple *def = SSA_NAME_DEF_STMT (use);
2699       if (gimple_nop_p (def)
2700 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2701 	{
2702 pop:
2703 	  do
2704 	    {
2705 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2706 	      curri = x.first;
2707 	      curr = x.second;
2708 	      do
2709 		curr = op_iter_next_use (&curri);
2710 	      /* Skip already visited or non-SSA operands (from iterating
2711 	         over PHI args).  */
2712 	      while (curr != NULL_USE_OPERAND_P
2713 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2714 			 || ! bitmap_set_bit (visited,
2715 					      SSA_NAME_VERSION
2716 					        (USE_FROM_PTR (curr)))));
2717 	    }
2718 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2719 	  if (curr == NULL_USE_OPERAND_P)
2720 	    break;
2721 	}
2722       else
2723 	{
2724 	  if (gimple_code (def) == GIMPLE_PHI)
2725 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2726 	  else
2727 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2728 	  while (curr != NULL_USE_OPERAND_P
2729 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2730 		     || ! bitmap_set_bit (visited,
2731 					  SSA_NAME_VERSION
2732 					    (USE_FROM_PTR (curr)))))
2733 	    curr = op_iter_next_use (&curri);
2734 	  if (curr == NULL_USE_OPERAND_P)
2735 	    goto pop;
2736 	}
2737     }
2738   while (1);
2739   if (dump_file && (dump_flags & TDF_DETAILS))
2740     {
2741       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2742       unsigned i;
2743       std::pair<ssa_op_iter, use_operand_p> *x;
2744       FOR_EACH_VEC_ELT (path, i, x)
2745 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2746       dump_printf (MSG_NOTE, "\n");
2747     }
2748 
2749   /* Check whether the reduction path detected is valid.  */
2750   bool fail = path.length () == 0;
2751   bool neg = false;
2752   for (unsigned i = 1; i < path.length (); ++i)
2753     {
2754       gimple *use_stmt = USE_STMT (path[i].second);
2755       tree op = USE_FROM_PTR (path[i].second);
2756       if (! has_single_use (op)
2757 	  || ! is_gimple_assign (use_stmt))
2758 	{
2759 	  fail = true;
2760 	  break;
2761 	}
2762       if (gimple_assign_rhs_code (use_stmt) != code)
2763 	{
2764 	  if (code == PLUS_EXPR
2765 	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2766 	    {
2767 	      /* Track whether we negate the reduction value each iteration.  */
2768 	      if (gimple_assign_rhs2 (use_stmt) == op)
2769 		neg = ! neg;
2770 	    }
2771 	  else
2772 	    {
2773 	      fail = true;
2774 	      break;
2775 	    }
2776 	}
2777     }
2778   return ! fail && ! neg;
2779 }
2780 
2781 
2782 /* Function vect_is_simple_reduction
2783 
2784    (1) Detect a cross-iteration def-use cycle that represents a simple
2785    reduction computation.  We look for the following pattern:
2786 
2787    loop_header:
2788      a1 = phi < a0, a2 >
2789      a3 = ...
2790      a2 = operation (a3, a1)
2791 
2792    or
2793 
2794    a3 = ...
2795    loop_header:
2796      a1 = phi < a0, a2 >
2797      a2 = operation (a3, a1)
2798 
2799    such that:
2800    1. operation is commutative and associative and it is safe to
2801       change the order of the computation
2802    2. no uses for a2 in the loop (a2 is used out of the loop)
2803    3. no uses of a1 in the loop besides the reduction operation
2804    4. no uses of a1 outside the loop.
2805 
2806    Conditions 1,4 are tested here.
2807    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2808 
2809    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2810    nested cycles.
2811 
2812    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2813    reductions:
2814 
2815      a1 = phi < a0, a2 >
2816      inner loop (def of a3)
2817      a2 = phi < a3 >
2818 
2819    (4) Detect condition expressions, ie:
2820      for (int i = 0; i < N; i++)
2821        if (a[i] < val)
2822 	ret_val = a[i];
2823 
2824 */
2825 
2826 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)2827 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2828 			  bool *double_reduc,
2829 			  bool need_wrapping_integral_overflow,
2830 			  enum vect_reduction_type *v_reduc_type)
2831 {
2832   gphi *phi = as_a <gphi *> (phi_info->stmt);
2833   struct loop *loop = (gimple_bb (phi))->loop_father;
2834   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2835   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2836   gimple *phi_use_stmt = NULL;
2837   enum tree_code orig_code, code;
2838   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2839   tree type;
2840   tree name;
2841   imm_use_iterator imm_iter;
2842   use_operand_p use_p;
2843   bool phi_def;
2844 
2845   *double_reduc = false;
2846   *v_reduc_type = TREE_CODE_REDUCTION;
2847 
2848   tree phi_name = PHI_RESULT (phi);
2849   /* ???  If there are no uses of the PHI result the inner loop reduction
2850      won't be detected as possibly double-reduction by vectorizable_reduction
2851      because that tries to walk the PHI arg from the preheader edge which
2852      can be constant.  See PR60382.  */
2853   if (has_zero_uses (phi_name))
2854     return NULL;
2855   unsigned nphi_def_loop_uses = 0;
2856   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2857     {
2858       gimple *use_stmt = USE_STMT (use_p);
2859       if (is_gimple_debug (use_stmt))
2860 	continue;
2861 
2862       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2863         {
2864           if (dump_enabled_p ())
2865 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2866 			     "intermediate value used outside loop.\n");
2867 
2868           return NULL;
2869         }
2870 
2871       nphi_def_loop_uses++;
2872       phi_use_stmt = use_stmt;
2873     }
2874 
2875   edge latch_e = loop_latch_edge (loop);
2876   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2877   if (TREE_CODE (loop_arg) != SSA_NAME)
2878     {
2879       if (dump_enabled_p ())
2880 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881 			 "reduction: not ssa_name: %T\n", loop_arg);
2882       return NULL;
2883     }
2884 
2885   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2886   if (!def_stmt_info
2887       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2888     return NULL;
2889 
2890   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2891     {
2892       name = gimple_assign_lhs (def_stmt);
2893       phi_def = false;
2894     }
2895   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2896     {
2897       name = PHI_RESULT (def_stmt);
2898       phi_def = true;
2899     }
2900   else
2901     {
2902       if (dump_enabled_p ())
2903 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904 			 "reduction: unhandled reduction operation: %G",
2905 			 def_stmt_info->stmt);
2906       return NULL;
2907     }
2908 
2909   unsigned nlatch_def_loop_uses = 0;
2910   auto_vec<gphi *, 3> lcphis;
2911   bool inner_loop_of_double_reduc = false;
2912   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2913     {
2914       gimple *use_stmt = USE_STMT (use_p);
2915       if (is_gimple_debug (use_stmt))
2916 	continue;
2917       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2918 	nlatch_def_loop_uses++;
2919       else
2920 	{
2921 	  /* We can have more than one loop-closed PHI.  */
2922 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
2923 	  if (nested_in_vect_loop
2924 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2925 		  == vect_double_reduction_def))
2926 	    inner_loop_of_double_reduc = true;
2927 	}
2928     }
2929 
2930   /* If this isn't a nested cycle or if the nested cycle reduction value
2931      is used ouside of the inner loop we cannot handle uses of the reduction
2932      value.  */
2933   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2934       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2935     {
2936       if (dump_enabled_p ())
2937 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2938 			 "reduction used in loop.\n");
2939       return NULL;
2940     }
2941 
2942   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2943      defined in the inner loop.  */
2944   if (phi_def)
2945     {
2946       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2947       op1 = PHI_ARG_DEF (def_stmt, 0);
2948 
2949       if (gimple_phi_num_args (def_stmt) != 1
2950           || TREE_CODE (op1) != SSA_NAME)
2951         {
2952           if (dump_enabled_p ())
2953 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954 			     "unsupported phi node definition.\n");
2955 
2956           return NULL;
2957         }
2958 
2959       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2960       if (gimple_bb (def1)
2961 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2962           && loop->inner
2963           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2964           && is_gimple_assign (def1)
2965 	  && is_a <gphi *> (phi_use_stmt)
2966 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2967         {
2968           if (dump_enabled_p ())
2969             report_vect_op (MSG_NOTE, def_stmt,
2970 			    "detected double reduction: ");
2971 
2972           *double_reduc = true;
2973 	  return def_stmt_info;
2974         }
2975 
2976       return NULL;
2977     }
2978 
2979   /* If we are vectorizing an inner reduction we are executing that
2980      in the original order only in case we are not dealing with a
2981      double reduction.  */
2982   bool check_reduction = true;
2983   if (flow_loop_nested_p (vect_loop, loop))
2984     {
2985       gphi *lcphi;
2986       unsigned i;
2987       check_reduction = false;
2988       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2989 	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2990 	  {
2991 	    gimple *use_stmt = USE_STMT (use_p);
2992 	    if (is_gimple_debug (use_stmt))
2993 	      continue;
2994 	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2995 	      check_reduction = true;
2996 	  }
2997     }
2998 
2999   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3000   code = orig_code = gimple_assign_rhs_code (def_stmt);
3001 
3002   if (nested_in_vect_loop && !check_reduction)
3003     {
3004       /* FIXME: Even for non-reductions code generation is funneled
3005 	 through vectorizable_reduction for the stmt defining the
3006 	 PHI latch value.  So we have to artificially restrict ourselves
3007 	 for the supported operations.  */
3008       switch (get_gimple_rhs_class (code))
3009 	{
3010 	case GIMPLE_BINARY_RHS:
3011 	case GIMPLE_TERNARY_RHS:
3012 	  break;
3013 	default:
3014 	  /* Not supported by vectorizable_reduction.  */
3015 	  if (dump_enabled_p ())
3016 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3017 			    "nested cycle: not handled operation: ");
3018 	  return NULL;
3019 	}
3020       if (dump_enabled_p ())
3021 	report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3022       return def_stmt_info;
3023     }
3024 
3025   /* We can handle "res -= x[i]", which is non-associative by
3026      simply rewriting this into "res += -x[i]".  Avoid changing
3027      gimple instruction for the first simple tests and only do this
3028      if we're allowed to change code at all.  */
3029   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3030     code = PLUS_EXPR;
3031 
3032   if (code == COND_EXPR)
3033     {
3034       if (! nested_in_vect_loop)
3035 	*v_reduc_type = COND_REDUCTION;
3036 
3037       op3 = gimple_assign_rhs1 (def_stmt);
3038       if (COMPARISON_CLASS_P (op3))
3039         {
3040           op4 = TREE_OPERAND (op3, 1);
3041           op3 = TREE_OPERAND (op3, 0);
3042         }
3043       if (op3 == phi_name || op4 == phi_name)
3044 	{
3045 	  if (dump_enabled_p ())
3046 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3047 			    "reduction: condition depends on previous"
3048 			    " iteration: ");
3049 	  return NULL;
3050 	}
3051 
3052       op1 = gimple_assign_rhs2 (def_stmt);
3053       op2 = gimple_assign_rhs3 (def_stmt);
3054     }
3055   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3056     {
3057       if (dump_enabled_p ())
3058 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3059 			"reduction: not commutative/associative: ");
3060       return NULL;
3061     }
3062   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3063     {
3064       op1 = gimple_assign_rhs1 (def_stmt);
3065       op2 = gimple_assign_rhs2 (def_stmt);
3066     }
3067   else
3068     {
3069       if (dump_enabled_p ())
3070 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071 			"reduction: not handled operation: ");
3072       return NULL;
3073     }
3074 
3075   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3076     {
3077       if (dump_enabled_p ())
3078 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3079 			"reduction: both uses not ssa_names: ");
3080 
3081       return NULL;
3082     }
3083 
3084   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3085   if ((TREE_CODE (op1) == SSA_NAME
3086        && !types_compatible_p (type,TREE_TYPE (op1)))
3087       || (TREE_CODE (op2) == SSA_NAME
3088           && !types_compatible_p (type, TREE_TYPE (op2)))
3089       || (op3 && TREE_CODE (op3) == SSA_NAME
3090           && !types_compatible_p (type, TREE_TYPE (op3)))
3091       || (op4 && TREE_CODE (op4) == SSA_NAME
3092           && !types_compatible_p (type, TREE_TYPE (op4))))
3093     {
3094       if (dump_enabled_p ())
3095         {
3096           dump_printf_loc (MSG_NOTE, vect_location,
3097 			   "reduction: multiple types: operation type: "
3098 			   "%T, operands types: %T,%T",
3099 			   type,  TREE_TYPE (op1), TREE_TYPE (op2));
3100           if (op3)
3101 	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3102 
3103           if (op4)
3104 	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3105           dump_printf (MSG_NOTE, "\n");
3106         }
3107 
3108       return NULL;
3109     }
3110 
3111   /* Check whether it's ok to change the order of the computation.
3112      Generally, when vectorizing a reduction we change the order of the
3113      computation.  This may change the behavior of the program in some
3114      cases, so we need to check that this is ok.  One exception is when
3115      vectorizing an outer-loop: the inner-loop is executed sequentially,
3116      and therefore vectorizing reductions in the inner-loop during
3117      outer-loop vectorization is safe.  */
3118   if (check_reduction
3119       && *v_reduc_type == TREE_CODE_REDUCTION
3120       && needs_fold_left_reduction_p (type, code,
3121 				      need_wrapping_integral_overflow))
3122     *v_reduc_type = FOLD_LEFT_REDUCTION;
3123 
3124   /* Reduction is safe. We're dealing with one of the following:
3125      1) integer arithmetic and no trapv
3126      2) floating point arithmetic, and special flags permit this optimization
3127      3) nested cycle (i.e., outer loop vectorization).  */
3128   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3129   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3130   if (code != COND_EXPR && !def1_info && !def2_info)
3131     {
3132       if (dump_enabled_p ())
3133 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3134       return NULL;
3135     }
3136 
3137   /* Check that one def is the reduction def, defined by PHI,
3138      the other def is either defined in the loop ("vect_internal_def"),
3139      or it's an induction (defined by a loop-header phi-node).  */
3140 
3141   if (def2_info
3142       && def2_info->stmt == phi
3143       && (code == COND_EXPR
3144 	  || !def1_info
3145 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3146 	  || vect_valid_reduction_input_p (def1_info)))
3147     {
3148       if (dump_enabled_p ())
3149 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3150       return def_stmt_info;
3151     }
3152 
3153   if (def1_info
3154       && def1_info->stmt == phi
3155       && (code == COND_EXPR
3156 	  || !def2_info
3157 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3158 	  || vect_valid_reduction_input_p (def2_info)))
3159     {
3160       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3161 	{
3162 	  /* Check if we can swap operands (just for simplicity - so that
3163 	     the rest of the code can assume that the reduction variable
3164 	     is always the last (second) argument).  */
3165 	  if (code == COND_EXPR)
3166 	    {
3167 	      /* Swap cond_expr by inverting the condition.  */
3168 	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
3169 	      enum tree_code invert_code = ERROR_MARK;
3170 	      enum tree_code cond_code = TREE_CODE (cond_expr);
3171 
3172 	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3173 		{
3174 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3175 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
3176 		}
3177 	      if (invert_code != ERROR_MARK)
3178 		{
3179 		  TREE_SET_CODE (cond_expr, invert_code);
3180 		  swap_ssa_operands (def_stmt,
3181 				     gimple_assign_rhs2_ptr (def_stmt),
3182 				     gimple_assign_rhs3_ptr (def_stmt));
3183 		}
3184 	      else
3185 		{
3186 		  if (dump_enabled_p ())
3187 		    report_vect_op (MSG_NOTE, def_stmt,
3188 				    "detected reduction: cannot swap operands "
3189 				    "for cond_expr");
3190 		  return NULL;
3191 		}
3192 	    }
3193 	  else
3194 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3195 			       gimple_assign_rhs2_ptr (def_stmt));
3196 
3197 	  if (dump_enabled_p ())
3198 	    report_vect_op (MSG_NOTE, def_stmt,
3199 			    "detected reduction: need to swap operands: ");
3200 
3201 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3202 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3203         }
3204       else
3205         {
3206           if (dump_enabled_p ())
3207             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3208         }
3209 
3210       return def_stmt_info;
3211     }
3212 
3213   /* Try to find SLP reduction chain.  */
3214   if (! nested_in_vect_loop
3215       && code != COND_EXPR
3216       && orig_code != MINUS_EXPR
3217       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3218     {
3219       if (dump_enabled_p ())
3220         report_vect_op (MSG_NOTE, def_stmt,
3221 			"reduction: detected reduction chain: ");
3222 
3223       return def_stmt_info;
3224     }
3225 
3226   /* Look for the expression computing loop_arg from loop PHI result.  */
3227   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3228     return def_stmt_info;
3229 
3230   if (dump_enabled_p ())
3231     {
3232       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3233 		      "reduction: unknown pattern: ");
3234     }
3235 
3236   return NULL;
3237 }
3238 
3239 /* Wrapper around vect_is_simple_reduction, which will modify code
3240    in-place if it enables detection of more reductions.  Arguments
3241    as there.  */
3242 
3243 stmt_vec_info
vect_force_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow)3244 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3245 			     bool *double_reduc,
3246 			     bool need_wrapping_integral_overflow)
3247 {
3248   enum vect_reduction_type v_reduc_type;
3249   stmt_vec_info def_info
3250     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3251 				need_wrapping_integral_overflow,
3252 				&v_reduc_type);
3253   if (def_info)
3254     {
3255       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3256       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3257       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3258       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3259     }
3260   return def_info;
3261 }
3262 
3263 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3264 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3265 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3266                              int *peel_iters_epilogue,
3267                              stmt_vector_for_cost *scalar_cost_vec,
3268 			     stmt_vector_for_cost *prologue_cost_vec,
3269 			     stmt_vector_for_cost *epilogue_cost_vec)
3270 {
3271   int retval = 0;
3272   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3273 
3274   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3275     {
3276       *peel_iters_epilogue = assumed_vf / 2;
3277       if (dump_enabled_p ())
3278         dump_printf_loc (MSG_NOTE, vect_location,
3279 			 "cost model: epilogue peel iters set to vf/2 "
3280 			 "because loop iterations are unknown .\n");
3281 
3282       /* If peeled iterations are known but number of scalar loop
3283          iterations are unknown, count a taken branch per peeled loop.  */
3284       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285 				 NULL, 0, vect_prologue);
3286       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287 				 NULL, 0, vect_epilogue);
3288     }
3289   else
3290     {
3291       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3292       peel_iters_prologue = niters < peel_iters_prologue ?
3293                             niters : peel_iters_prologue;
3294       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3295       /* If we need to peel for gaps, but no peeling is required, we have to
3296 	 peel VF iterations.  */
3297       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3298 	*peel_iters_epilogue = assumed_vf;
3299     }
3300 
3301   stmt_info_for_cost *si;
3302   int j;
3303   if (peel_iters_prologue)
3304     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3305       retval += record_stmt_cost (prologue_cost_vec,
3306 				  si->count * peel_iters_prologue,
3307 				  si->kind, si->stmt_info, si->misalign,
3308 				  vect_prologue);
3309   if (*peel_iters_epilogue)
3310     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3311       retval += record_stmt_cost (epilogue_cost_vec,
3312 				  si->count * *peel_iters_epilogue,
3313 				  si->kind, si->stmt_info, si->misalign,
3314 				  vect_epilogue);
3315 
3316   return retval;
3317 }
3318 
3319 /* Function vect_estimate_min_profitable_iters
3320 
3321    Return the number of iterations required for the vector version of the
3322    loop to be profitable relative to the cost of the scalar version of the
3323    loop.
3324 
3325    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3326    of iterations for vectorization.  -1 value means loop vectorization
3327    is not profitable.  This returned value may be used for dynamic
3328    profitability check.
3329 
3330    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3331    for static check against estimated number of iterations.  */
3332 
3333 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3334 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3335 				    int *ret_min_profitable_niters,
3336 				    int *ret_min_profitable_estimate)
3337 {
3338   int min_profitable_iters;
3339   int min_profitable_estimate;
3340   int peel_iters_prologue;
3341   int peel_iters_epilogue;
3342   unsigned vec_inside_cost = 0;
3343   int vec_outside_cost = 0;
3344   unsigned vec_prologue_cost = 0;
3345   unsigned vec_epilogue_cost = 0;
3346   int scalar_single_iter_cost = 0;
3347   int scalar_outside_cost = 0;
3348   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3349   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3350   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3351 
3352   /* Cost model disabled.  */
3353   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3354     {
3355       if (dump_enabled_p ())
3356 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3357       *ret_min_profitable_niters = 0;
3358       *ret_min_profitable_estimate = 0;
3359       return;
3360     }
3361 
3362   /* Requires loop versioning tests to handle misalignment.  */
3363   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3364     {
3365       /*  FIXME: Make cost depend on complexity of individual check.  */
3366       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3367       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3368 			    vect_prologue);
3369       if (dump_enabled_p ())
3370 	dump_printf (MSG_NOTE,
3371 		     "cost model: Adding cost of checks for loop "
3372 		     "versioning to treat misalignment.\n");
3373     }
3374 
3375   /* Requires loop versioning with alias checks.  */
3376   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3377     {
3378       /*  FIXME: Make cost depend on complexity of individual check.  */
3379       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3380       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3381 			    vect_prologue);
3382       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3383       if (len)
3384 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3385 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3386 			      NULL, 0, vect_prologue);
3387       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3388       if (len)
3389 	{
3390 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3391 	  unsigned int nstmts = len * 2 - 1;
3392 	  /* +1 for each bias that needs adding.  */
3393 	  for (unsigned int i = 0; i < len; ++i)
3394 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3395 	      nstmts += 1;
3396 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3397 				NULL, 0, vect_prologue);
3398 	}
3399       if (dump_enabled_p ())
3400 	dump_printf (MSG_NOTE,
3401 		     "cost model: Adding cost of checks for loop "
3402 		     "versioning aliasing.\n");
3403     }
3404 
3405   /* Requires loop versioning with niter checks.  */
3406   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3407     {
3408       /*  FIXME: Make cost depend on complexity of individual check.  */
3409       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3410 			    vect_prologue);
3411       if (dump_enabled_p ())
3412 	dump_printf (MSG_NOTE,
3413 		     "cost model: Adding cost of checks for loop "
3414 		     "versioning niters.\n");
3415     }
3416 
3417   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3418     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3419 			  vect_prologue);
3420 
3421   /* Count statements in scalar loop.  Using this as scalar cost for a single
3422      iteration for now.
3423 
3424      TODO: Add outer loop support.
3425 
3426      TODO: Consider assigning different costs to different scalar
3427      statements.  */
3428 
3429   scalar_single_iter_cost
3430     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3431 
3432   /* Add additional cost for the peeled instructions in prologue and epilogue
3433      loop.  (For fully-masked loops there will be no peeling.)
3434 
3435      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3436      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3437 
3438      TODO: Build an expression that represents peel_iters for prologue and
3439      epilogue to be used in a run-time test.  */
3440 
3441   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3442     {
3443       peel_iters_prologue = 0;
3444       peel_iters_epilogue = 0;
3445 
3446       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3447 	{
3448 	  /* We need to peel exactly one iteration.  */
3449 	  peel_iters_epilogue += 1;
3450 	  stmt_info_for_cost *si;
3451 	  int j;
3452 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3453 			    j, si)
3454 	    (void) add_stmt_cost (target_cost_data, si->count,
3455 				  si->kind, si->stmt_info, si->misalign,
3456 				  vect_epilogue);
3457 	}
3458     }
3459   else if (npeel < 0)
3460     {
3461       peel_iters_prologue = assumed_vf / 2;
3462       if (dump_enabled_p ())
3463 	dump_printf (MSG_NOTE, "cost model: "
3464 		     "prologue peel iters set to vf/2.\n");
3465 
3466       /* If peeling for alignment is unknown, loop bound of main loop becomes
3467          unknown.  */
3468       peel_iters_epilogue = assumed_vf / 2;
3469       if (dump_enabled_p ())
3470 	dump_printf (MSG_NOTE, "cost model: "
3471 		     "epilogue peel iters set to vf/2 because "
3472 		     "peeling for alignment is unknown.\n");
3473 
3474       /* If peeled iterations are unknown, count a taken branch and a not taken
3475          branch per peeled loop. Even if scalar loop iterations are known,
3476          vector iterations are not known since peeled prologue iterations are
3477          not known. Hence guards remain the same.  */
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3479 			    NULL, 0, vect_prologue);
3480       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3481 			    NULL, 0, vect_prologue);
3482       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3483 			    NULL, 0, vect_epilogue);
3484       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3485 			    NULL, 0, vect_epilogue);
3486       stmt_info_for_cost *si;
3487       int j;
3488       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3489 	{
3490 	  (void) add_stmt_cost (target_cost_data,
3491 				si->count * peel_iters_prologue,
3492 				si->kind, si->stmt_info, si->misalign,
3493 				vect_prologue);
3494 	  (void) add_stmt_cost (target_cost_data,
3495 				si->count * peel_iters_epilogue,
3496 				si->kind, si->stmt_info, si->misalign,
3497 				vect_epilogue);
3498 	}
3499     }
3500   else
3501     {
3502       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3503       stmt_info_for_cost *si;
3504       int j;
3505       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3506 
3507       prologue_cost_vec.create (2);
3508       epilogue_cost_vec.create (2);
3509       peel_iters_prologue = npeel;
3510 
3511       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3512 					  &peel_iters_epilogue,
3513 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3514 					    (loop_vinfo),
3515 					  &prologue_cost_vec,
3516 					  &epilogue_cost_vec);
3517 
3518       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3519 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3520 			      si->misalign, vect_prologue);
3521 
3522       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3523 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3524 			      si->misalign, vect_epilogue);
3525 
3526       prologue_cost_vec.release ();
3527       epilogue_cost_vec.release ();
3528     }
3529 
3530   /* FORNOW: The scalar outside cost is incremented in one of the
3531      following ways:
3532 
3533      1. The vectorizer checks for alignment and aliasing and generates
3534      a condition that allows dynamic vectorization.  A cost model
3535      check is ANDED with the versioning condition.  Hence scalar code
3536      path now has the added cost of the versioning check.
3537 
3538        if (cost > th & versioning_check)
3539          jmp to vector code
3540 
3541      Hence run-time scalar is incremented by not-taken branch cost.
3542 
3543      2. The vectorizer then checks if a prologue is required.  If the
3544      cost model check was not done before during versioning, it has to
3545      be done before the prologue check.
3546 
3547        if (cost <= th)
3548          prologue = scalar_iters
3549        if (prologue == 0)
3550          jmp to vector code
3551        else
3552          execute prologue
3553        if (prologue == num_iters)
3554 	 go to exit
3555 
3556      Hence the run-time scalar cost is incremented by a taken branch,
3557      plus a not-taken branch, plus a taken branch cost.
3558 
3559      3. The vectorizer then checks if an epilogue is required.  If the
3560      cost model check was not done before during prologue check, it
3561      has to be done with the epilogue check.
3562 
3563        if (prologue == 0)
3564          jmp to vector code
3565        else
3566          execute prologue
3567        if (prologue == num_iters)
3568 	 go to exit
3569        vector code:
3570          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3571            jmp to epilogue
3572 
3573      Hence the run-time scalar cost should be incremented by 2 taken
3574      branches.
3575 
3576      TODO: The back end may reorder the BBS's differently and reverse
3577      conditions/branch directions.  Change the estimates below to
3578      something more reasonable.  */
3579 
3580   /* If the number of iterations is known and we do not do versioning, we can
3581      decide whether to vectorize at compile time.  Hence the scalar version
3582      do not carry cost model guard costs.  */
3583   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3584       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3585     {
3586       /* Cost model check occurs at versioning.  */
3587       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3588 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3589       else
3590 	{
3591 	  /* Cost model check occurs at prologue generation.  */
3592 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3593 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3594 	      + vect_get_stmt_cost (cond_branch_not_taken);
3595 	  /* Cost model check occurs at epilogue generation.  */
3596 	  else
3597 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3598 	}
3599     }
3600 
3601   /* Complete the target-specific cost calculations.  */
3602   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3603 	       &vec_inside_cost, &vec_epilogue_cost);
3604 
3605   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3606 
3607   if (dump_enabled_p ())
3608     {
3609       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3610       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3611                    vec_inside_cost);
3612       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3613                    vec_prologue_cost);
3614       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3615                    vec_epilogue_cost);
3616       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3617                    scalar_single_iter_cost);
3618       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3619                    scalar_outside_cost);
3620       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3621                    vec_outside_cost);
3622       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3623                    peel_iters_prologue);
3624       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3625                    peel_iters_epilogue);
3626     }
3627 
3628   /* Calculate number of iterations required to make the vector version
3629      profitable, relative to the loop bodies only.  The following condition
3630      must hold true:
3631      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3632      where
3633      SIC = scalar iteration cost, VIC = vector iteration cost,
3634      VOC = vector outside cost, VF = vectorization factor,
3635      NPEEL = prologue iterations + epilogue iterations,
3636      SOC = scalar outside cost for run time cost model check.  */
3637 
3638   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3639 			  - vec_inside_cost);
3640   if (saving_per_viter <= 0)
3641     {
3642       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3643 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3644 		    "vectorization did not happen for a simd loop");
3645 
3646       if (dump_enabled_p ())
3647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648 			 "cost model: the vector iteration cost = %d "
3649 			 "divided by the scalar iteration cost = %d "
3650 			 "is greater or equal to the vectorization factor = %d"
3651                          ".\n",
3652 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3653       *ret_min_profitable_niters = -1;
3654       *ret_min_profitable_estimate = -1;
3655       return;
3656     }
3657 
3658   /* ??? The "if" arm is written to handle all cases; see below for what
3659      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3660   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3661     {
3662       /* Rewriting the condition above in terms of the number of
3663 	 vector iterations (vniters) rather than the number of
3664 	 scalar iterations (niters) gives:
3665 
3666 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3667 
3668 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3669 
3670 	 For integer N, X and Y when X > 0:
3671 
3672 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3673       int outside_overhead = (vec_outside_cost
3674 			      - scalar_single_iter_cost * peel_iters_prologue
3675 			      - scalar_single_iter_cost * peel_iters_epilogue
3676 			      - scalar_outside_cost);
3677       /* We're only interested in cases that require at least one
3678 	 vector iteration.  */
3679       int min_vec_niters = 1;
3680       if (outside_overhead > 0)
3681 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3682 
3683       if (dump_enabled_p ())
3684 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3685 		     min_vec_niters);
3686 
3687       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3688 	{
3689 	  /* Now that we know the minimum number of vector iterations,
3690 	     find the minimum niters for which the scalar cost is larger:
3691 
3692 	     SIC * niters > VIC * vniters + VOC - SOC
3693 
3694 	     We know that the minimum niters is no more than
3695 	     vniters * VF + NPEEL, but it might be (and often is) less
3696 	     than that if a partial vector iteration is cheaper than the
3697 	     equivalent scalar code.  */
3698 	  int threshold = (vec_inside_cost * min_vec_niters
3699 			   + vec_outside_cost
3700 			   - scalar_outside_cost);
3701 	  if (threshold <= 0)
3702 	    min_profitable_iters = 1;
3703 	  else
3704 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3705 	}
3706       else
3707 	/* Convert the number of vector iterations into a number of
3708 	   scalar iterations.  */
3709 	min_profitable_iters = (min_vec_niters * assumed_vf
3710 				+ peel_iters_prologue
3711 				+ peel_iters_epilogue);
3712     }
3713   else
3714     {
3715       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3716 			      * assumed_vf
3717 			      - vec_inside_cost * peel_iters_prologue
3718 			      - vec_inside_cost * peel_iters_epilogue);
3719       if (min_profitable_iters <= 0)
3720         min_profitable_iters = 0;
3721       else
3722 	{
3723 	  min_profitable_iters /= saving_per_viter;
3724 
3725 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3726 	      <= (((int) vec_inside_cost * min_profitable_iters)
3727 		  + (((int) vec_outside_cost - scalar_outside_cost)
3728 		     * assumed_vf)))
3729 	    min_profitable_iters++;
3730 	}
3731     }
3732 
3733   if (dump_enabled_p ())
3734     dump_printf (MSG_NOTE,
3735 		 "  Calculated minimum iters for profitability: %d\n",
3736 		 min_profitable_iters);
3737 
3738   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3739       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3740     /* We want the vectorized loop to execute at least once.  */
3741     min_profitable_iters = assumed_vf + peel_iters_prologue;
3742 
3743   if (dump_enabled_p ())
3744     dump_printf_loc (MSG_NOTE, vect_location,
3745                      "  Runtime profitability threshold = %d\n",
3746                      min_profitable_iters);
3747 
3748   *ret_min_profitable_niters = min_profitable_iters;
3749 
3750   /* Calculate number of iterations required to make the vector version
3751      profitable, relative to the loop bodies only.
3752 
3753      Non-vectorized variant is SIC * niters and it must win over vector
3754      variant on the expected loop trip count.  The following condition must hold true:
3755      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3756 
3757   if (vec_outside_cost <= 0)
3758     min_profitable_estimate = 0;
3759   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3760     {
3761       /* This is a repeat of the code above, but with + SOC rather
3762 	 than - SOC.  */
3763       int outside_overhead = (vec_outside_cost
3764 			      - scalar_single_iter_cost * peel_iters_prologue
3765 			      - scalar_single_iter_cost * peel_iters_epilogue
3766 			      + scalar_outside_cost);
3767       int min_vec_niters = 1;
3768       if (outside_overhead > 0)
3769 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3770 
3771       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3772 	{
3773 	  int threshold = (vec_inside_cost * min_vec_niters
3774 			   + vec_outside_cost
3775 			   + scalar_outside_cost);
3776 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3777 	}
3778       else
3779 	min_profitable_estimate = (min_vec_niters * assumed_vf
3780 				   + peel_iters_prologue
3781 				   + peel_iters_epilogue);
3782     }
3783   else
3784     {
3785       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3786 				 * assumed_vf
3787 				 - vec_inside_cost * peel_iters_prologue
3788 				 - vec_inside_cost * peel_iters_epilogue)
3789 				 / ((scalar_single_iter_cost * assumed_vf)
3790 				   - vec_inside_cost);
3791     }
3792   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3793   if (dump_enabled_p ())
3794     dump_printf_loc (MSG_NOTE, vect_location,
3795 		     "  Static estimate profitability threshold = %d\n",
3796 		     min_profitable_estimate);
3797 
3798   *ret_min_profitable_estimate = min_profitable_estimate;
3799 }
3800 
3801 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3802    vector elements (not bits) for a vector with NELT elements.  */
3803 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3804 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3805 			      vec_perm_builder *sel)
3806 {
3807   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3808      by vec_perm_indices.  */
3809   sel->new_vector (nelt, 1, 3);
3810   for (unsigned int i = 0; i < 3; i++)
3811     sel->quick_push (i + offset);
3812 }
3813 
3814 /* Checks whether the target supports whole-vector shifts for vectors of mode
3815    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3816    it supports vec_perm_const with masks for all necessary shift amounts.  */
3817 static bool
have_whole_vector_shift(machine_mode mode)3818 have_whole_vector_shift (machine_mode mode)
3819 {
3820   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3821     return true;
3822 
3823   /* Variable-length vectors should be handled via the optab.  */
3824   unsigned int nelt;
3825   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3826     return false;
3827 
3828   vec_perm_builder sel;
3829   vec_perm_indices indices;
3830   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3831     {
3832       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3833       indices.new_vector (sel, 2, nelt);
3834       if (!can_vec_perm_const_p (mode, indices, false))
3835 	return false;
3836     }
3837   return true;
3838 }
3839 
3840 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3841    functions. Design better to avoid maintenance issues.  */
3842 
3843 /* Function vect_model_reduction_cost.
3844 
3845    Models cost for a reduction operation, including the vector ops
3846    generated within the strip-mine loop, the initial definition before
3847    the loop, and the epilogue code that must be generated.  */
3848 
3849 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies,stmt_vector_for_cost * cost_vec)3850 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3851 			   int ncopies, stmt_vector_for_cost *cost_vec)
3852 {
3853   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3854   enum tree_code code;
3855   optab optab;
3856   tree vectype;
3857   machine_mode mode;
3858   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3859   struct loop *loop = NULL;
3860 
3861   if (loop_vinfo)
3862     loop = LOOP_VINFO_LOOP (loop_vinfo);
3863 
3864   /* Condition reductions generate two reductions in the loop.  */
3865   vect_reduction_type reduction_type
3866     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3867   if (reduction_type == COND_REDUCTION)
3868     ncopies *= 2;
3869 
3870   vectype = STMT_VINFO_VECTYPE (stmt_info);
3871   mode = TYPE_MODE (vectype);
3872   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3873 
3874   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3875 
3876   if (reduction_type == EXTRACT_LAST_REDUCTION
3877       || reduction_type == FOLD_LEFT_REDUCTION)
3878     {
3879       /* No extra instructions needed in the prologue.  */
3880       prologue_cost = 0;
3881 
3882       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3883 	/* Count one reduction-like operation per vector.  */
3884 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3885 					stmt_info, 0, vect_body);
3886       else
3887 	{
3888 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3889 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3890 	  inside_cost = record_stmt_cost (cost_vec, nelements,
3891 					  vec_to_scalar, stmt_info, 0,
3892 					  vect_body);
3893 	  inside_cost += record_stmt_cost (cost_vec, nelements,
3894 					   scalar_stmt, stmt_info, 0,
3895 					   vect_body);
3896 	}
3897     }
3898   else
3899     {
3900       /* Add in cost for initial definition.
3901 	 For cond reduction we have four vectors: initial index, step,
3902 	 initial result of the data reduction, initial value of the index
3903 	 reduction.  */
3904       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3905       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3906 					 scalar_to_vec, stmt_info, 0,
3907 					 vect_prologue);
3908 
3909       /* Cost of reduction op inside loop.  */
3910       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3911 				      stmt_info, 0, vect_body);
3912     }
3913 
3914   /* Determine cost of epilogue code.
3915 
3916      We have a reduction operator that will reduce the vector in one statement.
3917      Also requires scalar extract.  */
3918 
3919   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3920     {
3921       if (reduc_fn != IFN_LAST)
3922 	{
3923 	  if (reduction_type == COND_REDUCTION)
3924 	    {
3925 	      /* An EQ stmt and an COND_EXPR stmt.  */
3926 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3927 						 vector_stmt, stmt_info, 0,
3928 						 vect_epilogue);
3929 	      /* Reduction of the max index and a reduction of the found
3930 		 values.  */
3931 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3932 						 vec_to_scalar, stmt_info, 0,
3933 						 vect_epilogue);
3934 	      /* A broadcast of the max value.  */
3935 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3936 						 scalar_to_vec, stmt_info, 0,
3937 						 vect_epilogue);
3938 	    }
3939 	  else
3940 	    {
3941 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3942 						 stmt_info, 0, vect_epilogue);
3943 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3944 						 vec_to_scalar, stmt_info, 0,
3945 						 vect_epilogue);
3946 	    }
3947 	}
3948       else if (reduction_type == COND_REDUCTION)
3949 	{
3950 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3951 	  /* Extraction of scalar elements.  */
3952 	  epilogue_cost += record_stmt_cost (cost_vec,
3953 					     2 * estimated_nunits,
3954 					     vec_to_scalar, stmt_info, 0,
3955 					     vect_epilogue);
3956 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3957 	  epilogue_cost += record_stmt_cost (cost_vec,
3958 					     2 * estimated_nunits - 3,
3959 					     scalar_stmt, stmt_info, 0,
3960 					     vect_epilogue);
3961 	}
3962       else if (reduction_type == EXTRACT_LAST_REDUCTION
3963 	       || reduction_type == FOLD_LEFT_REDUCTION)
3964 	/* No extra instructions need in the epilogue.  */
3965 	;
3966       else
3967 	{
3968 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3969 	  tree bitsize =
3970 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3971 	  int element_bitsize = tree_to_uhwi (bitsize);
3972 	  int nelements = vec_size_in_bits / element_bitsize;
3973 
3974 	  if (code == COND_EXPR)
3975 	    code = MAX_EXPR;
3976 
3977 	  optab = optab_for_tree_code (code, vectype, optab_default);
3978 
3979 	  /* We have a whole vector shift available.  */
3980 	  if (optab != unknown_optab
3981 	      && VECTOR_MODE_P (mode)
3982 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3983 	      && have_whole_vector_shift (mode))
3984 	    {
3985 	      /* Final reduction via vector shifts and the reduction operator.
3986 		 Also requires scalar extract.  */
3987 	      epilogue_cost += record_stmt_cost (cost_vec,
3988 						 exact_log2 (nelements) * 2,
3989 						 vector_stmt, stmt_info, 0,
3990 						 vect_epilogue);
3991 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3992 						 vec_to_scalar, stmt_info, 0,
3993 						 vect_epilogue);
3994 	    }
3995 	  else
3996 	    /* Use extracts and reduction op for final reduction.  For N
3997 	       elements, we have N extracts and N-1 reduction ops.  */
3998 	    epilogue_cost += record_stmt_cost (cost_vec,
3999 					       nelements + nelements - 1,
4000 					       vector_stmt, stmt_info, 0,
4001 					       vect_epilogue);
4002 	}
4003     }
4004 
4005   if (dump_enabled_p ())
4006     dump_printf (MSG_NOTE,
4007                  "vect_model_reduction_cost: inside_cost = %d, "
4008                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4009                  prologue_cost, epilogue_cost);
4010 }
4011 
4012 
4013 /* Function vect_model_induction_cost.
4014 
4015    Models cost for induction operations.  */
4016 
4017 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4018 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4019 			   stmt_vector_for_cost *cost_vec)
4020 {
4021   unsigned inside_cost, prologue_cost;
4022 
4023   if (PURE_SLP_STMT (stmt_info))
4024     return;
4025 
4026   /* loop cost for vec_loop.  */
4027   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4028 				  stmt_info, 0, vect_body);
4029 
4030   /* prologue cost for vec_init and vec_step.  */
4031   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4032 				    stmt_info, 0, vect_prologue);
4033 
4034   if (dump_enabled_p ())
4035     dump_printf_loc (MSG_NOTE, vect_location,
4036                      "vect_model_induction_cost: inside_cost = %d, "
4037                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4038 }
4039 
4040 
4041 
4042 /* Function get_initial_def_for_reduction
4043 
4044    Input:
4045    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4046    INIT_VAL - the initial value of the reduction variable
4047 
4048    Output:
4049    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4050         of the reduction (used for adjusting the epilog - see below).
4051    Return a vector variable, initialized according to the operation that
4052 	STMT_VINFO performs. This vector will be used as the initial value
4053 	of the vector of partial results.
4054 
4055    Option1 (adjust in epilog): Initialize the vector as follows:
4056      add/bit or/xor:    [0,0,...,0,0]
4057      mult/bit and:      [1,1,...,1,1]
4058      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4059    and when necessary (e.g. add/mult case) let the caller know
4060    that it needs to adjust the result by init_val.
4061 
4062    Option2: Initialize the vector as follows:
4063      add/bit or/xor:    [init_val,0,0,...,0]
4064      mult/bit and:      [init_val,1,1,...,1]
4065      min/max/cond_expr: [init_val,init_val,...,init_val]
4066    and no adjustments are needed.
4067 
4068    For example, for the following code:
4069 
4070    s = init_val;
4071    for (i=0;i<n;i++)
4072      s = s + a[i];
4073 
4074    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4075    For a vector of 4 units, we want to return either [0,0,0,init_val],
4076    or [0,0,0,0] and let the caller know that it needs to adjust
4077    the result at the end by 'init_val'.
4078 
4079    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4080    initialization vector is simpler (same element in all entries), if
4081    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4082 
4083    A cost model should help decide between these two schemes.  */
4084 
4085 tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,tree init_val,tree * adjustment_def)4086 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4087                                tree *adjustment_def)
4088 {
4089   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4090   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4091   tree scalar_type = TREE_TYPE (init_val);
4092   tree vectype = get_vectype_for_scalar_type (scalar_type);
4093   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4094   tree def_for_init;
4095   tree init_def;
4096   REAL_VALUE_TYPE real_init_val = dconst0;
4097   int int_init_val = 0;
4098   gimple_seq stmts = NULL;
4099 
4100   gcc_assert (vectype);
4101 
4102   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4103 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4104 
4105   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4106 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4107 
4108   vect_reduction_type reduction_type
4109     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4110 
4111   switch (code)
4112     {
4113     case WIDEN_SUM_EXPR:
4114     case DOT_PROD_EXPR:
4115     case SAD_EXPR:
4116     case PLUS_EXPR:
4117     case MINUS_EXPR:
4118     case BIT_IOR_EXPR:
4119     case BIT_XOR_EXPR:
4120     case MULT_EXPR:
4121     case BIT_AND_EXPR:
4122       {
4123         /* ADJUSTMENT_DEF is NULL when called from
4124            vect_create_epilog_for_reduction to vectorize double reduction.  */
4125         if (adjustment_def)
4126 	  *adjustment_def = init_val;
4127 
4128         if (code == MULT_EXPR)
4129           {
4130             real_init_val = dconst1;
4131             int_init_val = 1;
4132           }
4133 
4134         if (code == BIT_AND_EXPR)
4135           int_init_val = -1;
4136 
4137         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4138           def_for_init = build_real (scalar_type, real_init_val);
4139         else
4140           def_for_init = build_int_cst (scalar_type, int_init_val);
4141 
4142 	if (adjustment_def)
4143 	  /* Option1: the first element is '0' or '1' as well.  */
4144 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4145 						   def_for_init);
4146 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4147 	  {
4148 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4149 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4150 						     def_for_init);
4151 	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4152 				     vectype, init_def, init_val);
4153 	  }
4154 	else
4155 	  {
4156 	    /* Option2: the first element is INIT_VAL.  */
4157 	    tree_vector_builder elts (vectype, 1, 2);
4158 	    elts.quick_push (init_val);
4159 	    elts.quick_push (def_for_init);
4160 	    init_def = gimple_build_vector (&stmts, &elts);
4161 	  }
4162       }
4163       break;
4164 
4165     case MIN_EXPR:
4166     case MAX_EXPR:
4167     case COND_EXPR:
4168       {
4169 	if (adjustment_def)
4170           {
4171 	    *adjustment_def = NULL_TREE;
4172 	    if (reduction_type != COND_REDUCTION
4173 		&& reduction_type != EXTRACT_LAST_REDUCTION)
4174 	      {
4175 		init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4176 		break;
4177 	      }
4178 	  }
4179 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4180 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4181       }
4182       break;
4183 
4184     default:
4185       gcc_unreachable ();
4186     }
4187 
4188   if (stmts)
4189     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4190   return init_def;
4191 }
4192 
4193 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4194    NUMBER_OF_VECTORS is the number of vector defs to create.
4195    If NEUTRAL_OP is nonnull, introducing extra elements of that
4196    value will not change the result.  */
4197 
4198 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4199 get_initial_defs_for_reduction (slp_tree slp_node,
4200 				vec<tree> *vec_oprnds,
4201 				unsigned int number_of_vectors,
4202 				bool reduc_chain, tree neutral_op)
4203 {
4204   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4205   stmt_vec_info stmt_vinfo = stmts[0];
4206   unsigned HOST_WIDE_INT nunits;
4207   unsigned j, number_of_places_left_in_vector;
4208   tree vector_type;
4209   unsigned int group_size = stmts.length ();
4210   unsigned int i;
4211   struct loop *loop;
4212 
4213   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4214 
4215   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4216 
4217   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4218   gcc_assert (loop);
4219   edge pe = loop_preheader_edge (loop);
4220 
4221   gcc_assert (!reduc_chain || neutral_op);
4222 
4223   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4224      created vectors. It is greater than 1 if unrolling is performed.
4225 
4226      For example, we have two scalar operands, s1 and s2 (e.g., group of
4227      strided accesses of size two), while NUNITS is four (i.e., four scalars
4228      of this type can be packed in a vector).  The output vector will contain
4229      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4230      will be 2).
4231 
4232      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4233      vectors containing the operands.
4234 
4235      For example, NUNITS is four as before, and the group size is 8
4236      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4237      {s5, s6, s7, s8}.  */
4238 
4239   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4240     nunits = group_size;
4241 
4242   number_of_places_left_in_vector = nunits;
4243   bool constant_p = true;
4244   tree_vector_builder elts (vector_type, nunits, 1);
4245   elts.quick_grow (nunits);
4246   gimple_seq ctor_seq = NULL;
4247   for (j = 0; j < nunits * number_of_vectors; ++j)
4248     {
4249       tree op;
4250       i = j % group_size;
4251       stmt_vinfo = stmts[i];
4252 
4253       /* Get the def before the loop.  In reduction chain we have only
4254 	 one initial value.  Else we have as many as PHIs in the group.  */
4255       if (reduc_chain)
4256 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4257       else if (((vec_oprnds->length () + 1) * nunits
4258 		- number_of_places_left_in_vector >= group_size)
4259 	       && neutral_op)
4260 	op = neutral_op;
4261       else
4262 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4263 
4264       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4265       number_of_places_left_in_vector--;
4266       elts[nunits - number_of_places_left_in_vector - 1] = op;
4267       if (!CONSTANT_CLASS_P (op))
4268 	constant_p = false;
4269 
4270       if (number_of_places_left_in_vector == 0)
4271 	{
4272 	  tree init;
4273 	  if (constant_p && !neutral_op
4274 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4275 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4276 	    /* Build the vector directly from ELTS.  */
4277 	    init = gimple_build_vector (&ctor_seq, &elts);
4278 	  else if (neutral_op)
4279 	    {
4280 	      /* Build a vector of the neutral value and shift the
4281 		 other elements into place.  */
4282 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4283 						   neutral_op);
4284 	      int k = nunits;
4285 	      while (k > 0 && elts[k - 1] == neutral_op)
4286 		k -= 1;
4287 	      while (k > 0)
4288 		{
4289 		  k -= 1;
4290 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4291 				       vector_type, init, elts[k]);
4292 		}
4293 	    }
4294 	  else
4295 	    {
4296 	      /* First time round, duplicate ELTS to fill the
4297 		 required number of vectors.  */
4298 	      duplicate_and_interleave (&ctor_seq, vector_type, elts,
4299 					number_of_vectors, *vec_oprnds);
4300 	      break;
4301 	    }
4302 	  vec_oprnds->quick_push (init);
4303 
4304 	  number_of_places_left_in_vector = nunits;
4305 	  elts.new_vector (vector_type, nunits, 1);
4306 	  elts.quick_grow (nunits);
4307 	  constant_p = true;
4308 	}
4309     }
4310   if (ctor_seq != NULL)
4311     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4312 }
4313 
4314 
4315 /* Function vect_create_epilog_for_reduction
4316 
4317    Create code at the loop-epilog to finalize the result of a reduction
4318    computation.
4319 
4320    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4321      reduction statements.
4322    STMT_INFO is the scalar reduction stmt that is being vectorized.
4323    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4324      number of elements that we can fit in a vectype (nunits).  In this case
4325      we have to generate more than one vector stmt - i.e - we need to "unroll"
4326      the vector stmt by a factor VF/nunits.  For more details see documentation
4327      in vectorizable_operation.
4328    REDUC_FN is the internal function for the epilog reduction.
4329    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4330      computation.
4331    REDUC_INDEX is the index of the operand in the right hand side of the
4332      statement that is defined by REDUCTION_PHI.
4333    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4334    SLP_NODE is an SLP node containing a group of reduction statements. The
4335      first one in this group is STMT_INFO.
4336    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4337      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4338      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4339      any value of the IV in the loop.
4340    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4341    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4342      null if this is not an SLP reduction
4343 
4344    This function:
4345    1. Creates the reduction def-use cycles: sets the arguments for
4346       REDUCTION_PHIS:
4347       The loop-entry argument is the vectorized initial-value of the reduction.
4348       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4349       sums.
4350    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4351       by calling the function specified by REDUC_FN if available, or by
4352       other means (whole-vector shifts or a scalar loop).
4353       The function also creates a new phi node at the loop exit to preserve
4354       loop-closed form, as illustrated below.
4355 
4356      The flow at the entry to this function:
4357 
4358         loop:
4359           vec_def = phi <null, null>            # REDUCTION_PHI
4360           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4361           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4362         loop_exit:
4363           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4364           use <s_out0>
4365           use <s_out0>
4366 
4367      The above is transformed by this function into:
4368 
4369         loop:
4370           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4371           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4372           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4373         loop_exit:
4374           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4375           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4376           v_out2 = reduce <v_out1>
4377           s_out3 = extract_field <v_out2, 0>
4378           s_out4 = adjust_result <s_out3>
4379           use <s_out4>
4380           use <s_out4>
4381 */
4382 
4383 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,stmt_vec_info stmt_info,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<stmt_vec_info> reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)4384 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4385 				  stmt_vec_info stmt_info,
4386 				  gimple *reduc_def_stmt,
4387 				  int ncopies, internal_fn reduc_fn,
4388 				  vec<stmt_vec_info> reduction_phis,
4389                                   bool double_reduc,
4390 				  slp_tree slp_node,
4391 				  slp_instance slp_node_instance,
4392 				  tree induc_val, enum tree_code induc_code,
4393 				  tree neutral_op)
4394 {
4395   stmt_vec_info prev_phi_info;
4396   tree vectype;
4397   machine_mode mode;
4398   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4399   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4400   basic_block exit_bb;
4401   tree scalar_dest;
4402   tree scalar_type;
4403   gimple *new_phi = NULL, *phi;
4404   stmt_vec_info phi_info;
4405   gimple_stmt_iterator exit_gsi;
4406   tree vec_dest;
4407   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4408   gimple *epilog_stmt = NULL;
4409   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4410   gimple *exit_phi;
4411   tree bitsize;
4412   tree adjustment_def = NULL;
4413   tree vec_initial_def = NULL;
4414   tree expr, def, initial_def = NULL;
4415   tree orig_name, scalar_result;
4416   imm_use_iterator imm_iter, phi_imm_iter;
4417   use_operand_p use_p, phi_use_p;
4418   gimple *use_stmt;
4419   stmt_vec_info reduction_phi_info = NULL;
4420   bool nested_in_vect_loop = false;
4421   auto_vec<gimple *> new_phis;
4422   auto_vec<stmt_vec_info> inner_phis;
4423   int j, i;
4424   auto_vec<tree> scalar_results;
4425   unsigned int group_size = 1, k, ratio;
4426   auto_vec<tree> vec_initial_defs;
4427   auto_vec<gimple *> phis;
4428   bool slp_reduc = false;
4429   bool direct_slp_reduc;
4430   tree new_phi_result;
4431   stmt_vec_info inner_phi = NULL;
4432   tree induction_index = NULL_TREE;
4433 
4434   if (slp_node)
4435     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4436 
4437   if (nested_in_vect_loop_p (loop, stmt_info))
4438     {
4439       outer_loop = loop;
4440       loop = loop->inner;
4441       nested_in_vect_loop = true;
4442       gcc_assert (!slp_node);
4443     }
4444 
4445   vectype = STMT_VINFO_VECTYPE (stmt_info);
4446   gcc_assert (vectype);
4447   mode = TYPE_MODE (vectype);
4448 
4449   /* 1. Create the reduction def-use cycle:
4450      Set the arguments of REDUCTION_PHIS, i.e., transform
4451 
4452         loop:
4453           vec_def = phi <null, null>            # REDUCTION_PHI
4454           VECT_DEF = vector_stmt                # vectorized form of STMT
4455           ...
4456 
4457      into:
4458 
4459         loop:
4460           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4461           VECT_DEF = vector_stmt                # vectorized form of STMT
4462           ...
4463 
4464      (in case of SLP, do it for all the phis). */
4465 
4466   /* Get the loop-entry arguments.  */
4467   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4468   if (slp_node)
4469     {
4470       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4471       vec_initial_defs.reserve (vec_num);
4472       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4473 				      &vec_initial_defs, vec_num,
4474 				      REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4475 				      neutral_op);
4476     }
4477   else
4478     {
4479       /* Get at the scalar def before the loop, that defines the initial value
4480 	 of the reduction variable.  */
4481       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4482 					   loop_preheader_edge (loop));
4483       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4484 	 and we can't use zero for induc_val, use initial_def.  Similarly
4485 	 for REDUC_MIN and initial_def larger than the base.  */
4486       if (TREE_CODE (initial_def) == INTEGER_CST
4487 	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4488 	      == INTEGER_INDUC_COND_REDUCTION)
4489 	  && !integer_zerop (induc_val)
4490 	  && ((induc_code == MAX_EXPR
4491 	       && tree_int_cst_lt (initial_def, induc_val))
4492 	      || (induc_code == MIN_EXPR
4493 		  && tree_int_cst_lt (induc_val, initial_def))))
4494 	induc_val = initial_def;
4495 
4496       if (double_reduc)
4497 	/* In case of double reduction we only create a vector variable
4498 	   to be put in the reduction phi node.  The actual statement
4499 	   creation is done later in this function.  */
4500 	vec_initial_def = vect_create_destination_var (initial_def, vectype);
4501       else if (nested_in_vect_loop)
4502 	{
4503 	  /* Do not use an adjustment def as that case is not supported
4504 	     correctly if ncopies is not one.  */
4505 	  vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4506 	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4507 							  stmt_info);
4508 	}
4509       else
4510 	vec_initial_def
4511 	  = get_initial_def_for_reduction (stmt_info, initial_def,
4512 					   &adjustment_def);
4513       vec_initial_defs.create (1);
4514       vec_initial_defs.quick_push (vec_initial_def);
4515     }
4516 
4517   /* Set phi nodes arguments.  */
4518   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4519     {
4520       tree vec_init_def = vec_initial_defs[i];
4521       tree def = vect_defs[i];
4522       for (j = 0; j < ncopies; j++)
4523         {
4524 	  if (j != 0)
4525 	    {
4526 	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4527 	      if (nested_in_vect_loop)
4528 		vec_init_def
4529 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4530 	    }
4531 
4532 	  /* Set the loop-entry arg of the reduction-phi.  */
4533 
4534 	  gphi *phi = as_a <gphi *> (phi_info->stmt);
4535 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4536 	      == INTEGER_INDUC_COND_REDUCTION)
4537 	    {
4538 	      /* Initialise the reduction phi to zero.  This prevents initial
4539 		 values of non-zero interferring with the reduction op.  */
4540 	      gcc_assert (ncopies == 1);
4541 	      gcc_assert (i == 0);
4542 
4543 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4544 	      tree induc_val_vec
4545 		= build_vector_from_val (vec_init_def_type, induc_val);
4546 
4547 	      add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4548 			   UNKNOWN_LOCATION);
4549 	    }
4550 	  else
4551 	    add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4552 			 UNKNOWN_LOCATION);
4553 
4554           /* Set the loop-latch arg for the reduction-phi.  */
4555           if (j > 0)
4556 	    def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4557 
4558 	  add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4559 
4560           if (dump_enabled_p ())
4561 	    dump_printf_loc (MSG_NOTE, vect_location,
4562 			     "transform reduction: created def-use cycle: %G%G",
4563 			     phi, SSA_NAME_DEF_STMT (def));
4564         }
4565     }
4566 
4567   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4568      which is updated with the current index of the loop for every match of
4569      the original loop's cond_expr (VEC_STMT).  This results in a vector
4570      containing the last time the condition passed for that vector lane.
4571      The first match will be a 1 to allow 0 to be used for non-matching
4572      indexes.  If there are no matches at all then the vector will be all
4573      zeroes.  */
4574   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4575     {
4576       tree indx_before_incr, indx_after_incr;
4577       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4578 
4579       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4580       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4581 
4582       int scalar_precision
4583 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4584       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4585       tree cr_index_vector_type = build_vector_type
4586 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4587 
4588       /* First we create a simple vector induction variable which starts
4589 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4590 	 vector size (STEP).  */
4591 
4592       /* Create a {1,2,3,...} vector.  */
4593       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4594 
4595       /* Create a vector of the step value.  */
4596       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4597       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4598 
4599       /* Create an induction variable.  */
4600       gimple_stmt_iterator incr_gsi;
4601       bool insert_after;
4602       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4603       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4604 		 insert_after, &indx_before_incr, &indx_after_incr);
4605 
4606       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4607 	 filled with zeros (VEC_ZERO).  */
4608 
4609       /* Create a vector of 0s.  */
4610       tree zero = build_zero_cst (cr_index_scalar_type);
4611       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4612 
4613       /* Create a vector phi node.  */
4614       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4615       new_phi = create_phi_node (new_phi_tree, loop->header);
4616       loop_vinfo->add_stmt (new_phi);
4617       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4618 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4619 
4620       /* Now take the condition from the loops original cond_expr
4621 	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4622 	 every match uses values from the induction variable
4623 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4624 	 (NEW_PHI_TREE).
4625 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4626 	 the new cond_expr (INDEX_COND_EXPR).  */
4627 
4628       /* Duplicate the condition from vec_stmt.  */
4629       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4630 
4631       /* Create a conditional, where the condition is taken from vec_stmt
4632 	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4633 	 else is the phi (NEW_PHI_TREE).  */
4634       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4635 				     ccompare, indx_before_incr,
4636 				     new_phi_tree);
4637       induction_index = make_ssa_name (cr_index_vector_type);
4638       gimple *index_condition = gimple_build_assign (induction_index,
4639 						     index_cond_expr);
4640       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4641       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4642       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4643 
4644       /* Update the phi with the vec cond.  */
4645       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4646 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4647     }
4648 
4649   /* 2. Create epilog code.
4650         The reduction epilog code operates across the elements of the vector
4651         of partial results computed by the vectorized loop.
4652         The reduction epilog code consists of:
4653 
4654         step 1: compute the scalar result in a vector (v_out2)
4655         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4656         step 3: adjust the scalar result (s_out3) if needed.
4657 
4658         Step 1 can be accomplished using one the following three schemes:
4659           (scheme 1) using reduc_fn, if available.
4660           (scheme 2) using whole-vector shifts, if available.
4661           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4662                      combined.
4663 
4664           The overall epilog code looks like this:
4665 
4666           s_out0 = phi <s_loop>         # original EXIT_PHI
4667           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4668           v_out2 = reduce <v_out1>              # step 1
4669           s_out3 = extract_field <v_out2, 0>    # step 2
4670           s_out4 = adjust_result <s_out3>       # step 3
4671 
4672           (step 3 is optional, and steps 1 and 2 may be combined).
4673           Lastly, the uses of s_out0 are replaced by s_out4.  */
4674 
4675 
4676   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4677          v_out1 = phi <VECT_DEF>
4678          Store them in NEW_PHIS.  */
4679 
4680   exit_bb = single_exit (loop)->dest;
4681   prev_phi_info = NULL;
4682   new_phis.create (vect_defs.length ());
4683   FOR_EACH_VEC_ELT (vect_defs, i, def)
4684     {
4685       for (j = 0; j < ncopies; j++)
4686         {
4687 	  tree new_def = copy_ssa_name (def);
4688           phi = create_phi_node (new_def, exit_bb);
4689 	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4690           if (j == 0)
4691             new_phis.quick_push (phi);
4692           else
4693 	    {
4694 	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4695 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4696 	    }
4697 
4698           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4699 	  prev_phi_info = phi_info;
4700         }
4701     }
4702 
4703   /* The epilogue is created for the outer-loop, i.e., for the loop being
4704      vectorized.  Create exit phis for the outer loop.  */
4705   if (double_reduc)
4706     {
4707       loop = outer_loop;
4708       exit_bb = single_exit (loop)->dest;
4709       inner_phis.create (vect_defs.length ());
4710       FOR_EACH_VEC_ELT (new_phis, i, phi)
4711 	{
4712 	  stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4713 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
4714 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
4715 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4716 			   PHI_RESULT (phi));
4717 	  prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4718 	  inner_phis.quick_push (phi_info);
4719 	  new_phis[i] = outer_phi;
4720 	  while (STMT_VINFO_RELATED_STMT (phi_info))
4721             {
4722 	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4723 	      new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4724 	      outer_phi = create_phi_node (new_result, exit_bb);
4725 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4726 			       PHI_RESULT (phi_info->stmt));
4727 	      stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4728 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4729 	      prev_phi_info = outer_phi_info;
4730 	    }
4731 	}
4732     }
4733 
4734   exit_gsi = gsi_after_labels (exit_bb);
4735 
4736   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4737          (i.e. when reduc_fn is not available) and in the final adjustment
4738 	 code (if needed).  Also get the original scalar reduction variable as
4739          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4740          represents a reduction pattern), the tree-code and scalar-def are
4741          taken from the original stmt that the pattern-stmt (STMT) replaces.
4742          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4743          are taken from STMT.  */
4744 
4745   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4746   if (orig_stmt_info != stmt_info)
4747     {
4748       /* Reduction pattern  */
4749       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4750       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4751     }
4752 
4753   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4754   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4755      partial results are added and not subtracted.  */
4756   if (code == MINUS_EXPR)
4757     code = PLUS_EXPR;
4758 
4759   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4760   scalar_type = TREE_TYPE (scalar_dest);
4761   scalar_results.create (group_size);
4762   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4763   bitsize = TYPE_SIZE (scalar_type);
4764 
4765   /* In case this is a reduction in an inner-loop while vectorizing an outer
4766      loop - we don't need to extract a single scalar result at the end of the
4767      inner-loop (unless it is double reduction, i.e., the use of reduction is
4768      outside the outer-loop).  The final vector of partial results will be used
4769      in the vectorized outer-loop, or reduced to a scalar result at the end of
4770      the outer-loop.  */
4771   if (nested_in_vect_loop && !double_reduc)
4772     goto vect_finalize_reduction;
4773 
4774   /* SLP reduction without reduction chain, e.g.,
4775      # a1 = phi <a2, a0>
4776      # b1 = phi <b2, b0>
4777      a2 = operation (a1)
4778      b2 = operation (b1)  */
4779   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4780 
4781   /* True if we should implement SLP_REDUC using native reduction operations
4782      instead of scalar operations.  */
4783   direct_slp_reduc = (reduc_fn != IFN_LAST
4784 		      && slp_reduc
4785 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4786 
4787   /* In case of reduction chain, e.g.,
4788      # a1 = phi <a3, a0>
4789      a2 = operation (a1)
4790      a3 = operation (a2),
4791 
4792      we may end up with more than one vector result.  Here we reduce them to
4793      one vector.  */
4794   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4795     {
4796       tree first_vect = PHI_RESULT (new_phis[0]);
4797       gassign *new_vec_stmt = NULL;
4798       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4799       for (k = 1; k < new_phis.length (); k++)
4800         {
4801 	  gimple *next_phi = new_phis[k];
4802           tree second_vect = PHI_RESULT (next_phi);
4803           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4804           new_vec_stmt = gimple_build_assign (tem, code,
4805 					      first_vect, second_vect);
4806           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4807 	  first_vect = tem;
4808         }
4809 
4810       new_phi_result = first_vect;
4811       if (new_vec_stmt)
4812         {
4813           new_phis.truncate (0);
4814           new_phis.safe_push (new_vec_stmt);
4815         }
4816     }
4817   /* Likewise if we couldn't use a single defuse cycle.  */
4818   else if (ncopies > 1)
4819     {
4820       gcc_assert (new_phis.length () == 1);
4821       tree first_vect = PHI_RESULT (new_phis[0]);
4822       gassign *new_vec_stmt = NULL;
4823       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4824       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4825       for (int k = 1; k < ncopies; ++k)
4826 	{
4827 	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4828 	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
4829           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4830           new_vec_stmt = gimple_build_assign (tem, code,
4831 					      first_vect, second_vect);
4832           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4833 	  first_vect = tem;
4834 	}
4835       new_phi_result = first_vect;
4836       new_phis.truncate (0);
4837       new_phis.safe_push (new_vec_stmt);
4838     }
4839   else
4840     new_phi_result = PHI_RESULT (new_phis[0]);
4841 
4842   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4843       && reduc_fn != IFN_LAST)
4844     {
4845       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4846 	 various data values where the condition matched and another vector
4847 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4848 	 need to extract the last matching index (which will be the index with
4849 	 highest value) and use this to index into the data vector.
4850 	 For the case where there were no matches, the data vector will contain
4851 	 all default values and the index vector will be all zeros.  */
4852 
4853       /* Get various versions of the type of the vector of indexes.  */
4854       tree index_vec_type = TREE_TYPE (induction_index);
4855       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4856       tree index_scalar_type = TREE_TYPE (index_vec_type);
4857       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4858 	(index_vec_type);
4859 
4860       /* Get an unsigned integer version of the type of the data vector.  */
4861       int scalar_precision
4862 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4863       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4864       tree vectype_unsigned = build_vector_type
4865 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4866 
4867       /* First we need to create a vector (ZERO_VEC) of zeros and another
4868 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4869 	 can create using a MAX reduction and then expanding.
4870 	 In the case where the loop never made any matches, the max index will
4871 	 be zero.  */
4872 
4873       /* Vector of {0, 0, 0,...}.  */
4874       tree zero_vec = make_ssa_name (vectype);
4875       tree zero_vec_rhs = build_zero_cst (vectype);
4876       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4877       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4878 
4879       /* Find maximum value from the vector of found indexes.  */
4880       tree max_index = make_ssa_name (index_scalar_type);
4881       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882 							  1, induction_index);
4883       gimple_call_set_lhs (max_index_stmt, max_index);
4884       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4885 
4886       /* Vector of {max_index, max_index, max_index,...}.  */
4887       tree max_index_vec = make_ssa_name (index_vec_type);
4888       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4889 						      max_index);
4890       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4891 							max_index_vec_rhs);
4892       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4893 
4894       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4895 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4896 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4897 	 otherwise.  Only one value should match, resulting in a vector
4898 	 (VEC_COND) with one data value and the rest zeros.
4899 	 In the case where the loop never made any matches, every index will
4900 	 match, resulting in a vector with all data values (which will all be
4901 	 the default value).  */
4902 
4903       /* Compare the max index vector to the vector of found indexes to find
4904 	 the position of the max value.  */
4905       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4906       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4907 						      induction_index,
4908 						      max_index_vec);
4909       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4910 
4911       /* Use the compare to choose either values from the data vector or
4912 	 zero.  */
4913       tree vec_cond = make_ssa_name (vectype);
4914       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4915 						   vec_compare, new_phi_result,
4916 						   zero_vec);
4917       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4918 
4919       /* Finally we need to extract the data value from the vector (VEC_COND)
4920 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4921 	 reduction, but because this doesn't exist, we can use a MAX reduction
4922 	 instead.  The data value might be signed or a float so we need to cast
4923 	 it first.
4924 	 In the case where the loop never made any matches, the data values are
4925 	 all identical, and so will reduce down correctly.  */
4926 
4927       /* Make the matched data values unsigned.  */
4928       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4929       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4930 				       vec_cond);
4931       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4932 							VIEW_CONVERT_EXPR,
4933 							vec_cond_cast_rhs);
4934       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4935 
4936       /* Reduce down to a scalar value.  */
4937       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4938       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4939 							   1, vec_cond_cast);
4940       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4941       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4942 
4943       /* Convert the reduced value back to the result type and set as the
4944 	 result.  */
4945       gimple_seq stmts = NULL;
4946       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4947 			       data_reduc);
4948       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4949       scalar_results.safe_push (new_temp);
4950     }
4951   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4952 	   && reduc_fn == IFN_LAST)
4953     {
4954       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4955 	 idx = 0;
4956          idx_val = induction_index[0];
4957 	 val = data_reduc[0];
4958          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4959 	   if (induction_index[i] > idx_val)
4960 	     val = data_reduc[i], idx_val = induction_index[i];
4961 	 return val;  */
4962 
4963       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4964       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4965       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4966       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4967       /* Enforced by vectorizable_reduction, which ensures we have target
4968 	 support before allowing a conditional reduction on variable-length
4969 	 vectors.  */
4970       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4971       tree idx_val = NULL_TREE, val = NULL_TREE;
4972       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4973 	{
4974 	  tree old_idx_val = idx_val;
4975 	  tree old_val = val;
4976 	  idx_val = make_ssa_name (idx_eltype);
4977 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4978 					     build3 (BIT_FIELD_REF, idx_eltype,
4979 						     induction_index,
4980 						     bitsize_int (el_size),
4981 						     bitsize_int (off)));
4982 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4983 	  val = make_ssa_name (data_eltype);
4984 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4985 					     build3 (BIT_FIELD_REF,
4986 						     data_eltype,
4987 						     new_phi_result,
4988 						     bitsize_int (el_size),
4989 						     bitsize_int (off)));
4990 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991 	  if (off != 0)
4992 	    {
4993 	      tree new_idx_val = idx_val;
4994 	      tree new_val = val;
4995 	      if (off != v_size - el_size)
4996 		{
4997 		  new_idx_val = make_ssa_name (idx_eltype);
4998 		  epilog_stmt = gimple_build_assign (new_idx_val,
4999 						     MAX_EXPR, idx_val,
5000 						     old_idx_val);
5001 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5002 		}
5003 	      new_val = make_ssa_name (data_eltype);
5004 	      epilog_stmt = gimple_build_assign (new_val,
5005 						 COND_EXPR,
5006 						 build2 (GT_EXPR,
5007 							 boolean_type_node,
5008 							 idx_val,
5009 							 old_idx_val),
5010 						 val, old_val);
5011 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5012 	      idx_val = new_idx_val;
5013 	      val = new_val;
5014 	    }
5015 	}
5016       /* Convert the reduced value back to the result type and set as the
5017 	 result.  */
5018       gimple_seq stmts = NULL;
5019       val = gimple_convert (&stmts, scalar_type, val);
5020       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5021       scalar_results.safe_push (val);
5022     }
5023 
5024   /* 2.3 Create the reduction code, using one of the three schemes described
5025          above. In SLP we simply need to extract all the elements from the
5026          vector (without reducing them), so we use scalar shifts.  */
5027   else if (reduc_fn != IFN_LAST && !slp_reduc)
5028     {
5029       tree tmp;
5030       tree vec_elem_type;
5031 
5032       /* Case 1:  Create:
5033          v_out2 = reduc_expr <v_out1>  */
5034 
5035       if (dump_enabled_p ())
5036         dump_printf_loc (MSG_NOTE, vect_location,
5037 			 "Reduce using direct vector reduction.\n");
5038 
5039       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5040       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5041 	{
5042 	  tree tmp_dest
5043 	    = vect_create_destination_var (scalar_dest, vec_elem_type);
5044 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5045 						    new_phi_result);
5046 	  gimple_set_lhs (epilog_stmt, tmp_dest);
5047 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5048 	  gimple_set_lhs (epilog_stmt, new_temp);
5049 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5050 
5051 	  epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5052 					     new_temp);
5053 	}
5054       else
5055 	{
5056 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5057 						    new_phi_result);
5058 	  gimple_set_lhs (epilog_stmt, new_scalar_dest);
5059 	}
5060 
5061       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5062       gimple_set_lhs (epilog_stmt, new_temp);
5063       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5064 
5065       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5066 	   == INTEGER_INDUC_COND_REDUCTION)
5067 	  && !operand_equal_p (initial_def, induc_val, 0))
5068 	{
5069 	  /* Earlier we set the initial value to be a vector if induc_val
5070 	     values.  Check the result and if it is induc_val then replace
5071 	     with the original initial value, unless induc_val is
5072 	     the same as initial_def already.  */
5073 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5074 				  induc_val);
5075 
5076 	  tmp = make_ssa_name (new_scalar_dest);
5077 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5078 					     initial_def, new_temp);
5079 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5080 	  new_temp = tmp;
5081 	}
5082 
5083       scalar_results.safe_push (new_temp);
5084     }
5085   else if (direct_slp_reduc)
5086     {
5087       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5088 	 with the elements for other SLP statements replaced with the
5089 	 neutral value.  We can then do a normal reduction on each vector.  */
5090 
5091       /* Enforced by vectorizable_reduction.  */
5092       gcc_assert (new_phis.length () == 1);
5093       gcc_assert (pow2p_hwi (group_size));
5094 
5095       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5096       vec<stmt_vec_info> orig_phis
5097 	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5098       gimple_seq seq = NULL;
5099 
5100       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5101 	 and the same element size as VECTYPE.  */
5102       tree index = build_index_vector (vectype, 0, 1);
5103       tree index_type = TREE_TYPE (index);
5104       tree index_elt_type = TREE_TYPE (index_type);
5105       tree mask_type = build_same_sized_truth_vector_type (index_type);
5106 
5107       /* Create a vector that, for each element, identifies which of
5108 	 the REDUC_GROUP_SIZE results should use it.  */
5109       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5110       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5111 			    build_vector_from_val (index_type, index_mask));
5112 
5113       /* Get a neutral vector value.  This is simply a splat of the neutral
5114 	 scalar value if we have one, otherwise the initial scalar value
5115 	 is itself a neutral value.  */
5116       tree vector_identity = NULL_TREE;
5117       if (neutral_op)
5118 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5119 							neutral_op);
5120       for (unsigned int i = 0; i < group_size; ++i)
5121 	{
5122 	  /* If there's no univeral neutral value, we can use the
5123 	     initial scalar value from the original PHI.  This is used
5124 	     for MIN and MAX reduction, for example.  */
5125 	  if (!neutral_op)
5126 	    {
5127 	      tree scalar_value
5128 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5129 					 loop_preheader_edge (loop));
5130 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5131 							      scalar_value);
5132 	    }
5133 
5134 	  /* Calculate the equivalent of:
5135 
5136 	     sel[j] = (index[j] == i);
5137 
5138 	     which selects the elements of NEW_PHI_RESULT that should
5139 	     be included in the result.  */
5140 	  tree compare_val = build_int_cst (index_elt_type, i);
5141 	  compare_val = build_vector_from_val (index_type, compare_val);
5142 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5143 				   index, compare_val);
5144 
5145 	  /* Calculate the equivalent of:
5146 
5147 	     vec = seq ? new_phi_result : vector_identity;
5148 
5149 	     VEC is now suitable for a full vector reduction.  */
5150 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5151 				   sel, new_phi_result, vector_identity);
5152 
5153 	  /* Do the reduction and convert it to the appropriate type.  */
5154 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5155 				      TREE_TYPE (vectype), vec);
5156 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5157 	  scalar_results.safe_push (scalar);
5158 	}
5159       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5160     }
5161   else
5162     {
5163       bool reduce_with_shift;
5164       tree vec_temp;
5165 
5166       /* COND reductions all do the final reduction with MAX_EXPR
5167 	 or MIN_EXPR.  */
5168       if (code == COND_EXPR)
5169 	{
5170 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5171 	      == INTEGER_INDUC_COND_REDUCTION)
5172 	    code = induc_code;
5173 	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5174 		   == CONST_COND_REDUCTION)
5175 	    code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5176 	  else
5177 	    code = MAX_EXPR;
5178 	}
5179 
5180       /* See if the target wants to do the final (shift) reduction
5181 	 in a vector mode of smaller size and first reduce upper/lower
5182 	 halves against each other.  */
5183       enum machine_mode mode1 = mode;
5184       tree vectype1 = vectype;
5185       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5186       unsigned sz1 = sz;
5187       if (!slp_reduc
5188 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5189 	sz1 = GET_MODE_SIZE (mode1).to_constant ();
5190 
5191       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5192       reduce_with_shift = have_whole_vector_shift (mode1);
5193       if (!VECTOR_MODE_P (mode1))
5194 	reduce_with_shift = false;
5195       else
5196 	{
5197 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5198 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5199 	    reduce_with_shift = false;
5200 	}
5201 
5202       /* First reduce the vector to the desired vector size we should
5203 	 do shift reduction on by combining upper and lower halves.  */
5204       new_temp = new_phi_result;
5205       while (sz > sz1)
5206 	{
5207 	  gcc_assert (!slp_reduc);
5208 	  sz /= 2;
5209 	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5210 
5211 	  /* The target has to make sure we support lowpart/highpart
5212 	     extraction, either via direct vector extract or through
5213 	     an integer mode punning.  */
5214 	  tree dst1, dst2;
5215 	  if (convert_optab_handler (vec_extract_optab,
5216 				     TYPE_MODE (TREE_TYPE (new_temp)),
5217 				     TYPE_MODE (vectype1))
5218 	      != CODE_FOR_nothing)
5219 	    {
5220 	      /* Extract sub-vectors directly once vec_extract becomes
5221 		 a conversion optab.  */
5222 	      dst1 = make_ssa_name (vectype1);
5223 	      epilog_stmt
5224 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5225 					 build3 (BIT_FIELD_REF, vectype1,
5226 						 new_temp, TYPE_SIZE (vectype1),
5227 						 bitsize_int (0)));
5228 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 	      dst2 =  make_ssa_name (vectype1);
5230 	      epilog_stmt
5231 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5232 					 build3 (BIT_FIELD_REF, vectype1,
5233 						 new_temp, TYPE_SIZE (vectype1),
5234 						 bitsize_int (sz * BITS_PER_UNIT)));
5235 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5236 	    }
5237 	  else
5238 	    {
5239 	      /* Extract via punning to appropriately sized integer mode
5240 		 vector.  */
5241 	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5242 							    1);
5243 	      tree etype = build_vector_type (eltype, 2);
5244 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5245 						 TYPE_MODE (etype),
5246 						 TYPE_MODE (eltype))
5247 			  != CODE_FOR_nothing);
5248 	      tree tem = make_ssa_name (etype);
5249 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5250 						 build1 (VIEW_CONVERT_EXPR,
5251 							 etype, new_temp));
5252 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5253 	      new_temp = tem;
5254 	      tem = make_ssa_name (eltype);
5255 	      epilog_stmt
5256 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5257 					 build3 (BIT_FIELD_REF, eltype,
5258 						 new_temp, TYPE_SIZE (eltype),
5259 						 bitsize_int (0)));
5260 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261 	      dst1 = make_ssa_name (vectype1);
5262 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5263 						 build1 (VIEW_CONVERT_EXPR,
5264 							 vectype1, tem));
5265 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266 	      tem = make_ssa_name (eltype);
5267 	      epilog_stmt
5268 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5269 					 build3 (BIT_FIELD_REF, eltype,
5270 						 new_temp, TYPE_SIZE (eltype),
5271 						 bitsize_int (sz * BITS_PER_UNIT)));
5272 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5273 	      dst2 =  make_ssa_name (vectype1);
5274 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5275 						 build1 (VIEW_CONVERT_EXPR,
5276 							 vectype1, tem));
5277 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 	    }
5279 
5280 	  new_temp = make_ssa_name (vectype1);
5281 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5282 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 	}
5284 
5285       if (reduce_with_shift && !slp_reduc)
5286 	{
5287 	  int element_bitsize = tree_to_uhwi (bitsize);
5288 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5289 	     for variable-length vectors and also requires direct target support
5290 	     for loop reductions.  */
5291 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5292 	  int nelements = vec_size_in_bits / element_bitsize;
5293 	  vec_perm_builder sel;
5294 	  vec_perm_indices indices;
5295 
5296           int elt_offset;
5297 
5298           tree zero_vec = build_zero_cst (vectype1);
5299           /* Case 2: Create:
5300              for (offset = nelements/2; offset >= 1; offset/=2)
5301                 {
5302                   Create:  va' = vec_shift <va, offset>
5303                   Create:  va = vop <va, va'>
5304                 }  */
5305 
5306           tree rhs;
5307 
5308           if (dump_enabled_p ())
5309             dump_printf_loc (MSG_NOTE, vect_location,
5310 			     "Reduce using vector shifts\n");
5311 
5312 	  mode1 = TYPE_MODE (vectype1);
5313           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5314           for (elt_offset = nelements / 2;
5315                elt_offset >= 1;
5316                elt_offset /= 2)
5317             {
5318 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5319 	      indices.new_vector (sel, 2, nelements);
5320 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5321 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5322 						 new_temp, zero_vec, mask);
5323               new_name = make_ssa_name (vec_dest, epilog_stmt);
5324               gimple_assign_set_lhs (epilog_stmt, new_name);
5325               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5326 
5327 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5328 						 new_temp);
5329               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5330               gimple_assign_set_lhs (epilog_stmt, new_temp);
5331               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5332             }
5333 
5334 	  /* 2.4  Extract the final scalar result.  Create:
5335 	     s_out3 = extract_field <v_out2, bitpos>  */
5336 
5337 	  if (dump_enabled_p ())
5338 	    dump_printf_loc (MSG_NOTE, vect_location,
5339 			     "extract scalar result\n");
5340 
5341 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5342 			bitsize, bitsize_zero_node);
5343 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5344 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5345 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5346 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347 	  scalar_results.safe_push (new_temp);
5348         }
5349       else
5350         {
5351           /* Case 3: Create:
5352              s = extract_field <v_out2, 0>
5353              for (offset = element_size;
5354                   offset < vector_size;
5355                   offset += element_size;)
5356                {
5357                  Create:  s' = extract_field <v_out2, offset>
5358                  Create:  s = op <s, s'>  // For non SLP cases
5359                }  */
5360 
5361           if (dump_enabled_p ())
5362             dump_printf_loc (MSG_NOTE, vect_location,
5363 			     "Reduce using scalar code.\n");
5364 
5365 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5366 	  int element_bitsize = tree_to_uhwi (bitsize);
5367           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5368             {
5369               int bit_offset;
5370               if (gimple_code (new_phi) == GIMPLE_PHI)
5371                 vec_temp = PHI_RESULT (new_phi);
5372               else
5373                 vec_temp = gimple_assign_lhs (new_phi);
5374               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5375 				 bitsize_zero_node);
5376               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5377               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5378               gimple_assign_set_lhs (epilog_stmt, new_temp);
5379               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380 
5381               /* In SLP we don't need to apply reduction operation, so we just
5382                  collect s' values in SCALAR_RESULTS.  */
5383               if (slp_reduc)
5384                 scalar_results.safe_push (new_temp);
5385 
5386               for (bit_offset = element_bitsize;
5387                    bit_offset < vec_size_in_bits;
5388                    bit_offset += element_bitsize)
5389                 {
5390                   tree bitpos = bitsize_int (bit_offset);
5391                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5392                                      bitsize, bitpos);
5393 
5394                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5395                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5396                   gimple_assign_set_lhs (epilog_stmt, new_name);
5397                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 
5399                   if (slp_reduc)
5400                     {
5401                       /* In SLP we don't need to apply reduction operation, so
5402                          we just collect s' values in SCALAR_RESULTS.  */
5403                       new_temp = new_name;
5404                       scalar_results.safe_push (new_name);
5405                     }
5406                   else
5407                     {
5408 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5409 							 new_name, new_temp);
5410                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5411                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5412                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413                     }
5414                 }
5415             }
5416 
5417           /* The only case where we need to reduce scalar results in SLP, is
5418              unrolling.  If the size of SCALAR_RESULTS is greater than
5419              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5420              REDUC_GROUP_SIZE.  */
5421           if (slp_reduc)
5422             {
5423               tree res, first_res, new_res;
5424 	      gimple *new_stmt;
5425 
5426               /* Reduce multiple scalar results in case of SLP unrolling.  */
5427               for (j = group_size; scalar_results.iterate (j, &res);
5428                    j++)
5429                 {
5430                   first_res = scalar_results[j % group_size];
5431 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
5432 						  first_res, res);
5433                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5434                   gimple_assign_set_lhs (new_stmt, new_res);
5435                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5436                   scalar_results[j % group_size] = new_res;
5437                 }
5438             }
5439           else
5440             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5441             scalar_results.safe_push (new_temp);
5442         }
5443 
5444       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5445 	   == INTEGER_INDUC_COND_REDUCTION)
5446 	  && !operand_equal_p (initial_def, induc_val, 0))
5447 	{
5448 	  /* Earlier we set the initial value to be a vector if induc_val
5449 	     values.  Check the result and if it is induc_val then replace
5450 	     with the original initial value, unless induc_val is
5451 	     the same as initial_def already.  */
5452 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5453 				  induc_val);
5454 
5455 	  tree tmp = make_ssa_name (new_scalar_dest);
5456 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5457 					     initial_def, new_temp);
5458 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5459 	  scalar_results[0] = tmp;
5460 	}
5461     }
5462 
5463 vect_finalize_reduction:
5464 
5465   if (double_reduc)
5466     loop = loop->inner;
5467 
5468   /* 2.5 Adjust the final result by the initial value of the reduction
5469 	 variable. (When such adjustment is not needed, then
5470 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5471 	 new_temp = loop_exit_def + adjustment_def  */
5472 
5473   if (adjustment_def)
5474     {
5475       gcc_assert (!slp_reduc);
5476       if (nested_in_vect_loop)
5477 	{
5478           new_phi = new_phis[0];
5479 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5480 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5481 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
5482 	}
5483       else
5484 	{
5485           new_temp = scalar_results[0];
5486 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5487 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
5488 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5489 	}
5490 
5491       epilog_stmt = gimple_build_assign (new_dest, expr);
5492       new_temp = make_ssa_name (new_dest, epilog_stmt);
5493       gimple_assign_set_lhs (epilog_stmt, new_temp);
5494       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5495       if (nested_in_vect_loop)
5496         {
5497 	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5498 	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5499 	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5500 
5501           if (!double_reduc)
5502             scalar_results.quick_push (new_temp);
5503           else
5504             scalar_results[0] = new_temp;
5505         }
5506       else
5507         scalar_results[0] = new_temp;
5508 
5509       new_phis[0] = epilog_stmt;
5510     }
5511 
5512   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5513           phis with new adjusted scalar results, i.e., replace use <s_out0>
5514           with use <s_out4>.
5515 
5516      Transform:
5517         loop_exit:
5518           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5519           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5520           v_out2 = reduce <v_out1>
5521           s_out3 = extract_field <v_out2, 0>
5522           s_out4 = adjust_result <s_out3>
5523           use <s_out0>
5524           use <s_out0>
5525 
5526      into:
5527 
5528         loop_exit:
5529           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5530           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5531           v_out2 = reduce <v_out1>
5532           s_out3 = extract_field <v_out2, 0>
5533           s_out4 = adjust_result <s_out3>
5534           use <s_out4>
5535           use <s_out4> */
5536 
5537 
5538   /* In SLP reduction chain we reduce vector results into one vector if
5539      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5540      LHS of the last stmt in the reduction chain, since we are looking for
5541      the loop exit phi node.  */
5542   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5543     {
5544       stmt_vec_info dest_stmt_info
5545 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5546       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5547       group_size = 1;
5548     }
5549 
5550   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5551      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5552      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5553      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5554      correspond to the first vector stmt, etc.
5555      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5556   if (group_size > new_phis.length ())
5557     {
5558       ratio = group_size / new_phis.length ();
5559       gcc_assert (!(group_size % new_phis.length ()));
5560     }
5561   else
5562     ratio = 1;
5563 
5564   stmt_vec_info epilog_stmt_info = NULL;
5565   for (k = 0; k < group_size; k++)
5566     {
5567       if (k % ratio == 0)
5568         {
5569 	  epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5570 	  reduction_phi_info = reduction_phis[k / ratio];
5571 	  if (double_reduc)
5572 	    inner_phi = inner_phis[k / ratio];
5573         }
5574 
5575       if (slp_reduc)
5576         {
5577 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5578 
5579 	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5580 	  /* SLP statements can't participate in patterns.  */
5581 	  gcc_assert (!orig_stmt_info);
5582 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5583         }
5584 
5585       phis.create (3);
5586       /* Find the loop-closed-use at the loop exit of the original scalar
5587          result.  (The reduction result is expected to have two immediate uses -
5588          one at the latch block, and one at the loop exit).  */
5589       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5590         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5591 	    && !is_gimple_debug (USE_STMT (use_p)))
5592           phis.safe_push (USE_STMT (use_p));
5593 
5594       /* While we expect to have found an exit_phi because of loop-closed-ssa
5595          form we can end up without one if the scalar cycle is dead.  */
5596 
5597       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5598         {
5599           if (outer_loop)
5600             {
5601 	      stmt_vec_info exit_phi_vinfo
5602 		= loop_vinfo->lookup_stmt (exit_phi);
5603               gphi *vect_phi;
5604 
5605 	      if (double_reduc)
5606 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5607 	      else
5608 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5609               if (!double_reduc
5610                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5611                       != vect_double_reduction_def)
5612                 continue;
5613 
5614               /* Handle double reduction:
5615 
5616                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5617                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5618                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5619                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5620 
5621                  At that point the regular reduction (stmt2 and stmt3) is
5622                  already vectorized, as well as the exit phi node, stmt4.
5623                  Here we vectorize the phi node of double reduction, stmt1, and
5624                  update all relevant statements.  */
5625 
5626               /* Go through all the uses of s2 to find double reduction phi
5627                  node, i.e., stmt1 above.  */
5628               orig_name = PHI_RESULT (exit_phi);
5629               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5630                 {
5631                   stmt_vec_info use_stmt_vinfo;
5632                   tree vect_phi_init, preheader_arg, vect_phi_res;
5633                   basic_block bb = gimple_bb (use_stmt);
5634 
5635                   /* Check that USE_STMT is really double reduction phi
5636                      node.  */
5637                   if (gimple_code (use_stmt) != GIMPLE_PHI
5638                       || gimple_phi_num_args (use_stmt) != 2
5639                       || bb->loop_father != outer_loop)
5640                     continue;
5641 		  use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5642                   if (!use_stmt_vinfo
5643                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5644                           != vect_double_reduction_def)
5645 		    continue;
5646 
5647                   /* Create vector phi node for double reduction:
5648                      vs1 = phi <vs0, vs2>
5649                      vs1 was created previously in this function by a call to
5650                        vect_get_vec_def_for_operand and is stored in
5651                        vec_initial_def;
5652                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5653                      vs0 is created here.  */
5654 
5655                   /* Create vector phi node.  */
5656                   vect_phi = create_phi_node (vec_initial_def, bb);
5657 		  loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5658 
5659                   /* Create vs0 - initial def of the double reduction phi.  */
5660                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5661                                              loop_preheader_edge (outer_loop));
5662                   vect_phi_init = get_initial_def_for_reduction
5663 		    (stmt_info, preheader_arg, NULL);
5664 
5665                   /* Update phi node arguments with vs0 and vs2.  */
5666                   add_phi_arg (vect_phi, vect_phi_init,
5667                                loop_preheader_edge (outer_loop),
5668                                UNKNOWN_LOCATION);
5669 		  add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5670 			       loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5671                   if (dump_enabled_p ())
5672 		    dump_printf_loc (MSG_NOTE, vect_location,
5673 				     "created double reduction phi node: %G",
5674 				     vect_phi);
5675 
5676                   vect_phi_res = PHI_RESULT (vect_phi);
5677 
5678                   /* Replace the use, i.e., set the correct vs1 in the regular
5679                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5680                      loop is redundant.  */
5681 		  stmt_vec_info use_info = reduction_phi_info;
5682 		  for (j = 0; j < ncopies; j++)
5683 		    {
5684 		      edge pr_edge = loop_preheader_edge (loop);
5685 		      SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5686 				       pr_edge->dest_idx, vect_phi_res);
5687 		      use_info = STMT_VINFO_RELATED_STMT (use_info);
5688 		    }
5689                 }
5690             }
5691         }
5692 
5693       phis.release ();
5694       if (nested_in_vect_loop)
5695         {
5696           if (double_reduc)
5697             loop = outer_loop;
5698           else
5699             continue;
5700         }
5701 
5702       phis.create (3);
5703       /* Find the loop-closed-use at the loop exit of the original scalar
5704          result.  (The reduction result is expected to have two immediate uses,
5705          one at the latch block, and one at the loop exit).  For double
5706          reductions we are looking for exit phis of the outer loop.  */
5707       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5708         {
5709           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5710 	    {
5711 	      if (!is_gimple_debug (USE_STMT (use_p)))
5712 		phis.safe_push (USE_STMT (use_p));
5713 	    }
5714           else
5715             {
5716               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5717                 {
5718                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5719 
5720                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5721                     {
5722                       if (!flow_bb_inside_loop_p (loop,
5723                                              gimple_bb (USE_STMT (phi_use_p)))
5724 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5725                         phis.safe_push (USE_STMT (phi_use_p));
5726                     }
5727                 }
5728             }
5729         }
5730 
5731       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5732         {
5733           /* Replace the uses:  */
5734           orig_name = PHI_RESULT (exit_phi);
5735           scalar_result = scalar_results[k];
5736           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5737             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5738               SET_USE (use_p, scalar_result);
5739         }
5740 
5741       phis.release ();
5742     }
5743 }
5744 
5745 /* Return a vector of type VECTYPE that is equal to the vector select
5746    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5747    before GSI.  */
5748 
5749 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5750 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5751 		     tree vec, tree identity)
5752 {
5753   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5754   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5755 					  mask, vec, identity);
5756   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5757   return cond;
5758 }
5759 
5760 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5761    order, starting with LHS.  Insert the extraction statements before GSI and
5762    associate the new scalar SSA names with variable SCALAR_DEST.
5763    Return the SSA name for the result.  */
5764 
5765 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5766 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5767 		       tree_code code, tree lhs, tree vector_rhs)
5768 {
5769   tree vectype = TREE_TYPE (vector_rhs);
5770   tree scalar_type = TREE_TYPE (vectype);
5771   tree bitsize = TYPE_SIZE (scalar_type);
5772   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5773   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5774 
5775   for (unsigned HOST_WIDE_INT bit_offset = 0;
5776        bit_offset < vec_size_in_bits;
5777        bit_offset += element_bitsize)
5778     {
5779       tree bitpos = bitsize_int (bit_offset);
5780       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5781 			 bitsize, bitpos);
5782 
5783       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5784       rhs = make_ssa_name (scalar_dest, stmt);
5785       gimple_assign_set_lhs (stmt, rhs);
5786       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5787 
5788       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5789       tree new_name = make_ssa_name (scalar_dest, stmt);
5790       gimple_assign_set_lhs (stmt, new_name);
5791       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5792       lhs = new_name;
5793     }
5794   return lhs;
5795 }
5796 
5797 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5798    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5799    statement.  CODE is the operation performed by STMT_INFO and OPS are
5800    its scalar operands.  REDUC_INDEX is the index of the operand in
5801    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5802    implements in-order reduction, or IFN_LAST if we should open-code it.
5803    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5804    that should be used to control the operation in a fully-masked loop.  */
5805 
5806 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5807 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5808 			       gimple_stmt_iterator *gsi,
5809 			       stmt_vec_info *vec_stmt, slp_tree slp_node,
5810 			       gimple *reduc_def_stmt,
5811 			       tree_code code, internal_fn reduc_fn,
5812 			       tree ops[3], tree vectype_in,
5813 			       int reduc_index, vec_loop_masks *masks)
5814 {
5815   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5816   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5817   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5818   stmt_vec_info new_stmt_info = NULL;
5819 
5820   int ncopies;
5821   if (slp_node)
5822     ncopies = 1;
5823   else
5824     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5825 
5826   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5827   gcc_assert (ncopies == 1);
5828   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5829   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5830   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5831 	      == FOLD_LEFT_REDUCTION);
5832 
5833   if (slp_node)
5834     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5835 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
5836 
5837   tree op0 = ops[1 - reduc_index];
5838 
5839   int group_size = 1;
5840   stmt_vec_info scalar_dest_def_info;
5841   auto_vec<tree> vec_oprnds0;
5842   if (slp_node)
5843     {
5844       auto_vec<vec<tree> > vec_defs (2);
5845       auto_vec<tree> sops(2);
5846       sops.quick_push (ops[0]);
5847       sops.quick_push (ops[1]);
5848       vect_get_slp_defs (sops, slp_node, &vec_defs);
5849       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5850       vec_defs[0].release ();
5851       vec_defs[1].release ();
5852       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5853       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5854     }
5855   else
5856     {
5857       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5858       vec_oprnds0.create (1);
5859       vec_oprnds0.quick_push (loop_vec_def0);
5860       scalar_dest_def_info = stmt_info;
5861     }
5862 
5863   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5864   tree scalar_type = TREE_TYPE (scalar_dest);
5865   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5866 
5867   int vec_num = vec_oprnds0.length ();
5868   gcc_assert (vec_num == 1 || slp_node);
5869   tree vec_elem_type = TREE_TYPE (vectype_out);
5870   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5871 
5872   tree vector_identity = NULL_TREE;
5873   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5874     vector_identity = build_zero_cst (vectype_out);
5875 
5876   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5877   int i;
5878   tree def0;
5879   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5880     {
5881       gimple *new_stmt;
5882       tree mask = NULL_TREE;
5883       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5884 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5885 
5886       /* Handle MINUS by adding the negative.  */
5887       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5888 	{
5889 	  tree negated = make_ssa_name (vectype_out);
5890 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5891 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5892 	  def0 = negated;
5893 	}
5894 
5895       if (mask)
5896 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5897 				    vector_identity);
5898 
5899       /* On the first iteration the input is simply the scalar phi
5900 	 result, and for subsequent iterations it is the output of
5901 	 the preceding operation.  */
5902       if (reduc_fn != IFN_LAST)
5903 	{
5904 	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5905 	  /* For chained SLP reductions the output of the previous reduction
5906 	     operation serves as the input of the next. For the final statement
5907 	     the output cannot be a temporary - we reuse the original
5908 	     scalar destination of the last statement.  */
5909 	  if (i != vec_num - 1)
5910 	    {
5911 	      gimple_set_lhs (new_stmt, scalar_dest_var);
5912 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5913 	      gimple_set_lhs (new_stmt, reduc_var);
5914 	    }
5915 	}
5916       else
5917 	{
5918 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5919 					     reduc_var, def0);
5920 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5921 	  /* Remove the statement, so that we can use the same code paths
5922 	     as for statements that we've just created.  */
5923 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5924 	  gsi_remove (&tmp_gsi, true);
5925 	}
5926 
5927       if (i == vec_num - 1)
5928 	{
5929 	  gimple_set_lhs (new_stmt, scalar_dest);
5930 	  new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5931 						    new_stmt);
5932 	}
5933       else
5934 	new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5935 						     new_stmt, gsi);
5936 
5937       if (slp_node)
5938 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5939     }
5940 
5941   if (!slp_node)
5942     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5943 
5944   return true;
5945 }
5946 
5947 /* Function is_nonwrapping_integer_induction.
5948 
5949    Check if STMT_VINO (which is part of loop LOOP) both increments and
5950    does not cause overflow.  */
5951 
5952 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,struct loop * loop)5953 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5954 {
5955   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5956   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5957   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5958   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5959   widest_int ni, max_loop_value, lhs_max;
5960   wi::overflow_type overflow = wi::OVF_NONE;
5961 
5962   /* Make sure the loop is integer based.  */
5963   if (TREE_CODE (base) != INTEGER_CST
5964       || TREE_CODE (step) != INTEGER_CST)
5965     return false;
5966 
5967   /* Check that the max size of the loop will not wrap.  */
5968 
5969   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5970     return true;
5971 
5972   if (! max_stmt_executions (loop, &ni))
5973     return false;
5974 
5975   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5976 			    &overflow);
5977   if (overflow)
5978     return false;
5979 
5980   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5981 			    TYPE_SIGN (lhs_type), &overflow);
5982   if (overflow)
5983     return false;
5984 
5985   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5986 	  <= TYPE_PRECISION (lhs_type));
5987 }
5988 
5989 /* Function vectorizable_reduction.
5990 
5991    Check if STMT_INFO performs a reduction operation that can be vectorized.
5992    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5993    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5994    Return true if STMT_INFO is vectorizable in this way.
5995 
5996    This function also handles reduction idioms (patterns) that have been
5997    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5998    may be of this form:
5999      X = pattern_expr (arg0, arg1, ..., X)
6000    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6001    sequence that had been detected and replaced by the pattern-stmt
6002    (STMT_INFO).
6003 
6004    This function also handles reduction of condition expressions, for example:
6005      for (int i = 0; i < N; i++)
6006        if (a[i] < value)
6007 	 last = a[i];
6008    This is handled by vectorising the loop and creating an additional vector
6009    containing the loop indexes for which "a[i] < value" was true.  In the
6010    function epilogue this is reduced to a single max value and then used to
6011    index into the vector of results.
6012 
6013    In some cases of reduction patterns, the type of the reduction variable X is
6014    different than the type of the other arguments of STMT_INFO.
6015    In such cases, the vectype that is used when transforming STMT_INFO into
6016    a vector stmt is different than the vectype that is used to determine the
6017    vectorization factor, because it consists of a different number of elements
6018    than the actual number of elements that are being operated upon in parallel.
6019 
6020    For example, consider an accumulation of shorts into an int accumulator.
6021    On some targets it's possible to vectorize this pattern operating on 8
6022    shorts at a time (hence, the vectype for purposes of determining the
6023    vectorization factor should be V8HI); on the other hand, the vectype that
6024    is used to create the vector form is actually V4SI (the type of the result).
6025 
6026    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6027    indicates what is the actual level of parallelism (V8HI in the example), so
6028    that the right vectorization factor would be derived.  This vectype
6029    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6030    be used to create the vectorized stmt.  The right vectype for the vectorized
6031    stmt is obtained from the type of the result X:
6032         get_vectype_for_scalar_type (TREE_TYPE (X))
6033 
6034    This means that, contrary to "regular" reductions (or "regular" stmts in
6035    general), the following equation:
6036       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6037    does *NOT* necessarily hold for reduction patterns.  */
6038 
6039 bool
vectorizable_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6040 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6041 			stmt_vec_info *vec_stmt, slp_tree slp_node,
6042 			slp_instance slp_node_instance,
6043 			stmt_vector_for_cost *cost_vec)
6044 {
6045   tree vec_dest;
6046   tree scalar_dest;
6047   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6048   tree vectype_in = NULL_TREE;
6049   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6050   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6051   enum tree_code code, orig_code;
6052   internal_fn reduc_fn;
6053   machine_mode vec_mode;
6054   int op_type;
6055   optab optab;
6056   tree new_temp = NULL_TREE;
6057   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6058   stmt_vec_info cond_stmt_vinfo = NULL;
6059   enum tree_code cond_reduc_op_code = ERROR_MARK;
6060   tree scalar_type;
6061   bool is_simple_use;
6062   int i;
6063   int ncopies;
6064   int epilog_copies;
6065   stmt_vec_info prev_stmt_info, prev_phi_info;
6066   bool single_defuse_cycle = false;
6067   stmt_vec_info new_stmt_info = NULL;
6068   int j;
6069   tree ops[3];
6070   enum vect_def_type dts[3];
6071   bool nested_cycle = false, found_nested_cycle_def = false;
6072   bool double_reduc = false;
6073   basic_block def_bb;
6074   struct loop * def_stmt_loop;
6075   tree def_arg;
6076   auto_vec<tree> vec_oprnds0;
6077   auto_vec<tree> vec_oprnds1;
6078   auto_vec<tree> vec_oprnds2;
6079   auto_vec<tree> vect_defs;
6080   auto_vec<stmt_vec_info> phis;
6081   int vec_num;
6082   tree def0, tem;
6083   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6084   tree cond_reduc_val = NULL_TREE;
6085 
6086   /* Make sure it was already recognized as a reduction computation.  */
6087   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6088       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6089     return false;
6090 
6091   if (nested_in_vect_loop_p (loop, stmt_info))
6092     {
6093       loop = loop->inner;
6094       nested_cycle = true;
6095     }
6096 
6097   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6098     gcc_assert (slp_node
6099 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6100 
6101   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6102     {
6103       tree phi_result = gimple_phi_result (phi);
6104       /* Analysis is fully done on the reduction stmt invocation.  */
6105       if (! vec_stmt)
6106 	{
6107 	  if (slp_node)
6108 	    slp_node_instance->reduc_phis = slp_node;
6109 
6110 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6111 	  return true;
6112 	}
6113 
6114       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6115 	/* Leave the scalar phi in place.  Note that checking
6116 	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6117 	   for reductions involving a single statement.  */
6118 	return true;
6119 
6120       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6121       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6122 
6123       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6124 	  == EXTRACT_LAST_REDUCTION)
6125 	/* Leave the scalar phi in place.  */
6126 	return true;
6127 
6128       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6129       code = gimple_assign_rhs_code (reduc_stmt);
6130       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6131 	{
6132 	  tree op = gimple_op (reduc_stmt, k);
6133 	  if (op == phi_result)
6134 	    continue;
6135 	  if (k == 1 && code == COND_EXPR)
6136 	    continue;
6137 	  bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6138 	  gcc_assert (is_simple_use);
6139 	  if (dt == vect_constant_def || dt == vect_external_def)
6140 	    continue;
6141 	  if (!vectype_in
6142 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6143 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6144 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6145 	  break;
6146 	}
6147       /* For a nested cycle we might end up with an operation like
6148          phi_result * phi_result.  */
6149       if (!vectype_in)
6150 	vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6151       gcc_assert (vectype_in);
6152 
6153       if (slp_node)
6154 	ncopies = 1;
6155       else
6156 	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6157 
6158       stmt_vec_info use_stmt_info;
6159       if (ncopies > 1
6160 	  && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6161 	  && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6162 	  && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6163 	single_defuse_cycle = true;
6164 
6165       /* Create the destination vector  */
6166       scalar_dest = gimple_assign_lhs (reduc_stmt);
6167       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6168 
6169       if (slp_node)
6170 	/* The size vect_schedule_slp_instance computes is off for us.  */
6171 	vec_num = vect_get_num_vectors
6172 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6173 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6174 	   vectype_in);
6175       else
6176 	vec_num = 1;
6177 
6178       /* Generate the reduction PHIs upfront.  */
6179       prev_phi_info = NULL;
6180       for (j = 0; j < ncopies; j++)
6181 	{
6182 	  if (j == 0 || !single_defuse_cycle)
6183 	    {
6184 	      for (i = 0; i < vec_num; i++)
6185 		{
6186 		  /* Create the reduction-phi that defines the reduction
6187 		     operand.  */
6188 		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
6189 		  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6190 
6191 		  if (slp_node)
6192 		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6193 		  else
6194 		    {
6195 		      if (j == 0)
6196 			STMT_VINFO_VEC_STMT (stmt_info)
6197 			  = *vec_stmt = new_phi_info;
6198 		      else
6199 			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6200 		      prev_phi_info = new_phi_info;
6201 		    }
6202 		}
6203 	    }
6204 	}
6205 
6206       return true;
6207     }
6208 
6209   /* 1. Is vectorizable reduction?  */
6210   /* Not supportable if the reduction variable is used in the loop, unless
6211      it's a reduction chain.  */
6212   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6213       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6214     return false;
6215 
6216   /* Reductions that are not used even in an enclosing outer-loop,
6217      are expected to be "live" (used out of the loop).  */
6218   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6219       && !STMT_VINFO_LIVE_P (stmt_info))
6220     return false;
6221 
6222   /* 2. Has this been recognized as a reduction pattern?
6223 
6224      Check if STMT represents a pattern that has been recognized
6225      in earlier analysis stages.  For stmts that represent a pattern,
6226      the STMT_VINFO_RELATED_STMT field records the last stmt in
6227      the original sequence that constitutes the pattern.  */
6228 
6229   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6230   if (orig_stmt_info)
6231     {
6232       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6233       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6234     }
6235 
6236   /* 3. Check the operands of the operation.  The first operands are defined
6237         inside the loop body. The last operand is the reduction variable,
6238         which is defined by the loop-header-phi.  */
6239 
6240   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6241 
6242   /* Flatten RHS.  */
6243   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6244     {
6245     case GIMPLE_BINARY_RHS:
6246       code = gimple_assign_rhs_code (stmt);
6247       op_type = TREE_CODE_LENGTH (code);
6248       gcc_assert (op_type == binary_op);
6249       ops[0] = gimple_assign_rhs1 (stmt);
6250       ops[1] = gimple_assign_rhs2 (stmt);
6251       break;
6252 
6253     case GIMPLE_TERNARY_RHS:
6254       code = gimple_assign_rhs_code (stmt);
6255       op_type = TREE_CODE_LENGTH (code);
6256       gcc_assert (op_type == ternary_op);
6257       ops[0] = gimple_assign_rhs1 (stmt);
6258       ops[1] = gimple_assign_rhs2 (stmt);
6259       ops[2] = gimple_assign_rhs3 (stmt);
6260       break;
6261 
6262     case GIMPLE_UNARY_RHS:
6263       return false;
6264 
6265     default:
6266       gcc_unreachable ();
6267     }
6268 
6269   if (code == COND_EXPR && slp_node)
6270     return false;
6271 
6272   scalar_dest = gimple_assign_lhs (stmt);
6273   scalar_type = TREE_TYPE (scalar_dest);
6274   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6275       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6276     return false;
6277 
6278   /* Do not try to vectorize bit-precision reductions.  */
6279   if (!type_has_mode_precision_p (scalar_type))
6280     return false;
6281 
6282   /* All uses but the last are expected to be defined in the loop.
6283      The last use is the reduction variable.  In case of nested cycle this
6284      assumption is not true: we use reduc_index to record the index of the
6285      reduction variable.  */
6286   stmt_vec_info reduc_def_info;
6287   if (orig_stmt_info)
6288     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6289   else
6290     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6291   gcc_assert (reduc_def_info);
6292   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6293   tree reduc_def = PHI_RESULT (reduc_def_phi);
6294   int reduc_index = -1;
6295   for (i = 0; i < op_type; i++)
6296     {
6297       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6298       if (i == 0 && code == COND_EXPR)
6299         continue;
6300 
6301       stmt_vec_info def_stmt_info;
6302       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6303 					  &def_stmt_info);
6304       dt = dts[i];
6305       gcc_assert (is_simple_use);
6306       if (dt == vect_reduction_def
6307 	  && ops[i] == reduc_def)
6308 	{
6309 	  reduc_index = i;
6310 	  continue;
6311 	}
6312       else if (tem)
6313 	{
6314 	  /* To properly compute ncopies we are interested in the widest
6315 	     input type in case we're looking at a widening accumulation.  */
6316 	  if (!vectype_in
6317 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6318 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6319 	    vectype_in = tem;
6320 	}
6321 
6322       if (dt != vect_internal_def
6323 	  && dt != vect_external_def
6324 	  && dt != vect_constant_def
6325 	  && dt != vect_induction_def
6326           && !(dt == vect_nested_cycle && nested_cycle))
6327 	return false;
6328 
6329       if (dt == vect_nested_cycle
6330 	  && ops[i] == reduc_def)
6331 	{
6332 	  found_nested_cycle_def = true;
6333 	  reduc_index = i;
6334 	}
6335 
6336       if (i == 1 && code == COND_EXPR)
6337 	{
6338 	  /* Record how value of COND_EXPR is defined.  */
6339 	  if (dt == vect_constant_def)
6340 	    {
6341 	      cond_reduc_dt = dt;
6342 	      cond_reduc_val = ops[i];
6343 	    }
6344 	  if (dt == vect_induction_def
6345 	      && def_stmt_info
6346 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6347 	    {
6348 	      cond_reduc_dt = dt;
6349 	      cond_stmt_vinfo = def_stmt_info;
6350 	    }
6351 	}
6352     }
6353 
6354   if (!vectype_in)
6355     vectype_in = vectype_out;
6356 
6357   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6358      directy used in stmt.  */
6359   if (reduc_index == -1)
6360     {
6361       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6362 	{
6363 	  if (dump_enabled_p ())
6364 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6365 			     "in-order reduction chain without SLP.\n");
6366 	  return false;
6367 	}
6368     }
6369 
6370   if (!(reduc_index == -1
6371 	|| dts[reduc_index] == vect_reduction_def
6372 	|| dts[reduc_index] == vect_nested_cycle
6373 	|| ((dts[reduc_index] == vect_internal_def
6374 	     || dts[reduc_index] == vect_external_def
6375 	     || dts[reduc_index] == vect_constant_def
6376 	     || dts[reduc_index] == vect_induction_def)
6377 	    && nested_cycle && found_nested_cycle_def)))
6378     {
6379       /* For pattern recognized stmts, orig_stmt might be a reduction,
6380 	 but some helper statements for the pattern might not, or
6381 	 might be COND_EXPRs with reduction uses in the condition.  */
6382       gcc_assert (orig_stmt_info);
6383       return false;
6384     }
6385 
6386   /* PHIs should not participate in patterns.  */
6387   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6388   enum vect_reduction_type v_reduc_type
6389     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6390   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6391 
6392   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6393   /* If we have a condition reduction, see if we can simplify it further.  */
6394   if (v_reduc_type == COND_REDUCTION)
6395     {
6396       /* TODO: We can't yet handle reduction chains, since we need to treat
6397 	 each COND_EXPR in the chain specially, not just the last one.
6398 	 E.g. for:
6399 
6400 	    x_1 = PHI <x_3, ...>
6401 	    x_2 = a_2 ? ... : x_1;
6402 	    x_3 = a_3 ? ... : x_2;
6403 
6404 	 we're interested in the last element in x_3 for which a_2 || a_3
6405 	 is true, whereas the current reduction chain handling would
6406 	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6407 	 as a reduction operation.  */
6408       if (reduc_index == -1)
6409 	{
6410 	  if (dump_enabled_p ())
6411 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6412 			     "conditional reduction chains not supported\n");
6413 	  return false;
6414 	}
6415 
6416       /* vect_is_simple_reduction ensured that operand 2 is the
6417 	 loop-carried operand.  */
6418       gcc_assert (reduc_index == 2);
6419 
6420       /* Loop peeling modifies initial value of reduction PHI, which
6421 	 makes the reduction stmt to be transformed different to the
6422 	 original stmt analyzed.  We need to record reduction code for
6423 	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6424 	 it can be used directly at transform stage.  */
6425       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6426 	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6427 	{
6428 	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
6429 	  gcc_assert (cond_reduc_dt == vect_constant_def);
6430 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6431 	}
6432       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6433 					       vectype_in, OPTIMIZE_FOR_SPEED))
6434 	{
6435 	  if (dump_enabled_p ())
6436 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6437 			     "optimizing condition reduction with"
6438 			     " FOLD_EXTRACT_LAST.\n");
6439 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6440 	}
6441       else if (cond_reduc_dt == vect_induction_def)
6442 	{
6443 	  tree base
6444 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6445 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6446 
6447 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6448 		      && TREE_CODE (step) == INTEGER_CST);
6449 	  cond_reduc_val = NULL_TREE;
6450 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6451 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6452 	    ;
6453 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6454 	     above base; punt if base is the minimum value of the type for
6455 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6456 	  else if (tree_int_cst_sgn (step) == -1)
6457 	    {
6458 	      cond_reduc_op_code = MIN_EXPR;
6459 	      if (tree_int_cst_sgn (base) == -1)
6460 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6461 	      else if (tree_int_cst_lt (base,
6462 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6463 		cond_reduc_val
6464 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6465 	    }
6466 	  else
6467 	    {
6468 	      cond_reduc_op_code = MAX_EXPR;
6469 	      if (tree_int_cst_sgn (base) == 1)
6470 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6471 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6472 					base))
6473 		cond_reduc_val
6474 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6475 	    }
6476 	  if (cond_reduc_val)
6477 	    {
6478 	      if (dump_enabled_p ())
6479 		dump_printf_loc (MSG_NOTE, vect_location,
6480 				 "condition expression based on "
6481 				 "integer induction.\n");
6482 	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6483 		= INTEGER_INDUC_COND_REDUCTION;
6484 	    }
6485 	}
6486       else if (cond_reduc_dt == vect_constant_def)
6487 	{
6488 	  enum vect_def_type cond_initial_dt;
6489 	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6490 	  tree cond_initial_val
6491 	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6492 
6493 	  gcc_assert (cond_reduc_val != NULL_TREE);
6494 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6495 	  if (cond_initial_dt == vect_constant_def
6496 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6497 				     TREE_TYPE (cond_reduc_val)))
6498 	    {
6499 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6500 				    cond_initial_val, cond_reduc_val);
6501 	      if (e && (integer_onep (e) || integer_zerop (e)))
6502 		{
6503 		  if (dump_enabled_p ())
6504 		    dump_printf_loc (MSG_NOTE, vect_location,
6505 				     "condition expression based on "
6506 				     "compile time constant.\n");
6507 		  /* Record reduction code at analysis stage.  */
6508 		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6509 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6510 		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6511 		    = CONST_COND_REDUCTION;
6512 		}
6513 	    }
6514 	}
6515     }
6516 
6517   if (orig_stmt_info)
6518     gcc_assert (tmp == orig_stmt_info
6519 		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6520   else
6521     /* We changed STMT to be the first stmt in reduction chain, hence we
6522        check that in this case the first element in the chain is STMT.  */
6523     gcc_assert (tmp == stmt_info
6524 		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6525 
6526   if (STMT_VINFO_LIVE_P (reduc_def_info))
6527     return false;
6528 
6529   if (slp_node)
6530     ncopies = 1;
6531   else
6532     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6533 
6534   gcc_assert (ncopies >= 1);
6535 
6536   vec_mode = TYPE_MODE (vectype_in);
6537   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6538 
6539   if (nested_cycle)
6540     {
6541       def_bb = gimple_bb (reduc_def_phi);
6542       def_stmt_loop = def_bb->loop_father;
6543       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6544                                        loop_preheader_edge (def_stmt_loop));
6545       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6546       if (def_arg_stmt_info
6547 	  && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6548 	      == vect_double_reduction_def))
6549         double_reduc = true;
6550     }
6551 
6552   vect_reduction_type reduction_type
6553     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6554   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6555       && ncopies > 1)
6556     {
6557       if (dump_enabled_p ())
6558 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6559 			 "multiple types in double reduction or condition "
6560 			 "reduction.\n");
6561       return false;
6562     }
6563 
6564   if (code == COND_EXPR)
6565     {
6566       /* Only call during the analysis stage, otherwise we'll lose
6567 	 STMT_VINFO_TYPE.  */
6568       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6569 						true, NULL, cost_vec))
6570         {
6571           if (dump_enabled_p ())
6572 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 			     "unsupported condition in reduction\n");
6574 	  return false;
6575         }
6576     }
6577   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6578 	   || code == LROTATE_EXPR || code == RROTATE_EXPR)
6579     {
6580       /* Only call during the analysis stage, otherwise we'll lose
6581 	 STMT_VINFO_TYPE.  We only support this for nested cycles
6582 	 without double reductions at the moment.  */
6583       if (!nested_cycle
6584 	  || double_reduc
6585 	  || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6586 						NULL, cost_vec)))
6587 	{
6588           if (dump_enabled_p ())
6589 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6590 			     "unsupported shift or rotation in reduction\n");
6591 	  return false;
6592 	}
6593     }
6594   else
6595     {
6596       /* 4. Supportable by target?  */
6597 
6598       /* 4.1. check support for the operation in the loop  */
6599       optab = optab_for_tree_code (code, vectype_in, optab_default);
6600       if (!optab)
6601         {
6602           if (dump_enabled_p ())
6603 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6604 			     "no optab.\n");
6605 
6606           return false;
6607         }
6608 
6609       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6610         {
6611           if (dump_enabled_p ())
6612             dump_printf (MSG_NOTE, "op not supported by target.\n");
6613 
6614 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6615 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6616             return false;
6617 
6618           if (dump_enabled_p ())
6619   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6620         }
6621 
6622       /* Worthwhile without SIMD support?  */
6623       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6624 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6625         {
6626           if (dump_enabled_p ())
6627 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6628 			     "not worthwhile without SIMD support.\n");
6629 
6630           return false;
6631         }
6632     }
6633 
6634   /* 4.2. Check support for the epilog operation.
6635 
6636           If STMT represents a reduction pattern, then the type of the
6637           reduction variable may be different than the type of the rest
6638           of the arguments.  For example, consider the case of accumulation
6639           of shorts into an int accumulator; The original code:
6640                         S1: int_a = (int) short_a;
6641           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6642 
6643           was replaced with:
6644                         STMT: int_acc = widen_sum <short_a, int_acc>
6645 
6646           This means that:
6647           1. The tree-code that is used to create the vector operation in the
6648              epilog code (that reduces the partial results) is not the
6649              tree-code of STMT, but is rather the tree-code of the original
6650              stmt from the pattern that STMT is replacing.  I.e, in the example
6651              above we want to use 'widen_sum' in the loop, but 'plus' in the
6652              epilog.
6653           2. The type (mode) we use to check available target support
6654              for the vector operation to be created in the *epilog*, is
6655              determined by the type of the reduction variable (in the example
6656              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6657              However the type (mode) we use to check available target support
6658              for the vector operation to be created *inside the loop*, is
6659              determined by the type of the other arguments to STMT (in the
6660              example we'd check this: optab_handler (widen_sum_optab,
6661 	     vect_short_mode)).
6662 
6663           This is contrary to "regular" reductions, in which the types of all
6664           the arguments are the same as the type of the reduction variable.
6665           For "regular" reductions we can therefore use the same vector type
6666           (and also the same tree-code) when generating the epilog code and
6667           when generating the code inside the loop.  */
6668 
6669   if (orig_stmt_info
6670       && (reduction_type == TREE_CODE_REDUCTION
6671 	  || reduction_type == FOLD_LEFT_REDUCTION))
6672     {
6673       /* This is a reduction pattern: get the vectype from the type of the
6674          reduction variable, and get the tree-code from orig_stmt.  */
6675       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6676       gcc_assert (vectype_out);
6677       vec_mode = TYPE_MODE (vectype_out);
6678     }
6679   else
6680     {
6681       /* Regular reduction: use the same vectype and tree-code as used for
6682          the vector code inside the loop can be used for the epilog code. */
6683       orig_code = code;
6684 
6685       if (code == MINUS_EXPR)
6686 	orig_code = PLUS_EXPR;
6687 
6688       /* For simple condition reductions, replace with the actual expression
6689 	 we want to base our reduction around.  */
6690       if (reduction_type == CONST_COND_REDUCTION)
6691 	{
6692 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6693 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6694 	}
6695       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6696 	orig_code = cond_reduc_op_code;
6697     }
6698 
6699   reduc_fn = IFN_LAST;
6700 
6701   if (reduction_type == TREE_CODE_REDUCTION
6702       || reduction_type == FOLD_LEFT_REDUCTION
6703       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6704       || reduction_type == CONST_COND_REDUCTION)
6705     {
6706       if (reduction_type == FOLD_LEFT_REDUCTION
6707 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6708 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6709 	{
6710 	  if (reduc_fn != IFN_LAST
6711 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6712 						  OPTIMIZE_FOR_SPEED))
6713 	    {
6714 	      if (dump_enabled_p ())
6715 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716 				 "reduc op not supported by target.\n");
6717 
6718 	      reduc_fn = IFN_LAST;
6719 	    }
6720 	}
6721       else
6722 	{
6723 	  if (!nested_cycle || double_reduc)
6724 	    {
6725 	      if (dump_enabled_p ())
6726 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 				 "no reduc code for scalar code.\n");
6728 
6729 	      return false;
6730 	    }
6731 	}
6732     }
6733   else if (reduction_type == COND_REDUCTION)
6734     {
6735       int scalar_precision
6736 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6737       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6738       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6739 						nunits_out);
6740 
6741       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6742 					  OPTIMIZE_FOR_SPEED))
6743 	reduc_fn = IFN_REDUC_MAX;
6744     }
6745 
6746   if (reduction_type != EXTRACT_LAST_REDUCTION
6747       && (!nested_cycle || double_reduc)
6748       && reduc_fn == IFN_LAST
6749       && !nunits_out.is_constant ())
6750     {
6751       if (dump_enabled_p ())
6752 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 			 "missing target support for reduction on"
6754 			 " variable-length vectors.\n");
6755       return false;
6756     }
6757 
6758   /* For SLP reductions, see if there is a neutral value we can use.  */
6759   tree neutral_op = NULL_TREE;
6760   if (slp_node)
6761     neutral_op = neutral_op_for_slp_reduction
6762       (slp_node_instance->reduc_phis, code,
6763        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6764 
6765   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6766     {
6767       /* We can't support in-order reductions of code such as this:
6768 
6769 	   for (int i = 0; i < n1; ++i)
6770 	     for (int j = 0; j < n2; ++j)
6771 	       l += a[j];
6772 
6773 	 since GCC effectively transforms the loop when vectorizing:
6774 
6775 	   for (int i = 0; i < n1 / VF; ++i)
6776 	     for (int j = 0; j < n2; ++j)
6777 	       for (int k = 0; k < VF; ++k)
6778 		 l += a[j];
6779 
6780 	 which is a reassociation of the original operation.  */
6781       if (dump_enabled_p ())
6782 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783 			 "in-order double reduction not supported.\n");
6784 
6785       return false;
6786     }
6787 
6788   if (reduction_type == FOLD_LEFT_REDUCTION
6789       && slp_node
6790       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6791     {
6792       /* We cannot use in-order reductions in this case because there is
6793 	 an implicit reassociation of the operations involved.  */
6794       if (dump_enabled_p ())
6795 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6796 			 "in-order unchained SLP reductions not supported.\n");
6797       return false;
6798     }
6799 
6800   /* For double reductions, and for SLP reductions with a neutral value,
6801      we construct a variable-length initial vector by loading a vector
6802      full of the neutral value and then shift-and-inserting the start
6803      values into the low-numbered elements.  */
6804   if ((double_reduc || neutral_op)
6805       && !nunits_out.is_constant ()
6806       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6807 					  vectype_out, OPTIMIZE_FOR_SPEED))
6808     {
6809       if (dump_enabled_p ())
6810 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 			 "reduction on variable-length vectors requires"
6812 			 " target support for a vector-shift-and-insert"
6813 			 " operation.\n");
6814       return false;
6815     }
6816 
6817   /* Check extra constraints for variable-length unchained SLP reductions.  */
6818   if (STMT_SLP_TYPE (stmt_info)
6819       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6820       && !nunits_out.is_constant ())
6821     {
6822       /* We checked above that we could build the initial vector when
6823 	 there's a neutral element value.  Check here for the case in
6824 	 which each SLP statement has its own initial value and in which
6825 	 that value needs to be repeated for every instance of the
6826 	 statement within the initial vector.  */
6827       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6828       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6829       if (!neutral_op
6830 	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
6831 	{
6832 	  if (dump_enabled_p ())
6833 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834 			     "unsupported form of SLP reduction for"
6835 			     " variable-length vectors: cannot build"
6836 			     " initial vector.\n");
6837 	  return false;
6838 	}
6839       /* The epilogue code relies on the number of elements being a multiple
6840 	 of the group size.  The duplicate-and-interleave approach to setting
6841 	 up the the initial vector does too.  */
6842       if (!multiple_p (nunits_out, group_size))
6843 	{
6844 	  if (dump_enabled_p ())
6845 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6846 			     "unsupported form of SLP reduction for"
6847 			     " variable-length vectors: the vector size"
6848 			     " is not a multiple of the number of results.\n");
6849 	  return false;
6850 	}
6851     }
6852 
6853   /* In case of widenning multiplication by a constant, we update the type
6854      of the constant to be the type of the other operand.  We check that the
6855      constant fits the type in the pattern recognition pass.  */
6856   if (code == DOT_PROD_EXPR
6857       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6858     {
6859       if (TREE_CODE (ops[0]) == INTEGER_CST)
6860         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6861       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6862         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6863       else
6864         {
6865           if (dump_enabled_p ())
6866 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 			     "invalid types in dot-prod\n");
6868 
6869           return false;
6870         }
6871     }
6872 
6873   if (reduction_type == COND_REDUCTION)
6874     {
6875       widest_int ni;
6876 
6877       if (! max_loop_iterations (loop, &ni))
6878 	{
6879 	  if (dump_enabled_p ())
6880 	    dump_printf_loc (MSG_NOTE, vect_location,
6881 			     "loop count not known, cannot create cond "
6882 			     "reduction.\n");
6883 	  return false;
6884 	}
6885       /* Convert backedges to iterations.  */
6886       ni += 1;
6887 
6888       /* The additional index will be the same type as the condition.  Check
6889 	 that the loop can fit into this less one (because we'll use up the
6890 	 zero slot for when there are no matches).  */
6891       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6892       if (wi::geu_p (ni, wi::to_widest (max_index)))
6893 	{
6894 	  if (dump_enabled_p ())
6895 	    dump_printf_loc (MSG_NOTE, vect_location,
6896 			     "loop size is greater than data size.\n");
6897 	  return false;
6898 	}
6899     }
6900 
6901   /* In case the vectorization factor (VF) is bigger than the number
6902      of elements that we can fit in a vectype (nunits), we have to generate
6903      more than one vector stmt - i.e - we need to "unroll" the
6904      vector stmt by a factor VF/nunits.  For more details see documentation
6905      in vectorizable_operation.  */
6906 
6907   /* If the reduction is used in an outer loop we need to generate
6908      VF intermediate results, like so (e.g. for ncopies=2):
6909 	r0 = phi (init, r0)
6910 	r1 = phi (init, r1)
6911 	r0 = x0 + r0;
6912         r1 = x1 + r1;
6913     (i.e. we generate VF results in 2 registers).
6914     In this case we have a separate def-use cycle for each copy, and therefore
6915     for each copy we get the vector def for the reduction variable from the
6916     respective phi node created for this copy.
6917 
6918     Otherwise (the reduction is unused in the loop nest), we can combine
6919     together intermediate results, like so (e.g. for ncopies=2):
6920 	r = phi (init, r)
6921 	r = x0 + r;
6922 	r = x1 + r;
6923    (i.e. we generate VF/2 results in a single register).
6924    In this case for each copy we get the vector def for the reduction variable
6925    from the vectorized reduction operation generated in the previous iteration.
6926 
6927    This only works when we see both the reduction PHI and its only consumer
6928    in vectorizable_reduction and there are no intermediate stmts
6929    participating.  */
6930   stmt_vec_info use_stmt_info;
6931   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6932   if (ncopies > 1
6933       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6934       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6935       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6936     {
6937       single_defuse_cycle = true;
6938       epilog_copies = 1;
6939     }
6940   else
6941     epilog_copies = ncopies;
6942 
6943   /* If the reduction stmt is one of the patterns that have lane
6944      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6945   if ((ncopies > 1
6946        && ! single_defuse_cycle)
6947       && (code == DOT_PROD_EXPR
6948 	  || code == WIDEN_SUM_EXPR
6949 	  || code == SAD_EXPR))
6950     {
6951       if (dump_enabled_p ())
6952 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953 			 "multi def-use cycle not possible for lane-reducing "
6954 			 "reduction operation\n");
6955       return false;
6956     }
6957 
6958   if (slp_node)
6959     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6960   else
6961     vec_num = 1;
6962 
6963   internal_fn cond_fn = get_conditional_internal_fn (code);
6964   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6965 
6966   if (!vec_stmt) /* transformation not required.  */
6967     {
6968       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6969       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6970 	{
6971 	  if (reduction_type != FOLD_LEFT_REDUCTION
6972 	      && (cond_fn == IFN_LAST
6973 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6974 						      OPTIMIZE_FOR_SPEED)))
6975 	    {
6976 	      if (dump_enabled_p ())
6977 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978 				 "can't use a fully-masked loop because no"
6979 				 " conditional operation is available.\n");
6980 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6981 	    }
6982 	  else if (reduc_index == -1)
6983 	    {
6984 	      if (dump_enabled_p ())
6985 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6986 				 "can't use a fully-masked loop for chained"
6987 				 " reductions.\n");
6988 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6989 	    }
6990 	  else
6991 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6992 				   vectype_in);
6993 	}
6994       if (dump_enabled_p ()
6995 	  && reduction_type == FOLD_LEFT_REDUCTION)
6996 	dump_printf_loc (MSG_NOTE, vect_location,
6997 			 "using an in-order (fold-left) reduction.\n");
6998       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6999       return true;
7000     }
7001 
7002   /* Transform.  */
7003 
7004   if (dump_enabled_p ())
7005     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7006 
7007   /* FORNOW: Multiple types are not supported for condition.  */
7008   if (code == COND_EXPR)
7009     gcc_assert (ncopies == 1);
7010 
7011   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7012 
7013   if (reduction_type == FOLD_LEFT_REDUCTION)
7014     return vectorize_fold_left_reduction
7015       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7016        reduc_fn, ops, vectype_in, reduc_index, masks);
7017 
7018   if (reduction_type == EXTRACT_LAST_REDUCTION)
7019     {
7020       gcc_assert (!slp_node);
7021       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7022 				     true, NULL, NULL);
7023     }
7024 
7025   /* Create the destination vector  */
7026   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7027 
7028   prev_stmt_info = NULL;
7029   prev_phi_info = NULL;
7030   if (!slp_node)
7031     {
7032       vec_oprnds0.create (1);
7033       vec_oprnds1.create (1);
7034       if (op_type == ternary_op)
7035         vec_oprnds2.create (1);
7036     }
7037 
7038   phis.create (vec_num);
7039   vect_defs.create (vec_num);
7040   if (!slp_node)
7041     vect_defs.quick_push (NULL_TREE);
7042 
7043   if (slp_node)
7044     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7045   else
7046     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7047 
7048   for (j = 0; j < ncopies; j++)
7049     {
7050       if (code == COND_EXPR)
7051         {
7052           gcc_assert (!slp_node);
7053 	  vectorizable_condition (stmt_info, gsi, vec_stmt,
7054 				  true, NULL, NULL);
7055           break;
7056         }
7057       if (code == LSHIFT_EXPR
7058 	  || code == RSHIFT_EXPR)
7059 	{
7060 	  vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7061 	  break;
7062 	}
7063 
7064       /* Handle uses.  */
7065       if (j == 0)
7066         {
7067 	  if (slp_node)
7068 	    {
7069 	      /* Get vec defs for all the operands except the reduction index,
7070 		 ensuring the ordering of the ops in the vector is kept.  */
7071 	      auto_vec<tree, 3> slp_ops;
7072 	      auto_vec<vec<tree>, 3> vec_defs;
7073 
7074 	      slp_ops.quick_push (ops[0]);
7075 	      slp_ops.quick_push (ops[1]);
7076 	      if (op_type == ternary_op)
7077 		slp_ops.quick_push (ops[2]);
7078 
7079 	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7080 
7081 	      vec_oprnds0.safe_splice (vec_defs[0]);
7082 	      vec_defs[0].release ();
7083 	      vec_oprnds1.safe_splice (vec_defs[1]);
7084 	      vec_defs[1].release ();
7085 	      if (op_type == ternary_op)
7086 		{
7087 		  vec_oprnds2.safe_splice (vec_defs[2]);
7088 		  vec_defs[2].release ();
7089 		}
7090 	    }
7091           else
7092 	    {
7093               vec_oprnds0.quick_push
7094 		(vect_get_vec_def_for_operand (ops[0], stmt_info));
7095               vec_oprnds1.quick_push
7096 		(vect_get_vec_def_for_operand (ops[1], stmt_info));
7097               if (op_type == ternary_op)
7098 		vec_oprnds2.quick_push
7099 		  (vect_get_vec_def_for_operand (ops[2], stmt_info));
7100 	    }
7101         }
7102       else
7103         {
7104           if (!slp_node)
7105             {
7106 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7107 
7108 	      if (single_defuse_cycle && reduc_index == 0)
7109 		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7110 	      else
7111 		vec_oprnds0[0]
7112 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7113 						    vec_oprnds0[0]);
7114 	      if (single_defuse_cycle && reduc_index == 1)
7115 		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7116 	      else
7117 		vec_oprnds1[0]
7118 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119 						    vec_oprnds1[0]);
7120 	      if (op_type == ternary_op)
7121 		{
7122 		  if (single_defuse_cycle && reduc_index == 2)
7123 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7124 		  else
7125 		    vec_oprnds2[0]
7126 		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7127 							vec_oprnds2[0]);
7128 		}
7129             }
7130         }
7131 
7132       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7133         {
7134 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7135 	  if (masked_loop_p)
7136 	    {
7137 	      /* Make sure that the reduction accumulator is vop[0].  */
7138 	      if (reduc_index == 1)
7139 		{
7140 		  gcc_assert (commutative_tree_code (code));
7141 		  std::swap (vop[0], vop[1]);
7142 		}
7143 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7144 					      vectype_in, i * ncopies + j);
7145 	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7146 							vop[0], vop[1],
7147 							vop[0]);
7148 	      new_temp = make_ssa_name (vec_dest, call);
7149 	      gimple_call_set_lhs (call, new_temp);
7150 	      gimple_call_set_nothrow (call, true);
7151 	      new_stmt_info
7152 		= vect_finish_stmt_generation (stmt_info, call, gsi);
7153 	    }
7154 	  else
7155 	    {
7156 	      if (op_type == ternary_op)
7157 		vop[2] = vec_oprnds2[i];
7158 
7159 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
7160 						       vop[0], vop[1], vop[2]);
7161 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7162 	      gimple_assign_set_lhs (new_stmt, new_temp);
7163 	      new_stmt_info
7164 		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7165 	    }
7166 
7167           if (slp_node)
7168             {
7169 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7170               vect_defs.quick_push (new_temp);
7171             }
7172           else
7173             vect_defs[0] = new_temp;
7174         }
7175 
7176       if (slp_node)
7177         continue;
7178 
7179       if (j == 0)
7180 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7181       else
7182 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7183 
7184       prev_stmt_info = new_stmt_info;
7185     }
7186 
7187   /* Finalize the reduction-phi (set its arguments) and create the
7188      epilog reduction code.  */
7189   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7190     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7191 
7192   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7193 				    epilog_copies, reduc_fn, phis,
7194 				    double_reduc, slp_node, slp_node_instance,
7195 				    cond_reduc_val, cond_reduc_op_code,
7196 				    neutral_op);
7197 
7198   return true;
7199 }
7200 
7201 /* Function vect_min_worthwhile_factor.
7202 
7203    For a loop where we could vectorize the operation indicated by CODE,
7204    return the minimum vectorization factor that makes it worthwhile
7205    to use generic vectors.  */
7206 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7207 vect_min_worthwhile_factor (enum tree_code code)
7208 {
7209   switch (code)
7210     {
7211     case PLUS_EXPR:
7212     case MINUS_EXPR:
7213     case NEGATE_EXPR:
7214       return 4;
7215 
7216     case BIT_AND_EXPR:
7217     case BIT_IOR_EXPR:
7218     case BIT_XOR_EXPR:
7219     case BIT_NOT_EXPR:
7220       return 2;
7221 
7222     default:
7223       return INT_MAX;
7224     }
7225 }
7226 
7227 /* Return true if VINFO indicates we are doing loop vectorization and if
7228    it is worth decomposing CODE operations into scalar operations for
7229    that loop's vectorization factor.  */
7230 
7231 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7232 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7233 {
7234   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7235   unsigned HOST_WIDE_INT value;
7236   return (loop_vinfo
7237 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7238 	  && value >= vect_min_worthwhile_factor (code));
7239 }
7240 
7241 /* Function vectorizable_induction
7242 
7243    Check if STMT_INFO performs an induction computation that can be vectorized.
7244    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7245    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7246    Return true if STMT_INFO is vectorizable in this way.  */
7247 
7248 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7249 vectorizable_induction (stmt_vec_info stmt_info,
7250 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7251 			stmt_vec_info *vec_stmt, slp_tree slp_node,
7252 			stmt_vector_for_cost *cost_vec)
7253 {
7254   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7255   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7256   unsigned ncopies;
7257   bool nested_in_vect_loop = false;
7258   struct loop *iv_loop;
7259   tree vec_def;
7260   edge pe = loop_preheader_edge (loop);
7261   basic_block new_bb;
7262   tree new_vec, vec_init, vec_step, t;
7263   tree new_name;
7264   gimple *new_stmt;
7265   gphi *induction_phi;
7266   tree induc_def, vec_dest;
7267   tree init_expr, step_expr;
7268   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7269   unsigned i;
7270   tree expr;
7271   gimple_seq stmts;
7272   imm_use_iterator imm_iter;
7273   use_operand_p use_p;
7274   gimple *exit_phi;
7275   edge latch_e;
7276   tree loop_arg;
7277   gimple_stmt_iterator si;
7278 
7279   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7280   if (!phi)
7281     return false;
7282 
7283   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7284     return false;
7285 
7286   /* Make sure it was recognized as induction computation.  */
7287   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7288     return false;
7289 
7290   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7291   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7292 
7293   if (slp_node)
7294     ncopies = 1;
7295   else
7296     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7297   gcc_assert (ncopies >= 1);
7298 
7299   /* FORNOW. These restrictions should be relaxed.  */
7300   if (nested_in_vect_loop_p (loop, stmt_info))
7301     {
7302       imm_use_iterator imm_iter;
7303       use_operand_p use_p;
7304       gimple *exit_phi;
7305       edge latch_e;
7306       tree loop_arg;
7307 
7308       if (ncopies > 1)
7309 	{
7310 	  if (dump_enabled_p ())
7311 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 			     "multiple types in nested loop.\n");
7313 	  return false;
7314 	}
7315 
7316       /* FORNOW: outer loop induction with SLP not supported.  */
7317       if (STMT_SLP_TYPE (stmt_info))
7318 	return false;
7319 
7320       exit_phi = NULL;
7321       latch_e = loop_latch_edge (loop->inner);
7322       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7323       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7324 	{
7325 	  gimple *use_stmt = USE_STMT (use_p);
7326 	  if (is_gimple_debug (use_stmt))
7327 	    continue;
7328 
7329 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7330 	    {
7331 	      exit_phi = use_stmt;
7332 	      break;
7333 	    }
7334 	}
7335       if (exit_phi)
7336 	{
7337 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7338 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7339 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7340 	    {
7341 	      if (dump_enabled_p ())
7342 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7343 				 "inner-loop induction only used outside "
7344 				 "of the outer vectorized loop.\n");
7345 	      return false;
7346 	    }
7347 	}
7348 
7349       nested_in_vect_loop = true;
7350       iv_loop = loop->inner;
7351     }
7352   else
7353     iv_loop = loop;
7354   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7355 
7356   if (slp_node && !nunits.is_constant ())
7357     {
7358       /* The current SLP code creates the initial value element-by-element.  */
7359       if (dump_enabled_p ())
7360 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7361 			 "SLP induction not supported for variable-length"
7362 			 " vectors.\n");
7363       return false;
7364     }
7365 
7366   if (!vec_stmt) /* transformation not required.  */
7367     {
7368       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7369       DUMP_VECT_SCOPE ("vectorizable_induction");
7370       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7371       return true;
7372     }
7373 
7374   /* Transform.  */
7375 
7376   /* Compute a vector variable, initialized with the first VF values of
7377      the induction variable.  E.g., for an iv with IV_PHI='X' and
7378      evolution S, for a vector of 4 units, we want to compute:
7379      [X, X + S, X + 2*S, X + 3*S].  */
7380 
7381   if (dump_enabled_p ())
7382     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7383 
7384   latch_e = loop_latch_edge (iv_loop);
7385   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7386 
7387   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7388   gcc_assert (step_expr != NULL_TREE);
7389 
7390   pe = loop_preheader_edge (iv_loop);
7391   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7392 				     loop_preheader_edge (iv_loop));
7393 
7394   stmts = NULL;
7395   if (!nested_in_vect_loop)
7396     {
7397       /* Convert the initial value to the desired type.  */
7398       tree new_type = TREE_TYPE (vectype);
7399       init_expr = gimple_convert (&stmts, new_type, init_expr);
7400 
7401       /* If we are using the loop mask to "peel" for alignment then we need
7402 	 to adjust the start value here.  */
7403       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7404       if (skip_niters != NULL_TREE)
7405 	{
7406 	  if (FLOAT_TYPE_P (vectype))
7407 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7408 					skip_niters);
7409 	  else
7410 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7411 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7412 					 skip_niters, step_expr);
7413 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7414 				    init_expr, skip_step);
7415 	}
7416     }
7417 
7418   /* Convert the step to the desired type.  */
7419   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7420 
7421   if (stmts)
7422     {
7423       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7424       gcc_assert (!new_bb);
7425     }
7426 
7427   /* Find the first insertion point in the BB.  */
7428   basic_block bb = gimple_bb (phi);
7429   si = gsi_after_labels (bb);
7430 
7431   /* For SLP induction we have to generate several IVs as for example
7432      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7433      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7434      [VF*S, VF*S, VF*S, VF*S] for all.  */
7435   if (slp_node)
7436     {
7437       /* Enforced above.  */
7438       unsigned int const_nunits = nunits.to_constant ();
7439 
7440       /* Generate [VF*S, VF*S, ... ].  */
7441       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7442 	{
7443 	  expr = build_int_cst (integer_type_node, vf);
7444 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7445 	}
7446       else
7447 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7448       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7449 			      expr, step_expr);
7450       if (! CONSTANT_CLASS_P (new_name))
7451 	new_name = vect_init_vector (stmt_info, new_name,
7452 				     TREE_TYPE (step_expr), NULL);
7453       new_vec = build_vector_from_val (vectype, new_name);
7454       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7455 
7456       /* Now generate the IVs.  */
7457       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7458       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7459       unsigned elts = const_nunits * nvects;
7460       unsigned nivs = least_common_multiple (group_size,
7461 					     const_nunits) / const_nunits;
7462       gcc_assert (elts % group_size == 0);
7463       tree elt = init_expr;
7464       unsigned ivn;
7465       for (ivn = 0; ivn < nivs; ++ivn)
7466 	{
7467 	  tree_vector_builder elts (vectype, const_nunits, 1);
7468 	  stmts = NULL;
7469 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7470 	    {
7471 	      if (ivn*const_nunits + eltn >= group_size
7472 		  && (ivn * const_nunits + eltn) % group_size == 0)
7473 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7474 				    elt, step_expr);
7475 	      elts.quick_push (elt);
7476 	    }
7477 	  vec_init = gimple_build_vector (&stmts, &elts);
7478 	  if (stmts)
7479 	    {
7480 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7481 	      gcc_assert (!new_bb);
7482 	    }
7483 
7484 	  /* Create the induction-phi that defines the induction-operand.  */
7485 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7486 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7487 	  stmt_vec_info induction_phi_info
7488 	    = loop_vinfo->add_stmt (induction_phi);
7489 	  induc_def = PHI_RESULT (induction_phi);
7490 
7491 	  /* Create the iv update inside the loop  */
7492 	  vec_def = make_ssa_name (vec_dest);
7493 	  new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7494 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7495 	  loop_vinfo->add_stmt (new_stmt);
7496 
7497 	  /* Set the arguments of the phi node:  */
7498 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7499 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7500 		       UNKNOWN_LOCATION);
7501 
7502 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7503 	}
7504 
7505       /* Re-use IVs when we can.  */
7506       if (ivn < nvects)
7507 	{
7508 	  unsigned vfp
7509 	    = least_common_multiple (group_size, const_nunits) / group_size;
7510 	  /* Generate [VF'*S, VF'*S, ... ].  */
7511 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7512 	    {
7513 	      expr = build_int_cst (integer_type_node, vfp);
7514 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7515 	    }
7516 	  else
7517 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7518 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7519 				  expr, step_expr);
7520 	  if (! CONSTANT_CLASS_P (new_name))
7521 	    new_name = vect_init_vector (stmt_info, new_name,
7522 					 TREE_TYPE (step_expr), NULL);
7523 	  new_vec = build_vector_from_val (vectype, new_name);
7524 	  vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7525 	  for (; ivn < nvects; ++ivn)
7526 	    {
7527 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7528 	      tree def;
7529 	      if (gimple_code (iv) == GIMPLE_PHI)
7530 		def = gimple_phi_result (iv);
7531 	      else
7532 		def = gimple_assign_lhs (iv);
7533 	      new_stmt = gimple_build_assign (make_ssa_name (vectype),
7534 					      PLUS_EXPR,
7535 					      def, vec_step);
7536 	      if (gimple_code (iv) == GIMPLE_PHI)
7537 		gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7538 	      else
7539 		{
7540 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7541 		  gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7542 		}
7543 	      SLP_TREE_VEC_STMTS (slp_node).quick_push
7544 		(loop_vinfo->add_stmt (new_stmt));
7545 	    }
7546 	}
7547 
7548       return true;
7549     }
7550 
7551   /* Create the vector that holds the initial_value of the induction.  */
7552   if (nested_in_vect_loop)
7553     {
7554       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7555 	 been created during vectorization of previous stmts.  We obtain it
7556 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7557       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7558       /* If the initial value is not of proper type, convert it.  */
7559       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7560 	{
7561 	  new_stmt
7562 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7563 							  vect_simple_var,
7564 							  "vec_iv_"),
7565 				   VIEW_CONVERT_EXPR,
7566 				   build1 (VIEW_CONVERT_EXPR, vectype,
7567 					   vec_init));
7568 	  vec_init = gimple_assign_lhs (new_stmt);
7569 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7570 						 new_stmt);
7571 	  gcc_assert (!new_bb);
7572 	  loop_vinfo->add_stmt (new_stmt);
7573 	}
7574     }
7575   else
7576     {
7577       /* iv_loop is the loop to be vectorized. Create:
7578 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7579       stmts = NULL;
7580       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7581 
7582       unsigned HOST_WIDE_INT const_nunits;
7583       if (nunits.is_constant (&const_nunits))
7584 	{
7585 	  tree_vector_builder elts (vectype, const_nunits, 1);
7586 	  elts.quick_push (new_name);
7587 	  for (i = 1; i < const_nunits; i++)
7588 	    {
7589 	      /* Create: new_name_i = new_name + step_expr  */
7590 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7591 				       new_name, step_expr);
7592 	      elts.quick_push (new_name);
7593 	    }
7594 	  /* Create a vector from [new_name_0, new_name_1, ...,
7595 	     new_name_nunits-1]  */
7596 	  vec_init = gimple_build_vector (&stmts, &elts);
7597 	}
7598       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7599 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7600 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7601 				 new_name, step_expr);
7602       else
7603 	{
7604 	  /* Build:
7605 	        [base, base, base, ...]
7606 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7607 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7608 	  gcc_assert (flag_associative_math);
7609 	  tree index = build_index_vector (vectype, 0, 1);
7610 	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7611 							new_name);
7612 	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7613 							step_expr);
7614 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7615 	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7616 				   vec_init, step_vec);
7617 	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7618 				   vec_init, base_vec);
7619 	}
7620 
7621       if (stmts)
7622 	{
7623 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7624 	  gcc_assert (!new_bb);
7625 	}
7626     }
7627 
7628 
7629   /* Create the vector that holds the step of the induction.  */
7630   if (nested_in_vect_loop)
7631     /* iv_loop is nested in the loop to be vectorized. Generate:
7632        vec_step = [S, S, S, S]  */
7633     new_name = step_expr;
7634   else
7635     {
7636       /* iv_loop is the loop to be vectorized. Generate:
7637 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7638       gimple_seq seq = NULL;
7639       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7640 	{
7641 	  expr = build_int_cst (integer_type_node, vf);
7642 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7643 	}
7644       else
7645 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7646       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7647 			       expr, step_expr);
7648       if (seq)
7649 	{
7650 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7651 	  gcc_assert (!new_bb);
7652 	}
7653     }
7654 
7655   t = unshare_expr (new_name);
7656   gcc_assert (CONSTANT_CLASS_P (new_name)
7657 	      || TREE_CODE (new_name) == SSA_NAME);
7658   new_vec = build_vector_from_val (vectype, t);
7659   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7660 
7661 
7662   /* Create the following def-use cycle:
7663      loop prolog:
7664          vec_init = ...
7665 	 vec_step = ...
7666      loop:
7667          vec_iv = PHI <vec_init, vec_loop>
7668          ...
7669          STMT
7670          ...
7671          vec_loop = vec_iv + vec_step;  */
7672 
7673   /* Create the induction-phi that defines the induction-operand.  */
7674   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7675   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7676   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7677   induc_def = PHI_RESULT (induction_phi);
7678 
7679   /* Create the iv update inside the loop  */
7680   vec_def = make_ssa_name (vec_dest);
7681   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7682   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7683   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7684 
7685   /* Set the arguments of the phi node:  */
7686   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7687   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7688 	       UNKNOWN_LOCATION);
7689 
7690   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7691 
7692   /* In case that vectorization factor (VF) is bigger than the number
7693      of elements that we can fit in a vectype (nunits), we have to generate
7694      more than one vector stmt - i.e - we need to "unroll" the
7695      vector stmt by a factor VF/nunits.  For more details see documentation
7696      in vectorizable_operation.  */
7697 
7698   if (ncopies > 1)
7699     {
7700       gimple_seq seq = NULL;
7701       stmt_vec_info prev_stmt_vinfo;
7702       /* FORNOW. This restriction should be relaxed.  */
7703       gcc_assert (!nested_in_vect_loop);
7704 
7705       /* Create the vector that holds the step of the induction.  */
7706       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7707 	{
7708 	  expr = build_int_cst (integer_type_node, nunits);
7709 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7710 	}
7711       else
7712 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7713       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7714 			       expr, step_expr);
7715       if (seq)
7716 	{
7717 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7718 	  gcc_assert (!new_bb);
7719 	}
7720 
7721       t = unshare_expr (new_name);
7722       gcc_assert (CONSTANT_CLASS_P (new_name)
7723 		  || TREE_CODE (new_name) == SSA_NAME);
7724       new_vec = build_vector_from_val (vectype, t);
7725       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7726 
7727       vec_def = induc_def;
7728       prev_stmt_vinfo = induction_phi_info;
7729       for (i = 1; i < ncopies; i++)
7730 	{
7731 	  /* vec_i = vec_prev + vec_step  */
7732 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7733 					  vec_def, vec_step);
7734 	  vec_def = make_ssa_name (vec_dest, new_stmt);
7735 	  gimple_assign_set_lhs (new_stmt, vec_def);
7736 
7737 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7738 	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7739 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7740 	  prev_stmt_vinfo = new_stmt_info;
7741 	}
7742     }
7743 
7744   if (nested_in_vect_loop)
7745     {
7746       /* Find the loop-closed exit-phi of the induction, and record
7747          the final vector of induction results:  */
7748       exit_phi = NULL;
7749       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7750         {
7751 	  gimple *use_stmt = USE_STMT (use_p);
7752 	  if (is_gimple_debug (use_stmt))
7753 	    continue;
7754 
7755 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7756 	    {
7757 	      exit_phi = use_stmt;
7758 	      break;
7759 	    }
7760         }
7761       if (exit_phi)
7762 	{
7763 	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7764 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
7765 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
7766 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7767 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
7768 
7769 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7770 	  if (dump_enabled_p ())
7771 	    dump_printf_loc (MSG_NOTE, vect_location,
7772 			     "vector of inductions after inner-loop:%G",
7773 			     new_stmt);
7774 	}
7775     }
7776 
7777 
7778   if (dump_enabled_p ())
7779     dump_printf_loc (MSG_NOTE, vect_location,
7780 		     "transform induction: created def-use cycle: %G%G",
7781 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
7782 
7783   return true;
7784 }
7785 
7786 /* Function vectorizable_live_operation.
7787 
7788    STMT_INFO computes a value that is used outside the loop.  Check if
7789    it can be supported.  */
7790 
7791 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,stmt_vec_info * vec_stmt,stmt_vector_for_cost *)7792 vectorizable_live_operation (stmt_vec_info stmt_info,
7793 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7794 			     slp_tree slp_node, int slp_index,
7795 			     stmt_vec_info *vec_stmt,
7796 			     stmt_vector_for_cost *)
7797 {
7798   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7799   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7800   imm_use_iterator imm_iter;
7801   tree lhs, lhs_type, bitsize, vec_bitsize;
7802   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7803   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7804   int ncopies;
7805   gimple *use_stmt;
7806   auto_vec<tree> vec_oprnds;
7807   int vec_entry = 0;
7808   poly_uint64 vec_index = 0;
7809 
7810   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7811 
7812   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7813     return false;
7814 
7815   /* FORNOW.  CHECKME.  */
7816   if (nested_in_vect_loop_p (loop, stmt_info))
7817     return false;
7818 
7819   /* If STMT is not relevant and it is a simple assignment and its inputs are
7820      invariant then it can remain in place, unvectorized.  The original last
7821      scalar value that it computes will be used.  */
7822   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7823     {
7824       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7825       if (dump_enabled_p ())
7826 	dump_printf_loc (MSG_NOTE, vect_location,
7827 			 "statement is simple and uses invariant.  Leaving in "
7828 			 "place.\n");
7829       return true;
7830     }
7831 
7832   if (slp_node)
7833     ncopies = 1;
7834   else
7835     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7836 
7837   if (slp_node)
7838     {
7839       gcc_assert (slp_index >= 0);
7840 
7841       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7842       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7843 
7844       /* Get the last occurrence of the scalar index from the concatenation of
7845 	 all the slp vectors. Calculate which slp vector it is and the index
7846 	 within.  */
7847       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7848 
7849       /* Calculate which vector contains the result, and which lane of
7850 	 that vector we need.  */
7851       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7852 	{
7853 	  if (dump_enabled_p ())
7854 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7855 			     "Cannot determine which vector holds the"
7856 			     " final result.\n");
7857 	  return false;
7858 	}
7859     }
7860 
7861   if (!vec_stmt)
7862     {
7863       /* No transformation required.  */
7864       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7865 	{
7866 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7867 					       OPTIMIZE_FOR_SPEED))
7868 	    {
7869 	      if (dump_enabled_p ())
7870 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7871 				 "can't use a fully-masked loop because "
7872 				 "the target doesn't support extract last "
7873 				 "reduction.\n");
7874 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7875 	    }
7876 	  else if (slp_node)
7877 	    {
7878 	      if (dump_enabled_p ())
7879 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7880 				 "can't use a fully-masked loop because an "
7881 				 "SLP statement is live after the loop.\n");
7882 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7883 	    }
7884 	  else if (ncopies > 1)
7885 	    {
7886 	      if (dump_enabled_p ())
7887 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7888 				 "can't use a fully-masked loop because"
7889 				 " ncopies is greater than 1.\n");
7890 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7891 	    }
7892 	  else
7893 	    {
7894 	      gcc_assert (ncopies == 1 && !slp_node);
7895 	      vect_record_loop_mask (loop_vinfo,
7896 				     &LOOP_VINFO_MASKS (loop_vinfo),
7897 				     1, vectype);
7898 	    }
7899 	}
7900       return true;
7901     }
7902 
7903   /* Use the lhs of the original scalar statement.  */
7904   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7905 
7906   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7907 	: gimple_get_lhs (stmt);
7908   lhs_type = TREE_TYPE (lhs);
7909 
7910   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7911 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7912 	     : TYPE_SIZE (TREE_TYPE (vectype)));
7913   vec_bitsize = TYPE_SIZE (vectype);
7914 
7915   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7916   tree vec_lhs, bitstart;
7917   if (slp_node)
7918     {
7919       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7920 
7921       /* Get the correct slp vectorized stmt.  */
7922       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7923       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7924 	vec_lhs = gimple_phi_result (phi);
7925       else
7926 	vec_lhs = gimple_get_lhs (vec_stmt);
7927 
7928       /* Get entry to use.  */
7929       bitstart = bitsize_int (vec_index);
7930       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7931     }
7932   else
7933     {
7934       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7935       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7936       gcc_checking_assert (ncopies == 1
7937 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7938 
7939       /* For multiple copies, get the last copy.  */
7940       for (int i = 1; i < ncopies; ++i)
7941 	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7942 
7943       /* Get the last lane in the vector.  */
7944       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7945     }
7946 
7947   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
7948      requirement, insert one phi node for it.  It looks like:
7949 	 loop;
7950        BB:
7951 	 # lhs' = PHI <lhs>
7952      ==>
7953 	 loop;
7954        BB:
7955 	 # vec_lhs' = PHI <vec_lhs>
7956 	 new_tree = lane_extract <vec_lhs', ...>;
7957 	 lhs' = new_tree;  */
7958 
7959   basic_block exit_bb = single_exit (loop)->dest;
7960   gcc_assert (single_pred_p (exit_bb));
7961 
7962   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
7963   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
7964   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
7965 
7966   gimple_seq stmts = NULL;
7967   tree new_tree;
7968   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7969     {
7970       /* Emit:
7971 
7972 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7973 
7974 	 where VEC_LHS is the vectorized live-out result and MASK is
7975 	 the loop mask for the final iteration.  */
7976       gcc_assert (ncopies == 1 && !slp_node);
7977       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7978       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
7979 				      vectype, 0);
7980       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
7981 				      mask, vec_lhs_phi);
7982 
7983       /* Convert the extracted vector element to the required scalar type.  */
7984       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7985     }
7986   else
7987     {
7988       tree bftype = TREE_TYPE (vectype);
7989       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7990 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7991       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
7992       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7993 				       &stmts, true, NULL_TREE);
7994     }
7995 
7996   if (stmts)
7997     {
7998       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
7999       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8000 
8001       /* Remove existing phi from lhs and create one copy from new_tree.  */
8002       tree lhs_phi = NULL_TREE;
8003       gimple_stmt_iterator gsi;
8004       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8005 	{
8006 	  gimple *phi = gsi_stmt (gsi);
8007 	  if ((gimple_phi_arg_def (phi, 0) == lhs))
8008 	    {
8009 	      remove_phi_node (&gsi, false);
8010 	      lhs_phi = gimple_phi_result (phi);
8011 	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8012 	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8013 	      break;
8014 	    }
8015 	}
8016     }
8017 
8018   /* Replace use of lhs with newly computed result.  If the use stmt is a
8019      single arg PHI, just replace all uses of PHI result.  It's necessary
8020      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8021   use_operand_p use_p;
8022   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8023     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8024 	&& !is_gimple_debug (use_stmt))
8025     {
8026       if (gimple_code (use_stmt) == GIMPLE_PHI
8027 	  && gimple_phi_num_args (use_stmt) == 1)
8028 	{
8029 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8030 	}
8031       else
8032 	{
8033 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8034 	    SET_USE (use_p, new_tree);
8035 	}
8036       update_stmt (use_stmt);
8037     }
8038 
8039   return true;
8040 }
8041 
8042 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8043 
8044 static void
vect_loop_kill_debug_uses(struct loop * loop,stmt_vec_info stmt_info)8045 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8046 {
8047   ssa_op_iter op_iter;
8048   imm_use_iterator imm_iter;
8049   def_operand_p def_p;
8050   gimple *ustmt;
8051 
8052   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8053     {
8054       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8055 	{
8056 	  basic_block bb;
8057 
8058 	  if (!is_gimple_debug (ustmt))
8059 	    continue;
8060 
8061 	  bb = gimple_bb (ustmt);
8062 
8063 	  if (!flow_bb_inside_loop_p (loop, bb))
8064 	    {
8065 	      if (gimple_debug_bind_p (ustmt))
8066 		{
8067 		  if (dump_enabled_p ())
8068 		    dump_printf_loc (MSG_NOTE, vect_location,
8069                                      "killing debug use\n");
8070 
8071 		  gimple_debug_bind_reset_value (ustmt);
8072 		  update_stmt (ustmt);
8073 		}
8074 	      else
8075 		gcc_unreachable ();
8076 	    }
8077 	}
8078     }
8079 }
8080 
8081 /* Given loop represented by LOOP_VINFO, return true if computation of
8082    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8083    otherwise.  */
8084 
8085 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8086 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8087 {
8088   /* Constant case.  */
8089   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8090     {
8091       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8092       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8093 
8094       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8095       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8096       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8097 	return true;
8098     }
8099 
8100   widest_int max;
8101   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8102   /* Check the upper bound of loop niters.  */
8103   if (get_max_loop_iterations (loop, &max))
8104     {
8105       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8106       signop sgn = TYPE_SIGN (type);
8107       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8108       if (max < type_max)
8109 	return true;
8110     }
8111   return false;
8112 }
8113 
8114 /* Return a mask type with half the number of elements as TYPE.  */
8115 
8116 tree
vect_halve_mask_nunits(tree type)8117 vect_halve_mask_nunits (tree type)
8118 {
8119   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8120   return build_truth_vector_type (nunits, current_vector_size);
8121 }
8122 
8123 /* Return a mask type with twice as many elements as TYPE.  */
8124 
8125 tree
vect_double_mask_nunits(tree type)8126 vect_double_mask_nunits (tree type)
8127 {
8128   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8129   return build_truth_vector_type (nunits, current_vector_size);
8130 }
8131 
8132 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8133    contain a sequence of NVECTORS masks that each control a vector of type
8134    VECTYPE.  */
8135 
8136 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)8137 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8138 		       unsigned int nvectors, tree vectype)
8139 {
8140   gcc_assert (nvectors != 0);
8141   if (masks->length () < nvectors)
8142     masks->safe_grow_cleared (nvectors);
8143   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8144   /* The number of scalars per iteration and the number of vectors are
8145      both compile-time constants.  */
8146   unsigned int nscalars_per_iter
8147     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8148 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8149   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8150     {
8151       rgm->max_nscalars_per_iter = nscalars_per_iter;
8152       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8153     }
8154 }
8155 
8156 /* Given a complete set of masks MASKS, extract mask number INDEX
8157    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8158    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8159 
8160    See the comment above vec_loop_masks for more details about the mask
8161    arrangement.  */
8162 
8163 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8164 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8165 		    unsigned int nvectors, tree vectype, unsigned int index)
8166 {
8167   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8168   tree mask_type = rgm->mask_type;
8169 
8170   /* Populate the rgroup's mask array, if this is the first time we've
8171      used it.  */
8172   if (rgm->masks.is_empty ())
8173     {
8174       rgm->masks.safe_grow_cleared (nvectors);
8175       for (unsigned int i = 0; i < nvectors; ++i)
8176 	{
8177 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8178 	  /* Provide a dummy definition until the real one is available.  */
8179 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8180 	  rgm->masks[i] = mask;
8181 	}
8182     }
8183 
8184   tree mask = rgm->masks[index];
8185   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8186 		TYPE_VECTOR_SUBPARTS (vectype)))
8187     {
8188       /* A loop mask for data type X can be reused for data type Y
8189 	 if X has N times more elements than Y and if Y's elements
8190 	 are N times bigger than X's.  In this case each sequence
8191 	 of N elements in the loop mask will be all-zero or all-one.
8192 	 We can then view-convert the mask so that each sequence of
8193 	 N elements is replaced by a single element.  */
8194       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8195 			      TYPE_VECTOR_SUBPARTS (vectype)));
8196       gimple_seq seq = NULL;
8197       mask_type = build_same_sized_truth_vector_type (vectype);
8198       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8199       if (seq)
8200 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8201     }
8202   return mask;
8203 }
8204 
8205 /* Scale profiling counters by estimation for LOOP which is vectorized
8206    by factor VF.  */
8207 
8208 static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)8209 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8210 {
8211   edge preheader = loop_preheader_edge (loop);
8212   /* Reduce loop iterations by the vectorization factor.  */
8213   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8214   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8215 
8216   if (freq_h.nonzero_p ())
8217     {
8218       profile_probability p;
8219 
8220       /* Avoid dropping loop body profile counter to 0 because of zero count
8221 	 in loop's preheader.  */
8222       if (!(freq_e == profile_count::zero ()))
8223         freq_e = freq_e.force_nonzero ();
8224       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8225       scale_loop_frequencies (loop, p);
8226     }
8227 
8228   edge exit_e = single_exit (loop);
8229   exit_e->probability = profile_probability::always ()
8230 				 .apply_scale (1, new_est_niter + 1);
8231 
8232   edge exit_l = single_pred_edge (loop->latch);
8233   profile_probability prob = exit_l->probability;
8234   exit_l->probability = exit_e->probability.invert ();
8235   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8236     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8237 }
8238 
8239 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8240    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8241    stmt_vec_info.  */
8242 
8243 static void
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8244 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8245 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8246 {
8247   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8248   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8249 
8250   if (dump_enabled_p ())
8251     dump_printf_loc (MSG_NOTE, vect_location,
8252 		     "------>vectorizing statement: %G", stmt_info->stmt);
8253 
8254   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8255     vect_loop_kill_debug_uses (loop, stmt_info);
8256 
8257   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8258       && !STMT_VINFO_LIVE_P (stmt_info))
8259     return;
8260 
8261   if (STMT_VINFO_VECTYPE (stmt_info))
8262     {
8263       poly_uint64 nunits
8264 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8265       if (!STMT_SLP_TYPE (stmt_info)
8266 	  && maybe_ne (nunits, vf)
8267 	  && dump_enabled_p ())
8268 	/* For SLP VF is set according to unrolling factor, and not
8269 	   to vector size, hence for SLP this print is not valid.  */
8270 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8271     }
8272 
8273   /* Pure SLP statements have already been vectorized.  We still need
8274      to apply loop vectorization to hybrid SLP statements.  */
8275   if (PURE_SLP_STMT (stmt_info))
8276     return;
8277 
8278   if (dump_enabled_p ())
8279     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8280 
8281   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8282     *seen_store = stmt_info;
8283 }
8284 
8285 /* Function vect_transform_loop.
8286 
8287    The analysis phase has determined that the loop is vectorizable.
8288    Vectorize the loop - created vectorized stmts to replace the scalar
8289    stmts in the loop, and update the loop exit condition.
8290    Returns scalar epilogue loop if any.  */
8291 
8292 struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)8293 vect_transform_loop (loop_vec_info loop_vinfo)
8294 {
8295   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8296   struct loop *epilogue = NULL;
8297   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8298   int nbbs = loop->num_nodes;
8299   int i;
8300   tree niters_vector = NULL_TREE;
8301   tree step_vector = NULL_TREE;
8302   tree niters_vector_mult_vf = NULL_TREE;
8303   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8304   unsigned int lowest_vf = constant_lower_bound (vf);
8305   gimple *stmt;
8306   bool check_profitability = false;
8307   unsigned int th;
8308 
8309   DUMP_VECT_SCOPE ("vec_transform_loop");
8310 
8311   loop_vinfo->shared->check_datarefs ();
8312 
8313   /* Use the more conservative vectorization threshold.  If the number
8314      of iterations is constant assume the cost check has been performed
8315      by our caller.  If the threshold makes all loops profitable that
8316      run at least the (estimated) vectorization factor number of times
8317      checking is pointless, too.  */
8318   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8319   if (th >= vect_vf_for_cost (loop_vinfo)
8320       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8321     {
8322       if (dump_enabled_p ())
8323 	dump_printf_loc (MSG_NOTE, vect_location,
8324 			 "Profitability threshold is %d loop iterations.\n",
8325                          th);
8326       check_profitability = true;
8327     }
8328 
8329   /* Make sure there exists a single-predecessor exit bb.  Do this before
8330      versioning.   */
8331   edge e = single_exit (loop);
8332   if (! single_pred_p (e->dest))
8333     {
8334       split_loop_exit_edge (e, true);
8335       if (dump_enabled_p ())
8336 	dump_printf (MSG_NOTE, "split exit edge\n");
8337     }
8338 
8339   /* Version the loop first, if required, so the profitability check
8340      comes first.  */
8341 
8342   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8343     {
8344       poly_uint64 versioning_threshold
8345 	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8346       if (check_profitability
8347 	  && ordered_p (poly_uint64 (th), versioning_threshold))
8348 	{
8349 	  versioning_threshold = ordered_max (poly_uint64 (th),
8350 					      versioning_threshold);
8351 	  check_profitability = false;
8352 	}
8353       struct loop *sloop
8354 	= vect_loop_versioning (loop_vinfo, th, check_profitability,
8355 				versioning_threshold);
8356       sloop->force_vectorize = false;
8357       check_profitability = false;
8358     }
8359 
8360   /* Make sure there exists a single-predecessor exit bb also on the
8361      scalar loop copy.  Do this after versioning but before peeling
8362      so CFG structure is fine for both scalar and if-converted loop
8363      to make slpeel_duplicate_current_defs_from_edges face matched
8364      loop closed PHI nodes on the exit.  */
8365   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8366     {
8367       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8368       if (! single_pred_p (e->dest))
8369 	{
8370 	  split_loop_exit_edge (e, true);
8371 	  if (dump_enabled_p ())
8372 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8373 	}
8374     }
8375 
8376   tree niters = vect_build_loop_niters (loop_vinfo);
8377   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8378   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8379   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8380   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8381 			      &step_vector, &niters_vector_mult_vf, th,
8382 			      check_profitability, niters_no_overflow);
8383 
8384   if (niters_vector == NULL_TREE)
8385     {
8386       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8387 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8388 	  && known_eq (lowest_vf, vf))
8389 	{
8390 	  niters_vector
8391 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8392 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8393 	  step_vector = build_one_cst (TREE_TYPE (niters));
8394 	}
8395       else
8396 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8397 				     &step_vector, niters_no_overflow);
8398     }
8399 
8400   /* 1) Make sure the loop header has exactly two entries
8401      2) Make sure we have a preheader basic block.  */
8402 
8403   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8404 
8405   split_edge (loop_preheader_edge (loop));
8406 
8407   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8408       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8409     /* This will deal with any possible peeling.  */
8410     vect_prepare_for_masked_peels (loop_vinfo);
8411 
8412   /* Schedule the SLP instances first, then handle loop vectorization
8413      below.  */
8414   if (!loop_vinfo->slp_instances.is_empty ())
8415     {
8416       DUMP_VECT_SCOPE ("scheduling SLP instances");
8417       vect_schedule_slp (loop_vinfo);
8418     }
8419 
8420   /* FORNOW: the vectorizer supports only loops which body consist
8421      of one basic block (header + empty latch). When the vectorizer will
8422      support more involved loop forms, the order by which the BBs are
8423      traversed need to be reconsidered.  */
8424 
8425   for (i = 0; i < nbbs; i++)
8426     {
8427       basic_block bb = bbs[i];
8428       stmt_vec_info stmt_info;
8429 
8430       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8431 	   gsi_next (&si))
8432         {
8433 	  gphi *phi = si.phi ();
8434 	  if (dump_enabled_p ())
8435 	    dump_printf_loc (MSG_NOTE, vect_location,
8436 			     "------>vectorizing phi: %G", phi);
8437 	  stmt_info = loop_vinfo->lookup_stmt (phi);
8438 	  if (!stmt_info)
8439 	    continue;
8440 
8441 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8442 	    vect_loop_kill_debug_uses (loop, stmt_info);
8443 
8444 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8445 	      && !STMT_VINFO_LIVE_P (stmt_info))
8446 	    continue;
8447 
8448 	  if (STMT_VINFO_VECTYPE (stmt_info)
8449 	      && (maybe_ne
8450 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8451 	      && dump_enabled_p ())
8452 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8453 
8454 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8455 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8456 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8457 	      && ! PURE_SLP_STMT (stmt_info))
8458 	    {
8459 	      if (dump_enabled_p ())
8460 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8461 	      vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8462 	    }
8463 	}
8464 
8465       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8466 	   !gsi_end_p (si);)
8467 	{
8468 	  stmt = gsi_stmt (si);
8469 	  /* During vectorization remove existing clobber stmts.  */
8470 	  if (gimple_clobber_p (stmt))
8471 	    {
8472 	      unlink_stmt_vdef (stmt);
8473 	      gsi_remove (&si, true);
8474 	      release_defs (stmt);
8475 	    }
8476 	  else
8477 	    {
8478 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
8479 
8480 	      /* vector stmts created in the outer-loop during vectorization of
8481 		 stmts in an inner-loop may not have a stmt_info, and do not
8482 		 need to be vectorized.  */
8483 	      stmt_vec_info seen_store = NULL;
8484 	      if (stmt_info)
8485 		{
8486 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8487 		    {
8488 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8489 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8490 			   !gsi_end_p (subsi); gsi_next (&subsi))
8491 			{
8492 			  stmt_vec_info pat_stmt_info
8493 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8494 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8495 						    &si, &seen_store);
8496 			}
8497 		      stmt_vec_info pat_stmt_info
8498 			= STMT_VINFO_RELATED_STMT (stmt_info);
8499 		      vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8500 						&seen_store);
8501 		    }
8502 		  vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8503 					    &seen_store);
8504 		}
8505 	      gsi_next (&si);
8506 	      if (seen_store)
8507 		{
8508 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8509 		    /* Interleaving.  If IS_STORE is TRUE, the
8510 		       vectorization of the interleaving chain was
8511 		       completed - free all the stores in the chain.  */
8512 		    vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8513 		  else
8514 		    /* Free the attached stmt_vec_info and remove the stmt.  */
8515 		    loop_vinfo->remove_stmt (stmt_info);
8516 		}
8517 	    }
8518 	}
8519 
8520       /* Stub out scalar statements that must not survive vectorization.
8521 	 Doing this here helps with grouped statements, or statements that
8522 	 are involved in patterns.  */
8523       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8524 	   !gsi_end_p (gsi); gsi_next (&gsi))
8525 	{
8526 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8527 	  if (!call || !gimple_call_internal_p (call))
8528 	    continue;
8529 	  internal_fn ifn = gimple_call_internal_fn (call);
8530 	  if (ifn == IFN_MASK_LOAD)
8531 	    {
8532 	      tree lhs = gimple_get_lhs (call);
8533 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8534 		{
8535 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8536 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8537 		  gsi_replace (&gsi, new_stmt, true);
8538 		}
8539 	    }
8540 	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
8541 	    {
8542 	      tree lhs = gimple_get_lhs (call);
8543 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8544 		{
8545 		  tree else_arg
8546 		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
8547 		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
8548 		  gsi_replace (&gsi, new_stmt, true);
8549 		}
8550 	    }
8551 	}
8552     }				/* BBs in loop */
8553 
8554   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8555      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8556   if (integer_onep (step_vector))
8557     niters_no_overflow = true;
8558   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8559 			   niters_vector_mult_vf, !niters_no_overflow);
8560 
8561   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8562   scale_profile_for_vect_loop (loop, assumed_vf);
8563 
8564   /* True if the final iteration might not handle a full vector's
8565      worth of scalar iterations.  */
8566   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8567   /* The minimum number of iterations performed by the epilogue.  This
8568      is 1 when peeling for gaps because we always need a final scalar
8569      iteration.  */
8570   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8571   /* +1 to convert latch counts to loop iteration counts,
8572      -min_epilogue_iters to remove iterations that cannot be performed
8573        by the vector code.  */
8574   int bias_for_lowest = 1 - min_epilogue_iters;
8575   int bias_for_assumed = bias_for_lowest;
8576   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8577   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8578     {
8579       /* When the amount of peeling is known at compile time, the first
8580 	 iteration will have exactly alignment_npeels active elements.
8581 	 In the worst case it will have at least one.  */
8582       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8583       bias_for_lowest += lowest_vf - min_first_active;
8584       bias_for_assumed += assumed_vf - min_first_active;
8585     }
8586   /* In these calculations the "- 1" converts loop iteration counts
8587      back to latch counts.  */
8588   if (loop->any_upper_bound)
8589     loop->nb_iterations_upper_bound
8590       = (final_iter_may_be_partial
8591 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8592 			  lowest_vf) - 1
8593 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8594 			   lowest_vf) - 1);
8595   if (loop->any_likely_upper_bound)
8596     loop->nb_iterations_likely_upper_bound
8597       = (final_iter_may_be_partial
8598 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8599 			  + bias_for_lowest, lowest_vf) - 1
8600 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8601 			   + bias_for_lowest, lowest_vf) - 1);
8602   if (loop->any_estimate)
8603     loop->nb_iterations_estimate
8604       = (final_iter_may_be_partial
8605 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8606 			  assumed_vf) - 1
8607 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8608 			   assumed_vf) - 1);
8609 
8610   if (dump_enabled_p ())
8611     {
8612       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8613 	{
8614 	  dump_printf_loc (MSG_NOTE, vect_location,
8615 			   "LOOP VECTORIZED\n");
8616 	  if (loop->inner)
8617 	    dump_printf_loc (MSG_NOTE, vect_location,
8618 			     "OUTER LOOP VECTORIZED\n");
8619 	  dump_printf (MSG_NOTE, "\n");
8620 	}
8621       else
8622 	{
8623 	  dump_printf_loc (MSG_NOTE, vect_location,
8624 			   "LOOP EPILOGUE VECTORIZED (VS=");
8625 	  dump_dec (MSG_NOTE, current_vector_size);
8626 	  dump_printf (MSG_NOTE, ")\n");
8627 	}
8628     }
8629 
8630   /* Loops vectorized with a variable factor won't benefit from
8631      unrolling/peeling.  */
8632   if (!vf.is_constant ())
8633     {
8634       loop->unroll = 1;
8635       if (dump_enabled_p ())
8636 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8637 			 " variable-length vectorization factor\n");
8638     }
8639   /* Free SLP instances here because otherwise stmt reference counting
8640      won't work.  */
8641   slp_instance instance;
8642   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8643     vect_free_slp_instance (instance, true);
8644   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8645   /* Clear-up safelen field since its value is invalid after vectorization
8646      since vectorized loop can have loop-carried dependencies.  */
8647   loop->safelen = 0;
8648 
8649   /* Don't vectorize epilogue for epilogue.  */
8650   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8651     epilogue = NULL;
8652 
8653   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8654     epilogue = NULL;
8655 
8656   if (epilogue)
8657     {
8658       auto_vector_sizes vector_sizes;
8659       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8660       unsigned int next_size = 0;
8661 
8662       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8663          on niters already ajusted for the iterations of the prologue.  */
8664       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8665 	  && known_eq (vf, lowest_vf))
8666 	{
8667 	  unsigned HOST_WIDE_INT eiters
8668 	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8669 	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8670 	  eiters
8671 	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8672 	  epilogue->nb_iterations_upper_bound = eiters - 1;
8673 	  epilogue->any_upper_bound = true;
8674 
8675 	  unsigned int ratio;
8676 	  while (next_size < vector_sizes.length ()
8677 		 && !(constant_multiple_p (current_vector_size,
8678 					   vector_sizes[next_size], &ratio)
8679 		      && eiters >= lowest_vf / ratio))
8680 	    next_size += 1;
8681 	}
8682       else
8683 	while (next_size < vector_sizes.length ()
8684 	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
8685 	  next_size += 1;
8686 
8687       if (next_size == vector_sizes.length ())
8688 	epilogue = NULL;
8689     }
8690 
8691   if (epilogue)
8692     {
8693       epilogue->force_vectorize = loop->force_vectorize;
8694       epilogue->safelen = loop->safelen;
8695       epilogue->dont_vectorize = false;
8696 
8697       /* We may need to if-convert epilogue to vectorize it.  */
8698       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8699 	tree_if_conversion (epilogue);
8700     }
8701 
8702   return epilogue;
8703 }
8704 
8705 /* The code below is trying to perform simple optimization - revert
8706    if-conversion for masked stores, i.e. if the mask of a store is zero
8707    do not perform it and all stored value producers also if possible.
8708    For example,
8709      for (i=0; i<n; i++)
8710        if (c[i])
8711 	{
8712 	  p1[i] += 1;
8713 	  p2[i] = p3[i] +2;
8714 	}
8715    this transformation will produce the following semi-hammock:
8716 
8717    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8718      {
8719        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8720        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8721        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8722        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8723        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8724        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8725      }
8726 */
8727 
8728 void
optimize_mask_stores(struct loop * loop)8729 optimize_mask_stores (struct loop *loop)
8730 {
8731   basic_block *bbs = get_loop_body (loop);
8732   unsigned nbbs = loop->num_nodes;
8733   unsigned i;
8734   basic_block bb;
8735   struct loop *bb_loop;
8736   gimple_stmt_iterator gsi;
8737   gimple *stmt;
8738   auto_vec<gimple *> worklist;
8739   auto_purge_vect_location sentinel;
8740 
8741   vect_location = find_loop_location (loop);
8742   /* Pick up all masked stores in loop if any.  */
8743   for (i = 0; i < nbbs; i++)
8744     {
8745       bb = bbs[i];
8746       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8747 	   gsi_next (&gsi))
8748 	{
8749 	  stmt = gsi_stmt (gsi);
8750 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8751 	    worklist.safe_push (stmt);
8752 	}
8753     }
8754 
8755   free (bbs);
8756   if (worklist.is_empty ())
8757     return;
8758 
8759   /* Loop has masked stores.  */
8760   while (!worklist.is_empty ())
8761     {
8762       gimple *last, *last_store;
8763       edge e, efalse;
8764       tree mask;
8765       basic_block store_bb, join_bb;
8766       gimple_stmt_iterator gsi_to;
8767       tree vdef, new_vdef;
8768       gphi *phi;
8769       tree vectype;
8770       tree zero;
8771 
8772       last = worklist.pop ();
8773       mask = gimple_call_arg (last, 2);
8774       bb = gimple_bb (last);
8775       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8776 	 the same loop as if_bb.  It could be different to LOOP when two
8777 	 level loop-nest is vectorized and mask_store belongs to the inner
8778 	 one.  */
8779       e = split_block (bb, last);
8780       bb_loop = bb->loop_father;
8781       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8782       join_bb = e->dest;
8783       store_bb = create_empty_bb (bb);
8784       add_bb_to_loop (store_bb, bb_loop);
8785       e->flags = EDGE_TRUE_VALUE;
8786       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8787       /* Put STORE_BB to likely part.  */
8788       efalse->probability = profile_probability::unlikely ();
8789       store_bb->count = efalse->count ();
8790       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8791       if (dom_info_available_p (CDI_DOMINATORS))
8792 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8793       if (dump_enabled_p ())
8794 	dump_printf_loc (MSG_NOTE, vect_location,
8795 			 "Create new block %d to sink mask stores.",
8796 			 store_bb->index);
8797       /* Create vector comparison with boolean result.  */
8798       vectype = TREE_TYPE (mask);
8799       zero = build_zero_cst (vectype);
8800       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8801       gsi = gsi_last_bb (bb);
8802       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8803       /* Create new PHI node for vdef of the last masked store:
8804 	 .MEM_2 = VDEF <.MEM_1>
8805 	 will be converted to
8806 	 .MEM.3 = VDEF <.MEM_1>
8807 	 and new PHI node will be created in join bb
8808 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
8809       */
8810       vdef = gimple_vdef (last);
8811       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8812       gimple_set_vdef (last, new_vdef);
8813       phi = create_phi_node (vdef, join_bb);
8814       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8815 
8816       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8817       while (true)
8818 	{
8819 	  gimple_stmt_iterator gsi_from;
8820 	  gimple *stmt1 = NULL;
8821 
8822 	  /* Move masked store to STORE_BB.  */
8823 	  last_store = last;
8824 	  gsi = gsi_for_stmt (last);
8825 	  gsi_from = gsi;
8826 	  /* Shift GSI to the previous stmt for further traversal.  */
8827 	  gsi_prev (&gsi);
8828 	  gsi_to = gsi_start_bb (store_bb);
8829 	  gsi_move_before (&gsi_from, &gsi_to);
8830 	  /* Setup GSI_TO to the non-empty block start.  */
8831 	  gsi_to = gsi_start_bb (store_bb);
8832 	  if (dump_enabled_p ())
8833 	    dump_printf_loc (MSG_NOTE, vect_location,
8834 			     "Move stmt to created bb\n%G", last);
8835 	  /* Move all stored value producers if possible.  */
8836 	  while (!gsi_end_p (gsi))
8837 	    {
8838 	      tree lhs;
8839 	      imm_use_iterator imm_iter;
8840 	      use_operand_p use_p;
8841 	      bool res;
8842 
8843 	      /* Skip debug statements.  */
8844 	      if (is_gimple_debug (gsi_stmt (gsi)))
8845 		{
8846 		  gsi_prev (&gsi);
8847 		  continue;
8848 		}
8849 	      stmt1 = gsi_stmt (gsi);
8850 	      /* Do not consider statements writing to memory or having
8851 		 volatile operand.  */
8852 	      if (gimple_vdef (stmt1)
8853 		  || gimple_has_volatile_ops (stmt1))
8854 		break;
8855 	      gsi_from = gsi;
8856 	      gsi_prev (&gsi);
8857 	      lhs = gimple_get_lhs (stmt1);
8858 	      if (!lhs)
8859 		break;
8860 
8861 	      /* LHS of vectorized stmt must be SSA_NAME.  */
8862 	      if (TREE_CODE (lhs) != SSA_NAME)
8863 		break;
8864 
8865 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8866 		{
8867 		  /* Remove dead scalar statement.  */
8868 		  if (has_zero_uses (lhs))
8869 		    {
8870 		      gsi_remove (&gsi_from, true);
8871 		      continue;
8872 		    }
8873 		}
8874 
8875 	      /* Check that LHS does not have uses outside of STORE_BB.  */
8876 	      res = true;
8877 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8878 		{
8879 		  gimple *use_stmt;
8880 		  use_stmt = USE_STMT (use_p);
8881 		  if (is_gimple_debug (use_stmt))
8882 		    continue;
8883 		  if (gimple_bb (use_stmt) != store_bb)
8884 		    {
8885 		      res = false;
8886 		      break;
8887 		    }
8888 		}
8889 	      if (!res)
8890 		break;
8891 
8892 	      if (gimple_vuse (stmt1)
8893 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
8894 		break;
8895 
8896 	      /* Can move STMT1 to STORE_BB.  */
8897 	      if (dump_enabled_p ())
8898 		dump_printf_loc (MSG_NOTE, vect_location,
8899 				 "Move stmt to created bb\n%G", stmt1);
8900 	      gsi_move_before (&gsi_from, &gsi_to);
8901 	      /* Shift GSI_TO for further insertion.  */
8902 	      gsi_prev (&gsi_to);
8903 	    }
8904 	  /* Put other masked stores with the same mask to STORE_BB.  */
8905 	  if (worklist.is_empty ()
8906 	      || gimple_call_arg (worklist.last (), 2) != mask
8907 	      || worklist.last () != stmt1)
8908 	    break;
8909 	  last = worklist.pop ();
8910 	}
8911       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8912     }
8913 }
8914