1 /* Loop Vectorization
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 /* Loop Vectorization Pass.
59 
60    This pass tries to vectorize loops.
61 
62    For example, the vectorizer transforms the following simple loop:
63 
64         short a[N]; short b[N]; short c[N]; int i;
65 
66         for (i=0; i<N; i++){
67           a[i] = b[i] + c[i];
68         }
69 
70    as if it was manually vectorized by rewriting the source code into:
71 
72         typedef int __attribute__((mode(V8HI))) v8hi;
73         short a[N];  short b[N]; short c[N];   int i;
74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75         v8hi va, vb, vc;
76 
77         for (i=0; i<N/8; i++){
78           vb = pb[i];
79           vc = pc[i];
80           va = vb + vc;
81           pa[i] = va;
82         }
83 
84         The main entry to this pass is vectorize_loops(), in which
85    the vectorizer applies a set of analyses on a given set of loops,
86    followed by the actual vectorization transformation for the loops that
87    had successfully passed the analysis phase.
88         Throughout this pass we make a distinction between two types of
89    data: scalars (which are represented by SSA_NAMES), and memory references
90    ("data-refs").  These two types of data require different handling both
91    during analysis and transformation. The types of data-refs that the
92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94    accesses are required to have a simple (consecutive) access pattern.
95 
96    Analysis phase:
97    ===============
98         The driver for the analysis phase is vect_analyze_loop().
99    It applies a set of analyses, some of which rely on the scalar evolution
100    analyzer (scev) developed by Sebastian Pop.
101 
102         During the analysis phase the vectorizer records some information
103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104    loop, as well as general information about the loop as a whole, which is
105    recorded in a "loop_vec_info" struct attached to each loop.
106 
107    Transformation phase:
108    =====================
109         The loop transformation phase scans all the stmts in the loop, and
110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111    the loop that needs to be vectorized.  It inserts the vector code sequence
112    just before the scalar stmt S, and records a pointer to the vector code
113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114    attached to S).  This pointer will be used for the vectorization of following
115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116    otherwise, we rely on dead code elimination for removing it.
117 
118         For example, say stmt S1 was vectorized into stmt VS1:
119 
120    VS1: vb = px[i];
121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122    S2:  a = b;
123 
124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
127    resulting sequence would be:
128 
129    VS1: vb = px[i];
130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131    VS2: va = vb;
132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 
134         Operands that are not SSA_NAMEs, are data-refs that appear in
135    load/store operations (like 'x[i]' in S1), and are handled differently.
136 
137    Target modeling:
138    =================
139         Currently the only target specific information that is used is the
140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141    Targets that can support different sizes of vectors, for now will need
142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
143    flexibility will be added in the future.
144 
145         Since we only vectorize operations which vector form can be
146    expressed using existing tree codes, to verify that an operation is
147    supported, the vectorizer checks the relevant optab at the relevant
148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
149    the value found is CODE_FOR_nothing, then there's no target support, and
150    we can't vectorize the stmt.
151 
152    For additional information on this project see:
153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155 
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 
158 /* Function vect_determine_vectorization_factor
159 
160    Determine the vectorization factor (VF).  VF is the number of data elements
161    that are operated upon in parallel in a single iteration of the vectorized
162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164    elements can fit in a single vector register.
165 
166    We currently support vectorization of loops in which all types operated upon
167    are of the same size.  Therefore this function currently sets VF according to
168    the size of the types operated upon, and fails if there are multiple sizes
169    in the loop.
170 
171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
172    original loop:
173         for (i=0; i<N; i++){
174           a[i] = b[i] + c[i];
175         }
176 
177    vectorized loop:
178         for (i=0; i<N; i+=VF){
179           a[i:VF] = b[i:VF] + c[i:VF];
180         }
181 */
182 
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
185 {
186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188   unsigned nbbs = loop->num_nodes;
189   poly_uint64 vectorization_factor = 1;
190   tree scalar_type = NULL_TREE;
191   gphi *phi;
192   tree vectype;
193   stmt_vec_info stmt_info;
194   unsigned i;
195   HOST_WIDE_INT dummy;
196   gimple *stmt, *pattern_stmt = NULL;
197   gimple_seq pattern_def_seq = NULL;
198   gimple_stmt_iterator pattern_def_si = gsi_none ();
199   bool analyze_pattern_stmt = false;
200   bool bool_result;
201   auto_vec<stmt_vec_info> mask_producers;
202 
203   if (dump_enabled_p ())
204     dump_printf_loc (MSG_NOTE, vect_location,
205                      "=== vect_determine_vectorization_factor ===\n");
206 
207   for (i = 0; i < nbbs; i++)
208     {
209       basic_block bb = bbs[i];
210 
211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 	   gsi_next (&si))
213 	{
214 	  phi = si.phi ();
215 	  stmt_info = vinfo_for_stmt (phi);
216 	  if (dump_enabled_p ())
217 	    {
218 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220 	    }
221 
222 	  gcc_assert (stmt_info);
223 
224 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
225 	      || STMT_VINFO_LIVE_P (stmt_info))
226             {
227 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
229 
230 	      if (dump_enabled_p ())
231 		{
232 		  dump_printf_loc (MSG_NOTE, vect_location,
233                                    "get vectype for scalar type:  ");
234 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235                   dump_printf (MSG_NOTE, "\n");
236 		}
237 
238 	      vectype = get_vectype_for_scalar_type (scalar_type);
239 	      if (!vectype)
240 		{
241 		  if (dump_enabled_p ())
242 		    {
243 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244                                        "not vectorized: unsupported "
245                                        "data-type ");
246 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247                                          scalar_type);
248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
249 		    }
250 		  return false;
251 		}
252 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
253 
254 	      if (dump_enabled_p ())
255 		{
256 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258                   dump_printf (MSG_NOTE, "\n");
259 		}
260 
261 	      if (dump_enabled_p ())
262 		{
263 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 		  dump_printf (MSG_NOTE, "\n");
266 		}
267 
268 	      vect_update_max_nunits (&vectorization_factor, vectype);
269 	    }
270 	}
271 
272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 	   !gsi_end_p (si) || analyze_pattern_stmt;)
274         {
275           tree vf_vectype;
276 
277           if (analyze_pattern_stmt)
278 	    stmt = pattern_stmt;
279           else
280             stmt = gsi_stmt (si);
281 
282           stmt_info = vinfo_for_stmt (stmt);
283 
284 	  if (dump_enabled_p ())
285 	    {
286 	      dump_printf_loc (MSG_NOTE, vect_location,
287                                "==> examining statement: ");
288 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
289 	    }
290 
291 	  gcc_assert (stmt_info);
292 
293 	  /* Skip stmts which do not need to be vectorized.  */
294 	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 	       && !STMT_VINFO_LIVE_P (stmt_info))
296 	      || gimple_clobber_p (stmt))
297             {
298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302                 {
303                   stmt = pattern_stmt;
304                   stmt_info = vinfo_for_stmt (pattern_stmt);
305                   if (dump_enabled_p ())
306                     {
307                       dump_printf_loc (MSG_NOTE, vect_location,
308                                        "==> examining pattern statement: ");
309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310                     }
311                 }
312               else
313 	        {
314 	          if (dump_enabled_p ())
315 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316                   gsi_next (&si);
317 	          continue;
318                 }
319 	    }
320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324             analyze_pattern_stmt = true;
325 
326 	  /* If a pattern statement has def stmts, analyze them too.  */
327 	  if (is_pattern_stmt_p (stmt_info))
328 	    {
329 	      if (pattern_def_seq == NULL)
330 		{
331 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 		  pattern_def_si = gsi_start (pattern_def_seq);
333 		}
334 	      else if (!gsi_end_p (pattern_def_si))
335 		gsi_next (&pattern_def_si);
336 	      if (pattern_def_seq != NULL)
337 		{
338 		  gimple *pattern_def_stmt = NULL;
339 		  stmt_vec_info pattern_def_stmt_info = NULL;
340 
341 		  while (!gsi_end_p (pattern_def_si))
342 		    {
343 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
344 		      pattern_def_stmt_info
345 			= vinfo_for_stmt (pattern_def_stmt);
346 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 			break;
349 		      gsi_next (&pattern_def_si);
350 		    }
351 
352 		  if (!gsi_end_p (pattern_def_si))
353 		    {
354 		      if (dump_enabled_p ())
355 			{
356 			  dump_printf_loc (MSG_NOTE, vect_location,
357                                            "==> examining pattern def stmt: ");
358 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359                                             pattern_def_stmt, 0);
360 			}
361 
362 		      stmt = pattern_def_stmt;
363 		      stmt_info = pattern_def_stmt_info;
364 		    }
365 		  else
366 		    {
367 		      pattern_def_si = gsi_none ();
368 		      analyze_pattern_stmt = false;
369 		    }
370 		}
371 	      else
372 		analyze_pattern_stmt = false;
373 	    }
374 
375 	  if (gimple_get_lhs (stmt) == NULL_TREE
376 	      /* MASK_STORE has no lhs, but is ok.  */
377 	      && (!is_gimple_call (stmt)
378 		  || !gimple_call_internal_p (stmt)
379 		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
380 	    {
381 	      if (is_gimple_call (stmt))
382 		{
383 		  /* Ignore calls with no lhs.  These must be calls to
384 		     #pragma omp simd functions, and what vectorization factor
385 		     it really needs can't be determined until
386 		     vectorizable_simd_clone_call.  */
387 		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
388 		    {
389 		      pattern_def_seq = NULL;
390 		      gsi_next (&si);
391 		    }
392 		  continue;
393 		}
394 	      if (dump_enabled_p ())
395 		{
396 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397                                    "not vectorized: irregular stmt.");
398 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
399                                     0);
400 		}
401 	      return false;
402 	    }
403 
404 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 	    {
406 	      if (dump_enabled_p ())
407 	        {
408 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409                                    "not vectorized: vector stmt in loop:");
410 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
411 	        }
412 	      return false;
413 	    }
414 
415 	  bool_result = false;
416 
417 	  if (STMT_VINFO_VECTYPE (stmt_info))
418 	    {
419 	      /* The only case when a vectype had been already set is for stmts
420 	         that contain a dataref, or for "pattern-stmts" (stmts
421 		 generated by the vectorizer to represent/replace a certain
422 		 idiom).  */
423 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 			  || is_pattern_stmt_p (stmt_info)
425 			  || !gsi_end_p (pattern_def_si));
426 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
427 	    }
428 	  else
429 	    {
430 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 	      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 	      else
434 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
435 
436 	      /* Bool ops don't participate in vectorization factor
437 		 computation.  For comparison use compared types to
438 		 compute a factor.  */
439 	      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 		  && is_gimple_assign (stmt)
441 		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
442 		{
443 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
444 		      || STMT_VINFO_LIVE_P (stmt_info))
445 		    mask_producers.safe_push (stmt_info);
446 		  bool_result = true;
447 
448 		  if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 		      == tcc_comparison
450 		      && !VECT_SCALAR_BOOLEAN_TYPE_P
451 			    (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 		  else
454 		    {
455 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 			{
457 			  pattern_def_seq = NULL;
458 			  gsi_next (&si);
459 			}
460 		      continue;
461 		    }
462 		}
463 
464 	      if (dump_enabled_p ())
465 		{
466 		  dump_printf_loc (MSG_NOTE, vect_location,
467                                    "get vectype for scalar type:  ");
468 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469                   dump_printf (MSG_NOTE, "\n");
470 		}
471 	      vectype = get_vectype_for_scalar_type (scalar_type);
472 	      if (!vectype)
473 		{
474 		  if (dump_enabled_p ())
475 		    {
476 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477                                        "not vectorized: unsupported "
478                                        "data-type ");
479 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480                                          scalar_type);
481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 		    }
483 		  return false;
484 		}
485 
486 	      if (!bool_result)
487 		STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 
489 	      if (dump_enabled_p ())
490 		{
491 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493                   dump_printf (MSG_NOTE, "\n");
494 		}
495             }
496 
497 	  /* Don't try to compute VF out scalar types if we stmt
498 	     produces boolean vector.  Use result vectype instead.  */
499 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 	    vf_vectype = vectype;
501 	  else
502 	    {
503 	      /* The vectorization factor is according to the smallest
504 		 scalar type (or the largest vector size, but we only
505 		 support one vector size per loop).  */
506 	      if (!bool_result)
507 		scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 							     &dummy);
509 	      if (dump_enabled_p ())
510 		{
511 		  dump_printf_loc (MSG_NOTE, vect_location,
512 				   "get vectype for scalar type:  ");
513 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 		  dump_printf (MSG_NOTE, "\n");
515 		}
516 	      vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 	    }
518 	  if (!vf_vectype)
519 	    {
520 	      if (dump_enabled_p ())
521 		{
522 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523                                    "not vectorized: unsupported data-type ");
524 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525                                      scalar_type);
526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 		}
528 	      return false;
529 	    }
530 
531 	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 			GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 	    {
534 	      if (dump_enabled_p ())
535 		{
536 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537                                    "not vectorized: different sized vector "
538                                    "types in statement, ");
539 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540                                      vectype);
541 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543                                      vf_vectype);
544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 		}
546 	      return false;
547 	    }
548 
549 	  if (dump_enabled_p ())
550 	    {
551 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553               dump_printf (MSG_NOTE, "\n");
554 	    }
555 
556 	  if (dump_enabled_p ())
557 	    {
558 	      dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 	      dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 	      dump_printf (MSG_NOTE, "\n");
561 	    }
562 
563 	  vect_update_max_nunits (&vectorization_factor, vf_vectype);
564 
565 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
566 	    {
567 	      pattern_def_seq = NULL;
568 	      gsi_next (&si);
569 	    }
570         }
571     }
572 
573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
574   if (dump_enabled_p ())
575     {
576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577       dump_dec (MSG_NOTE, vectorization_factor);
578       dump_printf (MSG_NOTE, "\n");
579     }
580 
581   if (known_le (vectorization_factor, 1U))
582     {
583       if (dump_enabled_p ())
584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585                          "not vectorized: unsupported data-type\n");
586       return false;
587     }
588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
589 
590   for (i = 0; i < mask_producers.length (); i++)
591     {
592       tree mask_type = NULL;
593 
594       stmt = STMT_VINFO_STMT (mask_producers[i]);
595 
596       if (is_gimple_assign (stmt)
597 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 	  && !VECT_SCALAR_BOOLEAN_TYPE_P
599 				      (TREE_TYPE (gimple_assign_rhs1 (stmt))))
600 	{
601 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
603 
604 	  if (!mask_type)
605 	    {
606 	      if (dump_enabled_p ())
607 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 				 "not vectorized: unsupported mask\n");
609 	      return false;
610 	    }
611 	}
612       else
613 	{
614 	  tree rhs;
615 	  ssa_op_iter iter;
616 	  gimple *def_stmt;
617 	  enum vect_def_type dt;
618 
619 	  FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
620 	    {
621 	      if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 				       &def_stmt, &dt, &vectype))
623 		{
624 		  if (dump_enabled_p ())
625 		    {
626 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 				       "not vectorized: can't compute mask type "
628 				       "for statement, ");
629 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
630 					0);
631 		    }
632 		  return false;
633 		}
634 
635 	      /* No vectype probably means external definition.
636 		 Allow it in case there is another operand which
637 		 allows to determine mask type.  */
638 	      if (!vectype)
639 		continue;
640 
641 	      if (!mask_type)
642 		mask_type = vectype;
643 	      else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 				 TYPE_VECTOR_SUBPARTS (vectype)))
645 		{
646 		  if (dump_enabled_p ())
647 		    {
648 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 				       "not vectorized: different sized masks "
650 				       "types in statement, ");
651 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 					 mask_type);
653 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 					 vectype);
656 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
657 		    }
658 		  return false;
659 		}
660 	      else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 		       != VECTOR_BOOLEAN_TYPE_P (vectype))
662 		{
663 		  if (dump_enabled_p ())
664 		    {
665 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 				       "not vectorized: mixed mask and "
667 				       "nonmask vector types in statement, ");
668 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 					 mask_type);
670 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 					 vectype);
673 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
674 		    }
675 		  return false;
676 		}
677 	    }
678 
679 	  /* We may compare boolean value loaded as vector of integers.
680 	     Fix mask_type in such case.  */
681 	  if (mask_type
682 	      && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 	      && gimple_code (stmt) == GIMPLE_ASSIGN
684 	      && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 	    mask_type = build_same_sized_truth_vector_type (mask_type);
686 	}
687 
688       /* No mask_type should mean loop invariant predicate.
689 	 This is probably a subject for optimization in
690 	 if-conversion.  */
691       if (!mask_type)
692 	{
693 	  if (dump_enabled_p ())
694 	    {
695 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 			       "not vectorized: can't compute mask type "
697 			       "for statement, ");
698 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
699 				0);
700 	    }
701 	  return false;
702 	}
703 
704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
705     }
706 
707   return true;
708 }
709 
710 
711 /* Function vect_is_simple_iv_evolution.
712 
713    FORNOW: A simple evolution of an induction variables in the loop is
714    considered a polynomial evolution.  */
715 
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718                              tree * step)
719 {
720   tree init_expr;
721   tree step_expr;
722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723   basic_block bb;
724 
725   /* When there is no evolution in this loop, the evolution function
726      is not "simple".  */
727   if (evolution_part == NULL_TREE)
728     return false;
729 
730   /* When the evolution is a polynomial of degree >= 2
731      the evolution function is not "simple".  */
732   if (tree_is_chrec (evolution_part))
733     return false;
734 
735   step_expr = evolution_part;
736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
737 
738   if (dump_enabled_p ())
739     {
740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742       dump_printf (MSG_NOTE, ",  init: ");
743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744       dump_printf (MSG_NOTE, "\n");
745     }
746 
747   *init = init_expr;
748   *step = step_expr;
749 
750   if (TREE_CODE (step_expr) != INTEGER_CST
751       && (TREE_CODE (step_expr) != SSA_NAME
752 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 		  || !flag_associative_math)))
757       && (TREE_CODE (step_expr) != REAL_CST
758 	  || !flag_associative_math))
759     {
760       if (dump_enabled_p ())
761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762                          "step unknown.\n");
763       return false;
764     }
765 
766   return true;
767 }
768 
769 /* Function vect_analyze_scalar_cycles_1.
770 
771    Examine the cross iteration def-use cycles of scalar variables
772    in LOOP.  LOOP_VINFO represents the loop that is now being
773    considered for vectorization (can be LOOP, or an outer-loop
774    enclosing LOOP).  */
775 
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
778 {
779   basic_block bb = loop->header;
780   tree init, step;
781   auto_vec<gimple *, 64> worklist;
782   gphi_iterator gsi;
783   bool double_reduc;
784 
785   if (dump_enabled_p ())
786     dump_printf_loc (MSG_NOTE, vect_location,
787                      "=== vect_analyze_scalar_cycles ===\n");
788 
789   /* First - identify all inductions.  Reduction detection assumes that all the
790      inductions have been identified, therefore, this order must not be
791      changed.  */
792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
793     {
794       gphi *phi = gsi.phi ();
795       tree access_fn = NULL;
796       tree def = PHI_RESULT (phi);
797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
798 
799       if (dump_enabled_p ())
800 	{
801 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
803 	}
804 
805       /* Skip virtual phi's.  The data dependences that are associated with
806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
807       if (virtual_operand_p (def))
808 	continue;
809 
810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
811 
812       /* Analyze the evolution function.  */
813       access_fn = analyze_scalar_evolution (loop, def);
814       if (access_fn)
815 	{
816 	  STRIP_NOPS (access_fn);
817 	  if (dump_enabled_p ())
818 	    {
819 	      dump_printf_loc (MSG_NOTE, vect_location,
820                                "Access function of PHI: ");
821 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822               dump_printf (MSG_NOTE, "\n");
823 	    }
824 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 	    = initial_condition_in_loop_num (access_fn, loop->num);
826 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 	    = evolution_part_in_loop_num (access_fn, loop->num);
828 	}
829 
830       if (!access_fn
831 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 	      && TREE_CODE (step) != INTEGER_CST))
834 	{
835 	  worklist.safe_push (phi);
836 	  continue;
837 	}
838 
839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 		  != NULL_TREE);
841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
842 
843       if (dump_enabled_p ())
844 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
846     }
847 
848 
849   /* Second - identify all reductions and nested cycles.  */
850   while (worklist.length () > 0)
851     {
852       gimple *phi = worklist.pop ();
853       tree def = PHI_RESULT (phi);
854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855       gimple *reduc_stmt;
856 
857       if (dump_enabled_p ())
858         {
859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861         }
862 
863       gcc_assert (!virtual_operand_p (def)
864 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
865 
866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 						&double_reduc, false);
868       if (reduc_stmt)
869         {
870           if (double_reduc)
871             {
872               if (dump_enabled_p ())
873                 dump_printf_loc (MSG_NOTE, vect_location,
874 				 "Detected double reduction.\n");
875 
876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878                                                     vect_double_reduction_def;
879             }
880           else
881             {
882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
883                 {
884                   if (dump_enabled_p ())
885                     dump_printf_loc (MSG_NOTE, vect_location,
886 				     "Detected vectorizable nested cycle.\n");
887 
888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890                                                              vect_nested_cycle;
891                 }
892               else
893                 {
894                   if (dump_enabled_p ())
895                     dump_printf_loc (MSG_NOTE, vect_location,
896 				     "Detected reduction.\n");
897 
898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900                                                            vect_reduction_def;
901                   /* Store the reduction cycles for possible vectorization in
902                      loop-aware SLP if it was not detected as reduction
903 		     chain.  */
904 		  if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
906                 }
907             }
908         }
909       else
910         if (dump_enabled_p ())
911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 			   "Unknown def-use cycle pattern.\n");
913     }
914 }
915 
916 
917 /* Function vect_analyze_scalar_cycles.
918 
919    Examine the cross iteration def-use cycles of scalar variables, by
920    analyzing the loop-header PHIs of scalar variables.  Classify each
921    cycle as one of the following: invariant, induction, reduction, unknown.
922    We do that for the loop represented by LOOP_VINFO, and also to its
923    inner-loop, if exists.
924    Examples for scalar cycles:
925 
926    Example1: reduction:
927 
928               loop1:
929               for (i=0; i<N; i++)
930                  sum += a[i];
931 
932    Example2: induction:
933 
934               loop2:
935               for (i=0; i<N; i++)
936                  a[i] = i;  */
937 
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 {
941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942 
943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944 
945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946      Reductions in such inner-loop therefore have different properties than
947      the reductions in the nest that gets vectorized:
948      1. When vectorized, they are executed in the same order as in the original
949         scalar loop, so we can't change the order of computation when
950         vectorizing them.
951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952         current checks are too strict.  */
953 
954   if (loop->inner)
955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 }
957 
958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
959 
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
962 {
963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964   gimple *stmtp;
965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968   do
969     {
970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973       if (stmt)
974 	GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 	  = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976     }
977   while (stmt);
978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 }
980 
981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
982 
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 {
986   gimple *first;
987   unsigned i;
988 
989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991       {
992 	gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 	while (next)
994 	  {
995 	    if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 	      break;
997 	    next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998 	  }
999 	/* If not all stmt in the chain are patterns try to handle
1000 	   the chain without patterns.  */
1001 	if (! next)
1002 	  {
1003 	    vect_fixup_reduc_chain (first);
1004 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 	      = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006 	  }
1007       }
1008 }
1009 
1010 /* Function vect_get_loop_niters.
1011 
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016 
1017    Return the loop exit condition.  */
1018 
1019 
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 		      tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028 
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034 		     "=== get_loop_niters ===\n");
1035 
1036   if (!exit)
1037     return cond;
1038 
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045 
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049 
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052 
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056 	{
1057 	  /* Try to combine may_be_zero with assumptions, this can simplify
1058 	     computation of niter expression.  */
1059 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 					     niter_assumptions,
1062 					     fold_build1 (TRUTH_NOT_EXPR,
1063 							  boolean_type_node,
1064 							  may_be_zero));
1065 	  else
1066 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 				 build_int_cst (TREE_TYPE (niter), 0),
1068 				 rewrite_to_non_trapping_overflow (niter));
1069 
1070 	  may_be_zero = NULL_TREE;
1071 	}
1072       else if (integer_nonzerop (may_be_zero))
1073 	{
1074 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 	  return cond;
1077 	}
1078       else
1079 	return cond;
1080     }
1081 
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084 
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 			  build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093 
1094   return cond;
1095 }
1096 
1097 /* Function bb_in_loop_p
1098 
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100 
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109 
1110 
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113 
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151 
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153 	{
1154 	  gimple *phi = gsi_stmt (si);
1155 	  gimple_set_uid (phi, 0);
1156 	  set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157 	}
1158 
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160 	{
1161 	  gimple *stmt = gsi_stmt (si);
1162 	  gimple_set_uid (stmt, 0);
1163 	  set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164 	}
1165     }
1166   free (body);
1167 
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172 
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 					  bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177 
1178 /* Free all levels of MASKS.  */
1179 
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189 
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192 
1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198 
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205 
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208 	  gimple *stmt = gsi_stmt (si);
1209 
1210 	  /* We may have broken canonical form by moving a constant
1211 	     into RHS1 of a commutative op.  Fix such occurrences.  */
1212 	  if (operands_swapped && is_gimple_assign (stmt))
1213 	    {
1214 	      enum tree_code code = gimple_assign_rhs_code (stmt);
1215 
1216 	      if ((code == PLUS_EXPR
1217 		   || code == POINTER_PLUS_EXPR
1218 		   || code == MULT_EXPR)
1219 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 		swap_ssa_operands (stmt,
1221 				   gimple_assign_rhs1_ptr (stmt),
1222 				   gimple_assign_rhs2_ptr (stmt));
1223 	      else if (code == COND_EXPR
1224 		       && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225 		{
1226 		  tree cond_expr = gimple_assign_rhs1 (stmt);
1227 		  enum tree_code cond_code = TREE_CODE (cond_expr);
1228 
1229 		  if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230 		    {
1231 		      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 								  0));
1233 		      cond_code = invert_tree_comparison (cond_code,
1234 							  honor_nans);
1235 		      if (cond_code != ERROR_MARK)
1236 			{
1237 			  TREE_SET_CODE (cond_expr, cond_code);
1238 			  swap_ssa_operands (stmt,
1239 					     gimple_assign_rhs2_ptr (stmt),
1240 					     gimple_assign_rhs3_ptr (stmt));
1241 			}
1242 		    }
1243 		}
1244 	    }
1245 
1246 	  /* Free stmt_vec_info.  */
1247 	  free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251 
1252   free (bbs);
1253 
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256 
1257   loop->aux = NULL;
1258 }
1259 
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262 
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269 
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277 				     &stmts, true, NULL_TREE);
1278       if (stmts)
1279 	{
1280 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 	  gsi_insert_seq_on_edge_immediate (e, stmts);
1282 	}
1283     }
1284   return cached;
1285 }
1286 
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289 
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 					    cmp_type, rgm->mask_type,
1299 					    OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303 
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306 
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317 
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321 
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327 
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333 
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338 
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343 
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346 
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349 
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358 	{
1359 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 	  if (this_type
1361 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362 	    {
1363 	      /* Although we could stop as soon as we find a valid mode,
1364 		 it's often better to continue until we hit Pmode, since the
1365 		 operands to the WHILE are more likely to be reusable in
1366 		 address calculations.  */
1367 	      cmp_type = this_type;
1368 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 		break;
1370 	    }
1371 	}
1372     }
1373 
1374   if (!cmp_type)
1375     return false;
1376 
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380 
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor;
1388   int innerloop_iters, i;
1389 
1390   /* Gather costs for statements in the scalar loop.  */
1391 
1392   /* FORNOW.  */
1393   innerloop_iters = 1;
1394   if (loop->inner)
1395     innerloop_iters = 50; /* FIXME */
1396 
1397   for (i = 0; i < nbbs; i++)
1398     {
1399       gimple_stmt_iterator si;
1400       basic_block bb = bbs[i];
1401 
1402       if (bb->loop_father == loop->inner)
1403         factor = innerloop_iters;
1404       else
1405         factor = 1;
1406 
1407       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409 	  gimple *stmt = gsi_stmt (si);
1410           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411 
1412           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413             continue;
1414 
1415           /* Skip stmts that are not vectorized inside the loop.  */
1416           if (stmt_info
1417               && !STMT_VINFO_RELEVANT_P (stmt_info)
1418               && (!STMT_VINFO_LIVE_P (stmt_info)
1419                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421             continue;
1422 
1423 	  vect_cost_for_stmt kind;
1424           if (STMT_VINFO_DATA_REF (stmt_info))
1425             {
1426               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427                kind = scalar_load;
1428              else
1429                kind = scalar_store;
1430             }
1431           else
1432             kind = scalar_stmt;
1433 
1434 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 			    factor, kind, stmt_info, 0, vect_prologue);
1436         }
1437     }
1438 
1439   /* Now accumulate cost.  */
1440   void *target_cost_data = init_cost (loop);
1441   stmt_info_for_cost *si;
1442   int j;
1443   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 		    j, si)
1445     {
1446       struct _stmt_vec_info *stmt_info
1447 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448       (void) add_stmt_cost (target_cost_data, si->count,
1449 			    si->kind, stmt_info, si->misalign,
1450 			    vect_body);
1451     }
1452   unsigned dummy, body_cost = 0;
1453   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454   destroy_cost_data (target_cost_data);
1455   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1456 }
1457 
1458 
1459 /* Function vect_analyze_loop_form_1.
1460 
1461    Verify that certain CFG restrictions hold, including:
1462    - the loop has a pre-header
1463    - the loop has a single entry and exit
1464    - the loop exit condition is simple enough
1465    - the number of iterations can be analyzed, i.e, a countable loop.  The
1466      niter could be analyzed under some assumptions.  */
1467 
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 			  tree *assumptions, tree *number_of_iterationsm1,
1471 			  tree *number_of_iterations, gcond **inner_loop_cond)
1472 {
1473   if (dump_enabled_p ())
1474     dump_printf_loc (MSG_NOTE, vect_location,
1475 		     "=== vect_analyze_loop_form ===\n");
1476 
1477   /* Different restrictions apply when we are considering an inner-most loop,
1478      vs. an outer (nested) loop.
1479      (FORNOW. May want to relax some of these restrictions in the future).  */
1480 
1481   if (!loop->inner)
1482     {
1483       /* Inner-most loop.  We currently require that the number of BBs is
1484 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1485 	 look like this:
1486 
1487                         (pre-header)
1488                            |
1489                           header <--------+
1490                            | |            |
1491                            | +--> latch --+
1492                            |
1493                         (exit-bb)  */
1494 
1495       if (loop->num_nodes != 2)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 			     "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502 
1503       if (empty_block_p (loop->header))
1504 	{
1505 	  if (dump_enabled_p ())
1506 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 			     "not vectorized: empty loop.\n");
1508 	  return false;
1509 	}
1510     }
1511   else
1512     {
1513       struct loop *innerloop = loop->inner;
1514       edge entryedge;
1515 
1516       /* Nested loop. We currently require that the loop is doubly-nested,
1517 	 contains a single inner loop, and the number of BBs is exactly 5.
1518 	 Vectorizable outer-loops look like this:
1519 
1520 			(pre-header)
1521 			   |
1522 			  header <---+
1523 			   |         |
1524 		          inner-loop |
1525 			   |         |
1526 			  tail ------+
1527 			   |
1528 		        (exit-bb)
1529 
1530 	 The inner-loop has the properties expected of inner-most loops
1531 	 as described above.  */
1532 
1533       if ((loop->inner)->inner || (loop->inner)->next)
1534 	{
1535 	  if (dump_enabled_p ())
1536 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 			     "not vectorized: multiple nested loops.\n");
1538 	  return false;
1539 	}
1540 
1541       if (loop->num_nodes != 5)
1542         {
1543 	  if (dump_enabled_p ())
1544 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 			     "not vectorized: control flow in loop.\n");
1546 	  return false;
1547         }
1548 
1549       entryedge = loop_preheader_edge (innerloop);
1550       if (entryedge->src != loop->header
1551 	  || !single_exit (innerloop)
1552 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1553 	{
1554 	  if (dump_enabled_p ())
1555 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 			     "not vectorized: unsupported outerloop form.\n");
1557 	  return false;
1558 	}
1559 
1560       /* Analyze the inner-loop.  */
1561       tree inner_niterm1, inner_niter, inner_assumptions;
1562       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 				      &inner_assumptions, &inner_niterm1,
1564 				      &inner_niter, NULL)
1565 	  /* Don't support analyzing niter under assumptions for inner
1566 	     loop.  */
1567 	  || !integer_onep (inner_assumptions))
1568 	{
1569 	  if (dump_enabled_p ())
1570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 			     "not vectorized: Bad inner loop.\n");
1572 	  return false;
1573 	}
1574 
1575       if (!expr_invariant_in_loop_p (loop, inner_niter))
1576 	{
1577 	  if (dump_enabled_p ())
1578 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 			     "not vectorized: inner-loop count not"
1580                              " invariant.\n");
1581 	  return false;
1582 	}
1583 
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_NOTE, vect_location,
1586 			 "Considering outer-loop vectorization.\n");
1587     }
1588 
1589   if (!single_exit (loop)
1590       || EDGE_COUNT (loop->header->preds) != 2)
1591     {
1592       if (dump_enabled_p ())
1593         {
1594           if (!single_exit (loop))
1595 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 			     "not vectorized: multiple exits.\n");
1597           else if (EDGE_COUNT (loop->header->preds) != 2)
1598 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 			     "not vectorized: too many incoming edges.\n");
1600         }
1601       return false;
1602     }
1603 
1604   /* We assume that the loop exit condition is at the end of the loop. i.e,
1605      that the loop is represented as a do-while (with a proper if-guard
1606      before the loop if needed), where the loop header contains all the
1607      executable statements, and the latch is empty.  */
1608   if (!empty_block_p (loop->latch)
1609       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1610     {
1611       if (dump_enabled_p ())
1612 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 			 "not vectorized: latch block not empty.\n");
1614       return false;
1615     }
1616 
1617   /* Make sure the exit is not abnormal.  */
1618   edge e = single_exit (loop);
1619   if (e->flags & EDGE_ABNORMAL)
1620     {
1621       if (dump_enabled_p ())
1622 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 			 "not vectorized: abnormal loop exit edge.\n");
1624       return false;
1625     }
1626 
1627   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 				     number_of_iterationsm1);
1629   if (!*loop_cond)
1630     {
1631       if (dump_enabled_p ())
1632 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 			 "not vectorized: complicated exit condition.\n");
1634       return false;
1635     }
1636 
1637   if (integer_zerop (*assumptions)
1638       || !*number_of_iterations
1639       || chrec_contains_undetermined (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 			 "not vectorized: number of iterations cannot be "
1644 			 "computed.\n");
1645       return false;
1646     }
1647 
1648   if (integer_zerop (*number_of_iterations))
1649     {
1650       if (dump_enabled_p ())
1651 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 			 "not vectorized: number of iterations = 0.\n");
1653       return false;
1654     }
1655 
1656   return true;
1657 }
1658 
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1660 
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1663 {
1664   tree assumptions, number_of_iterations, number_of_iterationsm1;
1665   gcond *loop_cond, *inner_loop_cond = NULL;
1666 
1667   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 				  &assumptions, &number_of_iterationsm1,
1669 				  &number_of_iterations, &inner_loop_cond))
1670     return NULL;
1671 
1672   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676   if (!integer_onep (assumptions))
1677     {
1678       /* We consider to vectorize this loop by versioning it under
1679 	 some assumptions.  In order to do this, we need to clear
1680 	 existing information computed by scev and niter analyzer.  */
1681       scev_reset_htab ();
1682       free_numbers_of_iterations_estimates (loop);
1683       /* Also set flag for this loop so that following scev and niter
1684 	 analysis are done under the assumptions.  */
1685       loop_constraint_set (loop, LOOP_C_FINITE);
1686       /* Also record the assumptions for versioning.  */
1687       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1688     }
1689 
1690   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1691     {
1692       if (dump_enabled_p ())
1693         {
1694           dump_printf_loc (MSG_NOTE, vect_location,
1695 			   "Symbolic number of iterations is ");
1696 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697           dump_printf (MSG_NOTE, "\n");
1698         }
1699     }
1700 
1701   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702   if (inner_loop_cond)
1703     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704       = loop_exit_ctrl_vec_info_type;
1705 
1706   gcc_assert (!loop->aux);
1707   loop->aux = loop_vinfo;
1708   return loop_vinfo;
1709 }
1710 
1711 
1712 
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714    statements update the vectorization factor.  */
1715 
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1718 {
1719   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721   int nbbs = loop->num_nodes;
1722   poly_uint64 vectorization_factor;
1723   int i;
1724 
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727 		     "=== vect_update_vf_for_slp ===\n");
1728 
1729   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730   gcc_assert (known_ne (vectorization_factor, 0U));
1731 
1732   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733      vectorization factor of the loop is the unrolling factor required by
1734      the SLP instances.  If that unrolling factor is 1, we say, that we
1735      perform pure SLP on loop - cross iteration parallelism is not
1736      exploited.  */
1737   bool only_slp_in_loop = true;
1738   for (i = 0; i < nbbs; i++)
1739     {
1740       basic_block bb = bbs[i];
1741       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 	   gsi_next (&si))
1743 	{
1744 	  gimple *stmt = gsi_stmt (si);
1745 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 	      && STMT_VINFO_RELATED_STMT (stmt_info))
1748 	    {
1749 	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 	      stmt_info = vinfo_for_stmt (stmt);
1751 	    }
1752 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 	      && !PURE_SLP_STMT (stmt_info))
1755 	    /* STMT needs both SLP and loop-based vectorization.  */
1756 	    only_slp_in_loop = false;
1757 	}
1758     }
1759 
1760   if (only_slp_in_loop)
1761     {
1762       dump_printf_loc (MSG_NOTE, vect_location,
1763 		       "Loop contains only SLP stmts\n");
1764       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1765     }
1766   else
1767     {
1768       dump_printf_loc (MSG_NOTE, vect_location,
1769 		       "Loop contains SLP and non-SLP stmts\n");
1770       /* Both the vectorization factor and unroll factor have the form
1771 	 current_vector_size * X for some rational X, so they must have
1772 	 a common multiple.  */
1773       vectorization_factor
1774 	= force_common_multiple (vectorization_factor,
1775 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1776     }
1777 
1778   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779   if (dump_enabled_p ())
1780     {
1781       dump_printf_loc (MSG_NOTE, vect_location,
1782 		       "Updating vectorization factor to ");
1783       dump_dec (MSG_NOTE, vectorization_factor);
1784       dump_printf (MSG_NOTE, ".\n");
1785     }
1786 }
1787 
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789    the other phi in the reduction is also relevant for vectorization.
1790    This rejects cases such as:
1791 
1792       outer1:
1793 	x_1 = PHI <x_3(outer2), ...>;
1794 	...
1795 
1796       inner:
1797 	x_2 = ...;
1798 	...
1799 
1800       outer2:
1801 	x_3 = PHI <x_2(inner)>;
1802 
1803    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1804 
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1807 {
1808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809     return false;
1810 
1811   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1813 }
1814 
1815 /* Function vect_analyze_loop_operations.
1816 
1817    Scan the loop stmts and make sure they are all vectorizable.  */
1818 
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1821 {
1822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824   int nbbs = loop->num_nodes;
1825   int i;
1826   stmt_vec_info stmt_info;
1827   bool need_to_vectorize = false;
1828   bool ok;
1829 
1830   if (dump_enabled_p ())
1831     dump_printf_loc (MSG_NOTE, vect_location,
1832 		     "=== vect_analyze_loop_operations ===\n");
1833 
1834   for (i = 0; i < nbbs; i++)
1835     {
1836       basic_block bb = bbs[i];
1837 
1838       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 	   gsi_next (&si))
1840         {
1841           gphi *phi = si.phi ();
1842           ok = true;
1843 
1844           stmt_info = vinfo_for_stmt (phi);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1849             }
1850 	  if (virtual_operand_p (gimple_phi_result (phi)))
1851 	    continue;
1852 
1853           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854              (i.e., a phi in the tail of the outer-loop).  */
1855           if (! is_loop_header_bb_p (bb))
1856             {
1857               /* FORNOW: we currently don't support the case that these phis
1858                  are not used in the outerloop (unless it is double reduction,
1859                  i.e., this phi is vect_reduction_def), cause this case
1860                  requires to actually do something here.  */
1861               if (STMT_VINFO_LIVE_P (stmt_info)
1862 		  && !vect_active_double_reduction_p (stmt_info))
1863                 {
1864                   if (dump_enabled_p ())
1865 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 				     "Unsupported loop-closed phi in "
1867 				     "outer-loop.\n");
1868                   return false;
1869                 }
1870 
1871               /* If PHI is used in the outer loop, we check that its operand
1872                  is defined in the inner loop.  */
1873               if (STMT_VINFO_RELEVANT_P (stmt_info))
1874                 {
1875                   tree phi_op;
1876 		  gimple *op_def_stmt;
1877 
1878                   if (gimple_phi_num_args (phi) != 1)
1879                     return false;
1880 
1881                   phi_op = PHI_ARG_DEF (phi, 0);
1882                   if (TREE_CODE (phi_op) != SSA_NAME)
1883                     return false;
1884 
1885                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 		  if (gimple_nop_p (op_def_stmt)
1887 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 		      || !vinfo_for_stmt (op_def_stmt))
1889                     return false;
1890 
1891                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892                         != vect_used_in_outer
1893                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                            != vect_used_in_outer_by_reduction)
1895                     return false;
1896                 }
1897 
1898               continue;
1899             }
1900 
1901           gcc_assert (stmt_info);
1902 
1903           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904                || STMT_VINFO_LIVE_P (stmt_info))
1905               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1906             {
1907               /* A scalar-dependence cycle that we don't support.  */
1908               if (dump_enabled_p ())
1909 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 				 "not vectorized: scalar dependence cycle.\n");
1911               return false;
1912             }
1913 
1914           if (STMT_VINFO_RELEVANT_P (stmt_info))
1915             {
1916               need_to_vectorize = true;
1917               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 		  && ! PURE_SLP_STMT (stmt_info))
1919                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 		       && ! PURE_SLP_STMT (stmt_info))
1923 		ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1924             }
1925 
1926 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1927 	  if (ok
1928 	      && STMT_VINFO_LIVE_P (stmt_info)
1929 	      && !PURE_SLP_STMT (stmt_info))
1930 	    ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1931 
1932           if (!ok)
1933             {
1934               if (dump_enabled_p ())
1935                 {
1936 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 				   "not vectorized: relevant phi not "
1938 				   "supported: ");
1939                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1940                 }
1941 	      return false;
1942             }
1943         }
1944 
1945       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 	   gsi_next (&si))
1947         {
1948 	  gimple *stmt = gsi_stmt (si);
1949 	  if (!gimple_clobber_p (stmt)
1950 	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 	    return false;
1952         }
1953     } /* bbs */
1954 
1955   /* All operations in the loop are either irrelevant (deal with loop
1956      control, or dead), or only used outside the loop and can be moved
1957      out of the loop (e.g. invariants, inductions).  The loop can be
1958      optimized away by scalar optimizations.  We're better off not
1959      touching this loop.  */
1960   if (!need_to_vectorize)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964 			 "All the computation can be taken out of the loop.\n");
1965       if (dump_enabled_p ())
1966 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 			 "not vectorized: redundant loop. no profit to "
1968 			 "vectorize.\n");
1969       return false;
1970     }
1971 
1972   return true;
1973 }
1974 
1975 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1976    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1977    definitely no, or -1 if it's worth retrying.  */
1978 
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1981 {
1982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1984 
1985   /* Only fully-masked loops can have iteration counts less than the
1986      vectorization factor.  */
1987   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988     {
1989       HOST_WIDE_INT max_niter;
1990 
1991       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993       else
1994 	max_niter = max_stmt_executions_int (loop);
1995 
1996       if (max_niter != -1
1997 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1998 	{
1999 	  if (dump_enabled_p ())
2000 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 			     "not vectorized: iteration count smaller than "
2002 			     "vectorization factor.\n");
2003 	  return 0;
2004 	}
2005     }
2006 
2007   int min_profitable_iters, min_profitable_estimate;
2008   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 				      &min_profitable_estimate);
2010 
2011   if (min_profitable_iters < 0)
2012     {
2013       if (dump_enabled_p ())
2014 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 			 "not vectorized: vectorization not profitable.\n");
2016       if (dump_enabled_p ())
2017 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 			 "not vectorized: vector version will never be "
2019 			 "profitable.\n");
2020       return -1;
2021     }
2022 
2023   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 			       * assumed_vf);
2025 
2026   /* Use the cost model only if it is more conservative than user specified
2027      threshold.  */
2028   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 				    min_profitable_iters);
2030 
2031   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2032 
2033   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2035     {
2036       if (dump_enabled_p ())
2037 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 			 "not vectorized: vectorization not profitable.\n");
2039       if (dump_enabled_p ())
2040 	dump_printf_loc (MSG_NOTE, vect_location,
2041 			 "not vectorized: iteration count smaller than user "
2042 			 "specified loop bound parameter or minimum profitable "
2043 			 "iterations (whichever is more conservative).\n");
2044       return 0;
2045     }
2046 
2047   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048   if (estimated_niter == -1)
2049     estimated_niter = likely_max_stmt_executions_int (loop);
2050   if (estimated_niter != -1
2051       && ((unsigned HOST_WIDE_INT) estimated_niter
2052 	  < MAX (th, (unsigned) min_profitable_estimate)))
2053     {
2054       if (dump_enabled_p ())
2055 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 			 "not vectorized: estimated iteration count too "
2057 			 "small.\n");
2058       if (dump_enabled_p ())
2059 	dump_printf_loc (MSG_NOTE, vect_location,
2060 			 "not vectorized: estimated iteration count smaller "
2061 			 "than specified loop bound parameter or minimum "
2062 			 "profitable iterations (whichever is more "
2063 			 "conservative).\n");
2064       return -1;
2065     }
2066 
2067   return 1;
2068 }
2069 
2070 
2071 /* Function vect_analyze_loop_2.
2072 
2073    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074    for it.  The different analyses will record information in the
2075    loop_vec_info struct.  */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2078 {
2079   bool ok;
2080   int res;
2081   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082   poly_uint64 min_vf = 2;
2083   unsigned int n_stmts = 0;
2084 
2085   /* The first group of checks is independent of the vector size.  */
2086   fatal = true;
2087 
2088   /* Find all data references in the loop (which correspond to vdefs/vuses)
2089      and analyze their evolution in the loop.  */
2090 
2091   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2092 
2093   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2095     {
2096       if (dump_enabled_p ())
2097 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 			 "not vectorized: loop nest containing two "
2099 			 "or more consecutive inner loops cannot be "
2100 			 "vectorized\n");
2101       return false;
2102     }
2103 
2104   for (unsigned i = 0; i < loop->num_nodes; i++)
2105     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 	 !gsi_end_p (gsi); gsi_next (&gsi))
2107       {
2108 	gimple *stmt = gsi_stmt (gsi);
2109 	if (is_gimple_debug (stmt))
2110 	  continue;
2111 	++n_stmts;
2112 	if (!find_data_references_in_stmt (loop, stmt,
2113 					   &LOOP_VINFO_DATAREFS (loop_vinfo)))
2114 	  {
2115 	    if (is_gimple_call (stmt) && loop->safelen)
2116 	      {
2117 		tree fndecl = gimple_call_fndecl (stmt), op;
2118 		if (fndecl != NULL_TREE)
2119 		  {
2120 		    cgraph_node *node = cgraph_node::get (fndecl);
2121 		    if (node != NULL && node->simd_clones != NULL)
2122 		      {
2123 			unsigned int j, n = gimple_call_num_args (stmt);
2124 			for (j = 0; j < n; j++)
2125 			  {
2126 			    op = gimple_call_arg (stmt, j);
2127 			    if (DECL_P (op)
2128 				|| (REFERENCE_CLASS_P (op)
2129 				    && get_base_address (op)))
2130 			      break;
2131 			  }
2132 			op = gimple_call_lhs (stmt);
2133 			/* Ignore #pragma omp declare simd functions
2134 			   if they don't have data references in the
2135 			   call stmt itself.  */
2136 			if (j == n
2137 			    && !(op
2138 				 && (DECL_P (op)
2139 				     || (REFERENCE_CLASS_P (op)
2140 					 && get_base_address (op)))))
2141 			  continue;
2142 		      }
2143 		  }
2144 	      }
2145 	    if (dump_enabled_p ())
2146 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 			       "not vectorized: loop contains function "
2148 			       "calls or data references that cannot "
2149 			       "be analyzed\n");
2150 	    return false;
2151 	  }
2152       }
2153 
2154   /* Analyze the data references and also adjust the minimal
2155      vectorization factor according to the loads and stores.  */
2156 
2157   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158   if (!ok)
2159     {
2160       if (dump_enabled_p ())
2161 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 			 "bad data references.\n");
2163       return false;
2164     }
2165 
2166   /* Classify all cross-iteration scalar data-flow cycles.
2167      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2168   vect_analyze_scalar_cycles (loop_vinfo);
2169 
2170   vect_pattern_recog (loop_vinfo);
2171 
2172   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2173 
2174   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2176 
2177   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 			 "bad data access.\n");
2183       return false;
2184     }
2185 
2186   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2187 
2188   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189   if (!ok)
2190     {
2191       if (dump_enabled_p ())
2192 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 			 "unexpected pattern.\n");
2194       return false;
2195     }
2196 
2197   /* While the rest of the analysis below depends on it in some way.  */
2198   fatal = false;
2199 
2200   /* Analyze data dependences between the data-refs in the loop
2201      and adjust the maximum vectorization factor according to
2202      the dependences.
2203      FORNOW: fail at the first data dependence that we encounter.  */
2204 
2205   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206   if (!ok
2207       || (max_vf != MAX_VECTORIZATION_FACTOR
2208 	  && maybe_lt (max_vf, min_vf)))
2209     {
2210       if (dump_enabled_p ())
2211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 			     "bad data dependence.\n");
2213       return false;
2214     }
2215   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2216 
2217   ok = vect_determine_vectorization_factor (loop_vinfo);
2218   if (!ok)
2219     {
2220       if (dump_enabled_p ())
2221 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 			 "can't determine vectorization factor.\n");
2223       return false;
2224     }
2225   if (max_vf != MAX_VECTORIZATION_FACTOR
2226       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2227     {
2228       if (dump_enabled_p ())
2229 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 			 "bad data dependence.\n");
2231       return false;
2232     }
2233 
2234   /* Compute the scalar iteration cost.  */
2235   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2236 
2237   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238   unsigned th;
2239 
2240   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2241   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242   if (!ok)
2243     return false;
2244 
2245   /* If there are any SLP instances mark them as pure_slp.  */
2246   bool slp = vect_make_slp_decision (loop_vinfo);
2247   if (slp)
2248     {
2249       /* Find stmts that need to be both vectorized and SLPed.  */
2250       vect_detect_hybrid_slp (loop_vinfo);
2251 
2252       /* Update the vectorization factor based on the SLP decision.  */
2253       vect_update_vf_for_slp (loop_vinfo);
2254     }
2255 
2256   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2257 
2258   /* We don't expect to have to roll back to anything other than an empty
2259      set of rgroups.  */
2260   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2261 
2262   /* This is the point where we can re-start analysis with SLP forced off.  */
2263 start_over:
2264 
2265   /* Now the vectorization factor is final.  */
2266   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267   gcc_assert (known_ne (vectorization_factor, 0U));
2268 
2269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2270     {
2271       dump_printf_loc (MSG_NOTE, vect_location,
2272 		       "vectorization_factor = ");
2273       dump_dec (MSG_NOTE, vectorization_factor);
2274       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2276     }
2277 
2278   HOST_WIDE_INT max_niter
2279     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2280 
2281   /* Analyze the alignment of the data-refs in the loop.
2282      Fail if a data reference is found that cannot be vectorized.  */
2283 
2284   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 			 "bad data alignment.\n");
2290       return false;
2291     }
2292 
2293   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294      It is important to call pruning after vect_analyze_data_ref_accesses,
2295      since we use grouping information gathered by interleaving analysis.  */
2296   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297   if (!ok)
2298     return false;
2299 
2300   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301      vectorization.  */
2302   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2303     {
2304     /* This pass will decide on using loop versioning and/or loop peeling in
2305        order to enhance the alignment of data references in the loop.  */
2306     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307     if (!ok)
2308       {
2309 	if (dump_enabled_p ())
2310 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 			   "bad data alignment.\n");
2312         return false;
2313       }
2314     }
2315 
2316   if (slp)
2317     {
2318       /* Analyze operations in the SLP instances.  Note this may
2319 	 remove unsupported SLP instances which makes the above
2320 	 SLP kind detection invalid.  */
2321       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322       vect_slp_analyze_operations (loop_vinfo);
2323       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 	goto again;
2325     }
2326 
2327   /* Scan all the remaining operations in the loop that are not subject
2328      to SLP and make sure they are vectorizable.  */
2329   ok = vect_analyze_loop_operations (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 			 "bad operation or unsupported loop bound.\n");
2335       return false;
2336     }
2337 
2338   /* Decide whether to use a fully-masked loop for this vectorization
2339      factor.  */
2340   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342        && vect_verify_full_masking (loop_vinfo));
2343   if (dump_enabled_p ())
2344     {
2345       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 	dump_printf_loc (MSG_NOTE, vect_location,
2347 			 "using a fully-masked loop.\n");
2348       else
2349 	dump_printf_loc (MSG_NOTE, vect_location,
2350 			 "not using a fully-masked loop.\n");
2351     }
2352 
2353   /* If epilog loop is required because of data accesses with gaps,
2354      one additional iteration needs to be peeled.  Check if there is
2355      enough iterations for vectorization.  */
2356   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2359     {
2360       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2362 
2363       if (known_lt (wi::to_widest (scalar_niters), vf))
2364 	{
2365 	  if (dump_enabled_p ())
2366 	    dump_printf_loc (MSG_NOTE, vect_location,
2367 			     "loop has no enough iterations to support"
2368 			     " peeling for gaps.\n");
2369 	  return false;
2370 	}
2371     }
2372 
2373   /* Check the costings of the loop make vectorizing worthwhile.  */
2374   res = vect_analyze_loop_costing (loop_vinfo);
2375   if (res < 0)
2376     goto again;
2377   if (!res)
2378     {
2379       if (dump_enabled_p ())
2380 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 			 "Loop costings not worthwhile.\n");
2382       return false;
2383     }
2384 
2385   /* Decide whether we need to create an epilogue loop to handle
2386      remaining scalar iterations.  */
2387   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2388 
2389   unsigned HOST_WIDE_INT const_vf;
2390   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391     /* The main loop handles all iterations.  */
2392     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2395     {
2396       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397 		       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2400     }
2401   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404 		< (unsigned) exact_log2 (const_vf))
2405 	       /* In case of versioning, check if the maximum number of
2406 		  iterations is greater than th.  If they are identical,
2407 		  the epilogue is unnecessary.  */
2408 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 		   || ((unsigned HOST_WIDE_INT) max_niter
2410 		       > (th / const_vf) * const_vf))))
2411     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2412 
2413   /* If an epilogue loop is required make sure we can create one.  */
2414   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2416     {
2417       if (dump_enabled_p ())
2418         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419       if (!vect_can_advance_ivs_p (loop_vinfo)
2420 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421 					   single_exit (LOOP_VINFO_LOOP
2422 							 (loop_vinfo))))
2423         {
2424           if (dump_enabled_p ())
2425 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426 			     "not vectorized: can't create required "
2427 			     "epilog loop\n");
2428           goto again;
2429         }
2430     }
2431 
2432   /* During peeling, we need to check if number of loop iterations is
2433      enough for both peeled prolog loop and vector loop.  This check
2434      can be merged along with threshold check of loop versioning, so
2435      increase threshold for this case if necessary.  */
2436   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2437     {
2438       poly_uint64 niters_th = 0;
2439 
2440       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441 	{
2442 	  /* Niters for peeled prolog loop.  */
2443 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2444 	    {
2445 	      struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446 	      tree vectype
2447 		= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2449 	    }
2450 	  else
2451 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2452 	}
2453 
2454       /* Niters for at least one iteration of vectorized loop.  */
2455       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457       /* One additional iteration because of peeling for gap.  */
2458       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459 	niters_th += 1;
2460       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2461     }
2462 
2463   gcc_assert (known_eq (vectorization_factor,
2464 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2465 
2466   /* Ok to vectorize!  */
2467   return true;
2468 
2469 again:
2470   /* Try again with SLP forced off but if we didn't do any SLP there is
2471      no point in re-trying.  */
2472   if (!slp)
2473     return false;
2474 
2475   /* If there are reduction chains re-trying will fail anyway.  */
2476   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477     return false;
2478 
2479   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480      via interleaving or lane instructions.  */
2481   slp_instance instance;
2482   slp_tree node;
2483   unsigned i, j;
2484   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2485     {
2486       stmt_vec_info vinfo;
2487       vinfo = vinfo_for_stmt
2488 	  (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490 	continue;
2491       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494       if (! vect_store_lanes_supported (vectype, size, false)
2495 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2496 	 && ! vect_grouped_store_supported (vectype, size))
2497        return false;
2498       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2499 	{
2500 	  vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2501 	  vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2502 	  bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2503 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
2504 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2505 	  if (! vect_load_lanes_supported (vectype, size, false)
2506 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2507 						size))
2508 	    return false;
2509 	}
2510     }
2511 
2512   if (dump_enabled_p ())
2513     dump_printf_loc (MSG_NOTE, vect_location,
2514 		     "re-trying with SLP disabled\n");
2515 
2516   /* Roll back state appropriately.  No SLP this time.  */
2517   slp = false;
2518   /* Restore vectorization factor as it were without SLP.  */
2519   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2520   /* Free the SLP instances.  */
2521   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2522     vect_free_slp_instance (instance);
2523   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2524   /* Reset SLP type to loop_vect on all stmts.  */
2525   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2526     {
2527       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2528       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2529 	   !gsi_end_p (si); gsi_next (&si))
2530 	{
2531 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2532 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2533 	}
2534       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2535 	   !gsi_end_p (si); gsi_next (&si))
2536 	{
2537 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2538 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2539 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2540 	    {
2541 	      stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2542 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2543 	      for (gimple_stmt_iterator pi
2544 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2545 		   !gsi_end_p (pi); gsi_next (&pi))
2546 		{
2547 		  gimple *pstmt = gsi_stmt (pi);
2548 		  STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2549 		}
2550 	    }
2551 	}
2552     }
2553   /* Free optimized alias test DDRS.  */
2554   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2555   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2556   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2557   /* Reset target cost data.  */
2558   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2559   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2560     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2561   /* Reset accumulated rgroup information.  */
2562   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2563   /* Reset assorted flags.  */
2564   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2565   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2566   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2567   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2568   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2569 
2570   goto start_over;
2571 }
2572 
2573 /* Function vect_analyze_loop.
2574 
2575    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2576    for it.  The different analyses will record information in the
2577    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2578    be vectorized.  */
2579 loop_vec_info
2580 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2581 {
2582   loop_vec_info loop_vinfo;
2583   auto_vector_sizes vector_sizes;
2584 
2585   /* Autodetect first vector size we try.  */
2586   current_vector_size = 0;
2587   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2588   unsigned int next_size = 0;
2589 
2590   if (dump_enabled_p ())
2591     dump_printf_loc (MSG_NOTE, vect_location,
2592 		     "===== analyze_loop_nest =====\n");
2593 
2594   if (loop_outer (loop)
2595       && loop_vec_info_for_loop (loop_outer (loop))
2596       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2597     {
2598       if (dump_enabled_p ())
2599 	dump_printf_loc (MSG_NOTE, vect_location,
2600 			 "outer-loop already vectorized.\n");
2601       return NULL;
2602     }
2603 
2604   poly_uint64 autodetected_vector_size = 0;
2605   while (1)
2606     {
2607       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2608       loop_vinfo = vect_analyze_loop_form (loop);
2609       if (!loop_vinfo)
2610 	{
2611 	  if (dump_enabled_p ())
2612 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613 			     "bad loop form.\n");
2614 	  return NULL;
2615 	}
2616 
2617       bool fatal = false;
2618 
2619       if (orig_loop_vinfo)
2620 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2621 
2622       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2623 	{
2624 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2625 
2626 	  return loop_vinfo;
2627 	}
2628 
2629       delete loop_vinfo;
2630 
2631       if (next_size == 0)
2632 	autodetected_vector_size = current_vector_size;
2633 
2634       if (next_size < vector_sizes.length ()
2635 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
2636 	next_size += 1;
2637 
2638       if (fatal
2639 	  || next_size == vector_sizes.length ()
2640 	  || known_eq (current_vector_size, 0U))
2641 	return NULL;
2642 
2643       /* Try the next biggest vector size.  */
2644       current_vector_size = vector_sizes[next_size++];
2645       if (dump_enabled_p ())
2646 	{
2647 	  dump_printf_loc (MSG_NOTE, vect_location,
2648 			   "***** Re-trying analysis with "
2649 			   "vector size ");
2650 	  dump_dec (MSG_NOTE, current_vector_size);
2651 	  dump_printf (MSG_NOTE, "\n");
2652 	}
2653     }
2654 }
2655 
2656 /* Return true if there is an in-order reduction function for CODE, storing
2657    it in *REDUC_FN if so.  */
2658 
2659 static bool
2660 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2661 {
2662   switch (code)
2663     {
2664     case PLUS_EXPR:
2665       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2666       return true;
2667 
2668     default:
2669       return false;
2670     }
2671 }
2672 
2673 /* Function reduction_fn_for_scalar_code
2674 
2675    Input:
2676    CODE - tree_code of a reduction operations.
2677 
2678    Output:
2679    REDUC_FN - the corresponding internal function to be used to reduce the
2680       vector of partial results into a single scalar result, or IFN_LAST
2681       if the operation is a supported reduction operation, but does not have
2682       such an internal function.
2683 
2684    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2685 
2686 static bool
2687 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2688 {
2689   switch (code)
2690     {
2691       case MAX_EXPR:
2692         *reduc_fn = IFN_REDUC_MAX;
2693         return true;
2694 
2695       case MIN_EXPR:
2696         *reduc_fn = IFN_REDUC_MIN;
2697         return true;
2698 
2699       case PLUS_EXPR:
2700         *reduc_fn = IFN_REDUC_PLUS;
2701         return true;
2702 
2703       case BIT_AND_EXPR:
2704 	*reduc_fn = IFN_REDUC_AND;
2705 	return true;
2706 
2707       case BIT_IOR_EXPR:
2708 	*reduc_fn = IFN_REDUC_IOR;
2709 	return true;
2710 
2711       case BIT_XOR_EXPR:
2712 	*reduc_fn = IFN_REDUC_XOR;
2713 	return true;
2714 
2715       case MULT_EXPR:
2716       case MINUS_EXPR:
2717         *reduc_fn = IFN_LAST;
2718         return true;
2719 
2720       default:
2721        return false;
2722     }
2723 }
2724 
2725 /* If there is a neutral value X such that SLP reduction NODE would not
2726    be affected by the introduction of additional X elements, return that X,
2727    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2728    is true if the SLP statements perform a single reduction, false if each
2729    statement performs an independent reduction.  */
2730 
2731 static tree
2732 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2733 			      bool reduc_chain)
2734 {
2735   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2736   gimple *stmt = stmts[0];
2737   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2738   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2739   tree scalar_type = TREE_TYPE (vector_type);
2740   struct loop *loop = gimple_bb (stmt)->loop_father;
2741   gcc_assert (loop);
2742 
2743   switch (code)
2744     {
2745     case WIDEN_SUM_EXPR:
2746     case DOT_PROD_EXPR:
2747     case SAD_EXPR:
2748     case PLUS_EXPR:
2749     case MINUS_EXPR:
2750     case BIT_IOR_EXPR:
2751     case BIT_XOR_EXPR:
2752       return build_zero_cst (scalar_type);
2753 
2754     case MULT_EXPR:
2755       return build_one_cst (scalar_type);
2756 
2757     case BIT_AND_EXPR:
2758       return build_all_ones_cst (scalar_type);
2759 
2760     case MAX_EXPR:
2761     case MIN_EXPR:
2762       /* For MIN/MAX the initial values are neutral.  A reduction chain
2763 	 has only a single initial value, so that value is neutral for
2764 	 all statements.  */
2765       if (reduc_chain)
2766 	return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2767       return NULL_TREE;
2768 
2769     default:
2770       return NULL_TREE;
2771     }
2772 }
2773 
2774 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2775    STMT is printed with a message MSG. */
2776 
2777 static void
2778 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2779 {
2780   dump_printf_loc (msg_type, vect_location, "%s", msg);
2781   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2782 }
2783 
2784 
2785 /* Detect SLP reduction of the form:
2786 
2787    #a1 = phi <a5, a0>
2788    a2 = operation (a1)
2789    a3 = operation (a2)
2790    a4 = operation (a3)
2791    a5 = operation (a4)
2792 
2793    #a = phi <a5>
2794 
2795    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2796    FIRST_STMT is the first reduction stmt in the chain
2797    (a2 = operation (a1)).
2798 
2799    Return TRUE if a reduction chain was detected.  */
2800 
2801 static bool
2802 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2803 		       gimple *first_stmt)
2804 {
2805   struct loop *loop = (gimple_bb (phi))->loop_father;
2806   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807   enum tree_code code;
2808   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2809   stmt_vec_info use_stmt_info, current_stmt_info;
2810   tree lhs;
2811   imm_use_iterator imm_iter;
2812   use_operand_p use_p;
2813   int nloop_uses, size = 0, n_out_of_loop_uses;
2814   bool found = false;
2815 
2816   if (loop != vect_loop)
2817     return false;
2818 
2819   lhs = PHI_RESULT (phi);
2820   code = gimple_assign_rhs_code (first_stmt);
2821   while (1)
2822     {
2823       nloop_uses = 0;
2824       n_out_of_loop_uses = 0;
2825       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2826         {
2827 	  gimple *use_stmt = USE_STMT (use_p);
2828 	  if (is_gimple_debug (use_stmt))
2829 	    continue;
2830 
2831           /* Check if we got back to the reduction phi.  */
2832 	  if (use_stmt == phi)
2833             {
2834 	      loop_use_stmt = use_stmt;
2835               found = true;
2836               break;
2837             }
2838 
2839           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2840             {
2841 	      loop_use_stmt = use_stmt;
2842 	      nloop_uses++;
2843             }
2844            else
2845              n_out_of_loop_uses++;
2846 
2847            /* There are can be either a single use in the loop or two uses in
2848               phi nodes.  */
2849            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2850              return false;
2851         }
2852 
2853       if (found)
2854         break;
2855 
2856       /* We reached a statement with no loop uses.  */
2857       if (nloop_uses == 0)
2858 	return false;
2859 
2860       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2861       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2862         return false;
2863 
2864       if (!is_gimple_assign (loop_use_stmt)
2865 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2866 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2867         return false;
2868 
2869       /* Insert USE_STMT into reduction chain.  */
2870       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2871       if (current_stmt)
2872         {
2873           current_stmt_info = vinfo_for_stmt (current_stmt);
2874 	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2875           GROUP_FIRST_ELEMENT (use_stmt_info)
2876             = GROUP_FIRST_ELEMENT (current_stmt_info);
2877         }
2878       else
2879 	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2880 
2881       lhs = gimple_assign_lhs (loop_use_stmt);
2882       current_stmt = loop_use_stmt;
2883       size++;
2884    }
2885 
2886   if (!found || loop_use_stmt != phi || size < 2)
2887     return false;
2888 
2889   /* Swap the operands, if needed, to make the reduction operand be the second
2890      operand.  */
2891   lhs = PHI_RESULT (phi);
2892   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2893   while (next_stmt)
2894     {
2895       if (gimple_assign_rhs2 (next_stmt) == lhs)
2896 	{
2897 	  tree op = gimple_assign_rhs1 (next_stmt);
2898 	  gimple *def_stmt = NULL;
2899 
2900           if (TREE_CODE (op) == SSA_NAME)
2901             def_stmt = SSA_NAME_DEF_STMT (op);
2902 
2903 	  /* Check that the other def is either defined in the loop
2904 	     ("vect_internal_def"), or it's an induction (defined by a
2905 	     loop-header phi-node).  */
2906           if (def_stmt
2907               && gimple_bb (def_stmt)
2908 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909               && (is_gimple_assign (def_stmt)
2910                   || is_gimple_call (def_stmt)
2911                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912                            == vect_induction_def
2913                   || (gimple_code (def_stmt) == GIMPLE_PHI
2914                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915                                   == vect_internal_def
2916                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917 	    {
2918 	      lhs = gimple_assign_lhs (next_stmt);
2919 	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2920  	      continue;
2921 	    }
2922 
2923 	  return false;
2924 	}
2925       else
2926 	{
2927           tree op = gimple_assign_rhs2 (next_stmt);
2928 	  gimple *def_stmt = NULL;
2929 
2930           if (TREE_CODE (op) == SSA_NAME)
2931             def_stmt = SSA_NAME_DEF_STMT (op);
2932 
2933           /* Check that the other def is either defined in the loop
2934             ("vect_internal_def"), or it's an induction (defined by a
2935             loop-header phi-node).  */
2936           if (def_stmt
2937               && gimple_bb (def_stmt)
2938 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2939               && (is_gimple_assign (def_stmt)
2940                   || is_gimple_call (def_stmt)
2941                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2942                               == vect_induction_def
2943                   || (gimple_code (def_stmt) == GIMPLE_PHI
2944                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2945                                   == vect_internal_def
2946                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2947   	    {
2948 	      if (dump_enabled_p ())
2949 		{
2950 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2951 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2952 		}
2953 
2954 	      swap_ssa_operands (next_stmt,
2955 	 		         gimple_assign_rhs1_ptr (next_stmt),
2956                                  gimple_assign_rhs2_ptr (next_stmt));
2957 	      update_stmt (next_stmt);
2958 
2959 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2960 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2961 	    }
2962 	  else
2963 	    return false;
2964         }
2965 
2966       lhs = gimple_assign_lhs (next_stmt);
2967       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2968     }
2969 
2970   /* Save the chain for further analysis in SLP detection.  */
2971   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2972   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2973   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2974 
2975   return true;
2976 }
2977 
2978 /* Return true if we need an in-order reduction for operation CODE
2979    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2980    overflow must wrap.  */
2981 
2982 static bool
2983 needs_fold_left_reduction_p (tree type, tree_code code,
2984 			     bool need_wrapping_integral_overflow)
2985 {
2986   /* CHECKME: check for !flag_finite_math_only too?  */
2987   if (SCALAR_FLOAT_TYPE_P (type))
2988     switch (code)
2989       {
2990       case MIN_EXPR:
2991       case MAX_EXPR:
2992 	return false;
2993 
2994       default:
2995 	return !flag_associative_math;
2996       }
2997 
2998   if (INTEGRAL_TYPE_P (type))
2999     {
3000       if (!operation_no_trapping_overflow (type, code))
3001 	return true;
3002       if (need_wrapping_integral_overflow
3003 	  && !TYPE_OVERFLOW_WRAPS (type)
3004 	  && operation_can_overflow (code))
3005 	return true;
3006       return false;
3007     }
3008 
3009   if (SAT_FIXED_POINT_TYPE_P (type))
3010     return true;
3011 
3012   return false;
3013 }
3014 
3015 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3016    reduction operation CODE has a handled computation expression.  */
3017 
3018 bool
3019 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3020 		      enum tree_code code)
3021 {
3022   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3023   auto_bitmap visited;
3024   tree lookfor = PHI_RESULT (phi);
3025   ssa_op_iter curri;
3026   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3027   while (USE_FROM_PTR (curr) != loop_arg)
3028     curr = op_iter_next_use (&curri);
3029   curri.i = curri.numops;
3030   do
3031     {
3032       path.safe_push (std::make_pair (curri, curr));
3033       tree use = USE_FROM_PTR (curr);
3034       if (use == lookfor)
3035 	break;
3036       gimple *def = SSA_NAME_DEF_STMT (use);
3037       if (gimple_nop_p (def)
3038 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3039 	{
3040 pop:
3041 	  do
3042 	    {
3043 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3044 	      curri = x.first;
3045 	      curr = x.second;
3046 	      do
3047 		curr = op_iter_next_use (&curri);
3048 	      /* Skip already visited or non-SSA operands (from iterating
3049 	         over PHI args).  */
3050 	      while (curr != NULL_USE_OPERAND_P
3051 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3052 			 || ! bitmap_set_bit (visited,
3053 					      SSA_NAME_VERSION
3054 					        (USE_FROM_PTR (curr)))));
3055 	    }
3056 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3057 	  if (curr == NULL_USE_OPERAND_P)
3058 	    break;
3059 	}
3060       else
3061 	{
3062 	  if (gimple_code (def) == GIMPLE_PHI)
3063 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3064 	  else
3065 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3066 	  while (curr != NULL_USE_OPERAND_P
3067 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3068 		     || ! bitmap_set_bit (visited,
3069 					  SSA_NAME_VERSION
3070 					    (USE_FROM_PTR (curr)))))
3071 	    curr = op_iter_next_use (&curri);
3072 	  if (curr == NULL_USE_OPERAND_P)
3073 	    goto pop;
3074 	}
3075     }
3076   while (1);
3077   if (dump_file && (dump_flags & TDF_DETAILS))
3078     {
3079       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3080       unsigned i;
3081       std::pair<ssa_op_iter, use_operand_p> *x;
3082       FOR_EACH_VEC_ELT (path, i, x)
3083 	{
3084 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3085 	  dump_printf (MSG_NOTE, " ");
3086 	}
3087       dump_printf (MSG_NOTE, "\n");
3088     }
3089 
3090   /* Check whether the reduction path detected is valid.  */
3091   bool fail = path.length () == 0;
3092   bool neg = false;
3093   for (unsigned i = 1; i < path.length (); ++i)
3094     {
3095       gimple *use_stmt = USE_STMT (path[i].second);
3096       tree op = USE_FROM_PTR (path[i].second);
3097       if (! has_single_use (op)
3098 	  || ! is_gimple_assign (use_stmt))
3099 	{
3100 	  fail = true;
3101 	  break;
3102 	}
3103       if (gimple_assign_rhs_code (use_stmt) != code)
3104 	{
3105 	  if (code == PLUS_EXPR
3106 	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3107 	    {
3108 	      /* Track whether we negate the reduction value each iteration.  */
3109 	      if (gimple_assign_rhs2 (use_stmt) == op)
3110 		neg = ! neg;
3111 	    }
3112 	  else
3113 	    {
3114 	      fail = true;
3115 	      break;
3116 	    }
3117 	}
3118     }
3119   return ! fail && ! neg;
3120 }
3121 
3122 
3123 /* Function vect_is_simple_reduction
3124 
3125    (1) Detect a cross-iteration def-use cycle that represents a simple
3126    reduction computation.  We look for the following pattern:
3127 
3128    loop_header:
3129      a1 = phi < a0, a2 >
3130      a3 = ...
3131      a2 = operation (a3, a1)
3132 
3133    or
3134 
3135    a3 = ...
3136    loop_header:
3137      a1 = phi < a0, a2 >
3138      a2 = operation (a3, a1)
3139 
3140    such that:
3141    1. operation is commutative and associative and it is safe to
3142       change the order of the computation
3143    2. no uses for a2 in the loop (a2 is used out of the loop)
3144    3. no uses of a1 in the loop besides the reduction operation
3145    4. no uses of a1 outside the loop.
3146 
3147    Conditions 1,4 are tested here.
3148    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3149 
3150    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3151    nested cycles.
3152 
3153    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3154    reductions:
3155 
3156      a1 = phi < a0, a2 >
3157      inner loop (def of a3)
3158      a2 = phi < a3 >
3159 
3160    (4) Detect condition expressions, ie:
3161      for (int i = 0; i < N; i++)
3162        if (a[i] < val)
3163 	ret_val = a[i];
3164 
3165 */
3166 
3167 static gimple *
3168 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3169 			  bool *double_reduc,
3170 			  bool need_wrapping_integral_overflow,
3171 			  enum vect_reduction_type *v_reduc_type)
3172 {
3173   struct loop *loop = (gimple_bb (phi))->loop_father;
3174   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3175   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3176   enum tree_code orig_code, code;
3177   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3178   tree type;
3179   int nloop_uses;
3180   tree name;
3181   imm_use_iterator imm_iter;
3182   use_operand_p use_p;
3183   bool phi_def;
3184 
3185   *double_reduc = false;
3186   *v_reduc_type = TREE_CODE_REDUCTION;
3187 
3188   tree phi_name = PHI_RESULT (phi);
3189   /* ???  If there are no uses of the PHI result the inner loop reduction
3190      won't be detected as possibly double-reduction by vectorizable_reduction
3191      because that tries to walk the PHI arg from the preheader edge which
3192      can be constant.  See PR60382.  */
3193   if (has_zero_uses (phi_name))
3194     return NULL;
3195   nloop_uses = 0;
3196   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3197     {
3198       gimple *use_stmt = USE_STMT (use_p);
3199       if (is_gimple_debug (use_stmt))
3200 	continue;
3201 
3202       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3203         {
3204           if (dump_enabled_p ())
3205 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206 			     "intermediate value used outside loop.\n");
3207 
3208           return NULL;
3209         }
3210 
3211       nloop_uses++;
3212       if (nloop_uses > 1)
3213         {
3214           if (dump_enabled_p ())
3215 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216 			     "reduction value used in loop.\n");
3217           return NULL;
3218         }
3219 
3220       phi_use_stmt = use_stmt;
3221     }
3222 
3223   edge latch_e = loop_latch_edge (loop);
3224   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3225   if (TREE_CODE (loop_arg) != SSA_NAME)
3226     {
3227       if (dump_enabled_p ())
3228 	{
3229 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230 			   "reduction: not ssa_name: ");
3231 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3232           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3233 	}
3234       return NULL;
3235     }
3236 
3237   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3238   if (is_gimple_assign (def_stmt))
3239     {
3240       name = gimple_assign_lhs (def_stmt);
3241       phi_def = false;
3242     }
3243   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3244     {
3245       name = PHI_RESULT (def_stmt);
3246       phi_def = true;
3247     }
3248   else
3249     {
3250       if (dump_enabled_p ())
3251 	{
3252 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3253 			   "reduction: unhandled reduction operation: ");
3254 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3255 	}
3256       return NULL;
3257     }
3258 
3259   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3260     return NULL;
3261 
3262   nloop_uses = 0;
3263   auto_vec<gphi *, 3> lcphis;
3264   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3265     {
3266       gimple *use_stmt = USE_STMT (use_p);
3267       if (is_gimple_debug (use_stmt))
3268 	continue;
3269       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3270 	nloop_uses++;
3271       else
3272 	/* We can have more than one loop-closed PHI.  */
3273 	lcphis.safe_push (as_a <gphi *> (use_stmt));
3274       if (nloop_uses > 1)
3275 	{
3276 	  if (dump_enabled_p ())
3277 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 			     "reduction used in loop.\n");
3279 	  return NULL;
3280 	}
3281     }
3282 
3283   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3284      defined in the inner loop.  */
3285   if (phi_def)
3286     {
3287       op1 = PHI_ARG_DEF (def_stmt, 0);
3288 
3289       if (gimple_phi_num_args (def_stmt) != 1
3290           || TREE_CODE (op1) != SSA_NAME)
3291         {
3292           if (dump_enabled_p ())
3293 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3294 			     "unsupported phi node definition.\n");
3295 
3296           return NULL;
3297         }
3298 
3299       def1 = SSA_NAME_DEF_STMT (op1);
3300       if (gimple_bb (def1)
3301 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3302           && loop->inner
3303           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3304           && is_gimple_assign (def1)
3305 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3306         {
3307           if (dump_enabled_p ())
3308             report_vect_op (MSG_NOTE, def_stmt,
3309 			    "detected double reduction: ");
3310 
3311           *double_reduc = true;
3312           return def_stmt;
3313         }
3314 
3315       return NULL;
3316     }
3317 
3318   /* If we are vectorizing an inner reduction we are executing that
3319      in the original order only in case we are not dealing with a
3320      double reduction.  */
3321   bool check_reduction = true;
3322   if (flow_loop_nested_p (vect_loop, loop))
3323     {
3324       gphi *lcphi;
3325       unsigned i;
3326       check_reduction = false;
3327       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3328 	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3329 	  {
3330 	    gimple *use_stmt = USE_STMT (use_p);
3331 	    if (is_gimple_debug (use_stmt))
3332 	      continue;
3333 	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3334 	      check_reduction = true;
3335 	  }
3336     }
3337 
3338   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3339   code = orig_code = gimple_assign_rhs_code (def_stmt);
3340 
3341   /* We can handle "res -= x[i]", which is non-associative by
3342      simply rewriting this into "res += -x[i]".  Avoid changing
3343      gimple instruction for the first simple tests and only do this
3344      if we're allowed to change code at all.  */
3345   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3346     code = PLUS_EXPR;
3347 
3348   if (code == COND_EXPR)
3349     {
3350       if (! nested_in_vect_loop)
3351 	*v_reduc_type = COND_REDUCTION;
3352 
3353       op3 = gimple_assign_rhs1 (def_stmt);
3354       if (COMPARISON_CLASS_P (op3))
3355         {
3356           op4 = TREE_OPERAND (op3, 1);
3357           op3 = TREE_OPERAND (op3, 0);
3358         }
3359       if (op3 == phi_name || op4 == phi_name)
3360 	{
3361 	  if (dump_enabled_p ())
3362 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3363 			    "reduction: condition depends on previous"
3364 			    " iteration: ");
3365 	  return NULL;
3366 	}
3367 
3368       op1 = gimple_assign_rhs2 (def_stmt);
3369       op2 = gimple_assign_rhs3 (def_stmt);
3370     }
3371   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3372     {
3373       if (dump_enabled_p ())
3374 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3375 			"reduction: not commutative/associative: ");
3376       return NULL;
3377     }
3378   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3379     {
3380       op1 = gimple_assign_rhs1 (def_stmt);
3381       op2 = gimple_assign_rhs2 (def_stmt);
3382     }
3383   else
3384     {
3385       if (dump_enabled_p ())
3386 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3387 			"reduction: not handled operation: ");
3388       return NULL;
3389     }
3390 
3391   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3392     {
3393       if (dump_enabled_p ())
3394 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3395 			"reduction: both uses not ssa_names: ");
3396 
3397       return NULL;
3398     }
3399 
3400   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3401   if ((TREE_CODE (op1) == SSA_NAME
3402        && !types_compatible_p (type,TREE_TYPE (op1)))
3403       || (TREE_CODE (op2) == SSA_NAME
3404           && !types_compatible_p (type, TREE_TYPE (op2)))
3405       || (op3 && TREE_CODE (op3) == SSA_NAME
3406           && !types_compatible_p (type, TREE_TYPE (op3)))
3407       || (op4 && TREE_CODE (op4) == SSA_NAME
3408           && !types_compatible_p (type, TREE_TYPE (op4))))
3409     {
3410       if (dump_enabled_p ())
3411         {
3412           dump_printf_loc (MSG_NOTE, vect_location,
3413 			   "reduction: multiple types: operation type: ");
3414           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3415           dump_printf (MSG_NOTE, ", operands types: ");
3416           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3417 			     TREE_TYPE (op1));
3418           dump_printf (MSG_NOTE, ",");
3419           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3420 			     TREE_TYPE (op2));
3421           if (op3)
3422             {
3423               dump_printf (MSG_NOTE, ",");
3424               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3425 				 TREE_TYPE (op3));
3426             }
3427 
3428           if (op4)
3429             {
3430               dump_printf (MSG_NOTE, ",");
3431               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3432 				 TREE_TYPE (op4));
3433             }
3434           dump_printf (MSG_NOTE, "\n");
3435         }
3436 
3437       return NULL;
3438     }
3439 
3440   /* Check whether it's ok to change the order of the computation.
3441      Generally, when vectorizing a reduction we change the order of the
3442      computation.  This may change the behavior of the program in some
3443      cases, so we need to check that this is ok.  One exception is when
3444      vectorizing an outer-loop: the inner-loop is executed sequentially,
3445      and therefore vectorizing reductions in the inner-loop during
3446      outer-loop vectorization is safe.  */
3447   if (check_reduction
3448       && *v_reduc_type == TREE_CODE_REDUCTION
3449       && needs_fold_left_reduction_p (type, code,
3450 				      need_wrapping_integral_overflow))
3451     *v_reduc_type = FOLD_LEFT_REDUCTION;
3452 
3453   /* Reduction is safe. We're dealing with one of the following:
3454      1) integer arithmetic and no trapv
3455      2) floating point arithmetic, and special flags permit this optimization
3456      3) nested cycle (i.e., outer loop vectorization).  */
3457   if (TREE_CODE (op1) == SSA_NAME)
3458     def1 = SSA_NAME_DEF_STMT (op1);
3459 
3460   if (TREE_CODE (op2) == SSA_NAME)
3461     def2 = SSA_NAME_DEF_STMT (op2);
3462 
3463   if (code != COND_EXPR
3464       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3465     {
3466       if (dump_enabled_p ())
3467 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3468       return NULL;
3469     }
3470 
3471   /* Check that one def is the reduction def, defined by PHI,
3472      the other def is either defined in the loop ("vect_internal_def"),
3473      or it's an induction (defined by a loop-header phi-node).  */
3474 
3475   if (def2 && def2 == phi
3476       && (code == COND_EXPR
3477 	  || !def1 || gimple_nop_p (def1)
3478 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3480               && (is_gimple_assign (def1)
3481 		  || is_gimple_call (def1)
3482   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3483                       == vect_induction_def
3484    	          || (gimple_code (def1) == GIMPLE_PHI
3485 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3486                           == vect_internal_def
3487  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
3488     {
3489       if (dump_enabled_p ())
3490 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3491       return def_stmt;
3492     }
3493 
3494   if (def1 && def1 == phi
3495       && (code == COND_EXPR
3496 	  || !def2 || gimple_nop_p (def2)
3497 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498 	  || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3499 	      && (is_gimple_assign (def2)
3500 		  || is_gimple_call (def2)
3501 		  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3502 		       == vect_induction_def
3503 		  || (gimple_code (def2) == GIMPLE_PHI
3504 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3505 			   == vect_internal_def
3506 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
3507     {
3508       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3509 	{
3510 	  /* Check if we can swap operands (just for simplicity - so that
3511 	     the rest of the code can assume that the reduction variable
3512 	     is always the last (second) argument).  */
3513 	  if (code == COND_EXPR)
3514 	    {
3515 	      /* Swap cond_expr by inverting the condition.  */
3516 	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
3517 	      enum tree_code invert_code = ERROR_MARK;
3518 	      enum tree_code cond_code = TREE_CODE (cond_expr);
3519 
3520 	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3521 		{
3522 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3523 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
3524 		}
3525 	      if (invert_code != ERROR_MARK)
3526 		{
3527 		  TREE_SET_CODE (cond_expr, invert_code);
3528 		  swap_ssa_operands (def_stmt,
3529 				     gimple_assign_rhs2_ptr (def_stmt),
3530 				     gimple_assign_rhs3_ptr (def_stmt));
3531 		}
3532 	      else
3533 		{
3534 		  if (dump_enabled_p ())
3535 		    report_vect_op (MSG_NOTE, def_stmt,
3536 				    "detected reduction: cannot swap operands "
3537 				    "for cond_expr");
3538 		  return NULL;
3539 		}
3540 	    }
3541 	  else
3542 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3543 			       gimple_assign_rhs2_ptr (def_stmt));
3544 
3545 	  if (dump_enabled_p ())
3546 	    report_vect_op (MSG_NOTE, def_stmt,
3547 			    "detected reduction: need to swap operands: ");
3548 
3549 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3550 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3551         }
3552       else
3553         {
3554           if (dump_enabled_p ())
3555             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3556         }
3557 
3558       return def_stmt;
3559     }
3560 
3561   /* Try to find SLP reduction chain.  */
3562   if (! nested_in_vect_loop
3563       && code != COND_EXPR
3564       && orig_code != MINUS_EXPR
3565       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3566     {
3567       if (dump_enabled_p ())
3568         report_vect_op (MSG_NOTE, def_stmt,
3569 			"reduction: detected reduction chain: ");
3570 
3571       return def_stmt;
3572     }
3573 
3574   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3575   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3576   while (first)
3577     {
3578       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3579       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3581       first = next;
3582     }
3583 
3584   /* Look for the expression computing loop_arg from loop PHI result.  */
3585   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3586 			    code))
3587     return def_stmt;
3588 
3589   if (dump_enabled_p ())
3590     {
3591       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3592 		      "reduction: unknown pattern: ");
3593     }
3594 
3595   return NULL;
3596 }
3597 
3598 /* Wrapper around vect_is_simple_reduction, which will modify code
3599    in-place if it enables detection of more reductions.  Arguments
3600    as there.  */
3601 
3602 gimple *
3603 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3604 			     bool *double_reduc,
3605 			     bool need_wrapping_integral_overflow)
3606 {
3607   enum vect_reduction_type v_reduc_type;
3608   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3609 					  need_wrapping_integral_overflow,
3610 					  &v_reduc_type);
3611   if (def)
3612     {
3613       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3614       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3615       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3616       reduc_def_info = vinfo_for_stmt (def);
3617       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3618       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3619     }
3620   return def;
3621 }
3622 
3623 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3624 int
3625 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3626                              int *peel_iters_epilogue,
3627                              stmt_vector_for_cost *scalar_cost_vec,
3628 			     stmt_vector_for_cost *prologue_cost_vec,
3629 			     stmt_vector_for_cost *epilogue_cost_vec)
3630 {
3631   int retval = 0;
3632   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3633 
3634   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3635     {
3636       *peel_iters_epilogue = assumed_vf / 2;
3637       if (dump_enabled_p ())
3638         dump_printf_loc (MSG_NOTE, vect_location,
3639 			 "cost model: epilogue peel iters set to vf/2 "
3640 			 "because loop iterations are unknown .\n");
3641 
3642       /* If peeled iterations are known but number of scalar loop
3643          iterations are unknown, count a taken branch per peeled loop.  */
3644       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3645 				 NULL, 0, vect_prologue);
3646       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3647 				 NULL, 0, vect_epilogue);
3648     }
3649   else
3650     {
3651       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3652       peel_iters_prologue = niters < peel_iters_prologue ?
3653                             niters : peel_iters_prologue;
3654       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3655       /* If we need to peel for gaps, but no peeling is required, we have to
3656 	 peel VF iterations.  */
3657       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3658 	*peel_iters_epilogue = assumed_vf;
3659     }
3660 
3661   stmt_info_for_cost *si;
3662   int j;
3663   if (peel_iters_prologue)
3664     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3665 	{
3666 	  stmt_vec_info stmt_info
3667 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3668 	  retval += record_stmt_cost (prologue_cost_vec,
3669 				      si->count * peel_iters_prologue,
3670 				      si->kind, stmt_info, si->misalign,
3671 				      vect_prologue);
3672 	}
3673   if (*peel_iters_epilogue)
3674     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3675 	{
3676 	  stmt_vec_info stmt_info
3677 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3678 	  retval += record_stmt_cost (epilogue_cost_vec,
3679 				      si->count * *peel_iters_epilogue,
3680 				      si->kind, stmt_info, si->misalign,
3681 				      vect_epilogue);
3682 	}
3683 
3684   return retval;
3685 }
3686 
3687 /* Function vect_estimate_min_profitable_iters
3688 
3689    Return the number of iterations required for the vector version of the
3690    loop to be profitable relative to the cost of the scalar version of the
3691    loop.
3692 
3693    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3694    of iterations for vectorization.  -1 value means loop vectorization
3695    is not profitable.  This returned value may be used for dynamic
3696    profitability check.
3697 
3698    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3699    for static check against estimated number of iterations.  */
3700 
3701 static void
3702 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3703 				    int *ret_min_profitable_niters,
3704 				    int *ret_min_profitable_estimate)
3705 {
3706   int min_profitable_iters;
3707   int min_profitable_estimate;
3708   int peel_iters_prologue;
3709   int peel_iters_epilogue;
3710   unsigned vec_inside_cost = 0;
3711   int vec_outside_cost = 0;
3712   unsigned vec_prologue_cost = 0;
3713   unsigned vec_epilogue_cost = 0;
3714   int scalar_single_iter_cost = 0;
3715   int scalar_outside_cost = 0;
3716   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3717   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3718   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719 
3720   /* Cost model disabled.  */
3721   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3722     {
3723       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3724       *ret_min_profitable_niters = 0;
3725       *ret_min_profitable_estimate = 0;
3726       return;
3727     }
3728 
3729   /* Requires loop versioning tests to handle misalignment.  */
3730   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3731     {
3732       /*  FIXME: Make cost depend on complexity of individual check.  */
3733       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3734       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3735 			    vect_prologue);
3736       dump_printf (MSG_NOTE,
3737                    "cost model: Adding cost of checks for loop "
3738                    "versioning to treat misalignment.\n");
3739     }
3740 
3741   /* Requires loop versioning with alias checks.  */
3742   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3743     {
3744       /*  FIXME: Make cost depend on complexity of individual check.  */
3745       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3746       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3747 			    vect_prologue);
3748       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3749       if (len)
3750 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3751 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3752 			      NULL, 0, vect_prologue);
3753       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3754       if (len)
3755 	{
3756 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3757 	  unsigned int nstmts = len * 2 - 1;
3758 	  /* +1 for each bias that needs adding.  */
3759 	  for (unsigned int i = 0; i < len; ++i)
3760 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3761 	      nstmts += 1;
3762 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3763 				NULL, 0, vect_prologue);
3764 	}
3765       dump_printf (MSG_NOTE,
3766                    "cost model: Adding cost of checks for loop "
3767                    "versioning aliasing.\n");
3768     }
3769 
3770   /* Requires loop versioning with niter checks.  */
3771   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3772     {
3773       /*  FIXME: Make cost depend on complexity of individual check.  */
3774       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3775 			    vect_prologue);
3776       dump_printf (MSG_NOTE,
3777 		   "cost model: Adding cost of checks for loop "
3778 		   "versioning niters.\n");
3779     }
3780 
3781   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3782     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3783 			  vect_prologue);
3784 
3785   /* Count statements in scalar loop.  Using this as scalar cost for a single
3786      iteration for now.
3787 
3788      TODO: Add outer loop support.
3789 
3790      TODO: Consider assigning different costs to different scalar
3791      statements.  */
3792 
3793   scalar_single_iter_cost
3794     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3795 
3796   /* Add additional cost for the peeled instructions in prologue and epilogue
3797      loop.  (For fully-masked loops there will be no peeling.)
3798 
3799      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3800      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3801 
3802      TODO: Build an expression that represents peel_iters for prologue and
3803      epilogue to be used in a run-time test.  */
3804 
3805   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806     {
3807       peel_iters_prologue = 0;
3808       peel_iters_epilogue = 0;
3809 
3810       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3811 	{
3812 	  /* We need to peel exactly one iteration.  */
3813 	  peel_iters_epilogue += 1;
3814 	  stmt_info_for_cost *si;
3815 	  int j;
3816 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3817 			    j, si)
3818 	    {
3819 	      struct _stmt_vec_info *stmt_info
3820 		= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3821 	      (void) add_stmt_cost (target_cost_data, si->count,
3822 				    si->kind, stmt_info, si->misalign,
3823 				    vect_epilogue);
3824 	    }
3825 	}
3826     }
3827   else if (npeel < 0)
3828     {
3829       peel_iters_prologue = assumed_vf / 2;
3830       dump_printf (MSG_NOTE, "cost model: "
3831                    "prologue peel iters set to vf/2.\n");
3832 
3833       /* If peeling for alignment is unknown, loop bound of main loop becomes
3834          unknown.  */
3835       peel_iters_epilogue = assumed_vf / 2;
3836       dump_printf (MSG_NOTE, "cost model: "
3837                    "epilogue peel iters set to vf/2 because "
3838                    "peeling for alignment is unknown.\n");
3839 
3840       /* If peeled iterations are unknown, count a taken branch and a not taken
3841          branch per peeled loop. Even if scalar loop iterations are known,
3842          vector iterations are not known since peeled prologue iterations are
3843          not known. Hence guards remain the same.  */
3844       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3845 			    NULL, 0, vect_prologue);
3846       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3847 			    NULL, 0, vect_prologue);
3848       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3849 			    NULL, 0, vect_epilogue);
3850       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3851 			    NULL, 0, vect_epilogue);
3852       stmt_info_for_cost *si;
3853       int j;
3854       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3855 	{
3856 	  struct _stmt_vec_info *stmt_info
3857 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3858 	  (void) add_stmt_cost (target_cost_data,
3859 				si->count * peel_iters_prologue,
3860 				si->kind, stmt_info, si->misalign,
3861 				vect_prologue);
3862 	  (void) add_stmt_cost (target_cost_data,
3863 				si->count * peel_iters_epilogue,
3864 				si->kind, stmt_info, si->misalign,
3865 				vect_epilogue);
3866 	}
3867     }
3868   else
3869     {
3870       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3871       stmt_info_for_cost *si;
3872       int j;
3873       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3874 
3875       prologue_cost_vec.create (2);
3876       epilogue_cost_vec.create (2);
3877       peel_iters_prologue = npeel;
3878 
3879       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3880 					  &peel_iters_epilogue,
3881 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3882 					    (loop_vinfo),
3883 					  &prologue_cost_vec,
3884 					  &epilogue_cost_vec);
3885 
3886       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3887 	{
3888 	  struct _stmt_vec_info *stmt_info
3889 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3890 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3891 				si->misalign, vect_prologue);
3892 	}
3893 
3894       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3895 	{
3896 	  struct _stmt_vec_info *stmt_info
3897 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3898 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3899 				si->misalign, vect_epilogue);
3900 	}
3901 
3902       prologue_cost_vec.release ();
3903       epilogue_cost_vec.release ();
3904     }
3905 
3906   /* FORNOW: The scalar outside cost is incremented in one of the
3907      following ways:
3908 
3909      1. The vectorizer checks for alignment and aliasing and generates
3910      a condition that allows dynamic vectorization.  A cost model
3911      check is ANDED with the versioning condition.  Hence scalar code
3912      path now has the added cost of the versioning check.
3913 
3914        if (cost > th & versioning_check)
3915          jmp to vector code
3916 
3917      Hence run-time scalar is incremented by not-taken branch cost.
3918 
3919      2. The vectorizer then checks if a prologue is required.  If the
3920      cost model check was not done before during versioning, it has to
3921      be done before the prologue check.
3922 
3923        if (cost <= th)
3924          prologue = scalar_iters
3925        if (prologue == 0)
3926          jmp to vector code
3927        else
3928          execute prologue
3929        if (prologue == num_iters)
3930 	 go to exit
3931 
3932      Hence the run-time scalar cost is incremented by a taken branch,
3933      plus a not-taken branch, plus a taken branch cost.
3934 
3935      3. The vectorizer then checks if an epilogue is required.  If the
3936      cost model check was not done before during prologue check, it
3937      has to be done with the epilogue check.
3938 
3939        if (prologue == 0)
3940          jmp to vector code
3941        else
3942          execute prologue
3943        if (prologue == num_iters)
3944 	 go to exit
3945        vector code:
3946          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3947            jmp to epilogue
3948 
3949      Hence the run-time scalar cost should be incremented by 2 taken
3950      branches.
3951 
3952      TODO: The back end may reorder the BBS's differently and reverse
3953      conditions/branch directions.  Change the estimates below to
3954      something more reasonable.  */
3955 
3956   /* If the number of iterations is known and we do not do versioning, we can
3957      decide whether to vectorize at compile time.  Hence the scalar version
3958      do not carry cost model guard costs.  */
3959   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3960       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3961     {
3962       /* Cost model check occurs at versioning.  */
3963       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3964 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3965       else
3966 	{
3967 	  /* Cost model check occurs at prologue generation.  */
3968 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3969 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3970 	      + vect_get_stmt_cost (cond_branch_not_taken);
3971 	  /* Cost model check occurs at epilogue generation.  */
3972 	  else
3973 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3974 	}
3975     }
3976 
3977   /* Complete the target-specific cost calculations.  */
3978   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3979 	       &vec_inside_cost, &vec_epilogue_cost);
3980 
3981   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3982 
3983   if (dump_enabled_p ())
3984     {
3985       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3986       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3987                    vec_inside_cost);
3988       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3989                    vec_prologue_cost);
3990       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3991                    vec_epilogue_cost);
3992       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3993                    scalar_single_iter_cost);
3994       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3995                    scalar_outside_cost);
3996       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3997                    vec_outside_cost);
3998       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3999                    peel_iters_prologue);
4000       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4001                    peel_iters_epilogue);
4002     }
4003 
4004   /* Calculate number of iterations required to make the vector version
4005      profitable, relative to the loop bodies only.  The following condition
4006      must hold true:
4007      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4008      where
4009      SIC = scalar iteration cost, VIC = vector iteration cost,
4010      VOC = vector outside cost, VF = vectorization factor,
4011      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4012      SOC = scalar outside cost for run time cost model check.  */
4013 
4014   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4015     {
4016       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4017 			      * assumed_vf
4018 			      - vec_inside_cost * peel_iters_prologue
4019 			      - vec_inside_cost * peel_iters_epilogue);
4020       if (min_profitable_iters <= 0)
4021         min_profitable_iters = 0;
4022       else
4023 	{
4024 	  min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4025 				   - vec_inside_cost);
4026 
4027 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4028 	      <= (((int) vec_inside_cost * min_profitable_iters)
4029 		  + (((int) vec_outside_cost - scalar_outside_cost)
4030 		     * assumed_vf)))
4031 	    min_profitable_iters++;
4032 	}
4033     }
4034   /* vector version will never be profitable.  */
4035   else
4036     {
4037       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4038 	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4039 		    "did not happen for a simd loop");
4040 
4041       if (dump_enabled_p ())
4042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4043 			 "cost model: the vector iteration cost = %d "
4044 			 "divided by the scalar iteration cost = %d "
4045 			 "is greater or equal to the vectorization factor = %d"
4046                          ".\n",
4047 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4048       *ret_min_profitable_niters = -1;
4049       *ret_min_profitable_estimate = -1;
4050       return;
4051     }
4052 
4053   dump_printf (MSG_NOTE,
4054 	       "  Calculated minimum iters for profitability: %d\n",
4055 	       min_profitable_iters);
4056 
4057   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4058       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4059     /* We want the vectorized loop to execute at least once.  */
4060     min_profitable_iters = assumed_vf + peel_iters_prologue;
4061 
4062   if (dump_enabled_p ())
4063     dump_printf_loc (MSG_NOTE, vect_location,
4064                      "  Runtime profitability threshold = %d\n",
4065                      min_profitable_iters);
4066 
4067   *ret_min_profitable_niters = min_profitable_iters;
4068 
4069   /* Calculate number of iterations required to make the vector version
4070      profitable, relative to the loop bodies only.
4071 
4072      Non-vectorized variant is SIC * niters and it must win over vector
4073      variant on the expected loop trip count.  The following condition must hold true:
4074      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4075 
4076   if (vec_outside_cost <= 0)
4077     min_profitable_estimate = 0;
4078   else
4079     {
4080       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4081 				 * assumed_vf
4082 				 - vec_inside_cost * peel_iters_prologue
4083 				 - vec_inside_cost * peel_iters_epilogue)
4084 				 / ((scalar_single_iter_cost * assumed_vf)
4085 				   - vec_inside_cost);
4086     }
4087   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4088   if (dump_enabled_p ())
4089     dump_printf_loc (MSG_NOTE, vect_location,
4090 		     "  Static estimate profitability threshold = %d\n",
4091 		     min_profitable_estimate);
4092 
4093   *ret_min_profitable_estimate = min_profitable_estimate;
4094 }
4095 
4096 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4097    vector elements (not bits) for a vector with NELT elements.  */
4098 static void
4099 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4100 			      vec_perm_builder *sel)
4101 {
4102   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4103      by vec_perm_indices.  */
4104   sel->new_vector (nelt, 1, 3);
4105   for (unsigned int i = 0; i < 3; i++)
4106     sel->quick_push (i + offset);
4107 }
4108 
4109 /* Checks whether the target supports whole-vector shifts for vectors of mode
4110    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4111    it supports vec_perm_const with masks for all necessary shift amounts.  */
4112 static bool
4113 have_whole_vector_shift (machine_mode mode)
4114 {
4115   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4116     return true;
4117 
4118   /* Variable-length vectors should be handled via the optab.  */
4119   unsigned int nelt;
4120   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4121     return false;
4122 
4123   vec_perm_builder sel;
4124   vec_perm_indices indices;
4125   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4126     {
4127       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4128       indices.new_vector (sel, 2, nelt);
4129       if (!can_vec_perm_const_p (mode, indices, false))
4130 	return false;
4131     }
4132   return true;
4133 }
4134 
4135 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4136    functions. Design better to avoid maintenance issues.  */
4137 
4138 /* Function vect_model_reduction_cost.
4139 
4140    Models cost for a reduction operation, including the vector ops
4141    generated within the strip-mine loop, the initial definition before
4142    the loop, and the epilogue code that must be generated.  */
4143 
4144 static void
4145 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4146 			   int ncopies)
4147 {
4148   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4149   enum tree_code code;
4150   optab optab;
4151   tree vectype;
4152   gimple *orig_stmt;
4153   machine_mode mode;
4154   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4155   struct loop *loop = NULL;
4156   void *target_cost_data;
4157 
4158   if (loop_vinfo)
4159     {
4160       loop = LOOP_VINFO_LOOP (loop_vinfo);
4161       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4162     }
4163   else
4164     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4165 
4166   /* Condition reductions generate two reductions in the loop.  */
4167   vect_reduction_type reduction_type
4168     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4169   if (reduction_type == COND_REDUCTION)
4170     ncopies *= 2;
4171 
4172   vectype = STMT_VINFO_VECTYPE (stmt_info);
4173   mode = TYPE_MODE (vectype);
4174   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4175 
4176   if (!orig_stmt)
4177     orig_stmt = STMT_VINFO_STMT (stmt_info);
4178 
4179   code = gimple_assign_rhs_code (orig_stmt);
4180 
4181   if (reduction_type == EXTRACT_LAST_REDUCTION
4182       || reduction_type == FOLD_LEFT_REDUCTION)
4183     {
4184       /* No extra instructions needed in the prologue.  */
4185       prologue_cost = 0;
4186 
4187       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4188 	/* Count one reduction-like operation per vector.  */
4189 	inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4190 				     stmt_info, 0, vect_body);
4191       else
4192 	{
4193 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4194 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4195 	  inside_cost = add_stmt_cost (target_cost_data,  nelements,
4196 				       vec_to_scalar, stmt_info, 0,
4197 				       vect_body);
4198 	  inside_cost += add_stmt_cost (target_cost_data,  nelements,
4199 					scalar_stmt, stmt_info, 0,
4200 					vect_body);
4201 	}
4202     }
4203   else
4204     {
4205       /* Add in cost for initial definition.
4206 	 For cond reduction we have four vectors: initial index, step,
4207 	 initial result of the data reduction, initial value of the index
4208 	 reduction.  */
4209       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4210       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4211 				      scalar_to_vec, stmt_info, 0,
4212 				      vect_prologue);
4213 
4214       /* Cost of reduction op inside loop.  */
4215       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4216 				   stmt_info, 0, vect_body);
4217     }
4218 
4219   /* Determine cost of epilogue code.
4220 
4221      We have a reduction operator that will reduce the vector in one statement.
4222      Also requires scalar extract.  */
4223 
4224   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4225     {
4226       if (reduc_fn != IFN_LAST)
4227 	{
4228 	  if (reduction_type == COND_REDUCTION)
4229 	    {
4230 	      /* An EQ stmt and an COND_EXPR stmt.  */
4231 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4232 					      vector_stmt, stmt_info, 0,
4233 					      vect_epilogue);
4234 	      /* Reduction of the max index and a reduction of the found
4235 		 values.  */
4236 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4237 					      vec_to_scalar, stmt_info, 0,
4238 					      vect_epilogue);
4239 	      /* A broadcast of the max value.  */
4240 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4241 					      scalar_to_vec, stmt_info, 0,
4242 					      vect_epilogue);
4243 	    }
4244 	  else
4245 	    {
4246 	      epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4247 					      stmt_info, 0, vect_epilogue);
4248 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4249 					      vec_to_scalar, stmt_info, 0,
4250 					      vect_epilogue);
4251 	    }
4252 	}
4253       else if (reduction_type == COND_REDUCTION)
4254 	{
4255 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4256 	  /* Extraction of scalar elements.  */
4257 	  epilogue_cost += add_stmt_cost (target_cost_data,
4258 					  2 * estimated_nunits,
4259 					  vec_to_scalar, stmt_info, 0,
4260 					  vect_epilogue);
4261 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4262 	  epilogue_cost += add_stmt_cost (target_cost_data,
4263 					  2 * estimated_nunits - 3,
4264 					  scalar_stmt, stmt_info, 0,
4265 					  vect_epilogue);
4266 	}
4267       else if (reduction_type == EXTRACT_LAST_REDUCTION
4268 	       || reduction_type == FOLD_LEFT_REDUCTION)
4269 	/* No extra instructions need in the epilogue.  */
4270 	;
4271       else
4272 	{
4273 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4274 	  tree bitsize =
4275 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4276 	  int element_bitsize = tree_to_uhwi (bitsize);
4277 	  int nelements = vec_size_in_bits / element_bitsize;
4278 
4279 	  if (code == COND_EXPR)
4280 	    code = MAX_EXPR;
4281 
4282 	  optab = optab_for_tree_code (code, vectype, optab_default);
4283 
4284 	  /* We have a whole vector shift available.  */
4285 	  if (optab != unknown_optab
4286 	      && VECTOR_MODE_P (mode)
4287 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4288 	      && have_whole_vector_shift (mode))
4289 	    {
4290 	      /* Final reduction via vector shifts and the reduction operator.
4291 		 Also requires scalar extract.  */
4292 	      epilogue_cost += add_stmt_cost (target_cost_data,
4293 					      exact_log2 (nelements) * 2,
4294 					      vector_stmt, stmt_info, 0,
4295 					      vect_epilogue);
4296 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4297 					      vec_to_scalar, stmt_info, 0,
4298 					      vect_epilogue);
4299 	    }
4300 	  else
4301 	    /* Use extracts and reduction op for final reduction.  For N
4302 	       elements, we have N extracts and N-1 reduction ops.  */
4303 	    epilogue_cost += add_stmt_cost (target_cost_data,
4304 					    nelements + nelements - 1,
4305 					    vector_stmt, stmt_info, 0,
4306 					    vect_epilogue);
4307 	}
4308     }
4309 
4310   if (dump_enabled_p ())
4311     dump_printf (MSG_NOTE,
4312                  "vect_model_reduction_cost: inside_cost = %d, "
4313                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4314                  prologue_cost, epilogue_cost);
4315 }
4316 
4317 
4318 /* Function vect_model_induction_cost.
4319 
4320    Models cost for induction operations.  */
4321 
4322 static void
4323 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4324 {
4325   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4326   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4327   unsigned inside_cost, prologue_cost;
4328 
4329   if (PURE_SLP_STMT (stmt_info))
4330     return;
4331 
4332   /* loop cost for vec_loop.  */
4333   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4334 			       stmt_info, 0, vect_body);
4335 
4336   /* prologue cost for vec_init and vec_step.  */
4337   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4338 				 stmt_info, 0, vect_prologue);
4339 
4340   if (dump_enabled_p ())
4341     dump_printf_loc (MSG_NOTE, vect_location,
4342                      "vect_model_induction_cost: inside_cost = %d, "
4343                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4344 }
4345 
4346 
4347 
4348 /* Function get_initial_def_for_reduction
4349 
4350    Input:
4351    STMT - a stmt that performs a reduction operation in the loop.
4352    INIT_VAL - the initial value of the reduction variable
4353 
4354    Output:
4355    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4356         of the reduction (used for adjusting the epilog - see below).
4357    Return a vector variable, initialized according to the operation that STMT
4358         performs. This vector will be used as the initial value of the
4359         vector of partial results.
4360 
4361    Option1 (adjust in epilog): Initialize the vector as follows:
4362      add/bit or/xor:    [0,0,...,0,0]
4363      mult/bit and:      [1,1,...,1,1]
4364      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4365    and when necessary (e.g. add/mult case) let the caller know
4366    that it needs to adjust the result by init_val.
4367 
4368    Option2: Initialize the vector as follows:
4369      add/bit or/xor:    [init_val,0,0,...,0]
4370      mult/bit and:      [init_val,1,1,...,1]
4371      min/max/cond_expr: [init_val,init_val,...,init_val]
4372    and no adjustments are needed.
4373 
4374    For example, for the following code:
4375 
4376    s = init_val;
4377    for (i=0;i<n;i++)
4378      s = s + a[i];
4379 
4380    STMT is 's = s + a[i]', and the reduction variable is 's'.
4381    For a vector of 4 units, we want to return either [0,0,0,init_val],
4382    or [0,0,0,0] and let the caller know that it needs to adjust
4383    the result at the end by 'init_val'.
4384 
4385    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4386    initialization vector is simpler (same element in all entries), if
4387    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4388 
4389    A cost model should help decide between these two schemes.  */
4390 
4391 tree
4392 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4393                                tree *adjustment_def)
4394 {
4395   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4398   tree scalar_type = TREE_TYPE (init_val);
4399   tree vectype = get_vectype_for_scalar_type (scalar_type);
4400   enum tree_code code = gimple_assign_rhs_code (stmt);
4401   tree def_for_init;
4402   tree init_def;
4403   bool nested_in_vect_loop = false;
4404   REAL_VALUE_TYPE real_init_val = dconst0;
4405   int int_init_val = 0;
4406   gimple *def_stmt = NULL;
4407   gimple_seq stmts = NULL;
4408 
4409   gcc_assert (vectype);
4410 
4411   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4412 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4413 
4414   if (nested_in_vect_loop_p (loop, stmt))
4415     nested_in_vect_loop = true;
4416   else
4417     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4418 
4419   /* In case of double reduction we only create a vector variable to be put
4420      in the reduction phi node.  The actual statement creation is done in
4421      vect_create_epilog_for_reduction.  */
4422   if (adjustment_def && nested_in_vect_loop
4423       && TREE_CODE (init_val) == SSA_NAME
4424       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4425       && gimple_code (def_stmt) == GIMPLE_PHI
4426       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4427       && vinfo_for_stmt (def_stmt)
4428       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4429           == vect_double_reduction_def)
4430     {
4431       *adjustment_def = NULL;
4432       return vect_create_destination_var (init_val, vectype);
4433     }
4434 
4435   vect_reduction_type reduction_type
4436     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4437 
4438   /* In case of a nested reduction do not use an adjustment def as
4439      that case is not supported by the epilogue generation correctly
4440      if ncopies is not one.  */
4441   if (adjustment_def && nested_in_vect_loop)
4442     {
4443       *adjustment_def = NULL;
4444       return vect_get_vec_def_for_operand (init_val, stmt);
4445     }
4446 
4447   switch (code)
4448     {
4449     case WIDEN_SUM_EXPR:
4450     case DOT_PROD_EXPR:
4451     case SAD_EXPR:
4452     case PLUS_EXPR:
4453     case MINUS_EXPR:
4454     case BIT_IOR_EXPR:
4455     case BIT_XOR_EXPR:
4456     case MULT_EXPR:
4457     case BIT_AND_EXPR:
4458       {
4459         /* ADJUSTMENT_DEF is NULL when called from
4460            vect_create_epilog_for_reduction to vectorize double reduction.  */
4461         if (adjustment_def)
4462 	  *adjustment_def = init_val;
4463 
4464         if (code == MULT_EXPR)
4465           {
4466             real_init_val = dconst1;
4467             int_init_val = 1;
4468           }
4469 
4470         if (code == BIT_AND_EXPR)
4471           int_init_val = -1;
4472 
4473         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4474           def_for_init = build_real (scalar_type, real_init_val);
4475         else
4476           def_for_init = build_int_cst (scalar_type, int_init_val);
4477 
4478 	if (adjustment_def)
4479 	  /* Option1: the first element is '0' or '1' as well.  */
4480 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4481 						   def_for_init);
4482 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4483 	  {
4484 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4485 	    init_def = build_vector_from_val (vectype, def_for_init);
4486 	    gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4487 						      2, init_def, init_val);
4488 	    init_def = make_ssa_name (vectype);
4489 	    gimple_call_set_lhs (call, init_def);
4490 	    gimple_seq_add_stmt (&stmts, call);
4491 	  }
4492 	else
4493 	  {
4494 	    /* Option2: the first element is INIT_VAL.  */
4495 	    tree_vector_builder elts (vectype, 1, 2);
4496 	    elts.quick_push (init_val);
4497 	    elts.quick_push (def_for_init);
4498 	    init_def = gimple_build_vector (&stmts, &elts);
4499 	  }
4500       }
4501       break;
4502 
4503     case MIN_EXPR:
4504     case MAX_EXPR:
4505     case COND_EXPR:
4506       {
4507 	if (adjustment_def)
4508           {
4509 	    *adjustment_def = NULL_TREE;
4510 	    if (reduction_type != COND_REDUCTION
4511 		&& reduction_type != EXTRACT_LAST_REDUCTION)
4512 	      {
4513 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
4514 		break;
4515 	      }
4516 	  }
4517 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4518 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4519       }
4520       break;
4521 
4522     default:
4523       gcc_unreachable ();
4524     }
4525 
4526   if (stmts)
4527     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4528   return init_def;
4529 }
4530 
4531 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4532    NUMBER_OF_VECTORS is the number of vector defs to create.
4533    If NEUTRAL_OP is nonnull, introducing extra elements of that
4534    value will not change the result.  */
4535 
4536 static void
4537 get_initial_defs_for_reduction (slp_tree slp_node,
4538 				vec<tree> *vec_oprnds,
4539 				unsigned int number_of_vectors,
4540 				bool reduc_chain, tree neutral_op)
4541 {
4542   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4543   gimple *stmt = stmts[0];
4544   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4545   unsigned HOST_WIDE_INT nunits;
4546   unsigned j, number_of_places_left_in_vector;
4547   tree vector_type;
4548   tree vop;
4549   int group_size = stmts.length ();
4550   unsigned int vec_num, i;
4551   unsigned number_of_copies = 1;
4552   vec<tree> voprnds;
4553   voprnds.create (number_of_vectors);
4554   struct loop *loop;
4555   auto_vec<tree, 16> permute_results;
4556 
4557   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4558 
4559   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4560 
4561   loop = (gimple_bb (stmt))->loop_father;
4562   gcc_assert (loop);
4563   edge pe = loop_preheader_edge (loop);
4564 
4565   gcc_assert (!reduc_chain || neutral_op);
4566 
4567   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4568      created vectors. It is greater than 1 if unrolling is performed.
4569 
4570      For example, we have two scalar operands, s1 and s2 (e.g., group of
4571      strided accesses of size two), while NUNITS is four (i.e., four scalars
4572      of this type can be packed in a vector).  The output vector will contain
4573      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4574      will be 2).
4575 
4576      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4577      containing the operands.
4578 
4579      For example, NUNITS is four as before, and the group size is 8
4580      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4581      {s5, s6, s7, s8}.  */
4582 
4583   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4584     nunits = group_size;
4585 
4586   number_of_copies = nunits * number_of_vectors / group_size;
4587 
4588   number_of_places_left_in_vector = nunits;
4589   bool constant_p = true;
4590   tree_vector_builder elts (vector_type, nunits, 1);
4591   elts.quick_grow (nunits);
4592   for (j = 0; j < number_of_copies; j++)
4593     {
4594       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4595         {
4596 	  tree op;
4597 	  /* Get the def before the loop.  In reduction chain we have only
4598 	     one initial value.  */
4599 	  if ((j != (number_of_copies - 1)
4600 	       || (reduc_chain && i != 0))
4601 	      && neutral_op)
4602 	    op = neutral_op;
4603 	  else
4604 	    op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4605 
4606           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4607           number_of_places_left_in_vector--;
4608 	  elts[number_of_places_left_in_vector] = op;
4609 	  if (!CONSTANT_CLASS_P (op))
4610 	    constant_p = false;
4611 
4612           if (number_of_places_left_in_vector == 0)
4613             {
4614 	      gimple_seq ctor_seq = NULL;
4615 	      tree init;
4616 	      if (constant_p && !neutral_op
4617 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4618 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4619 		/* Build the vector directly from ELTS.  */
4620 		init = gimple_build_vector (&ctor_seq, &elts);
4621 	      else if (neutral_op)
4622 		{
4623 		  /* Build a vector of the neutral value and shift the
4624 		     other elements into place.  */
4625 		  init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4626 						       neutral_op);
4627 		  int k = nunits;
4628 		  while (k > 0 && elts[k - 1] == neutral_op)
4629 		    k -= 1;
4630 		  while (k > 0)
4631 		    {
4632 		      k -= 1;
4633 		      gcall *call = gimple_build_call_internal
4634 			(IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4635 		      init = make_ssa_name (vector_type);
4636 		      gimple_call_set_lhs (call, init);
4637 		      gimple_seq_add_stmt (&ctor_seq, call);
4638 		    }
4639 		}
4640 	      else
4641 		{
4642 		  /* First time round, duplicate ELTS to fill the
4643 		     required number of vectors, then cherry pick the
4644 		     appropriate result for each iteration.  */
4645 		  if (vec_oprnds->is_empty ())
4646 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
4647 					      number_of_vectors,
4648 					      permute_results);
4649 		  init = permute_results[number_of_vectors - j - 1];
4650 		}
4651 	      if (ctor_seq != NULL)
4652 		gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4653 	      voprnds.quick_push (init);
4654 
4655               number_of_places_left_in_vector = nunits;
4656 	      elts.new_vector (vector_type, nunits, 1);
4657 	      elts.quick_grow (nunits);
4658 	      constant_p = true;
4659             }
4660         }
4661     }
4662 
4663   /* Since the vectors are created in the reverse order, we should invert
4664      them.  */
4665   vec_num = voprnds.length ();
4666   for (j = vec_num; j != 0; j--)
4667     {
4668       vop = voprnds[j - 1];
4669       vec_oprnds->quick_push (vop);
4670     }
4671 
4672   voprnds.release ();
4673 
4674   /* In case that VF is greater than the unrolling factor needed for the SLP
4675      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4676      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4677      to replicate the vectors.  */
4678   tree neutral_vec = NULL;
4679   while (number_of_vectors > vec_oprnds->length ())
4680     {
4681       if (neutral_op)
4682         {
4683           if (!neutral_vec)
4684 	    {
4685 	      gimple_seq ctor_seq = NULL;
4686 	      neutral_vec = gimple_build_vector_from_val
4687 		(&ctor_seq, vector_type, neutral_op);
4688 	      if (ctor_seq != NULL)
4689 		gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4690 	    }
4691           vec_oprnds->quick_push (neutral_vec);
4692         }
4693       else
4694         {
4695           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4696             vec_oprnds->quick_push (vop);
4697         }
4698     }
4699 }
4700 
4701 
4702 /* Function vect_create_epilog_for_reduction
4703 
4704    Create code at the loop-epilog to finalize the result of a reduction
4705    computation.
4706 
4707    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4708      reduction statements.
4709    STMT is the scalar reduction stmt that is being vectorized.
4710    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4711      number of elements that we can fit in a vectype (nunits).  In this case
4712      we have to generate more than one vector stmt - i.e - we need to "unroll"
4713      the vector stmt by a factor VF/nunits.  For more details see documentation
4714      in vectorizable_operation.
4715    REDUC_FN is the internal function for the epilog reduction.
4716    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4717      computation.
4718    REDUC_INDEX is the index of the operand in the right hand side of the
4719      statement that is defined by REDUCTION_PHI.
4720    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4721    SLP_NODE is an SLP node containing a group of reduction statements. The
4722      first one in this group is STMT.
4723    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4724      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4725      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4726      any value of the IV in the loop.
4727    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4728    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4729      null if this is not an SLP reduction
4730 
4731    This function:
4732    1. Creates the reduction def-use cycles: sets the arguments for
4733       REDUCTION_PHIS:
4734       The loop-entry argument is the vectorized initial-value of the reduction.
4735       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4736       sums.
4737    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4738       by calling the function specified by REDUC_FN if available, or by
4739       other means (whole-vector shifts or a scalar loop).
4740       The function also creates a new phi node at the loop exit to preserve
4741       loop-closed form, as illustrated below.
4742 
4743      The flow at the entry to this function:
4744 
4745         loop:
4746           vec_def = phi <null, null>            # REDUCTION_PHI
4747           VECT_DEF = vector_stmt                # vectorized form of STMT
4748           s_loop = scalar_stmt                  # (scalar) STMT
4749         loop_exit:
4750           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4751           use <s_out0>
4752           use <s_out0>
4753 
4754      The above is transformed by this function into:
4755 
4756         loop:
4757           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4758           VECT_DEF = vector_stmt                # vectorized form of STMT
4759           s_loop = scalar_stmt                  # (scalar) STMT
4760         loop_exit:
4761           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4762           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4763           v_out2 = reduce <v_out1>
4764           s_out3 = extract_field <v_out2, 0>
4765           s_out4 = adjust_result <s_out3>
4766           use <s_out4>
4767           use <s_out4>
4768 */
4769 
4770 static void
4771 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4772 				  gimple *reduc_def_stmt,
4773 				  int ncopies, internal_fn reduc_fn,
4774 				  vec<gimple *> reduction_phis,
4775                                   bool double_reduc,
4776 				  slp_tree slp_node,
4777 				  slp_instance slp_node_instance,
4778 				  tree induc_val, enum tree_code induc_code,
4779 				  tree neutral_op)
4780 {
4781   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4782   stmt_vec_info prev_phi_info;
4783   tree vectype;
4784   machine_mode mode;
4785   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4786   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4787   basic_block exit_bb;
4788   tree scalar_dest;
4789   tree scalar_type;
4790   gimple *new_phi = NULL, *phi;
4791   gimple_stmt_iterator exit_gsi;
4792   tree vec_dest;
4793   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4794   gimple *epilog_stmt = NULL;
4795   enum tree_code code = gimple_assign_rhs_code (stmt);
4796   gimple *exit_phi;
4797   tree bitsize;
4798   tree adjustment_def = NULL;
4799   tree vec_initial_def = NULL;
4800   tree expr, def, initial_def = NULL;
4801   tree orig_name, scalar_result;
4802   imm_use_iterator imm_iter, phi_imm_iter;
4803   use_operand_p use_p, phi_use_p;
4804   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4805   bool nested_in_vect_loop = false;
4806   auto_vec<gimple *> new_phis;
4807   auto_vec<gimple *> inner_phis;
4808   enum vect_def_type dt = vect_unknown_def_type;
4809   int j, i;
4810   auto_vec<tree> scalar_results;
4811   unsigned int group_size = 1, k, ratio;
4812   auto_vec<tree> vec_initial_defs;
4813   auto_vec<gimple *> phis;
4814   bool slp_reduc = false;
4815   bool direct_slp_reduc;
4816   tree new_phi_result;
4817   gimple *inner_phi = NULL;
4818   tree induction_index = NULL_TREE;
4819 
4820   if (slp_node)
4821     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4822 
4823   if (nested_in_vect_loop_p (loop, stmt))
4824     {
4825       outer_loop = loop;
4826       loop = loop->inner;
4827       nested_in_vect_loop = true;
4828       gcc_assert (!slp_node);
4829     }
4830 
4831   vectype = STMT_VINFO_VECTYPE (stmt_info);
4832   gcc_assert (vectype);
4833   mode = TYPE_MODE (vectype);
4834 
4835   /* 1. Create the reduction def-use cycle:
4836      Set the arguments of REDUCTION_PHIS, i.e., transform
4837 
4838         loop:
4839           vec_def = phi <null, null>            # REDUCTION_PHI
4840           VECT_DEF = vector_stmt                # vectorized form of STMT
4841           ...
4842 
4843      into:
4844 
4845         loop:
4846           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4847           VECT_DEF = vector_stmt                # vectorized form of STMT
4848           ...
4849 
4850      (in case of SLP, do it for all the phis). */
4851 
4852   /* Get the loop-entry arguments.  */
4853   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4854   if (slp_node)
4855     {
4856       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4857       vec_initial_defs.reserve (vec_num);
4858       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4859 				      &vec_initial_defs, vec_num,
4860 				      GROUP_FIRST_ELEMENT (stmt_info),
4861 				      neutral_op);
4862     }
4863   else
4864     {
4865       /* Get at the scalar def before the loop, that defines the initial value
4866 	 of the reduction variable.  */
4867       gimple *def_stmt;
4868       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4869 					   loop_preheader_edge (loop));
4870       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4871 	 and we can't use zero for induc_val, use initial_def.  Similarly
4872 	 for REDUC_MIN and initial_def larger than the base.  */
4873       if (TREE_CODE (initial_def) == INTEGER_CST
4874 	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4875 	      == INTEGER_INDUC_COND_REDUCTION)
4876 	  && !integer_zerop (induc_val)
4877 	  && ((induc_code == MAX_EXPR
4878 	       && tree_int_cst_lt (initial_def, induc_val))
4879 	      || (induc_code == MIN_EXPR
4880 		  && tree_int_cst_lt (induc_val, initial_def))))
4881 	induc_val = initial_def;
4882       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4883       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4884 						       &adjustment_def);
4885       vec_initial_defs.create (1);
4886       vec_initial_defs.quick_push (vec_initial_def);
4887     }
4888 
4889   /* Set phi nodes arguments.  */
4890   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4891     {
4892       tree vec_init_def = vec_initial_defs[i];
4893       tree def = vect_defs[i];
4894       for (j = 0; j < ncopies; j++)
4895         {
4896 	  if (j != 0)
4897 	    {
4898 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4899 	      if (nested_in_vect_loop)
4900 		vec_init_def
4901 		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4902 						    vec_init_def);
4903 	    }
4904 
4905 	  /* Set the loop-entry arg of the reduction-phi.  */
4906 
4907 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4908 	      == INTEGER_INDUC_COND_REDUCTION)
4909 	    {
4910 	      /* Initialise the reduction phi to zero.  This prevents initial
4911 		 values of non-zero interferring with the reduction op.  */
4912 	      gcc_assert (ncopies == 1);
4913 	      gcc_assert (i == 0);
4914 
4915 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4916 	      tree induc_val_vec
4917 		= build_vector_from_val (vec_init_def_type, induc_val);
4918 
4919 	      add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4920 			   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4921 	    }
4922 	  else
4923 	    add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4924 			 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4925 
4926           /* Set the loop-latch arg for the reduction-phi.  */
4927           if (j > 0)
4928             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4929 
4930           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4931 		       UNKNOWN_LOCATION);
4932 
4933           if (dump_enabled_p ())
4934             {
4935               dump_printf_loc (MSG_NOTE, vect_location,
4936 			       "transform reduction: created def-use cycle: ");
4937               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4938               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4939             }
4940         }
4941     }
4942 
4943   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4944      which is updated with the current index of the loop for every match of
4945      the original loop's cond_expr (VEC_STMT).  This results in a vector
4946      containing the last time the condition passed for that vector lane.
4947      The first match will be a 1 to allow 0 to be used for non-matching
4948      indexes.  If there are no matches at all then the vector will be all
4949      zeroes.  */
4950   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4951     {
4952       tree indx_before_incr, indx_after_incr;
4953       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4954 
4955       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4956       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4957 
4958       int scalar_precision
4959 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4960       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4961       tree cr_index_vector_type = build_vector_type
4962 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4963 
4964       /* First we create a simple vector induction variable which starts
4965 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4966 	 vector size (STEP).  */
4967 
4968       /* Create a {1,2,3,...} vector.  */
4969       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4970 
4971       /* Create a vector of the step value.  */
4972       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4973       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4974 
4975       /* Create an induction variable.  */
4976       gimple_stmt_iterator incr_gsi;
4977       bool insert_after;
4978       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4979       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4980 		 insert_after, &indx_before_incr, &indx_after_incr);
4981 
4982       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4983 	 filled with zeros (VEC_ZERO).  */
4984 
4985       /* Create a vector of 0s.  */
4986       tree zero = build_zero_cst (cr_index_scalar_type);
4987       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4988 
4989       /* Create a vector phi node.  */
4990       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4991       new_phi = create_phi_node (new_phi_tree, loop->header);
4992       set_vinfo_for_stmt (new_phi,
4993 			  new_stmt_vec_info (new_phi, loop_vinfo));
4994       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4995 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4996 
4997       /* Now take the condition from the loops original cond_expr
4998 	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4999 	 every match uses values from the induction variable
5000 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5001 	 (NEW_PHI_TREE).
5002 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5003 	 the new cond_expr (INDEX_COND_EXPR).  */
5004 
5005       /* Duplicate the condition from vec_stmt.  */
5006       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5007 
5008       /* Create a conditional, where the condition is taken from vec_stmt
5009 	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5010 	 else is the phi (NEW_PHI_TREE).  */
5011       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5012 				     ccompare, indx_before_incr,
5013 				     new_phi_tree);
5014       induction_index = make_ssa_name (cr_index_vector_type);
5015       gimple *index_condition = gimple_build_assign (induction_index,
5016 						     index_cond_expr);
5017       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5018       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5019 							loop_vinfo);
5020       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5021       set_vinfo_for_stmt (index_condition, index_vec_info);
5022 
5023       /* Update the phi with the vec cond.  */
5024       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5025 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
5026     }
5027 
5028   /* 2. Create epilog code.
5029         The reduction epilog code operates across the elements of the vector
5030         of partial results computed by the vectorized loop.
5031         The reduction epilog code consists of:
5032 
5033         step 1: compute the scalar result in a vector (v_out2)
5034         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5035         step 3: adjust the scalar result (s_out3) if needed.
5036 
5037         Step 1 can be accomplished using one the following three schemes:
5038           (scheme 1) using reduc_fn, if available.
5039           (scheme 2) using whole-vector shifts, if available.
5040           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5041                      combined.
5042 
5043           The overall epilog code looks like this:
5044 
5045           s_out0 = phi <s_loop>         # original EXIT_PHI
5046           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5047           v_out2 = reduce <v_out1>              # step 1
5048           s_out3 = extract_field <v_out2, 0>    # step 2
5049           s_out4 = adjust_result <s_out3>       # step 3
5050 
5051           (step 3 is optional, and steps 1 and 2 may be combined).
5052           Lastly, the uses of s_out0 are replaced by s_out4.  */
5053 
5054 
5055   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5056          v_out1 = phi <VECT_DEF>
5057          Store them in NEW_PHIS.  */
5058 
5059   exit_bb = single_exit (loop)->dest;
5060   prev_phi_info = NULL;
5061   new_phis.create (vect_defs.length ());
5062   FOR_EACH_VEC_ELT (vect_defs, i, def)
5063     {
5064       for (j = 0; j < ncopies; j++)
5065         {
5066 	  tree new_def = copy_ssa_name (def);
5067           phi = create_phi_node (new_def, exit_bb);
5068           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5069           if (j == 0)
5070             new_phis.quick_push (phi);
5071           else
5072 	    {
5073 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
5074 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5075 	    }
5076 
5077           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5078           prev_phi_info = vinfo_for_stmt (phi);
5079         }
5080     }
5081 
5082   /* The epilogue is created for the outer-loop, i.e., for the loop being
5083      vectorized.  Create exit phis for the outer loop.  */
5084   if (double_reduc)
5085     {
5086       loop = outer_loop;
5087       exit_bb = single_exit (loop)->dest;
5088       inner_phis.create (vect_defs.length ());
5089       FOR_EACH_VEC_ELT (new_phis, i, phi)
5090 	{
5091 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
5092 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
5093 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5094 			   PHI_RESULT (phi));
5095 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5096 							    loop_vinfo));
5097 	  inner_phis.quick_push (phi);
5098 	  new_phis[i] = outer_phi;
5099 	  prev_phi_info = vinfo_for_stmt (outer_phi);
5100           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5101             {
5102 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5103 	      new_result = copy_ssa_name (PHI_RESULT (phi));
5104 	      outer_phi = create_phi_node (new_result, exit_bb);
5105 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5106 			       PHI_RESULT (phi));
5107 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5108 								loop_vinfo));
5109 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5110 	      prev_phi_info = vinfo_for_stmt (outer_phi);
5111 	    }
5112 	}
5113     }
5114 
5115   exit_gsi = gsi_after_labels (exit_bb);
5116 
5117   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5118          (i.e. when reduc_fn is not available) and in the final adjustment
5119 	 code (if needed).  Also get the original scalar reduction variable as
5120          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5121          represents a reduction pattern), the tree-code and scalar-def are
5122          taken from the original stmt that the pattern-stmt (STMT) replaces.
5123          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5124          are taken from STMT.  */
5125 
5126   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5127   if (!orig_stmt)
5128     {
5129       /* Regular reduction  */
5130       orig_stmt = stmt;
5131     }
5132   else
5133     {
5134       /* Reduction pattern  */
5135       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5136       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5137       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5138     }
5139 
5140   code = gimple_assign_rhs_code (orig_stmt);
5141   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5142      partial results are added and not subtracted.  */
5143   if (code == MINUS_EXPR)
5144     code = PLUS_EXPR;
5145 
5146   scalar_dest = gimple_assign_lhs (orig_stmt);
5147   scalar_type = TREE_TYPE (scalar_dest);
5148   scalar_results.create (group_size);
5149   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5150   bitsize = TYPE_SIZE (scalar_type);
5151 
5152   /* In case this is a reduction in an inner-loop while vectorizing an outer
5153      loop - we don't need to extract a single scalar result at the end of the
5154      inner-loop (unless it is double reduction, i.e., the use of reduction is
5155      outside the outer-loop).  The final vector of partial results will be used
5156      in the vectorized outer-loop, or reduced to a scalar result at the end of
5157      the outer-loop.  */
5158   if (nested_in_vect_loop && !double_reduc)
5159     goto vect_finalize_reduction;
5160 
5161   /* SLP reduction without reduction chain, e.g.,
5162      # a1 = phi <a2, a0>
5163      # b1 = phi <b2, b0>
5164      a2 = operation (a1)
5165      b2 = operation (b1)  */
5166   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5167 
5168   /* True if we should implement SLP_REDUC using native reduction operations
5169      instead of scalar operations.  */
5170   direct_slp_reduc = (reduc_fn != IFN_LAST
5171 		      && slp_reduc
5172 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5173 
5174   /* In case of reduction chain, e.g.,
5175      # a1 = phi <a3, a0>
5176      a2 = operation (a1)
5177      a3 = operation (a2),
5178 
5179      we may end up with more than one vector result.  Here we reduce them to
5180      one vector.  */
5181   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5182     {
5183       tree first_vect = PHI_RESULT (new_phis[0]);
5184       gassign *new_vec_stmt = NULL;
5185       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5186       for (k = 1; k < new_phis.length (); k++)
5187         {
5188 	  gimple *next_phi = new_phis[k];
5189           tree second_vect = PHI_RESULT (next_phi);
5190           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5191           new_vec_stmt = gimple_build_assign (tem, code,
5192 					      first_vect, second_vect);
5193           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5194 	  first_vect = tem;
5195         }
5196 
5197       new_phi_result = first_vect;
5198       if (new_vec_stmt)
5199         {
5200           new_phis.truncate (0);
5201           new_phis.safe_push (new_vec_stmt);
5202         }
5203     }
5204   /* Likewise if we couldn't use a single defuse cycle.  */
5205   else if (ncopies > 1)
5206     {
5207       gcc_assert (new_phis.length () == 1);
5208       tree first_vect = PHI_RESULT (new_phis[0]);
5209       gassign *new_vec_stmt = NULL;
5210       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5211       gimple *next_phi = new_phis[0];
5212       for (int k = 1; k < ncopies; ++k)
5213 	{
5214 	  next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5215 	  tree second_vect = PHI_RESULT (next_phi);
5216           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5217           new_vec_stmt = gimple_build_assign (tem, code,
5218 					      first_vect, second_vect);
5219           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5220 	  first_vect = tem;
5221 	}
5222       new_phi_result = first_vect;
5223       new_phis.truncate (0);
5224       new_phis.safe_push (new_vec_stmt);
5225     }
5226   else
5227     new_phi_result = PHI_RESULT (new_phis[0]);
5228 
5229   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5230       && reduc_fn != IFN_LAST)
5231     {
5232       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5233 	 various data values where the condition matched and another vector
5234 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5235 	 need to extract the last matching index (which will be the index with
5236 	 highest value) and use this to index into the data vector.
5237 	 For the case where there were no matches, the data vector will contain
5238 	 all default values and the index vector will be all zeros.  */
5239 
5240       /* Get various versions of the type of the vector of indexes.  */
5241       tree index_vec_type = TREE_TYPE (induction_index);
5242       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5243       tree index_scalar_type = TREE_TYPE (index_vec_type);
5244       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5245 	(index_vec_type);
5246 
5247       /* Get an unsigned integer version of the type of the data vector.  */
5248       int scalar_precision
5249 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5250       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5251       tree vectype_unsigned = build_vector_type
5252 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5253 
5254       /* First we need to create a vector (ZERO_VEC) of zeros and another
5255 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5256 	 can create using a MAX reduction and then expanding.
5257 	 In the case where the loop never made any matches, the max index will
5258 	 be zero.  */
5259 
5260       /* Vector of {0, 0, 0,...}.  */
5261       tree zero_vec = make_ssa_name (vectype);
5262       tree zero_vec_rhs = build_zero_cst (vectype);
5263       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5264       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5265 
5266       /* Find maximum value from the vector of found indexes.  */
5267       tree max_index = make_ssa_name (index_scalar_type);
5268       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5269 							  1, induction_index);
5270       gimple_call_set_lhs (max_index_stmt, max_index);
5271       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5272 
5273       /* Vector of {max_index, max_index, max_index,...}.  */
5274       tree max_index_vec = make_ssa_name (index_vec_type);
5275       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5276 						      max_index);
5277       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5278 							max_index_vec_rhs);
5279       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5280 
5281       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5282 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5283 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5284 	 otherwise.  Only one value should match, resulting in a vector
5285 	 (VEC_COND) with one data value and the rest zeros.
5286 	 In the case where the loop never made any matches, every index will
5287 	 match, resulting in a vector with all data values (which will all be
5288 	 the default value).  */
5289 
5290       /* Compare the max index vector to the vector of found indexes to find
5291 	 the position of the max value.  */
5292       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5293       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5294 						      induction_index,
5295 						      max_index_vec);
5296       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5297 
5298       /* Use the compare to choose either values from the data vector or
5299 	 zero.  */
5300       tree vec_cond = make_ssa_name (vectype);
5301       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5302 						   vec_compare, new_phi_result,
5303 						   zero_vec);
5304       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5305 
5306       /* Finally we need to extract the data value from the vector (VEC_COND)
5307 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5308 	 reduction, but because this doesn't exist, we can use a MAX reduction
5309 	 instead.  The data value might be signed or a float so we need to cast
5310 	 it first.
5311 	 In the case where the loop never made any matches, the data values are
5312 	 all identical, and so will reduce down correctly.  */
5313 
5314       /* Make the matched data values unsigned.  */
5315       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5316       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5317 				       vec_cond);
5318       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5319 							VIEW_CONVERT_EXPR,
5320 							vec_cond_cast_rhs);
5321       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5322 
5323       /* Reduce down to a scalar value.  */
5324       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5325       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5326 							   1, vec_cond_cast);
5327       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5328       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5329 
5330       /* Convert the reduced value back to the result type and set as the
5331 	 result.  */
5332       gimple_seq stmts = NULL;
5333       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5334 			       data_reduc);
5335       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5336       scalar_results.safe_push (new_temp);
5337     }
5338   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5339 	   && reduc_fn == IFN_LAST)
5340     {
5341       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5342 	 idx = 0;
5343          idx_val = induction_index[0];
5344 	 val = data_reduc[0];
5345          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5346 	   if (induction_index[i] > idx_val)
5347 	     val = data_reduc[i], idx_val = induction_index[i];
5348 	 return val;  */
5349 
5350       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5351       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5352       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5353       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5354       /* Enforced by vectorizable_reduction, which ensures we have target
5355 	 support before allowing a conditional reduction on variable-length
5356 	 vectors.  */
5357       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5358       tree idx_val = NULL_TREE, val = NULL_TREE;
5359       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5360 	{
5361 	  tree old_idx_val = idx_val;
5362 	  tree old_val = val;
5363 	  idx_val = make_ssa_name (idx_eltype);
5364 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5365 					     build3 (BIT_FIELD_REF, idx_eltype,
5366 						     induction_index,
5367 						     bitsize_int (el_size),
5368 						     bitsize_int (off)));
5369 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370 	  val = make_ssa_name (data_eltype);
5371 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5372 					     build3 (BIT_FIELD_REF,
5373 						     data_eltype,
5374 						     new_phi_result,
5375 						     bitsize_int (el_size),
5376 						     bitsize_int (off)));
5377 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378 	  if (off != 0)
5379 	    {
5380 	      tree new_idx_val = idx_val;
5381 	      tree new_val = val;
5382 	      if (off != v_size - el_size)
5383 		{
5384 		  new_idx_val = make_ssa_name (idx_eltype);
5385 		  epilog_stmt = gimple_build_assign (new_idx_val,
5386 						     MAX_EXPR, idx_val,
5387 						     old_idx_val);
5388 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5389 		}
5390 	      new_val = make_ssa_name (data_eltype);
5391 	      epilog_stmt = gimple_build_assign (new_val,
5392 						 COND_EXPR,
5393 						 build2 (GT_EXPR,
5394 							 boolean_type_node,
5395 							 idx_val,
5396 							 old_idx_val),
5397 						 val, old_val);
5398 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5399 	      idx_val = new_idx_val;
5400 	      val = new_val;
5401 	    }
5402 	}
5403       /* Convert the reduced value back to the result type and set as the
5404 	 result.  */
5405       gimple_seq stmts = NULL;
5406       val = gimple_convert (&stmts, scalar_type, val);
5407       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5408       scalar_results.safe_push (val);
5409     }
5410 
5411   /* 2.3 Create the reduction code, using one of the three schemes described
5412          above. In SLP we simply need to extract all the elements from the
5413          vector (without reducing them), so we use scalar shifts.  */
5414   else if (reduc_fn != IFN_LAST && !slp_reduc)
5415     {
5416       tree tmp;
5417       tree vec_elem_type;
5418 
5419       /* Case 1:  Create:
5420          v_out2 = reduc_expr <v_out1>  */
5421 
5422       if (dump_enabled_p ())
5423         dump_printf_loc (MSG_NOTE, vect_location,
5424 			 "Reduce using direct vector reduction.\n");
5425 
5426       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5427       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5428 	{
5429 	  tree tmp_dest
5430 	    = vect_create_destination_var (scalar_dest, vec_elem_type);
5431 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5432 						    new_phi_result);
5433 	  gimple_set_lhs (epilog_stmt, tmp_dest);
5434 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5435 	  gimple_set_lhs (epilog_stmt, new_temp);
5436 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5437 
5438 	  epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5439 					     new_temp);
5440 	}
5441       else
5442 	{
5443 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5444 						    new_phi_result);
5445 	  gimple_set_lhs (epilog_stmt, new_scalar_dest);
5446 	}
5447 
5448       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5449       gimple_set_lhs (epilog_stmt, new_temp);
5450       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451 
5452       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5453 	   == INTEGER_INDUC_COND_REDUCTION)
5454 	  && !operand_equal_p (initial_def, induc_val, 0))
5455 	{
5456 	  /* Earlier we set the initial value to be a vector if induc_val
5457 	     values.  Check the result and if it is induc_val then replace
5458 	     with the original initial value, unless induc_val is
5459 	     the same as initial_def already.  */
5460 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5461 				  induc_val);
5462 
5463 	  tmp = make_ssa_name (new_scalar_dest);
5464 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5465 					     initial_def, new_temp);
5466 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467 	  new_temp = tmp;
5468 	}
5469 
5470       scalar_results.safe_push (new_temp);
5471     }
5472   else if (direct_slp_reduc)
5473     {
5474       /* Here we create one vector for each of the GROUP_SIZE results,
5475 	 with the elements for other SLP statements replaced with the
5476 	 neutral value.  We can then do a normal reduction on each vector.  */
5477 
5478       /* Enforced by vectorizable_reduction.  */
5479       gcc_assert (new_phis.length () == 1);
5480       gcc_assert (pow2p_hwi (group_size));
5481 
5482       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5483       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5484       gimple_seq seq = NULL;
5485 
5486       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5487 	 and the same element size as VECTYPE.  */
5488       tree index = build_index_vector (vectype, 0, 1);
5489       tree index_type = TREE_TYPE (index);
5490       tree index_elt_type = TREE_TYPE (index_type);
5491       tree mask_type = build_same_sized_truth_vector_type (index_type);
5492 
5493       /* Create a vector that, for each element, identifies which of
5494 	 the GROUP_SIZE results should use it.  */
5495       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5496       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5497 			    build_vector_from_val (index_type, index_mask));
5498 
5499       /* Get a neutral vector value.  This is simply a splat of the neutral
5500 	 scalar value if we have one, otherwise the initial scalar value
5501 	 is itself a neutral value.  */
5502       tree vector_identity = NULL_TREE;
5503       if (neutral_op)
5504 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5505 							neutral_op);
5506       for (unsigned int i = 0; i < group_size; ++i)
5507 	{
5508 	  /* If there's no univeral neutral value, we can use the
5509 	     initial scalar value from the original PHI.  This is used
5510 	     for MIN and MAX reduction, for example.  */
5511 	  if (!neutral_op)
5512 	    {
5513 	      tree scalar_value
5514 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5515 					 loop_preheader_edge (loop));
5516 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5517 							      scalar_value);
5518 	    }
5519 
5520 	  /* Calculate the equivalent of:
5521 
5522 	     sel[j] = (index[j] == i);
5523 
5524 	     which selects the elements of NEW_PHI_RESULT that should
5525 	     be included in the result.  */
5526 	  tree compare_val = build_int_cst (index_elt_type, i);
5527 	  compare_val = build_vector_from_val (index_type, compare_val);
5528 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5529 				   index, compare_val);
5530 
5531 	  /* Calculate the equivalent of:
5532 
5533 	     vec = seq ? new_phi_result : vector_identity;
5534 
5535 	     VEC is now suitable for a full vector reduction.  */
5536 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5537 				   sel, new_phi_result, vector_identity);
5538 
5539 	  /* Do the reduction and convert it to the appropriate type.  */
5540 	  gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5541 	  tree scalar = make_ssa_name (TREE_TYPE (vectype));
5542 	  gimple_call_set_lhs (call, scalar);
5543 	  gimple_seq_add_stmt (&seq, call);
5544 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5545 	  scalar_results.safe_push (scalar);
5546 	}
5547       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5548     }
5549   else
5550     {
5551       bool reduce_with_shift;
5552       tree vec_temp;
5553 
5554       /* COND reductions all do the final reduction with MAX_EXPR
5555 	 or MIN_EXPR.  */
5556       if (code == COND_EXPR)
5557 	{
5558 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5559 	      == INTEGER_INDUC_COND_REDUCTION)
5560 	    code = induc_code;
5561 	  else
5562 	    code = MAX_EXPR;
5563 	}
5564 
5565       /* See if the target wants to do the final (shift) reduction
5566 	 in a vector mode of smaller size and first reduce upper/lower
5567 	 halves against each other.  */
5568       enum machine_mode mode1 = mode;
5569       tree vectype1 = vectype;
5570       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5571       unsigned sz1 = sz;
5572       if (!slp_reduc
5573 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5574 	sz1 = GET_MODE_SIZE (mode1).to_constant ();
5575 
5576       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5577       reduce_with_shift = have_whole_vector_shift (mode1);
5578       if (!VECTOR_MODE_P (mode1))
5579 	reduce_with_shift = false;
5580       else
5581 	{
5582 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5583 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5584 	    reduce_with_shift = false;
5585 	}
5586 
5587       /* First reduce the vector to the desired vector size we should
5588 	 do shift reduction on by combining upper and lower halves.  */
5589       new_temp = new_phi_result;
5590       while (sz > sz1)
5591 	{
5592 	  gcc_assert (!slp_reduc);
5593 	  sz /= 2;
5594 	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5595 
5596 	  /* The target has to make sure we support lowpart/highpart
5597 	     extraction, either via direct vector extract or through
5598 	     an integer mode punning.  */
5599 	  tree dst1, dst2;
5600 	  if (convert_optab_handler (vec_extract_optab,
5601 				     TYPE_MODE (TREE_TYPE (new_temp)),
5602 				     TYPE_MODE (vectype1))
5603 	      != CODE_FOR_nothing)
5604 	    {
5605 	      /* Extract sub-vectors directly once vec_extract becomes
5606 		 a conversion optab.  */
5607 	      dst1 = make_ssa_name (vectype1);
5608 	      epilog_stmt
5609 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5610 					 build3 (BIT_FIELD_REF, vectype1,
5611 						 new_temp, TYPE_SIZE (vectype1),
5612 						 bitsize_int (0)));
5613 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5614 	      dst2 =  make_ssa_name (vectype1);
5615 	      epilog_stmt
5616 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5617 					 build3 (BIT_FIELD_REF, vectype1,
5618 						 new_temp, TYPE_SIZE (vectype1),
5619 						 bitsize_int (sz * BITS_PER_UNIT)));
5620 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621 	    }
5622 	  else
5623 	    {
5624 	      /* Extract via punning to appropriately sized integer mode
5625 		 vector.  */
5626 	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5627 							    1);
5628 	      tree etype = build_vector_type (eltype, 2);
5629 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5630 						 TYPE_MODE (etype),
5631 						 TYPE_MODE (eltype))
5632 			  != CODE_FOR_nothing);
5633 	      tree tem = make_ssa_name (etype);
5634 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5635 						 build1 (VIEW_CONVERT_EXPR,
5636 							 etype, new_temp));
5637 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638 	      new_temp = tem;
5639 	      tem = make_ssa_name (eltype);
5640 	      epilog_stmt
5641 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5642 					 build3 (BIT_FIELD_REF, eltype,
5643 						 new_temp, TYPE_SIZE (eltype),
5644 						 bitsize_int (0)));
5645 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5646 	      dst1 = make_ssa_name (vectype1);
5647 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5648 						 build1 (VIEW_CONVERT_EXPR,
5649 							 vectype1, tem));
5650 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5651 	      tem = make_ssa_name (eltype);
5652 	      epilog_stmt
5653 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5654 					 build3 (BIT_FIELD_REF, eltype,
5655 						 new_temp, TYPE_SIZE (eltype),
5656 						 bitsize_int (sz * BITS_PER_UNIT)));
5657 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658 	      dst2 =  make_ssa_name (vectype1);
5659 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5660 						 build1 (VIEW_CONVERT_EXPR,
5661 							 vectype1, tem));
5662 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5663 	    }
5664 
5665 	  new_temp = make_ssa_name (vectype1);
5666 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5667 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5668 	}
5669 
5670       if (reduce_with_shift && !slp_reduc)
5671 	{
5672 	  int element_bitsize = tree_to_uhwi (bitsize);
5673 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5674 	     for variable-length vectors and also requires direct target support
5675 	     for loop reductions.  */
5676 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5677 	  int nelements = vec_size_in_bits / element_bitsize;
5678 	  vec_perm_builder sel;
5679 	  vec_perm_indices indices;
5680 
5681           int elt_offset;
5682 
5683           tree zero_vec = build_zero_cst (vectype1);
5684           /* Case 2: Create:
5685              for (offset = nelements/2; offset >= 1; offset/=2)
5686                 {
5687                   Create:  va' = vec_shift <va, offset>
5688                   Create:  va = vop <va, va'>
5689                 }  */
5690 
5691           tree rhs;
5692 
5693           if (dump_enabled_p ())
5694             dump_printf_loc (MSG_NOTE, vect_location,
5695 			     "Reduce using vector shifts\n");
5696 
5697 	  mode1 = TYPE_MODE (vectype1);
5698           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5699           for (elt_offset = nelements / 2;
5700                elt_offset >= 1;
5701                elt_offset /= 2)
5702             {
5703 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5704 	      indices.new_vector (sel, 2, nelements);
5705 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5706 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5707 						 new_temp, zero_vec, mask);
5708               new_name = make_ssa_name (vec_dest, epilog_stmt);
5709               gimple_assign_set_lhs (epilog_stmt, new_name);
5710               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5711 
5712 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5713 						 new_temp);
5714               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5715               gimple_assign_set_lhs (epilog_stmt, new_temp);
5716               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5717             }
5718 
5719 	  /* 2.4  Extract the final scalar result.  Create:
5720 	     s_out3 = extract_field <v_out2, bitpos>  */
5721 
5722 	  if (dump_enabled_p ())
5723 	    dump_printf_loc (MSG_NOTE, vect_location,
5724 			     "extract scalar result\n");
5725 
5726 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5727 			bitsize, bitsize_zero_node);
5728 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5729 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5730 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5731 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5732 	  scalar_results.safe_push (new_temp);
5733         }
5734       else
5735         {
5736           /* Case 3: Create:
5737              s = extract_field <v_out2, 0>
5738              for (offset = element_size;
5739                   offset < vector_size;
5740                   offset += element_size;)
5741                {
5742                  Create:  s' = extract_field <v_out2, offset>
5743                  Create:  s = op <s, s'>  // For non SLP cases
5744                }  */
5745 
5746           if (dump_enabled_p ())
5747             dump_printf_loc (MSG_NOTE, vect_location,
5748 			     "Reduce using scalar code.\n");
5749 
5750 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5751 	  int element_bitsize = tree_to_uhwi (bitsize);
5752           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5753             {
5754               int bit_offset;
5755               if (gimple_code (new_phi) == GIMPLE_PHI)
5756                 vec_temp = PHI_RESULT (new_phi);
5757               else
5758                 vec_temp = gimple_assign_lhs (new_phi);
5759               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5760 				 bitsize_zero_node);
5761               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5762               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5763               gimple_assign_set_lhs (epilog_stmt, new_temp);
5764               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5765 
5766               /* In SLP we don't need to apply reduction operation, so we just
5767                  collect s' values in SCALAR_RESULTS.  */
5768               if (slp_reduc)
5769                 scalar_results.safe_push (new_temp);
5770 
5771               for (bit_offset = element_bitsize;
5772                    bit_offset < vec_size_in_bits;
5773                    bit_offset += element_bitsize)
5774                 {
5775                   tree bitpos = bitsize_int (bit_offset);
5776                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5777                                      bitsize, bitpos);
5778 
5779                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5780                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5781                   gimple_assign_set_lhs (epilog_stmt, new_name);
5782                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5783 
5784                   if (slp_reduc)
5785                     {
5786                       /* In SLP we don't need to apply reduction operation, so
5787                          we just collect s' values in SCALAR_RESULTS.  */
5788                       new_temp = new_name;
5789                       scalar_results.safe_push (new_name);
5790                     }
5791                   else
5792                     {
5793 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5794 							 new_name, new_temp);
5795                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5796                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5797                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5798                     }
5799                 }
5800             }
5801 
5802           /* The only case where we need to reduce scalar results in SLP, is
5803              unrolling.  If the size of SCALAR_RESULTS is greater than
5804              GROUP_SIZE, we reduce them combining elements modulo
5805              GROUP_SIZE.  */
5806           if (slp_reduc)
5807             {
5808               tree res, first_res, new_res;
5809 	      gimple *new_stmt;
5810 
5811               /* Reduce multiple scalar results in case of SLP unrolling.  */
5812               for (j = group_size; scalar_results.iterate (j, &res);
5813                    j++)
5814                 {
5815                   first_res = scalar_results[j % group_size];
5816 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
5817 						  first_res, res);
5818                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5819                   gimple_assign_set_lhs (new_stmt, new_res);
5820                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5821                   scalar_results[j % group_size] = new_res;
5822                 }
5823             }
5824           else
5825             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5826             scalar_results.safe_push (new_temp);
5827         }
5828 
5829       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5830 	   == INTEGER_INDUC_COND_REDUCTION)
5831 	  && !operand_equal_p (initial_def, induc_val, 0))
5832 	{
5833 	  /* Earlier we set the initial value to be a vector if induc_val
5834 	     values.  Check the result and if it is induc_val then replace
5835 	     with the original initial value, unless induc_val is
5836 	     the same as initial_def already.  */
5837 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5838 				  induc_val);
5839 
5840 	  tree tmp = make_ssa_name (new_scalar_dest);
5841 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5842 					     initial_def, new_temp);
5843 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5844 	  scalar_results[0] = tmp;
5845 	}
5846     }
5847 
5848 vect_finalize_reduction:
5849 
5850   if (double_reduc)
5851     loop = loop->inner;
5852 
5853   /* 2.5 Adjust the final result by the initial value of the reduction
5854 	 variable. (When such adjustment is not needed, then
5855 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5856 	 new_temp = loop_exit_def + adjustment_def  */
5857 
5858   if (adjustment_def)
5859     {
5860       gcc_assert (!slp_reduc);
5861       if (nested_in_vect_loop)
5862 	{
5863           new_phi = new_phis[0];
5864 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5865 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5866 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
5867 	}
5868       else
5869 	{
5870           new_temp = scalar_results[0];
5871 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5872 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
5873 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5874 	}
5875 
5876       epilog_stmt = gimple_build_assign (new_dest, expr);
5877       new_temp = make_ssa_name (new_dest, epilog_stmt);
5878       gimple_assign_set_lhs (epilog_stmt, new_temp);
5879       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5880       if (nested_in_vect_loop)
5881         {
5882           set_vinfo_for_stmt (epilog_stmt,
5883                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5884           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5885                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5886 
5887           if (!double_reduc)
5888             scalar_results.quick_push (new_temp);
5889           else
5890             scalar_results[0] = new_temp;
5891         }
5892       else
5893         scalar_results[0] = new_temp;
5894 
5895       new_phis[0] = epilog_stmt;
5896     }
5897 
5898   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5899           phis with new adjusted scalar results, i.e., replace use <s_out0>
5900           with use <s_out4>.
5901 
5902      Transform:
5903         loop_exit:
5904           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5905           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5906           v_out2 = reduce <v_out1>
5907           s_out3 = extract_field <v_out2, 0>
5908           s_out4 = adjust_result <s_out3>
5909           use <s_out0>
5910           use <s_out0>
5911 
5912      into:
5913 
5914         loop_exit:
5915           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5916           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5917           v_out2 = reduce <v_out1>
5918           s_out3 = extract_field <v_out2, 0>
5919           s_out4 = adjust_result <s_out3>
5920           use <s_out4>
5921           use <s_out4> */
5922 
5923 
5924   /* In SLP reduction chain we reduce vector results into one vector if
5925      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5926      the last stmt in the reduction chain, since we are looking for the loop
5927      exit phi node.  */
5928   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5929     {
5930       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5931       /* Handle reduction patterns.  */
5932       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5933 	dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5934 
5935       scalar_dest = gimple_assign_lhs (dest_stmt);
5936       group_size = 1;
5937     }
5938 
5939   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5940      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5941      need to match SCALAR_RESULTS with corresponding statements.  The first
5942      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5943      the first vector stmt, etc.
5944      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5945   if (group_size > new_phis.length ())
5946     {
5947       ratio = group_size / new_phis.length ();
5948       gcc_assert (!(group_size % new_phis.length ()));
5949     }
5950   else
5951     ratio = 1;
5952 
5953   for (k = 0; k < group_size; k++)
5954     {
5955       if (k % ratio == 0)
5956         {
5957           epilog_stmt = new_phis[k / ratio];
5958           reduction_phi = reduction_phis[k / ratio];
5959 	  if (double_reduc)
5960 	    inner_phi = inner_phis[k / ratio];
5961         }
5962 
5963       if (slp_reduc)
5964         {
5965 	  gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5966 
5967           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5968           /* SLP statements can't participate in patterns.  */
5969           gcc_assert (!orig_stmt);
5970           scalar_dest = gimple_assign_lhs (current_stmt);
5971         }
5972 
5973       phis.create (3);
5974       /* Find the loop-closed-use at the loop exit of the original scalar
5975          result.  (The reduction result is expected to have two immediate uses -
5976          one at the latch block, and one at the loop exit).  */
5977       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5978         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5979 	    && !is_gimple_debug (USE_STMT (use_p)))
5980           phis.safe_push (USE_STMT (use_p));
5981 
5982       /* While we expect to have found an exit_phi because of loop-closed-ssa
5983          form we can end up without one if the scalar cycle is dead.  */
5984 
5985       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5986         {
5987           if (outer_loop)
5988             {
5989               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5990               gphi *vect_phi;
5991 
5992               /* FORNOW. Currently not supporting the case that an inner-loop
5993                  reduction is not used in the outer-loop (but only outside the
5994                  outer-loop), unless it is double reduction.  */
5995               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5996                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5997                           || double_reduc);
5998 
5999 	      if (double_reduc)
6000 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6001 	      else
6002 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6003               if (!double_reduc
6004                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6005                       != vect_double_reduction_def)
6006                 continue;
6007 
6008               /* Handle double reduction:
6009 
6010                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
6011                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6012                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
6013                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
6014 
6015                  At that point the regular reduction (stmt2 and stmt3) is
6016                  already vectorized, as well as the exit phi node, stmt4.
6017                  Here we vectorize the phi node of double reduction, stmt1, and
6018                  update all relevant statements.  */
6019 
6020               /* Go through all the uses of s2 to find double reduction phi
6021                  node, i.e., stmt1 above.  */
6022               orig_name = PHI_RESULT (exit_phi);
6023               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6024                 {
6025                   stmt_vec_info use_stmt_vinfo;
6026                   stmt_vec_info new_phi_vinfo;
6027                   tree vect_phi_init, preheader_arg, vect_phi_res;
6028                   basic_block bb = gimple_bb (use_stmt);
6029 		  gimple *use;
6030 
6031                   /* Check that USE_STMT is really double reduction phi
6032                      node.  */
6033                   if (gimple_code (use_stmt) != GIMPLE_PHI
6034                       || gimple_phi_num_args (use_stmt) != 2
6035                       || bb->loop_father != outer_loop)
6036                     continue;
6037                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6038                   if (!use_stmt_vinfo
6039                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6040                           != vect_double_reduction_def)
6041 		    continue;
6042 
6043                   /* Create vector phi node for double reduction:
6044                      vs1 = phi <vs0, vs2>
6045                      vs1 was created previously in this function by a call to
6046                        vect_get_vec_def_for_operand and is stored in
6047                        vec_initial_def;
6048                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6049                      vs0 is created here.  */
6050 
6051                   /* Create vector phi node.  */
6052                   vect_phi = create_phi_node (vec_initial_def, bb);
6053                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6054                                     loop_vec_info_for_loop (outer_loop));
6055                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6056 
6057                   /* Create vs0 - initial def of the double reduction phi.  */
6058                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6059                                              loop_preheader_edge (outer_loop));
6060                   vect_phi_init = get_initial_def_for_reduction
6061 		    (stmt, preheader_arg, NULL);
6062 
6063                   /* Update phi node arguments with vs0 and vs2.  */
6064                   add_phi_arg (vect_phi, vect_phi_init,
6065                                loop_preheader_edge (outer_loop),
6066                                UNKNOWN_LOCATION);
6067                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6068                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6069                   if (dump_enabled_p ())
6070                     {
6071                       dump_printf_loc (MSG_NOTE, vect_location,
6072 				       "created double reduction phi node: ");
6073                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6074                     }
6075 
6076                   vect_phi_res = PHI_RESULT (vect_phi);
6077 
6078                   /* Replace the use, i.e., set the correct vs1 in the regular
6079                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6080                      loop is redundant.  */
6081                   use = reduction_phi;
6082                   for (j = 0; j < ncopies; j++)
6083                     {
6084                       edge pr_edge = loop_preheader_edge (loop);
6085                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6086                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6087                     }
6088                 }
6089             }
6090         }
6091 
6092       phis.release ();
6093       if (nested_in_vect_loop)
6094         {
6095           if (double_reduc)
6096             loop = outer_loop;
6097           else
6098             continue;
6099         }
6100 
6101       phis.create (3);
6102       /* Find the loop-closed-use at the loop exit of the original scalar
6103          result.  (The reduction result is expected to have two immediate uses,
6104          one at the latch block, and one at the loop exit).  For double
6105          reductions we are looking for exit phis of the outer loop.  */
6106       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6107         {
6108           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6109 	    {
6110 	      if (!is_gimple_debug (USE_STMT (use_p)))
6111 		phis.safe_push (USE_STMT (use_p));
6112 	    }
6113           else
6114             {
6115               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6116                 {
6117                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6118 
6119                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6120                     {
6121                       if (!flow_bb_inside_loop_p (loop,
6122                                              gimple_bb (USE_STMT (phi_use_p)))
6123 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6124                         phis.safe_push (USE_STMT (phi_use_p));
6125                     }
6126                 }
6127             }
6128         }
6129 
6130       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6131         {
6132           /* Replace the uses:  */
6133           orig_name = PHI_RESULT (exit_phi);
6134           scalar_result = scalar_results[k];
6135           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6136             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6137               SET_USE (use_p, scalar_result);
6138         }
6139 
6140       phis.release ();
6141     }
6142 }
6143 
6144 /* Return a vector of type VECTYPE that is equal to the vector select
6145    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6146    before GSI.  */
6147 
6148 static tree
6149 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6150 		     tree vec, tree identity)
6151 {
6152   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6153   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6154 					  mask, vec, identity);
6155   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6156   return cond;
6157 }
6158 
6159 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6160    order, starting with LHS.  Insert the extraction statements before GSI and
6161    associate the new scalar SSA names with variable SCALAR_DEST.
6162    Return the SSA name for the result.  */
6163 
6164 static tree
6165 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6166 		       tree_code code, tree lhs, tree vector_rhs)
6167 {
6168   tree vectype = TREE_TYPE (vector_rhs);
6169   tree scalar_type = TREE_TYPE (vectype);
6170   tree bitsize = TYPE_SIZE (scalar_type);
6171   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6172   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6173 
6174   for (unsigned HOST_WIDE_INT bit_offset = 0;
6175        bit_offset < vec_size_in_bits;
6176        bit_offset += element_bitsize)
6177     {
6178       tree bitpos = bitsize_int (bit_offset);
6179       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6180 			 bitsize, bitpos);
6181 
6182       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6183       rhs = make_ssa_name (scalar_dest, stmt);
6184       gimple_assign_set_lhs (stmt, rhs);
6185       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6186 
6187       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6188       tree new_name = make_ssa_name (scalar_dest, stmt);
6189       gimple_assign_set_lhs (stmt, new_name);
6190       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6191       lhs = new_name;
6192     }
6193   return lhs;
6194 }
6195 
6196 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6197    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6198    statement.  CODE is the operation performed by STMT and OPS are
6199    its scalar operands.  REDUC_INDEX is the index of the operand in
6200    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6201    implements in-order reduction, or IFN_LAST if we should open-code it.
6202    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6203    that should be used to control the operation in a fully-masked loop.  */
6204 
6205 static bool
6206 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6207 			       gimple **vec_stmt, slp_tree slp_node,
6208 			       gimple *reduc_def_stmt,
6209 			       tree_code code, internal_fn reduc_fn,
6210 			       tree ops[3], tree vectype_in,
6211 			       int reduc_index, vec_loop_masks *masks)
6212 {
6213   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6214   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6215   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6216   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6217   gimple *new_stmt = NULL;
6218 
6219   int ncopies;
6220   if (slp_node)
6221     ncopies = 1;
6222   else
6223     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6224 
6225   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6226   gcc_assert (ncopies == 1);
6227   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6228   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6229   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6230 	      == FOLD_LEFT_REDUCTION);
6231 
6232   if (slp_node)
6233     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6234 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6235 
6236   tree op0 = ops[1 - reduc_index];
6237 
6238   int group_size = 1;
6239   gimple *scalar_dest_def;
6240   auto_vec<tree> vec_oprnds0;
6241   if (slp_node)
6242     {
6243       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6244       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6245       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6246     }
6247   else
6248     {
6249       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6250       vec_oprnds0.create (1);
6251       vec_oprnds0.quick_push (loop_vec_def0);
6252       scalar_dest_def = stmt;
6253     }
6254 
6255   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6256   tree scalar_type = TREE_TYPE (scalar_dest);
6257   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6258 
6259   int vec_num = vec_oprnds0.length ();
6260   gcc_assert (vec_num == 1 || slp_node);
6261   tree vec_elem_type = TREE_TYPE (vectype_out);
6262   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6263 
6264   tree vector_identity = NULL_TREE;
6265   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6266     vector_identity = build_zero_cst (vectype_out);
6267 
6268   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6269   int i;
6270   tree def0;
6271   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6272     {
6273       tree mask = NULL_TREE;
6274       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6275 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6276 
6277       /* Handle MINUS by adding the negative.  */
6278       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6279 	{
6280 	  tree negated = make_ssa_name (vectype_out);
6281 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6282 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6283 	  def0 = negated;
6284 	}
6285 
6286       if (mask)
6287 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6288 				    vector_identity);
6289 
6290       /* On the first iteration the input is simply the scalar phi
6291 	 result, and for subsequent iterations it is the output of
6292 	 the preceding operation.  */
6293       if (reduc_fn != IFN_LAST)
6294 	{
6295 	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6296 	  /* For chained SLP reductions the output of the previous reduction
6297 	     operation serves as the input of the next. For the final statement
6298 	     the output cannot be a temporary - we reuse the original
6299 	     scalar destination of the last statement.  */
6300 	  if (i != vec_num - 1)
6301 	    {
6302 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6303 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6304 	      gimple_set_lhs (new_stmt, reduc_var);
6305 	    }
6306 	}
6307       else
6308 	{
6309 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6310 					     reduc_var, def0);
6311 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6312 	  /* Remove the statement, so that we can use the same code paths
6313 	     as for statements that we've just created.  */
6314 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6315 	  gsi_remove (&tmp_gsi, false);
6316 	}
6317 
6318       if (i == vec_num - 1)
6319 	{
6320 	  gimple_set_lhs (new_stmt, scalar_dest);
6321 	  vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6322 	}
6323       else
6324 	vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6325 
6326       if (slp_node)
6327 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6328     }
6329 
6330   if (!slp_node)
6331     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6332 
6333   return true;
6334 }
6335 
6336 /* Function is_nonwrapping_integer_induction.
6337 
6338    Check if STMT (which is part of loop LOOP) both increments and
6339    does not cause overflow.  */
6340 
6341 static bool
6342 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6343 {
6344   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6345   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6346   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6347   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6348   widest_int ni, max_loop_value, lhs_max;
6349   bool overflow = false;
6350 
6351   /* Make sure the loop is integer based.  */
6352   if (TREE_CODE (base) != INTEGER_CST
6353       || TREE_CODE (step) != INTEGER_CST)
6354     return false;
6355 
6356   /* Check that the max size of the loop will not wrap.  */
6357 
6358   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6359     return true;
6360 
6361   if (! max_stmt_executions (loop, &ni))
6362     return false;
6363 
6364   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6365 			    &overflow);
6366   if (overflow)
6367     return false;
6368 
6369   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6370 			    TYPE_SIGN (lhs_type), &overflow);
6371   if (overflow)
6372     return false;
6373 
6374   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6375 	  <= TYPE_PRECISION (lhs_type));
6376 }
6377 
6378 /* Function vectorizable_reduction.
6379 
6380    Check if STMT performs a reduction operation that can be vectorized.
6381    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6382    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6383    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6384 
6385    This function also handles reduction idioms (patterns) that have been
6386    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6387    of this form:
6388      X = pattern_expr (arg0, arg1, ..., X)
6389    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6390    sequence that had been detected and replaced by the pattern-stmt (STMT).
6391 
6392    This function also handles reduction of condition expressions, for example:
6393      for (int i = 0; i < N; i++)
6394        if (a[i] < value)
6395 	 last = a[i];
6396    This is handled by vectorising the loop and creating an additional vector
6397    containing the loop indexes for which "a[i] < value" was true.  In the
6398    function epilogue this is reduced to a single max value and then used to
6399    index into the vector of results.
6400 
6401    In some cases of reduction patterns, the type of the reduction variable X is
6402    different than the type of the other arguments of STMT.
6403    In such cases, the vectype that is used when transforming STMT into a vector
6404    stmt is different than the vectype that is used to determine the
6405    vectorization factor, because it consists of a different number of elements
6406    than the actual number of elements that are being operated upon in parallel.
6407 
6408    For example, consider an accumulation of shorts into an int accumulator.
6409    On some targets it's possible to vectorize this pattern operating on 8
6410    shorts at a time (hence, the vectype for purposes of determining the
6411    vectorization factor should be V8HI); on the other hand, the vectype that
6412    is used to create the vector form is actually V4SI (the type of the result).
6413 
6414    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6415    indicates what is the actual level of parallelism (V8HI in the example), so
6416    that the right vectorization factor would be derived.  This vectype
6417    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6418    be used to create the vectorized stmt.  The right vectype for the vectorized
6419    stmt is obtained from the type of the result X:
6420         get_vectype_for_scalar_type (TREE_TYPE (X))
6421 
6422    This means that, contrary to "regular" reductions (or "regular" stmts in
6423    general), the following equation:
6424       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6425    does *NOT* necessarily hold for reduction patterns.  */
6426 
6427 bool
6428 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6429 			gimple **vec_stmt, slp_tree slp_node,
6430 			slp_instance slp_node_instance)
6431 {
6432   tree vec_dest;
6433   tree scalar_dest;
6434   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6435   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6436   tree vectype_in = NULL_TREE;
6437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6439   enum tree_code code, orig_code;
6440   internal_fn reduc_fn;
6441   machine_mode vec_mode;
6442   int op_type;
6443   optab optab;
6444   tree new_temp = NULL_TREE;
6445   gimple *def_stmt;
6446   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6447   gimple *cond_reduc_def_stmt = NULL;
6448   enum tree_code cond_reduc_op_code = ERROR_MARK;
6449   tree scalar_type;
6450   bool is_simple_use;
6451   gimple *orig_stmt;
6452   stmt_vec_info orig_stmt_info = NULL;
6453   int i;
6454   int ncopies;
6455   int epilog_copies;
6456   stmt_vec_info prev_stmt_info, prev_phi_info;
6457   bool single_defuse_cycle = false;
6458   gimple *new_stmt = NULL;
6459   int j;
6460   tree ops[3];
6461   enum vect_def_type dts[3];
6462   bool nested_cycle = false, found_nested_cycle_def = false;
6463   bool double_reduc = false;
6464   basic_block def_bb;
6465   struct loop * def_stmt_loop, *outer_loop = NULL;
6466   tree def_arg;
6467   gimple *def_arg_stmt;
6468   auto_vec<tree> vec_oprnds0;
6469   auto_vec<tree> vec_oprnds1;
6470   auto_vec<tree> vec_oprnds2;
6471   auto_vec<tree> vect_defs;
6472   auto_vec<gimple *> phis;
6473   int vec_num;
6474   tree def0, tem;
6475   bool first_p = true;
6476   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6477   tree cond_reduc_val = NULL_TREE;
6478 
6479   /* Make sure it was already recognized as a reduction computation.  */
6480   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6481       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6482     return false;
6483 
6484   if (nested_in_vect_loop_p (loop, stmt))
6485     {
6486       outer_loop = loop;
6487       loop = loop->inner;
6488       nested_cycle = true;
6489     }
6490 
6491   /* In case of reduction chain we switch to the first stmt in the chain, but
6492      we don't update STMT_INFO, since only the last stmt is marked as reduction
6493      and has reduction properties.  */
6494   if (GROUP_FIRST_ELEMENT (stmt_info)
6495       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6496     {
6497       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6498       first_p = false;
6499     }
6500 
6501   if (gimple_code (stmt) == GIMPLE_PHI)
6502     {
6503       /* Analysis is fully done on the reduction stmt invocation.  */
6504       if (! vec_stmt)
6505 	{
6506 	  if (slp_node)
6507 	    slp_node_instance->reduc_phis = slp_node;
6508 
6509 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6510 	  return true;
6511 	}
6512 
6513       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6514 	/* Leave the scalar phi in place.  Note that checking
6515 	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6516 	   for reductions involving a single statement.  */
6517 	return true;
6518 
6519       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6520       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6521 	reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6522 
6523       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6524 	  == EXTRACT_LAST_REDUCTION)
6525 	/* Leave the scalar phi in place.  */
6526 	return true;
6527 
6528       gcc_assert (is_gimple_assign (reduc_stmt));
6529       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6530 	{
6531 	  tree op = gimple_op (reduc_stmt, k);
6532 	  if (op == gimple_phi_result (stmt))
6533 	    continue;
6534 	  if (k == 1
6535 	      && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6536 	    continue;
6537 	  if (!vectype_in
6538 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6539 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6540 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6541 	  break;
6542 	}
6543       gcc_assert (vectype_in);
6544 
6545       if (slp_node)
6546 	ncopies = 1;
6547       else
6548 	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6549 
6550       use_operand_p use_p;
6551       gimple *use_stmt;
6552       if (ncopies > 1
6553 	  && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6554 	      <= vect_used_only_live)
6555 	  && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6556 	  && (use_stmt == reduc_stmt
6557 	      || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6558 		  == reduc_stmt)))
6559 	single_defuse_cycle = true;
6560 
6561       /* Create the destination vector  */
6562       scalar_dest = gimple_assign_lhs (reduc_stmt);
6563       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6564 
6565       if (slp_node)
6566 	/* The size vect_schedule_slp_instance computes is off for us.  */
6567 	vec_num = vect_get_num_vectors
6568 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6570 	   vectype_in);
6571       else
6572 	vec_num = 1;
6573 
6574       /* Generate the reduction PHIs upfront.  */
6575       prev_phi_info = NULL;
6576       for (j = 0; j < ncopies; j++)
6577 	{
6578 	  if (j == 0 || !single_defuse_cycle)
6579 	    {
6580 	      for (i = 0; i < vec_num; i++)
6581 		{
6582 		  /* Create the reduction-phi that defines the reduction
6583 		     operand.  */
6584 		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
6585 		  set_vinfo_for_stmt (new_phi,
6586 				      new_stmt_vec_info (new_phi, loop_vinfo));
6587 
6588 		  if (slp_node)
6589 		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6590 		  else
6591 		    {
6592 		      if (j == 0)
6593 			STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6594 		      else
6595 			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6596 		      prev_phi_info = vinfo_for_stmt (new_phi);
6597 		    }
6598 		}
6599 	    }
6600 	}
6601 
6602       return true;
6603     }
6604 
6605   /* 1. Is vectorizable reduction?  */
6606   /* Not supportable if the reduction variable is used in the loop, unless
6607      it's a reduction chain.  */
6608   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6609       && !GROUP_FIRST_ELEMENT (stmt_info))
6610     return false;
6611 
6612   /* Reductions that are not used even in an enclosing outer-loop,
6613      are expected to be "live" (used out of the loop).  */
6614   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6615       && !STMT_VINFO_LIVE_P (stmt_info))
6616     return false;
6617 
6618   /* 2. Has this been recognized as a reduction pattern?
6619 
6620      Check if STMT represents a pattern that has been recognized
6621      in earlier analysis stages.  For stmts that represent a pattern,
6622      the STMT_VINFO_RELATED_STMT field records the last stmt in
6623      the original sequence that constitutes the pattern.  */
6624 
6625   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6626   if (orig_stmt)
6627     {
6628       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6629       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6630       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6631     }
6632 
6633   /* 3. Check the operands of the operation.  The first operands are defined
6634         inside the loop body. The last operand is the reduction variable,
6635         which is defined by the loop-header-phi.  */
6636 
6637   gcc_assert (is_gimple_assign (stmt));
6638 
6639   /* Flatten RHS.  */
6640   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6641     {
6642     case GIMPLE_BINARY_RHS:
6643       code = gimple_assign_rhs_code (stmt);
6644       op_type = TREE_CODE_LENGTH (code);
6645       gcc_assert (op_type == binary_op);
6646       ops[0] = gimple_assign_rhs1 (stmt);
6647       ops[1] = gimple_assign_rhs2 (stmt);
6648       break;
6649 
6650     case GIMPLE_TERNARY_RHS:
6651       code = gimple_assign_rhs_code (stmt);
6652       op_type = TREE_CODE_LENGTH (code);
6653       gcc_assert (op_type == ternary_op);
6654       ops[0] = gimple_assign_rhs1 (stmt);
6655       ops[1] = gimple_assign_rhs2 (stmt);
6656       ops[2] = gimple_assign_rhs3 (stmt);
6657       break;
6658 
6659     case GIMPLE_UNARY_RHS:
6660       return false;
6661 
6662     default:
6663       gcc_unreachable ();
6664     }
6665 
6666   if (code == COND_EXPR && slp_node)
6667     return false;
6668 
6669   scalar_dest = gimple_assign_lhs (stmt);
6670   scalar_type = TREE_TYPE (scalar_dest);
6671   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6672       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6673     return false;
6674 
6675   /* Do not try to vectorize bit-precision reductions.  */
6676   if (!type_has_mode_precision_p (scalar_type))
6677     return false;
6678 
6679   /* All uses but the last are expected to be defined in the loop.
6680      The last use is the reduction variable.  In case of nested cycle this
6681      assumption is not true: we use reduc_index to record the index of the
6682      reduction variable.  */
6683   gimple *reduc_def_stmt = NULL;
6684   int reduc_index = -1;
6685   for (i = 0; i < op_type; i++)
6686     {
6687       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6688       if (i == 0 && code == COND_EXPR)
6689         continue;
6690 
6691       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6692 					  &def_stmt, &dts[i], &tem);
6693       dt = dts[i];
6694       gcc_assert (is_simple_use);
6695       if (dt == vect_reduction_def)
6696 	{
6697           reduc_def_stmt = def_stmt;
6698 	  reduc_index = i;
6699 	  continue;
6700 	}
6701       else if (tem)
6702 	{
6703 	  /* To properly compute ncopies we are interested in the widest
6704 	     input type in case we're looking at a widening accumulation.  */
6705 	  if (!vectype_in
6706 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6707 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6708 	    vectype_in = tem;
6709 	}
6710 
6711       if (dt != vect_internal_def
6712 	  && dt != vect_external_def
6713 	  && dt != vect_constant_def
6714 	  && dt != vect_induction_def
6715           && !(dt == vect_nested_cycle && nested_cycle))
6716 	return false;
6717 
6718       if (dt == vect_nested_cycle)
6719         {
6720           found_nested_cycle_def = true;
6721           reduc_def_stmt = def_stmt;
6722           reduc_index = i;
6723         }
6724 
6725       if (i == 1 && code == COND_EXPR)
6726 	{
6727 	  /* Record how value of COND_EXPR is defined.  */
6728 	  if (dt == vect_constant_def)
6729 	    {
6730 	      cond_reduc_dt = dt;
6731 	      cond_reduc_val = ops[i];
6732 	    }
6733 	  if (dt == vect_induction_def
6734 	      && def_stmt != NULL
6735 	      && is_nonwrapping_integer_induction (def_stmt, loop))
6736 	    {
6737 	      cond_reduc_dt = dt;
6738 	      cond_reduc_def_stmt = def_stmt;
6739 	    }
6740 	}
6741     }
6742 
6743   if (!vectype_in)
6744     vectype_in = vectype_out;
6745 
6746   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6747      directy used in stmt.  */
6748   if (reduc_index == -1)
6749     {
6750       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6751 	{
6752 	  if (dump_enabled_p ())
6753 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754 			     "in-order reduction chain without SLP.\n");
6755 	  return false;
6756 	}
6757 
6758       if (orig_stmt)
6759 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6760       else
6761 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6762     }
6763 
6764   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6765     return false;
6766 
6767   if (!(reduc_index == -1
6768 	|| dts[reduc_index] == vect_reduction_def
6769 	|| dts[reduc_index] == vect_nested_cycle
6770 	|| ((dts[reduc_index] == vect_internal_def
6771 	     || dts[reduc_index] == vect_external_def
6772 	     || dts[reduc_index] == vect_constant_def
6773 	     || dts[reduc_index] == vect_induction_def)
6774 	    && nested_cycle && found_nested_cycle_def)))
6775     {
6776       /* For pattern recognized stmts, orig_stmt might be a reduction,
6777 	 but some helper statements for the pattern might not, or
6778 	 might be COND_EXPRs with reduction uses in the condition.  */
6779       gcc_assert (orig_stmt);
6780       return false;
6781     }
6782 
6783   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6784   enum vect_reduction_type v_reduc_type
6785     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6786   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6787 
6788   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6789   /* If we have a condition reduction, see if we can simplify it further.  */
6790   if (v_reduc_type == COND_REDUCTION)
6791     {
6792       /* TODO: We can't yet handle reduction chains, since we need to treat
6793 	 each COND_EXPR in the chain specially, not just the last one.
6794 	 E.g. for:
6795 
6796 	    x_1 = PHI <x_3, ...>
6797 	    x_2 = a_2 ? ... : x_1;
6798 	    x_3 = a_3 ? ... : x_2;
6799 
6800 	 we're interested in the last element in x_3 for which a_2 || a_3
6801 	 is true, whereas the current reduction chain handling would
6802 	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6803 	 as a reduction operation.  */
6804       if (reduc_index == -1)
6805 	{
6806 	  if (dump_enabled_p ())
6807 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808 			     "conditional reduction chains not supported\n");
6809 	  return false;
6810 	}
6811 
6812       /* vect_is_simple_reduction ensured that operand 2 is the
6813 	 loop-carried operand.  */
6814       gcc_assert (reduc_index == 2);
6815 
6816       /* Loop peeling modifies initial value of reduction PHI, which
6817 	 makes the reduction stmt to be transformed different to the
6818 	 original stmt analyzed.  We need to record reduction code for
6819 	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6820 	 it can be used directly at transform stage.  */
6821       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6822 	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6823 	{
6824 	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
6825 	  gcc_assert (cond_reduc_dt == vect_constant_def);
6826 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6827 	}
6828       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6829 					       vectype_in, OPTIMIZE_FOR_SPEED))
6830 	{
6831 	  if (dump_enabled_p ())
6832 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833 			     "optimizing condition reduction with"
6834 			     " FOLD_EXTRACT_LAST.\n");
6835 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6836 	}
6837       else if (cond_reduc_dt == vect_induction_def)
6838 	{
6839 	  stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6840 	  tree base
6841 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6842 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6843 
6844 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6845 		      && TREE_CODE (step) == INTEGER_CST);
6846 	  cond_reduc_val = NULL_TREE;
6847 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6848 	     above base; punt if base is the minimum value of the type for
6849 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6850 	  if (tree_int_cst_sgn (step) == -1)
6851 	    {
6852 	      cond_reduc_op_code = MIN_EXPR;
6853 	      if (tree_int_cst_sgn (base) == -1)
6854 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6855 	      else if (tree_int_cst_lt (base,
6856 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6857 		cond_reduc_val
6858 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6859 	    }
6860 	  else
6861 	    {
6862 	      cond_reduc_op_code = MAX_EXPR;
6863 	      if (tree_int_cst_sgn (base) == 1)
6864 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6865 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6866 					base))
6867 		cond_reduc_val
6868 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6869 	    }
6870 	  if (cond_reduc_val)
6871 	    {
6872 	      if (dump_enabled_p ())
6873 		dump_printf_loc (MSG_NOTE, vect_location,
6874 				 "condition expression based on "
6875 				 "integer induction.\n");
6876 	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6877 		= INTEGER_INDUC_COND_REDUCTION;
6878 	    }
6879 	}
6880       else if (cond_reduc_dt == vect_constant_def)
6881 	{
6882 	  enum vect_def_type cond_initial_dt;
6883 	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6884 	  tree cond_initial_val
6885 	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6886 
6887 	  gcc_assert (cond_reduc_val != NULL_TREE);
6888 	  vect_is_simple_use (cond_initial_val, loop_vinfo,
6889 			      &def_stmt, &cond_initial_dt);
6890 	  if (cond_initial_dt == vect_constant_def
6891 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6892 				     TREE_TYPE (cond_reduc_val)))
6893 	    {
6894 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6895 				    cond_initial_val, cond_reduc_val);
6896 	      if (e && (integer_onep (e) || integer_zerop (e)))
6897 		{
6898 		  if (dump_enabled_p ())
6899 		    dump_printf_loc (MSG_NOTE, vect_location,
6900 				     "condition expression based on "
6901 				     "compile time constant.\n");
6902 		  /* Record reduction code at analysis stage.  */
6903 		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6904 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6905 		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6906 		    = CONST_COND_REDUCTION;
6907 		}
6908 	    }
6909 	}
6910     }
6911 
6912   if (orig_stmt)
6913     gcc_assert (tmp == orig_stmt
6914 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6915   else
6916     /* We changed STMT to be the first stmt in reduction chain, hence we
6917        check that in this case the first element in the chain is STMT.  */
6918     gcc_assert (stmt == tmp
6919 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6920 
6921   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6922     return false;
6923 
6924   if (slp_node)
6925     ncopies = 1;
6926   else
6927     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6928 
6929   gcc_assert (ncopies >= 1);
6930 
6931   vec_mode = TYPE_MODE (vectype_in);
6932   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6933 
6934   if (code == COND_EXPR)
6935     {
6936       /* Only call during the analysis stage, otherwise we'll lose
6937 	 STMT_VINFO_TYPE.  */
6938       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6939 						ops[reduc_index], 0, NULL))
6940         {
6941           if (dump_enabled_p ())
6942 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943 			     "unsupported condition in reduction\n");
6944 	  return false;
6945         }
6946     }
6947   else
6948     {
6949       /* 4. Supportable by target?  */
6950 
6951       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6952 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
6953 	{
6954 	  /* Shifts and rotates are only supported by vectorizable_shifts,
6955 	     not vectorizable_reduction.  */
6956           if (dump_enabled_p ())
6957 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958 			     "unsupported shift or rotation.\n");
6959 	  return false;
6960 	}
6961 
6962       /* 4.1. check support for the operation in the loop  */
6963       optab = optab_for_tree_code (code, vectype_in, optab_default);
6964       if (!optab)
6965         {
6966           if (dump_enabled_p ())
6967 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6968 			     "no optab.\n");
6969 
6970           return false;
6971         }
6972 
6973       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6974         {
6975           if (dump_enabled_p ())
6976             dump_printf (MSG_NOTE, "op not supported by target.\n");
6977 
6978 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6979 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6980             return false;
6981 
6982           if (dump_enabled_p ())
6983   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6984         }
6985 
6986       /* Worthwhile without SIMD support?  */
6987       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6988 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6989         {
6990           if (dump_enabled_p ())
6991 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6992 			     "not worthwhile without SIMD support.\n");
6993 
6994           return false;
6995         }
6996     }
6997 
6998   /* 4.2. Check support for the epilog operation.
6999 
7000           If STMT represents a reduction pattern, then the type of the
7001           reduction variable may be different than the type of the rest
7002           of the arguments.  For example, consider the case of accumulation
7003           of shorts into an int accumulator; The original code:
7004                         S1: int_a = (int) short_a;
7005           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7006 
7007           was replaced with:
7008                         STMT: int_acc = widen_sum <short_a, int_acc>
7009 
7010           This means that:
7011           1. The tree-code that is used to create the vector operation in the
7012              epilog code (that reduces the partial results) is not the
7013              tree-code of STMT, but is rather the tree-code of the original
7014              stmt from the pattern that STMT is replacing.  I.e, in the example
7015              above we want to use 'widen_sum' in the loop, but 'plus' in the
7016              epilog.
7017           2. The type (mode) we use to check available target support
7018              for the vector operation to be created in the *epilog*, is
7019              determined by the type of the reduction variable (in the example
7020              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7021              However the type (mode) we use to check available target support
7022              for the vector operation to be created *inside the loop*, is
7023              determined by the type of the other arguments to STMT (in the
7024              example we'd check this: optab_handler (widen_sum_optab,
7025 	     vect_short_mode)).
7026 
7027           This is contrary to "regular" reductions, in which the types of all
7028           the arguments are the same as the type of the reduction variable.
7029           For "regular" reductions we can therefore use the same vector type
7030           (and also the same tree-code) when generating the epilog code and
7031           when generating the code inside the loop.  */
7032 
7033   vect_reduction_type reduction_type
7034     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7035   if (orig_stmt
7036       && (reduction_type == TREE_CODE_REDUCTION
7037 	  || reduction_type == FOLD_LEFT_REDUCTION))
7038     {
7039       /* This is a reduction pattern: get the vectype from the type of the
7040          reduction variable, and get the tree-code from orig_stmt.  */
7041       orig_code = gimple_assign_rhs_code (orig_stmt);
7042       gcc_assert (vectype_out);
7043       vec_mode = TYPE_MODE (vectype_out);
7044     }
7045   else
7046     {
7047       /* Regular reduction: use the same vectype and tree-code as used for
7048          the vector code inside the loop can be used for the epilog code. */
7049       orig_code = code;
7050 
7051       if (code == MINUS_EXPR)
7052 	orig_code = PLUS_EXPR;
7053 
7054       /* For simple condition reductions, replace with the actual expression
7055 	 we want to base our reduction around.  */
7056       if (reduction_type == CONST_COND_REDUCTION)
7057 	{
7058 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7059 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7060 	}
7061       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7062 	orig_code = cond_reduc_op_code;
7063     }
7064 
7065   if (nested_cycle)
7066     {
7067       def_bb = gimple_bb (reduc_def_stmt);
7068       def_stmt_loop = def_bb->loop_father;
7069       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7070                                        loop_preheader_edge (def_stmt_loop));
7071       if (TREE_CODE (def_arg) == SSA_NAME
7072           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7073           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7074           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7075           && vinfo_for_stmt (def_arg_stmt)
7076           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7077               == vect_double_reduction_def)
7078         double_reduc = true;
7079     }
7080 
7081   reduc_fn = IFN_LAST;
7082 
7083   if (reduction_type == TREE_CODE_REDUCTION
7084       || reduction_type == FOLD_LEFT_REDUCTION
7085       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7086       || reduction_type == CONST_COND_REDUCTION)
7087     {
7088       if (reduction_type == FOLD_LEFT_REDUCTION
7089 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
7090 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7091 	{
7092 	  if (reduc_fn != IFN_LAST
7093 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7094 						  OPTIMIZE_FOR_SPEED))
7095 	    {
7096 	      if (dump_enabled_p ())
7097 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7098 				 "reduc op not supported by target.\n");
7099 
7100 	      reduc_fn = IFN_LAST;
7101 	    }
7102 	}
7103       else
7104 	{
7105 	  if (!nested_cycle || double_reduc)
7106 	    {
7107 	      if (dump_enabled_p ())
7108 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 				 "no reduc code for scalar code.\n");
7110 
7111 	      return false;
7112 	    }
7113 	}
7114     }
7115   else if (reduction_type == COND_REDUCTION)
7116     {
7117       int scalar_precision
7118 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7119       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7120       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7121 						nunits_out);
7122 
7123       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7124 					  OPTIMIZE_FOR_SPEED))
7125 	reduc_fn = IFN_REDUC_MAX;
7126     }
7127 
7128   if (reduction_type != EXTRACT_LAST_REDUCTION
7129       && reduc_fn == IFN_LAST
7130       && !nunits_out.is_constant ())
7131     {
7132       if (dump_enabled_p ())
7133 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7134 			 "missing target support for reduction on"
7135 			 " variable-length vectors.\n");
7136       return false;
7137     }
7138 
7139   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7140       && ncopies > 1)
7141     {
7142       if (dump_enabled_p ())
7143 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7144 			 "multiple types in double reduction or condition "
7145 			 "reduction.\n");
7146       return false;
7147     }
7148 
7149   /* For SLP reductions, see if there is a neutral value we can use.  */
7150   tree neutral_op = NULL_TREE;
7151   if (slp_node)
7152     neutral_op
7153       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7154 				      GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7155 
7156   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7157     {
7158       /* We can't support in-order reductions of code such as this:
7159 
7160 	   for (int i = 0; i < n1; ++i)
7161 	     for (int j = 0; j < n2; ++j)
7162 	       l += a[j];
7163 
7164 	 since GCC effectively transforms the loop when vectorizing:
7165 
7166 	   for (int i = 0; i < n1 / VF; ++i)
7167 	     for (int j = 0; j < n2; ++j)
7168 	       for (int k = 0; k < VF; ++k)
7169 		 l += a[j];
7170 
7171 	 which is a reassociation of the original operation.  */
7172       if (dump_enabled_p ())
7173 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7174 			 "in-order double reduction not supported.\n");
7175 
7176       return false;
7177     }
7178 
7179   if (reduction_type == FOLD_LEFT_REDUCTION
7180       && slp_node
7181       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7182     {
7183       /* We cannot use in-order reductions in this case because there is
7184 	 an implicit reassociation of the operations involved.  */
7185       if (dump_enabled_p ())
7186 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7187 			 "in-order unchained SLP reductions not supported.\n");
7188       return false;
7189     }
7190 
7191   /* For double reductions, and for SLP reductions with a neutral value,
7192      we construct a variable-length initial vector by loading a vector
7193      full of the neutral value and then shift-and-inserting the start
7194      values into the low-numbered elements.  */
7195   if ((double_reduc || neutral_op)
7196       && !nunits_out.is_constant ()
7197       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7198 					  vectype_out, OPTIMIZE_FOR_SPEED))
7199     {
7200       if (dump_enabled_p ())
7201 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202 			 "reduction on variable-length vectors requires"
7203 			 " target support for a vector-shift-and-insert"
7204 			 " operation.\n");
7205       return false;
7206     }
7207 
7208   /* Check extra constraints for variable-length unchained SLP reductions.  */
7209   if (STMT_SLP_TYPE (stmt_info)
7210       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7211       && !nunits_out.is_constant ())
7212     {
7213       /* We checked above that we could build the initial vector when
7214 	 there's a neutral element value.  Check here for the case in
7215 	 which each SLP statement has its own initial value and in which
7216 	 that value needs to be repeated for every instance of the
7217 	 statement within the initial vector.  */
7218       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7219       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7220       if (!neutral_op
7221 	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
7222 	{
7223 	  if (dump_enabled_p ())
7224 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225 			     "unsupported form of SLP reduction for"
7226 			     " variable-length vectors: cannot build"
7227 			     " initial vector.\n");
7228 	  return false;
7229 	}
7230       /* The epilogue code relies on the number of elements being a multiple
7231 	 of the group size.  The duplicate-and-interleave approach to setting
7232 	 up the the initial vector does too.  */
7233       if (!multiple_p (nunits_out, group_size))
7234 	{
7235 	  if (dump_enabled_p ())
7236 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7237 			     "unsupported form of SLP reduction for"
7238 			     " variable-length vectors: the vector size"
7239 			     " is not a multiple of the number of results.\n");
7240 	  return false;
7241 	}
7242     }
7243 
7244   /* In case of widenning multiplication by a constant, we update the type
7245      of the constant to be the type of the other operand.  We check that the
7246      constant fits the type in the pattern recognition pass.  */
7247   if (code == DOT_PROD_EXPR
7248       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7249     {
7250       if (TREE_CODE (ops[0]) == INTEGER_CST)
7251         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7252       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7253         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7254       else
7255         {
7256           if (dump_enabled_p ())
7257 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258 			     "invalid types in dot-prod\n");
7259 
7260           return false;
7261         }
7262     }
7263 
7264   if (reduction_type == COND_REDUCTION)
7265     {
7266       widest_int ni;
7267 
7268       if (! max_loop_iterations (loop, &ni))
7269 	{
7270 	  if (dump_enabled_p ())
7271 	    dump_printf_loc (MSG_NOTE, vect_location,
7272 			     "loop count not known, cannot create cond "
7273 			     "reduction.\n");
7274 	  return false;
7275 	}
7276       /* Convert backedges to iterations.  */
7277       ni += 1;
7278 
7279       /* The additional index will be the same type as the condition.  Check
7280 	 that the loop can fit into this less one (because we'll use up the
7281 	 zero slot for when there are no matches).  */
7282       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7283       if (wi::geu_p (ni, wi::to_widest (max_index)))
7284 	{
7285 	  if (dump_enabled_p ())
7286 	    dump_printf_loc (MSG_NOTE, vect_location,
7287 			     "loop size is greater than data size.\n");
7288 	  return false;
7289 	}
7290     }
7291 
7292   /* In case the vectorization factor (VF) is bigger than the number
7293      of elements that we can fit in a vectype (nunits), we have to generate
7294      more than one vector stmt - i.e - we need to "unroll" the
7295      vector stmt by a factor VF/nunits.  For more details see documentation
7296      in vectorizable_operation.  */
7297 
7298   /* If the reduction is used in an outer loop we need to generate
7299      VF intermediate results, like so (e.g. for ncopies=2):
7300 	r0 = phi (init, r0)
7301 	r1 = phi (init, r1)
7302 	r0 = x0 + r0;
7303         r1 = x1 + r1;
7304     (i.e. we generate VF results in 2 registers).
7305     In this case we have a separate def-use cycle for each copy, and therefore
7306     for each copy we get the vector def for the reduction variable from the
7307     respective phi node created for this copy.
7308 
7309     Otherwise (the reduction is unused in the loop nest), we can combine
7310     together intermediate results, like so (e.g. for ncopies=2):
7311 	r = phi (init, r)
7312 	r = x0 + r;
7313 	r = x1 + r;
7314    (i.e. we generate VF/2 results in a single register).
7315    In this case for each copy we get the vector def for the reduction variable
7316    from the vectorized reduction operation generated in the previous iteration.
7317 
7318    This only works when we see both the reduction PHI and its only consumer
7319    in vectorizable_reduction and there are no intermediate stmts
7320    participating.  */
7321   use_operand_p use_p;
7322   gimple *use_stmt;
7323   if (ncopies > 1
7324       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7325       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7326       && (use_stmt == stmt
7327 	  || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7328     {
7329       single_defuse_cycle = true;
7330       epilog_copies = 1;
7331     }
7332   else
7333     epilog_copies = ncopies;
7334 
7335   /* If the reduction stmt is one of the patterns that have lane
7336      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7337   if ((ncopies > 1
7338        && ! single_defuse_cycle)
7339       && (code == DOT_PROD_EXPR
7340 	  || code == WIDEN_SUM_EXPR
7341 	  || code == SAD_EXPR))
7342     {
7343       if (dump_enabled_p ())
7344 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7345 			 "multi def-use cycle not possible for lane-reducing "
7346 			 "reduction operation\n");
7347       return false;
7348     }
7349 
7350   if (slp_node)
7351     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7352   else
7353     vec_num = 1;
7354 
7355   internal_fn cond_fn = get_conditional_internal_fn (code);
7356   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7357 
7358   if (!vec_stmt) /* transformation not required.  */
7359     {
7360       if (first_p)
7361 	vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7362       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7363 	{
7364 	  if (reduction_type != FOLD_LEFT_REDUCTION
7365 	      && (cond_fn == IFN_LAST
7366 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7367 						      OPTIMIZE_FOR_SPEED)))
7368 	    {
7369 	      if (dump_enabled_p ())
7370 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371 				 "can't use a fully-masked loop because no"
7372 				 " conditional operation is available.\n");
7373 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7374 	    }
7375 	  else if (reduc_index == -1)
7376 	    {
7377 	      if (dump_enabled_p ())
7378 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7379 				 "can't use a fully-masked loop for chained"
7380 				 " reductions.\n");
7381 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7382 	    }
7383 	  else
7384 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7385 				   vectype_in);
7386 	}
7387       if (dump_enabled_p ()
7388 	  && reduction_type == FOLD_LEFT_REDUCTION)
7389 	dump_printf_loc (MSG_NOTE, vect_location,
7390 			 "using an in-order (fold-left) reduction.\n");
7391       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7392       return true;
7393     }
7394 
7395   /* Transform.  */
7396 
7397   if (dump_enabled_p ())
7398     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7399 
7400   /* FORNOW: Multiple types are not supported for condition.  */
7401   if (code == COND_EXPR)
7402     gcc_assert (ncopies == 1);
7403 
7404   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7405 
7406   if (reduction_type == FOLD_LEFT_REDUCTION)
7407     return vectorize_fold_left_reduction
7408       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7409        reduc_fn, ops, vectype_in, reduc_index, masks);
7410 
7411   if (reduction_type == EXTRACT_LAST_REDUCTION)
7412     {
7413       gcc_assert (!slp_node);
7414       return vectorizable_condition (stmt, gsi, vec_stmt,
7415 				     NULL, reduc_index, NULL);
7416     }
7417 
7418   /* Create the destination vector  */
7419   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7420 
7421   prev_stmt_info = NULL;
7422   prev_phi_info = NULL;
7423   if (!slp_node)
7424     {
7425       vec_oprnds0.create (1);
7426       vec_oprnds1.create (1);
7427       if (op_type == ternary_op)
7428         vec_oprnds2.create (1);
7429     }
7430 
7431   phis.create (vec_num);
7432   vect_defs.create (vec_num);
7433   if (!slp_node)
7434     vect_defs.quick_push (NULL_TREE);
7435 
7436   if (slp_node)
7437     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7438   else
7439     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7440 
7441   for (j = 0; j < ncopies; j++)
7442     {
7443       if (code == COND_EXPR)
7444         {
7445           gcc_assert (!slp_node);
7446           vectorizable_condition (stmt, gsi, vec_stmt,
7447                                   PHI_RESULT (phis[0]),
7448                                   reduc_index, NULL);
7449           /* Multiple types are not supported for condition.  */
7450           break;
7451         }
7452 
7453       /* Handle uses.  */
7454       if (j == 0)
7455         {
7456 	  if (slp_node)
7457 	    {
7458 	      /* Get vec defs for all the operands except the reduction index,
7459 		 ensuring the ordering of the ops in the vector is kept.  */
7460 	      auto_vec<tree, 3> slp_ops;
7461 	      auto_vec<vec<tree>, 3> vec_defs;
7462 
7463 	      slp_ops.quick_push (ops[0]);
7464 	      slp_ops.quick_push (ops[1]);
7465 	      if (op_type == ternary_op)
7466 		slp_ops.quick_push (ops[2]);
7467 
7468 	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7469 
7470 	      vec_oprnds0.safe_splice (vec_defs[0]);
7471 	      vec_defs[0].release ();
7472 	      vec_oprnds1.safe_splice (vec_defs[1]);
7473 	      vec_defs[1].release ();
7474 	      if (op_type == ternary_op)
7475 		{
7476 		  vec_oprnds2.safe_splice (vec_defs[2]);
7477 		  vec_defs[2].release ();
7478 		}
7479 	    }
7480           else
7481 	    {
7482               vec_oprnds0.quick_push
7483 		(vect_get_vec_def_for_operand (ops[0], stmt));
7484               vec_oprnds1.quick_push
7485 		(vect_get_vec_def_for_operand (ops[1], stmt));
7486               if (op_type == ternary_op)
7487 		vec_oprnds2.quick_push
7488 		  (vect_get_vec_def_for_operand (ops[2], stmt));
7489 	    }
7490         }
7491       else
7492         {
7493           if (!slp_node)
7494             {
7495 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7496 
7497 	      if (single_defuse_cycle && reduc_index == 0)
7498 		vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7499 	      else
7500 		vec_oprnds0[0]
7501 		  = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7502 	      if (single_defuse_cycle && reduc_index == 1)
7503 		vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7504 	      else
7505 		vec_oprnds1[0]
7506 		  = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7507 	      if (op_type == ternary_op)
7508 		{
7509 		  if (single_defuse_cycle && reduc_index == 2)
7510 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7511 		  else
7512 		    vec_oprnds2[0]
7513 		      = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7514 		}
7515             }
7516         }
7517 
7518       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7519         {
7520 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7521 	  if (masked_loop_p)
7522 	    {
7523 	      /* Make sure that the reduction accumulator is vop[0].  */
7524 	      if (reduc_index == 1)
7525 		{
7526 		  gcc_assert (commutative_tree_code (code));
7527 		  std::swap (vop[0], vop[1]);
7528 		}
7529 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7530 					      vectype_in, i * ncopies + j);
7531 	      gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7532 							vop[0], vop[1]);
7533 	      new_temp = make_ssa_name (vec_dest, call);
7534 	      gimple_call_set_lhs (call, new_temp);
7535 	      gimple_call_set_nothrow (call, true);
7536 	      new_stmt = call;
7537 	    }
7538 	  else
7539 	    {
7540 	      if (op_type == ternary_op)
7541 		vop[2] = vec_oprnds2[i];
7542 
7543 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7544 	      new_stmt = gimple_build_assign (new_temp, code,
7545 					      vop[0], vop[1], vop[2]);
7546 	    }
7547 	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
7548 
7549           if (slp_node)
7550             {
7551               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7552               vect_defs.quick_push (new_temp);
7553             }
7554           else
7555             vect_defs[0] = new_temp;
7556         }
7557 
7558       if (slp_node)
7559         continue;
7560 
7561       if (j == 0)
7562 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7563       else
7564 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7565 
7566       prev_stmt_info = vinfo_for_stmt (new_stmt);
7567     }
7568 
7569   /* Finalize the reduction-phi (set its arguments) and create the
7570      epilog reduction code.  */
7571   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7572     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7573 
7574   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7575 				    epilog_copies, reduc_fn, phis,
7576 				    double_reduc, slp_node, slp_node_instance,
7577 				    cond_reduc_val, cond_reduc_op_code,
7578 				    neutral_op);
7579 
7580   return true;
7581 }
7582 
7583 /* Function vect_min_worthwhile_factor.
7584 
7585    For a loop where we could vectorize the operation indicated by CODE,
7586    return the minimum vectorization factor that makes it worthwhile
7587    to use generic vectors.  */
7588 static unsigned int
7589 vect_min_worthwhile_factor (enum tree_code code)
7590 {
7591   switch (code)
7592     {
7593     case PLUS_EXPR:
7594     case MINUS_EXPR:
7595     case NEGATE_EXPR:
7596       return 4;
7597 
7598     case BIT_AND_EXPR:
7599     case BIT_IOR_EXPR:
7600     case BIT_XOR_EXPR:
7601     case BIT_NOT_EXPR:
7602       return 2;
7603 
7604     default:
7605       return INT_MAX;
7606     }
7607 }
7608 
7609 /* Return true if VINFO indicates we are doing loop vectorization and if
7610    it is worth decomposing CODE operations into scalar operations for
7611    that loop's vectorization factor.  */
7612 
7613 bool
7614 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7615 {
7616   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7617   unsigned HOST_WIDE_INT value;
7618   return (loop_vinfo
7619 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7620 	  && value >= vect_min_worthwhile_factor (code));
7621 }
7622 
7623 /* Function vectorizable_induction
7624 
7625    Check if PHI performs an induction computation that can be vectorized.
7626    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7627    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7628    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7629 
7630 bool
7631 vectorizable_induction (gimple *phi,
7632 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7633 			gimple **vec_stmt, slp_tree slp_node)
7634 {
7635   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7636   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7637   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7638   unsigned ncopies;
7639   bool nested_in_vect_loop = false;
7640   struct loop *iv_loop;
7641   tree vec_def;
7642   edge pe = loop_preheader_edge (loop);
7643   basic_block new_bb;
7644   tree new_vec, vec_init, vec_step, t;
7645   tree new_name;
7646   gimple *new_stmt;
7647   gphi *induction_phi;
7648   tree induc_def, vec_dest;
7649   tree init_expr, step_expr;
7650   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7651   unsigned i;
7652   tree expr;
7653   gimple_seq stmts;
7654   imm_use_iterator imm_iter;
7655   use_operand_p use_p;
7656   gimple *exit_phi;
7657   edge latch_e;
7658   tree loop_arg;
7659   gimple_stmt_iterator si;
7660   basic_block bb = gimple_bb (phi);
7661 
7662   if (gimple_code (phi) != GIMPLE_PHI)
7663     return false;
7664 
7665   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7666     return false;
7667 
7668   /* Make sure it was recognized as induction computation.  */
7669   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7670     return false;
7671 
7672   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7673   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7674 
7675   if (slp_node)
7676     ncopies = 1;
7677   else
7678     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7679   gcc_assert (ncopies >= 1);
7680 
7681   /* FORNOW. These restrictions should be relaxed.  */
7682   if (nested_in_vect_loop_p (loop, phi))
7683     {
7684       imm_use_iterator imm_iter;
7685       use_operand_p use_p;
7686       gimple *exit_phi;
7687       edge latch_e;
7688       tree loop_arg;
7689 
7690       if (ncopies > 1)
7691 	{
7692 	  if (dump_enabled_p ())
7693 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694 			     "multiple types in nested loop.\n");
7695 	  return false;
7696 	}
7697 
7698       /* FORNOW: outer loop induction with SLP not supported.  */
7699       if (STMT_SLP_TYPE (stmt_info))
7700 	return false;
7701 
7702       exit_phi = NULL;
7703       latch_e = loop_latch_edge (loop->inner);
7704       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7706 	{
7707 	  gimple *use_stmt = USE_STMT (use_p);
7708 	  if (is_gimple_debug (use_stmt))
7709 	    continue;
7710 
7711 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7712 	    {
7713 	      exit_phi = use_stmt;
7714 	      break;
7715 	    }
7716 	}
7717       if (exit_phi)
7718 	{
7719 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7720 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7721 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7722 	    {
7723 	      if (dump_enabled_p ())
7724 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7725 				 "inner-loop induction only used outside "
7726 				 "of the outer vectorized loop.\n");
7727 	      return false;
7728 	    }
7729 	}
7730 
7731       nested_in_vect_loop = true;
7732       iv_loop = loop->inner;
7733     }
7734   else
7735     iv_loop = loop;
7736   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7737 
7738   if (slp_node && !nunits.is_constant ())
7739     {
7740       /* The current SLP code creates the initial value element-by-element.  */
7741       if (dump_enabled_p ())
7742 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7743 			 "SLP induction not supported for variable-length"
7744 			 " vectors.\n");
7745       return false;
7746     }
7747 
7748   if (!vec_stmt) /* transformation not required.  */
7749     {
7750       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7751       if (dump_enabled_p ())
7752         dump_printf_loc (MSG_NOTE, vect_location,
7753                          "=== vectorizable_induction ===\n");
7754       vect_model_induction_cost (stmt_info, ncopies);
7755       return true;
7756     }
7757 
7758   /* Transform.  */
7759 
7760   /* Compute a vector variable, initialized with the first VF values of
7761      the induction variable.  E.g., for an iv with IV_PHI='X' and
7762      evolution S, for a vector of 4 units, we want to compute:
7763      [X, X + S, X + 2*S, X + 3*S].  */
7764 
7765   if (dump_enabled_p ())
7766     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7767 
7768   latch_e = loop_latch_edge (iv_loop);
7769   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7770 
7771   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7772   gcc_assert (step_expr != NULL_TREE);
7773 
7774   pe = loop_preheader_edge (iv_loop);
7775   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7776 				     loop_preheader_edge (iv_loop));
7777 
7778   stmts = NULL;
7779   if (!nested_in_vect_loop)
7780     {
7781       /* Convert the initial value to the desired type.  */
7782       tree new_type = TREE_TYPE (vectype);
7783       init_expr = gimple_convert (&stmts, new_type, init_expr);
7784 
7785       /* If we are using the loop mask to "peel" for alignment then we need
7786 	 to adjust the start value here.  */
7787       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7788       if (skip_niters != NULL_TREE)
7789 	{
7790 	  if (FLOAT_TYPE_P (vectype))
7791 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7792 					skip_niters);
7793 	  else
7794 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7795 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7796 					 skip_niters, step_expr);
7797 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7798 				    init_expr, skip_step);
7799 	}
7800     }
7801 
7802   /* Convert the step to the desired type.  */
7803   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7804 
7805   if (stmts)
7806     {
7807       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7808       gcc_assert (!new_bb);
7809     }
7810 
7811   /* Find the first insertion point in the BB.  */
7812   si = gsi_after_labels (bb);
7813 
7814   /* For SLP induction we have to generate several IVs as for example
7815      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7816      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7817      [VF*S, VF*S, VF*S, VF*S] for all.  */
7818   if (slp_node)
7819     {
7820       /* Enforced above.  */
7821       unsigned int const_nunits = nunits.to_constant ();
7822 
7823       /* Generate [VF*S, VF*S, ... ].  */
7824       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7825 	{
7826 	  expr = build_int_cst (integer_type_node, vf);
7827 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7828 	}
7829       else
7830 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7831       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7832 			      expr, step_expr);
7833       if (! CONSTANT_CLASS_P (new_name))
7834 	new_name = vect_init_vector (phi, new_name,
7835 				     TREE_TYPE (step_expr), NULL);
7836       new_vec = build_vector_from_val (vectype, new_name);
7837       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7838 
7839       /* Now generate the IVs.  */
7840       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7841       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7842       unsigned elts = const_nunits * nvects;
7843       unsigned nivs = least_common_multiple (group_size,
7844 					     const_nunits) / const_nunits;
7845       gcc_assert (elts % group_size == 0);
7846       tree elt = init_expr;
7847       unsigned ivn;
7848       for (ivn = 0; ivn < nivs; ++ivn)
7849 	{
7850 	  tree_vector_builder elts (vectype, const_nunits, 1);
7851 	  stmts = NULL;
7852 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7853 	    {
7854 	      if (ivn*const_nunits + eltn >= group_size
7855 		  && (ivn * const_nunits + eltn) % group_size == 0)
7856 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7857 				    elt, step_expr);
7858 	      elts.quick_push (elt);
7859 	    }
7860 	  vec_init = gimple_build_vector (&stmts, &elts);
7861 	  if (stmts)
7862 	    {
7863 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7864 	      gcc_assert (!new_bb);
7865 	    }
7866 
7867 	  /* Create the induction-phi that defines the induction-operand.  */
7868 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7869 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7870 	  set_vinfo_for_stmt (induction_phi,
7871 			      new_stmt_vec_info (induction_phi, loop_vinfo));
7872 	  induc_def = PHI_RESULT (induction_phi);
7873 
7874 	  /* Create the iv update inside the loop  */
7875 	  vec_def = make_ssa_name (vec_dest);
7876 	  new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7877 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7878 	  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7879 
7880 	  /* Set the arguments of the phi node:  */
7881 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7882 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7883 		       UNKNOWN_LOCATION);
7884 
7885 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7886 	}
7887 
7888       /* Re-use IVs when we can.  */
7889       if (ivn < nvects)
7890 	{
7891 	  unsigned vfp
7892 	    = least_common_multiple (group_size, const_nunits) / group_size;
7893 	  /* Generate [VF'*S, VF'*S, ... ].  */
7894 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7895 	    {
7896 	      expr = build_int_cst (integer_type_node, vfp);
7897 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7898 	    }
7899 	  else
7900 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7901 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7902 				  expr, step_expr);
7903 	  if (! CONSTANT_CLASS_P (new_name))
7904 	    new_name = vect_init_vector (phi, new_name,
7905 					 TREE_TYPE (step_expr), NULL);
7906 	  new_vec = build_vector_from_val (vectype, new_name);
7907 	  vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7908 	  for (; ivn < nvects; ++ivn)
7909 	    {
7910 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7911 	      tree def;
7912 	      if (gimple_code (iv) == GIMPLE_PHI)
7913 		def = gimple_phi_result (iv);
7914 	      else
7915 		def = gimple_assign_lhs (iv);
7916 	      new_stmt = gimple_build_assign (make_ssa_name (vectype),
7917 					      PLUS_EXPR,
7918 					      def, vec_step);
7919 	      if (gimple_code (iv) == GIMPLE_PHI)
7920 		gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7921 	      else
7922 		{
7923 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7924 		  gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7925 		}
7926 	      set_vinfo_for_stmt (new_stmt,
7927 				  new_stmt_vec_info (new_stmt, loop_vinfo));
7928 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7929 	    }
7930 	}
7931 
7932       return true;
7933     }
7934 
7935   /* Create the vector that holds the initial_value of the induction.  */
7936   if (nested_in_vect_loop)
7937     {
7938       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7939 	 been created during vectorization of previous stmts.  We obtain it
7940 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7941       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7942       /* If the initial value is not of proper type, convert it.  */
7943       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7944 	{
7945 	  new_stmt
7946 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7947 							  vect_simple_var,
7948 							  "vec_iv_"),
7949 				   VIEW_CONVERT_EXPR,
7950 				   build1 (VIEW_CONVERT_EXPR, vectype,
7951 					   vec_init));
7952 	  vec_init = gimple_assign_lhs (new_stmt);
7953 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7954 						 new_stmt);
7955 	  gcc_assert (!new_bb);
7956 	  set_vinfo_for_stmt (new_stmt,
7957 			      new_stmt_vec_info (new_stmt, loop_vinfo));
7958 	}
7959     }
7960   else
7961     {
7962       /* iv_loop is the loop to be vectorized. Create:
7963 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7964       stmts = NULL;
7965       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7966 
7967       unsigned HOST_WIDE_INT const_nunits;
7968       if (nunits.is_constant (&const_nunits))
7969 	{
7970 	  tree_vector_builder elts (vectype, const_nunits, 1);
7971 	  elts.quick_push (new_name);
7972 	  for (i = 1; i < const_nunits; i++)
7973 	    {
7974 	      /* Create: new_name_i = new_name + step_expr  */
7975 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7976 				       new_name, step_expr);
7977 	      elts.quick_push (new_name);
7978 	    }
7979 	  /* Create a vector from [new_name_0, new_name_1, ...,
7980 	     new_name_nunits-1]  */
7981 	  vec_init = gimple_build_vector (&stmts, &elts);
7982 	}
7983       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7984 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7985 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7986 				 new_name, step_expr);
7987       else
7988 	{
7989 	  /* Build:
7990 	        [base, base, base, ...]
7991 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7992 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7993 	  gcc_assert (flag_associative_math);
7994 	  tree index = build_index_vector (vectype, 0, 1);
7995 	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7996 							new_name);
7997 	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7998 							step_expr);
7999 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
8000 	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8001 				   vec_init, step_vec);
8002 	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8003 				   vec_init, base_vec);
8004 	}
8005 
8006       if (stmts)
8007 	{
8008 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8009 	  gcc_assert (!new_bb);
8010 	}
8011     }
8012 
8013 
8014   /* Create the vector that holds the step of the induction.  */
8015   if (nested_in_vect_loop)
8016     /* iv_loop is nested in the loop to be vectorized. Generate:
8017        vec_step = [S, S, S, S]  */
8018     new_name = step_expr;
8019   else
8020     {
8021       /* iv_loop is the loop to be vectorized. Generate:
8022 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8023       gimple_seq seq = NULL;
8024       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8025 	{
8026 	  expr = build_int_cst (integer_type_node, vf);
8027 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8028 	}
8029       else
8030 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
8031       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8032 			       expr, step_expr);
8033       if (seq)
8034 	{
8035 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8036 	  gcc_assert (!new_bb);
8037 	}
8038     }
8039 
8040   t = unshare_expr (new_name);
8041   gcc_assert (CONSTANT_CLASS_P (new_name)
8042 	      || TREE_CODE (new_name) == SSA_NAME);
8043   new_vec = build_vector_from_val (vectype, t);
8044   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8045 
8046 
8047   /* Create the following def-use cycle:
8048      loop prolog:
8049          vec_init = ...
8050 	 vec_step = ...
8051      loop:
8052          vec_iv = PHI <vec_init, vec_loop>
8053          ...
8054          STMT
8055          ...
8056          vec_loop = vec_iv + vec_step;  */
8057 
8058   /* Create the induction-phi that defines the induction-operand.  */
8059   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8060   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8061   set_vinfo_for_stmt (induction_phi,
8062 		      new_stmt_vec_info (induction_phi, loop_vinfo));
8063   induc_def = PHI_RESULT (induction_phi);
8064 
8065   /* Create the iv update inside the loop  */
8066   vec_def = make_ssa_name (vec_dest);
8067   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8068   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8069   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8070 
8071   /* Set the arguments of the phi node:  */
8072   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8073   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8074 	       UNKNOWN_LOCATION);
8075 
8076   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8077 
8078   /* In case that vectorization factor (VF) is bigger than the number
8079      of elements that we can fit in a vectype (nunits), we have to generate
8080      more than one vector stmt - i.e - we need to "unroll" the
8081      vector stmt by a factor VF/nunits.  For more details see documentation
8082      in vectorizable_operation.  */
8083 
8084   if (ncopies > 1)
8085     {
8086       gimple_seq seq = NULL;
8087       stmt_vec_info prev_stmt_vinfo;
8088       /* FORNOW. This restriction should be relaxed.  */
8089       gcc_assert (!nested_in_vect_loop);
8090 
8091       /* Create the vector that holds the step of the induction.  */
8092       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8093 	{
8094 	  expr = build_int_cst (integer_type_node, nunits);
8095 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8096 	}
8097       else
8098 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8099       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8100 			       expr, step_expr);
8101       if (seq)
8102 	{
8103 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8104 	  gcc_assert (!new_bb);
8105 	}
8106 
8107       t = unshare_expr (new_name);
8108       gcc_assert (CONSTANT_CLASS_P (new_name)
8109 		  || TREE_CODE (new_name) == SSA_NAME);
8110       new_vec = build_vector_from_val (vectype, t);
8111       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8112 
8113       vec_def = induc_def;
8114       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8115       for (i = 1; i < ncopies; i++)
8116 	{
8117 	  /* vec_i = vec_prev + vec_step  */
8118 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8119 					  vec_def, vec_step);
8120 	  vec_def = make_ssa_name (vec_dest, new_stmt);
8121 	  gimple_assign_set_lhs (new_stmt, vec_def);
8122 
8123 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8124 	  set_vinfo_for_stmt (new_stmt,
8125 			      new_stmt_vec_info (new_stmt, loop_vinfo));
8126 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8127 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8128 	}
8129     }
8130 
8131   if (nested_in_vect_loop)
8132     {
8133       /* Find the loop-closed exit-phi of the induction, and record
8134          the final vector of induction results:  */
8135       exit_phi = NULL;
8136       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8137         {
8138 	  gimple *use_stmt = USE_STMT (use_p);
8139 	  if (is_gimple_debug (use_stmt))
8140 	    continue;
8141 
8142 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8143 	    {
8144 	      exit_phi = use_stmt;
8145 	      break;
8146 	    }
8147         }
8148       if (exit_phi)
8149 	{
8150 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8151 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
8152 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
8153 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8154 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
8155 
8156 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8157 	  if (dump_enabled_p ())
8158 	    {
8159 	      dump_printf_loc (MSG_NOTE, vect_location,
8160 			       "vector of inductions after inner-loop:");
8161 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8162 	    }
8163 	}
8164     }
8165 
8166 
8167   if (dump_enabled_p ())
8168     {
8169       dump_printf_loc (MSG_NOTE, vect_location,
8170 		       "transform induction: created def-use cycle: ");
8171       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8172       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8173 			SSA_NAME_DEF_STMT (vec_def), 0);
8174     }
8175 
8176   return true;
8177 }
8178 
8179 /* Function vectorizable_live_operation.
8180 
8181    STMT computes a value that is used outside the loop.  Check if
8182    it can be supported.  */
8183 
8184 bool
8185 vectorizable_live_operation (gimple *stmt,
8186 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8187 			     slp_tree slp_node, int slp_index,
8188 			     gimple **vec_stmt)
8189 {
8190   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8192   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8193   imm_use_iterator imm_iter;
8194   tree lhs, lhs_type, bitsize, vec_bitsize;
8195   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8196   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8197   int ncopies;
8198   gimple *use_stmt;
8199   auto_vec<tree> vec_oprnds;
8200   int vec_entry = 0;
8201   poly_uint64 vec_index = 0;
8202 
8203   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8204 
8205   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8206     return false;
8207 
8208   /* FORNOW.  CHECKME.  */
8209   if (nested_in_vect_loop_p (loop, stmt))
8210     return false;
8211 
8212   /* If STMT is not relevant and it is a simple assignment and its inputs are
8213      invariant then it can remain in place, unvectorized.  The original last
8214      scalar value that it computes will be used.  */
8215   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8216     {
8217       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8218       if (dump_enabled_p ())
8219 	dump_printf_loc (MSG_NOTE, vect_location,
8220 			 "statement is simple and uses invariant.  Leaving in "
8221 			 "place.\n");
8222       return true;
8223     }
8224 
8225   if (slp_node)
8226     ncopies = 1;
8227   else
8228     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8229 
8230   if (slp_node)
8231     {
8232       gcc_assert (slp_index >= 0);
8233 
8234       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8235       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8236 
8237       /* Get the last occurrence of the scalar index from the concatenation of
8238 	 all the slp vectors. Calculate which slp vector it is and the index
8239 	 within.  */
8240       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8241 
8242       /* Calculate which vector contains the result, and which lane of
8243 	 that vector we need.  */
8244       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8245 	{
8246 	  if (dump_enabled_p ())
8247 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 			     "Cannot determine which vector holds the"
8249 			     " final result.\n");
8250 	  return false;
8251 	}
8252     }
8253 
8254   if (!vec_stmt)
8255     {
8256       /* No transformation required.  */
8257       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8258 	{
8259 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8260 					       OPTIMIZE_FOR_SPEED))
8261 	    {
8262 	      if (dump_enabled_p ())
8263 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264 				 "can't use a fully-masked loop because "
8265 				 "the target doesn't support extract last "
8266 				 "reduction.\n");
8267 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8268 	    }
8269 	  else if (slp_node)
8270 	    {
8271 	      if (dump_enabled_p ())
8272 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273 				 "can't use a fully-masked loop because an "
8274 				 "SLP statement is live after the loop.\n");
8275 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8276 	    }
8277 	  else if (ncopies > 1)
8278 	    {
8279 	      if (dump_enabled_p ())
8280 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8281 				 "can't use a fully-masked loop because"
8282 				 " ncopies is greater than 1.\n");
8283 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8284 	    }
8285 	  else
8286 	    {
8287 	      gcc_assert (ncopies == 1 && !slp_node);
8288 	      vect_record_loop_mask (loop_vinfo,
8289 				     &LOOP_VINFO_MASKS (loop_vinfo),
8290 				     1, vectype);
8291 	    }
8292 	}
8293       return true;
8294     }
8295 
8296   /* If stmt has a related stmt, then use that for getting the lhs.  */
8297   if (is_pattern_stmt_p (stmt_info))
8298     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8299 
8300   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8301 	: gimple_get_lhs (stmt);
8302   lhs_type = TREE_TYPE (lhs);
8303 
8304   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8305 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8306 	     : TYPE_SIZE (TREE_TYPE (vectype)));
8307   vec_bitsize = TYPE_SIZE (vectype);
8308 
8309   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8310   tree vec_lhs, bitstart;
8311   if (slp_node)
8312     {
8313       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8314 
8315       /* Get the correct slp vectorized stmt.  */
8316       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8317       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8318 	vec_lhs = gimple_phi_result (phi);
8319       else
8320 	vec_lhs = gimple_get_lhs (vec_stmt);
8321 
8322       /* Get entry to use.  */
8323       bitstart = bitsize_int (vec_index);
8324       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8325     }
8326   else
8327     {
8328       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8329       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8330       gcc_checking_assert (ncopies == 1
8331 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8332 
8333       /* For multiple copies, get the last copy.  */
8334       for (int i = 1; i < ncopies; ++i)
8335 	vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8336 						  vec_lhs);
8337 
8338       /* Get the last lane in the vector.  */
8339       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8340     }
8341 
8342   gimple_seq stmts = NULL;
8343   tree new_tree;
8344   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8345     {
8346       /* Emit:
8347 
8348 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8349 
8350 	 where VEC_LHS is the vectorized live-out result and MASK is
8351 	 the loop mask for the final iteration.  */
8352       gcc_assert (ncopies == 1 && !slp_node);
8353       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8354       tree scalar_res = make_ssa_name (scalar_type);
8355       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8356 				      1, vectype, 0);
8357       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8358 						    2, mask, vec_lhs);
8359       gimple_call_set_lhs (new_stmt, scalar_res);
8360       gimple_seq_add_stmt (&stmts, new_stmt);
8361 
8362       /* Convert the extracted vector element to the required scalar type.  */
8363       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8364     }
8365   else
8366     {
8367       tree bftype = TREE_TYPE (vectype);
8368       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8369 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8370       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8371       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8372 				       &stmts, true, NULL_TREE);
8373     }
8374 
8375   if (stmts)
8376     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8377 
8378   /* Replace use of lhs with newly computed result.  If the use stmt is a
8379      single arg PHI, just replace all uses of PHI result.  It's necessary
8380      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8381   use_operand_p use_p;
8382   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8383     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8384 	&& !is_gimple_debug (use_stmt))
8385     {
8386       if (gimple_code (use_stmt) == GIMPLE_PHI
8387 	  && gimple_phi_num_args (use_stmt) == 1)
8388 	{
8389 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8390 	}
8391       else
8392 	{
8393 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8394 	    SET_USE (use_p, new_tree);
8395 	}
8396       update_stmt (use_stmt);
8397     }
8398 
8399   return true;
8400 }
8401 
8402 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8403 
8404 static void
8405 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8406 {
8407   ssa_op_iter op_iter;
8408   imm_use_iterator imm_iter;
8409   def_operand_p def_p;
8410   gimple *ustmt;
8411 
8412   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8413     {
8414       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8415 	{
8416 	  basic_block bb;
8417 
8418 	  if (!is_gimple_debug (ustmt))
8419 	    continue;
8420 
8421 	  bb = gimple_bb (ustmt);
8422 
8423 	  if (!flow_bb_inside_loop_p (loop, bb))
8424 	    {
8425 	      if (gimple_debug_bind_p (ustmt))
8426 		{
8427 		  if (dump_enabled_p ())
8428 		    dump_printf_loc (MSG_NOTE, vect_location,
8429                                      "killing debug use\n");
8430 
8431 		  gimple_debug_bind_reset_value (ustmt);
8432 		  update_stmt (ustmt);
8433 		}
8434 	      else
8435 		gcc_unreachable ();
8436 	    }
8437 	}
8438     }
8439 }
8440 
8441 /* Given loop represented by LOOP_VINFO, return true if computation of
8442    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8443    otherwise.  */
8444 
8445 static bool
8446 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8447 {
8448   /* Constant case.  */
8449   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8450     {
8451       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8452       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8453 
8454       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8455       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8456       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8457 	return true;
8458     }
8459 
8460   widest_int max;
8461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8462   /* Check the upper bound of loop niters.  */
8463   if (get_max_loop_iterations (loop, &max))
8464     {
8465       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8466       signop sgn = TYPE_SIGN (type);
8467       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8468       if (max < type_max)
8469 	return true;
8470     }
8471   return false;
8472 }
8473 
8474 /* Return a mask type with half the number of elements as TYPE.  */
8475 
8476 tree
8477 vect_halve_mask_nunits (tree type)
8478 {
8479   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8480   return build_truth_vector_type (nunits, current_vector_size);
8481 }
8482 
8483 /* Return a mask type with twice as many elements as TYPE.  */
8484 
8485 tree
8486 vect_double_mask_nunits (tree type)
8487 {
8488   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8489   return build_truth_vector_type (nunits, current_vector_size);
8490 }
8491 
8492 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8493    contain a sequence of NVECTORS masks that each control a vector of type
8494    VECTYPE.  */
8495 
8496 void
8497 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8498 		       unsigned int nvectors, tree vectype)
8499 {
8500   gcc_assert (nvectors != 0);
8501   if (masks->length () < nvectors)
8502     masks->safe_grow_cleared (nvectors);
8503   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8504   /* The number of scalars per iteration and the number of vectors are
8505      both compile-time constants.  */
8506   unsigned int nscalars_per_iter
8507     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8508 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8509   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8510     {
8511       rgm->max_nscalars_per_iter = nscalars_per_iter;
8512       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8513     }
8514 }
8515 
8516 /* Given a complete set of masks MASKS, extract mask number INDEX
8517    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8518    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8519 
8520    See the comment above vec_loop_masks for more details about the mask
8521    arrangement.  */
8522 
8523 tree
8524 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8525 		    unsigned int nvectors, tree vectype, unsigned int index)
8526 {
8527   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8528   tree mask_type = rgm->mask_type;
8529 
8530   /* Populate the rgroup's mask array, if this is the first time we've
8531      used it.  */
8532   if (rgm->masks.is_empty ())
8533     {
8534       rgm->masks.safe_grow_cleared (nvectors);
8535       for (unsigned int i = 0; i < nvectors; ++i)
8536 	{
8537 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8538 	  /* Provide a dummy definition until the real one is available.  */
8539 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8540 	  rgm->masks[i] = mask;
8541 	}
8542     }
8543 
8544   tree mask = rgm->masks[index];
8545   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8546 		TYPE_VECTOR_SUBPARTS (vectype)))
8547     {
8548       /* A loop mask for data type X can be reused for data type Y
8549 	 if X has N times more elements than Y and if Y's elements
8550 	 are N times bigger than X's.  In this case each sequence
8551 	 of N elements in the loop mask will be all-zero or all-one.
8552 	 We can then view-convert the mask so that each sequence of
8553 	 N elements is replaced by a single element.  */
8554       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8555 			      TYPE_VECTOR_SUBPARTS (vectype)));
8556       gimple_seq seq = NULL;
8557       mask_type = build_same_sized_truth_vector_type (vectype);
8558       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8559       if (seq)
8560 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8561     }
8562   return mask;
8563 }
8564 
8565 /* Scale profiling counters by estimation for LOOP which is vectorized
8566    by factor VF.  */
8567 
8568 static void
8569 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8570 {
8571   edge preheader = loop_preheader_edge (loop);
8572   /* Reduce loop iterations by the vectorization factor.  */
8573   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8574   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8575 
8576   if (freq_h.nonzero_p ())
8577     {
8578       profile_probability p;
8579 
8580       /* Avoid dropping loop body profile counter to 0 because of zero count
8581 	 in loop's preheader.  */
8582       if (!(freq_e == profile_count::zero ()))
8583         freq_e = freq_e.force_nonzero ();
8584       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8585       scale_loop_frequencies (loop, p);
8586     }
8587 
8588   edge exit_e = single_exit (loop);
8589   exit_e->probability = profile_probability::always ()
8590 				 .apply_scale (1, new_est_niter + 1);
8591 
8592   edge exit_l = single_pred_edge (loop->latch);
8593   profile_probability prob = exit_l->probability;
8594   exit_l->probability = exit_e->probability.invert ();
8595   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8596     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8597 }
8598 
8599 /* Function vect_transform_loop.
8600 
8601    The analysis phase has determined that the loop is vectorizable.
8602    Vectorize the loop - created vectorized stmts to replace the scalar
8603    stmts in the loop, and update the loop exit condition.
8604    Returns scalar epilogue loop if any.  */
8605 
8606 struct loop *
8607 vect_transform_loop (loop_vec_info loop_vinfo)
8608 {
8609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8610   struct loop *epilogue = NULL;
8611   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8612   int nbbs = loop->num_nodes;
8613   int i;
8614   tree niters_vector = NULL_TREE;
8615   tree step_vector = NULL_TREE;
8616   tree niters_vector_mult_vf = NULL_TREE;
8617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8618   unsigned int lowest_vf = constant_lower_bound (vf);
8619   bool grouped_store;
8620   bool slp_scheduled = false;
8621   gimple *stmt, *pattern_stmt;
8622   gimple_seq pattern_def_seq = NULL;
8623   gimple_stmt_iterator pattern_def_si = gsi_none ();
8624   bool transform_pattern_stmt = false;
8625   bool check_profitability = false;
8626   unsigned int th;
8627 
8628   if (dump_enabled_p ())
8629     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8630 
8631   /* Use the more conservative vectorization threshold.  If the number
8632      of iterations is constant assume the cost check has been performed
8633      by our caller.  If the threshold makes all loops profitable that
8634      run at least the (estimated) vectorization factor number of times
8635      checking is pointless, too.  */
8636   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8637   if (th >= vect_vf_for_cost (loop_vinfo)
8638       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8639     {
8640       if (dump_enabled_p ())
8641 	dump_printf_loc (MSG_NOTE, vect_location,
8642 			 "Profitability threshold is %d loop iterations.\n",
8643                          th);
8644       check_profitability = true;
8645     }
8646 
8647   /* Make sure there exists a single-predecessor exit bb.  Do this before
8648      versioning.   */
8649   edge e = single_exit (loop);
8650   if (! single_pred_p (e->dest))
8651     {
8652       split_loop_exit_edge (e);
8653       if (dump_enabled_p ())
8654 	dump_printf (MSG_NOTE, "split exit edge\n");
8655     }
8656 
8657   /* Version the loop first, if required, so the profitability check
8658      comes first.  */
8659 
8660   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8661     {
8662       poly_uint64 versioning_threshold
8663 	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8664       if (check_profitability
8665 	  && ordered_p (poly_uint64 (th), versioning_threshold))
8666 	{
8667 	  versioning_threshold = ordered_max (poly_uint64 (th),
8668 					      versioning_threshold);
8669 	  check_profitability = false;
8670 	}
8671       vect_loop_versioning (loop_vinfo, th, check_profitability,
8672 			    versioning_threshold);
8673       check_profitability = false;
8674     }
8675 
8676   /* Make sure there exists a single-predecessor exit bb also on the
8677      scalar loop copy.  Do this after versioning but before peeling
8678      so CFG structure is fine for both scalar and if-converted loop
8679      to make slpeel_duplicate_current_defs_from_edges face matched
8680      loop closed PHI nodes on the exit.  */
8681   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8682     {
8683       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8684       if (! single_pred_p (e->dest))
8685 	{
8686 	  split_loop_exit_edge (e);
8687 	  if (dump_enabled_p ())
8688 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8689 	}
8690     }
8691 
8692   tree niters = vect_build_loop_niters (loop_vinfo);
8693   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8694   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8695   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8696   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8697 			      &step_vector, &niters_vector_mult_vf, th,
8698 			      check_profitability, niters_no_overflow);
8699 
8700   if (niters_vector == NULL_TREE)
8701     {
8702       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8703 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8704 	  && known_eq (lowest_vf, vf))
8705 	{
8706 	  niters_vector
8707 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8708 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8709 	  step_vector = build_one_cst (TREE_TYPE (niters));
8710 	}
8711       else
8712 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8713 				     &step_vector, niters_no_overflow);
8714     }
8715 
8716   /* 1) Make sure the loop header has exactly two entries
8717      2) Make sure we have a preheader basic block.  */
8718 
8719   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8720 
8721   split_edge (loop_preheader_edge (loop));
8722 
8723   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8724       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8725     /* This will deal with any possible peeling.  */
8726     vect_prepare_for_masked_peels (loop_vinfo);
8727 
8728   /* FORNOW: the vectorizer supports only loops which body consist
8729      of one basic block (header + empty latch). When the vectorizer will
8730      support more involved loop forms, the order by which the BBs are
8731      traversed need to be reconsidered.  */
8732 
8733   for (i = 0; i < nbbs; i++)
8734     {
8735       basic_block bb = bbs[i];
8736       stmt_vec_info stmt_info;
8737 
8738       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8739 	   gsi_next (&si))
8740         {
8741 	  gphi *phi = si.phi ();
8742 	  if (dump_enabled_p ())
8743 	    {
8744 	      dump_printf_loc (MSG_NOTE, vect_location,
8745                                "------>vectorizing phi: ");
8746 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8747 	    }
8748 	  stmt_info = vinfo_for_stmt (phi);
8749 	  if (!stmt_info)
8750 	    continue;
8751 
8752 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8753 	    vect_loop_kill_debug_uses (loop, phi);
8754 
8755 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8756 	      && !STMT_VINFO_LIVE_P (stmt_info))
8757 	    continue;
8758 
8759 	  if (STMT_VINFO_VECTYPE (stmt_info)
8760 	      && (maybe_ne
8761 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8762 	      && dump_enabled_p ())
8763 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8764 
8765 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8766 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8767 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8768 	      && ! PURE_SLP_STMT (stmt_info))
8769 	    {
8770 	      if (dump_enabled_p ())
8771 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8772 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8773 	    }
8774 	}
8775 
8776       pattern_stmt = NULL;
8777       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8778 	   !gsi_end_p (si) || transform_pattern_stmt;)
8779 	{
8780 	  bool is_store;
8781 
8782           if (transform_pattern_stmt)
8783 	    stmt = pattern_stmt;
8784           else
8785 	    {
8786 	      stmt = gsi_stmt (si);
8787 	      /* During vectorization remove existing clobber stmts.  */
8788 	      if (gimple_clobber_p (stmt))
8789 		{
8790 		  unlink_stmt_vdef (stmt);
8791 		  gsi_remove (&si, true);
8792 		  release_defs (stmt);
8793 		  continue;
8794 		}
8795 	    }
8796 
8797 	  if (dump_enabled_p ())
8798 	    {
8799 	      dump_printf_loc (MSG_NOTE, vect_location,
8800 			       "------>vectorizing statement: ");
8801 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8802 	    }
8803 
8804 	  stmt_info = vinfo_for_stmt (stmt);
8805 
8806 	  /* vector stmts created in the outer-loop during vectorization of
8807 	     stmts in an inner-loop may not have a stmt_info, and do not
8808 	     need to be vectorized.  */
8809 	  if (!stmt_info)
8810 	    {
8811 	      gsi_next (&si);
8812 	      continue;
8813 	    }
8814 
8815 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8816 	    vect_loop_kill_debug_uses (loop, stmt);
8817 
8818 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8819 	      && !STMT_VINFO_LIVE_P (stmt_info))
8820             {
8821               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8822                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8823                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8824                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8825                 {
8826                   stmt = pattern_stmt;
8827                   stmt_info = vinfo_for_stmt (stmt);
8828                 }
8829               else
8830 	        {
8831    	          gsi_next (&si);
8832 	          continue;
8833                 }
8834 	    }
8835           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8836                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8837                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8838                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8839             transform_pattern_stmt = true;
8840 
8841 	  /* If pattern statement has def stmts, vectorize them too.  */
8842 	  if (is_pattern_stmt_p (stmt_info))
8843 	    {
8844 	      if (pattern_def_seq == NULL)
8845 		{
8846 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8847 		  pattern_def_si = gsi_start (pattern_def_seq);
8848 		}
8849 	      else if (!gsi_end_p (pattern_def_si))
8850 		gsi_next (&pattern_def_si);
8851 	      if (pattern_def_seq != NULL)
8852 		{
8853 		  gimple *pattern_def_stmt = NULL;
8854 		  stmt_vec_info pattern_def_stmt_info = NULL;
8855 
8856 		  while (!gsi_end_p (pattern_def_si))
8857 		    {
8858 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
8859 		      pattern_def_stmt_info
8860 			= vinfo_for_stmt (pattern_def_stmt);
8861 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8862 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8863 			break;
8864 		      gsi_next (&pattern_def_si);
8865 		    }
8866 
8867 		  if (!gsi_end_p (pattern_def_si))
8868 		    {
8869 		      if (dump_enabled_p ())
8870 			{
8871 			  dump_printf_loc (MSG_NOTE, vect_location,
8872 					   "==> vectorizing pattern def "
8873 					   "stmt: ");
8874 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8875 					    pattern_def_stmt, 0);
8876 			}
8877 
8878 		      stmt = pattern_def_stmt;
8879 		      stmt_info = pattern_def_stmt_info;
8880 		    }
8881 		  else
8882 		    {
8883 		      pattern_def_si = gsi_none ();
8884 		      transform_pattern_stmt = false;
8885 		    }
8886 		}
8887 	      else
8888 		transform_pattern_stmt = false;
8889             }
8890 
8891 	  if (STMT_VINFO_VECTYPE (stmt_info))
8892 	    {
8893 	      poly_uint64 nunits
8894 		= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8895 	      if (!STMT_SLP_TYPE (stmt_info)
8896 		  && maybe_ne (nunits, vf)
8897 		  && dump_enabled_p ())
8898 		  /* For SLP VF is set according to unrolling factor, and not
8899 		     to vector size, hence for SLP this print is not valid.  */
8900 		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8901 	    }
8902 
8903 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
8904 	     reached.  */
8905 	  if (STMT_SLP_TYPE (stmt_info))
8906 	    {
8907 	      if (!slp_scheduled)
8908 		{
8909 		  slp_scheduled = true;
8910 
8911 		  if (dump_enabled_p ())
8912 		    dump_printf_loc (MSG_NOTE, vect_location,
8913 				     "=== scheduling SLP instances ===\n");
8914 
8915 		  vect_schedule_slp (loop_vinfo);
8916 		}
8917 
8918 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8919 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8920 		{
8921 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8922 		    {
8923 		      pattern_def_seq = NULL;
8924 		      gsi_next (&si);
8925 		    }
8926 		  continue;
8927 		}
8928 	    }
8929 
8930 	  /* -------- vectorize statement ------------ */
8931 	  if (dump_enabled_p ())
8932 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8933 
8934 	  grouped_store = false;
8935 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8936           if (is_store)
8937             {
8938 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8939 		{
8940 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8941 		     interleaving chain was completed - free all the stores in
8942 		     the chain.  */
8943 		  gsi_next (&si);
8944 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8945 		}
8946 	      else
8947 		{
8948 		  /* Free the attached stmt_vec_info and remove the stmt.  */
8949 		  gimple *store = gsi_stmt (si);
8950 		  free_stmt_vec_info (store);
8951 		  unlink_stmt_vdef (store);
8952 		  gsi_remove (&si, true);
8953 		  release_defs (store);
8954 		}
8955 
8956 	      /* Stores can only appear at the end of pattern statements.  */
8957 	      gcc_assert (!transform_pattern_stmt);
8958 	      pattern_def_seq = NULL;
8959 	    }
8960 	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8961 	    {
8962 	      pattern_def_seq = NULL;
8963 	      gsi_next (&si);
8964 	    }
8965 	}		        /* stmts in BB */
8966 
8967       /* Stub out scalar statements that must not survive vectorization.
8968 	 Doing this here helps with grouped statements, or statements that
8969 	 are involved in patterns.  */
8970       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8971 	   !gsi_end_p (gsi); gsi_next (&gsi))
8972 	{
8973 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8974 	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8975 	    {
8976 	      tree lhs = gimple_get_lhs (call);
8977 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8978 		{
8979 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8980 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8981 		  gsi_replace (&gsi, new_stmt, true);
8982 		}
8983 	    }
8984 	}
8985     }				/* BBs in loop */
8986 
8987   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8988      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8989   if (integer_onep (step_vector))
8990     niters_no_overflow = true;
8991   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8992 			   niters_vector_mult_vf, !niters_no_overflow);
8993 
8994   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8995   scale_profile_for_vect_loop (loop, assumed_vf);
8996 
8997   /* True if the final iteration might not handle a full vector's
8998      worth of scalar iterations.  */
8999   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
9000   /* The minimum number of iterations performed by the epilogue.  This
9001      is 1 when peeling for gaps because we always need a final scalar
9002      iteration.  */
9003   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9004   /* +1 to convert latch counts to loop iteration counts,
9005      -min_epilogue_iters to remove iterations that cannot be performed
9006        by the vector code.  */
9007   int bias_for_lowest = 1 - min_epilogue_iters;
9008   int bias_for_assumed = bias_for_lowest;
9009   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9010   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9011     {
9012       /* When the amount of peeling is known at compile time, the first
9013 	 iteration will have exactly alignment_npeels active elements.
9014 	 In the worst case it will have at least one.  */
9015       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9016       bias_for_lowest += lowest_vf - min_first_active;
9017       bias_for_assumed += assumed_vf - min_first_active;
9018     }
9019   /* In these calculations the "- 1" converts loop iteration counts
9020      back to latch counts.  */
9021   if (loop->any_upper_bound)
9022     loop->nb_iterations_upper_bound
9023       = (final_iter_may_be_partial
9024 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9025 			  lowest_vf) - 1
9026 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9027 			   lowest_vf) - 1);
9028   if (loop->any_likely_upper_bound)
9029     loop->nb_iterations_likely_upper_bound
9030       = (final_iter_may_be_partial
9031 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9032 			  + bias_for_lowest, lowest_vf) - 1
9033 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9034 			   + bias_for_lowest, lowest_vf) - 1);
9035   if (loop->any_estimate)
9036     loop->nb_iterations_estimate
9037       = (final_iter_may_be_partial
9038 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9039 			  assumed_vf) - 1
9040 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9041 			   assumed_vf) - 1);
9042 
9043   if (dump_enabled_p ())
9044     {
9045       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9046 	{
9047 	  dump_printf_loc (MSG_NOTE, vect_location,
9048 			   "LOOP VECTORIZED\n");
9049 	  if (loop->inner)
9050 	    dump_printf_loc (MSG_NOTE, vect_location,
9051 			     "OUTER LOOP VECTORIZED\n");
9052 	  dump_printf (MSG_NOTE, "\n");
9053 	}
9054       else
9055 	{
9056 	  dump_printf_loc (MSG_NOTE, vect_location,
9057 			   "LOOP EPILOGUE VECTORIZED (VS=");
9058 	  dump_dec (MSG_NOTE, current_vector_size);
9059 	  dump_printf (MSG_NOTE, ")\n");
9060 	}
9061     }
9062 
9063   /* Free SLP instances here because otherwise stmt reference counting
9064      won't work.  */
9065   slp_instance instance;
9066   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9067     vect_free_slp_instance (instance);
9068   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9069   /* Clear-up safelen field since its value is invalid after vectorization
9070      since vectorized loop can have loop-carried dependencies.  */
9071   loop->safelen = 0;
9072 
9073   /* Don't vectorize epilogue for epilogue.  */
9074   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9075     epilogue = NULL;
9076 
9077   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9078     epilogue = NULL;
9079 
9080   if (epilogue)
9081     {
9082       auto_vector_sizes vector_sizes;
9083       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9084       unsigned int next_size = 0;
9085 
9086       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9087 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9088 	  && known_eq (vf, lowest_vf))
9089 	{
9090 	  unsigned int eiters
9091 	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9092 	       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9093 	  eiters = eiters % lowest_vf;
9094 	  epilogue->nb_iterations_upper_bound = eiters - 1;
9095 
9096 	  unsigned int ratio;
9097 	  while (next_size < vector_sizes.length ()
9098 		 && !(constant_multiple_p (current_vector_size,
9099 					   vector_sizes[next_size], &ratio)
9100 		      && eiters >= lowest_vf / ratio))
9101 	    next_size += 1;
9102 	}
9103       else
9104 	while (next_size < vector_sizes.length ()
9105 	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
9106 	  next_size += 1;
9107 
9108       if (next_size == vector_sizes.length ())
9109 	epilogue = NULL;
9110     }
9111 
9112   if (epilogue)
9113     {
9114       epilogue->force_vectorize = loop->force_vectorize;
9115       epilogue->safelen = loop->safelen;
9116       epilogue->dont_vectorize = false;
9117 
9118       /* We may need to if-convert epilogue to vectorize it.  */
9119       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9120 	tree_if_conversion (epilogue);
9121     }
9122 
9123   return epilogue;
9124 }
9125 
9126 /* The code below is trying to perform simple optimization - revert
9127    if-conversion for masked stores, i.e. if the mask of a store is zero
9128    do not perform it and all stored value producers also if possible.
9129    For example,
9130      for (i=0; i<n; i++)
9131        if (c[i])
9132 	{
9133 	  p1[i] += 1;
9134 	  p2[i] = p3[i] +2;
9135 	}
9136    this transformation will produce the following semi-hammock:
9137 
9138    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9139      {
9140        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9141        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9142        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9143        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9144        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9145        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9146      }
9147 */
9148 
9149 void
9150 optimize_mask_stores (struct loop *loop)
9151 {
9152   basic_block *bbs = get_loop_body (loop);
9153   unsigned nbbs = loop->num_nodes;
9154   unsigned i;
9155   basic_block bb;
9156   struct loop *bb_loop;
9157   gimple_stmt_iterator gsi;
9158   gimple *stmt;
9159   auto_vec<gimple *> worklist;
9160 
9161   vect_location = find_loop_location (loop);
9162   /* Pick up all masked stores in loop if any.  */
9163   for (i = 0; i < nbbs; i++)
9164     {
9165       bb = bbs[i];
9166       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9167 	   gsi_next (&gsi))
9168 	{
9169 	  stmt = gsi_stmt (gsi);
9170 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9171 	    worklist.safe_push (stmt);
9172 	}
9173     }
9174 
9175   free (bbs);
9176   if (worklist.is_empty ())
9177     return;
9178 
9179   /* Loop has masked stores.  */
9180   while (!worklist.is_empty ())
9181     {
9182       gimple *last, *last_store;
9183       edge e, efalse;
9184       tree mask;
9185       basic_block store_bb, join_bb;
9186       gimple_stmt_iterator gsi_to;
9187       tree vdef, new_vdef;
9188       gphi *phi;
9189       tree vectype;
9190       tree zero;
9191 
9192       last = worklist.pop ();
9193       mask = gimple_call_arg (last, 2);
9194       bb = gimple_bb (last);
9195       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9196 	 the same loop as if_bb.  It could be different to LOOP when two
9197 	 level loop-nest is vectorized and mask_store belongs to the inner
9198 	 one.  */
9199       e = split_block (bb, last);
9200       bb_loop = bb->loop_father;
9201       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9202       join_bb = e->dest;
9203       store_bb = create_empty_bb (bb);
9204       add_bb_to_loop (store_bb, bb_loop);
9205       e->flags = EDGE_TRUE_VALUE;
9206       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9207       /* Put STORE_BB to likely part.  */
9208       efalse->probability = profile_probability::unlikely ();
9209       store_bb->count = efalse->count ();
9210       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9211       if (dom_info_available_p (CDI_DOMINATORS))
9212 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9213       if (dump_enabled_p ())
9214 	dump_printf_loc (MSG_NOTE, vect_location,
9215 			 "Create new block %d to sink mask stores.",
9216 			 store_bb->index);
9217       /* Create vector comparison with boolean result.  */
9218       vectype = TREE_TYPE (mask);
9219       zero = build_zero_cst (vectype);
9220       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9221       gsi = gsi_last_bb (bb);
9222       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9223       /* Create new PHI node for vdef of the last masked store:
9224 	 .MEM_2 = VDEF <.MEM_1>
9225 	 will be converted to
9226 	 .MEM.3 = VDEF <.MEM_1>
9227 	 and new PHI node will be created in join bb
9228 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9229       */
9230       vdef = gimple_vdef (last);
9231       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9232       gimple_set_vdef (last, new_vdef);
9233       phi = create_phi_node (vdef, join_bb);
9234       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9235 
9236       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9237       while (true)
9238 	{
9239 	  gimple_stmt_iterator gsi_from;
9240 	  gimple *stmt1 = NULL;
9241 
9242 	  /* Move masked store to STORE_BB.  */
9243 	  last_store = last;
9244 	  gsi = gsi_for_stmt (last);
9245 	  gsi_from = gsi;
9246 	  /* Shift GSI to the previous stmt for further traversal.  */
9247 	  gsi_prev (&gsi);
9248 	  gsi_to = gsi_start_bb (store_bb);
9249 	  gsi_move_before (&gsi_from, &gsi_to);
9250 	  /* Setup GSI_TO to the non-empty block start.  */
9251 	  gsi_to = gsi_start_bb (store_bb);
9252 	  if (dump_enabled_p ())
9253 	    {
9254 	      dump_printf_loc (MSG_NOTE, vect_location,
9255 			       "Move stmt to created bb\n");
9256 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9257 	    }
9258 	  /* Move all stored value producers if possible.  */
9259 	  while (!gsi_end_p (gsi))
9260 	    {
9261 	      tree lhs;
9262 	      imm_use_iterator imm_iter;
9263 	      use_operand_p use_p;
9264 	      bool res;
9265 
9266 	      /* Skip debug statements.  */
9267 	      if (is_gimple_debug (gsi_stmt (gsi)))
9268 		{
9269 		  gsi_prev (&gsi);
9270 		  continue;
9271 		}
9272 	      stmt1 = gsi_stmt (gsi);
9273 	      /* Do not consider statements writing to memory or having
9274 		 volatile operand.  */
9275 	      if (gimple_vdef (stmt1)
9276 		  || gimple_has_volatile_ops (stmt1))
9277 		break;
9278 	      gsi_from = gsi;
9279 	      gsi_prev (&gsi);
9280 	      lhs = gimple_get_lhs (stmt1);
9281 	      if (!lhs)
9282 		break;
9283 
9284 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9285 	      if (TREE_CODE (lhs) != SSA_NAME)
9286 		break;
9287 
9288 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9289 		{
9290 		  /* Remove dead scalar statement.  */
9291 		  if (has_zero_uses (lhs))
9292 		    {
9293 		      gsi_remove (&gsi_from, true);
9294 		      continue;
9295 		    }
9296 		}
9297 
9298 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9299 	      res = true;
9300 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9301 		{
9302 		  gimple *use_stmt;
9303 		  use_stmt = USE_STMT (use_p);
9304 		  if (is_gimple_debug (use_stmt))
9305 		    continue;
9306 		  if (gimple_bb (use_stmt) != store_bb)
9307 		    {
9308 		      res = false;
9309 		      break;
9310 		    }
9311 		}
9312 	      if (!res)
9313 		break;
9314 
9315 	      if (gimple_vuse (stmt1)
9316 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9317 		break;
9318 
9319 	      /* Can move STMT1 to STORE_BB.  */
9320 	      if (dump_enabled_p ())
9321 		{
9322 		  dump_printf_loc (MSG_NOTE, vect_location,
9323 				   "Move stmt to created bb\n");
9324 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9325 		}
9326 	      gsi_move_before (&gsi_from, &gsi_to);
9327 	      /* Shift GSI_TO for further insertion.  */
9328 	      gsi_prev (&gsi_to);
9329 	    }
9330 	  /* Put other masked stores with the same mask to STORE_BB.  */
9331 	  if (worklist.is_empty ()
9332 	      || gimple_call_arg (worklist.last (), 2) != mask
9333 	      || worklist.last () != stmt1)
9334 	    break;
9335 	  last = worklist.pop ();
9336 	}
9337       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9338     }
9339 }
9340