1 /* Loop Vectorization
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 #define vec_step vec_step_
59 
60 /* Loop Vectorization Pass.
61 
62    This pass tries to vectorize loops.
63 
64    For example, the vectorizer transforms the following simple loop:
65 
66         short a[N]; short b[N]; short c[N]; int i;
67 
68         for (i=0; i<N; i++){
69           a[i] = b[i] + c[i];
70         }
71 
72    as if it was manually vectorized by rewriting the source code into:
73 
74         typedef int __attribute__((mode(V8HI))) v8hi;
75         short a[N];  short b[N]; short c[N];   int i;
76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77         v8hi va, vb, vc;
78 
79         for (i=0; i<N/8; i++){
80           vb = pb[i];
81           vc = pc[i];
82           va = vb + vc;
83           pa[i] = va;
84         }
85 
86         The main entry to this pass is vectorize_loops(), in which
87    the vectorizer applies a set of analyses on a given set of loops,
88    followed by the actual vectorization transformation for the loops that
89    had successfully passed the analysis phase.
90         Throughout this pass we make a distinction between two types of
91    data: scalars (which are represented by SSA_NAMES), and memory references
92    ("data-refs").  These two types of data require different handling both
93    during analysis and transformation. The types of data-refs that the
94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96    accesses are required to have a simple (consecutive) access pattern.
97 
98    Analysis phase:
99    ===============
100         The driver for the analysis phase is vect_analyze_loop().
101    It applies a set of analyses, some of which rely on the scalar evolution
102    analyzer (scev) developed by Sebastian Pop.
103 
104         During the analysis phase the vectorizer records some information
105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106    loop, as well as general information about the loop as a whole, which is
107    recorded in a "loop_vec_info" struct attached to each loop.
108 
109    Transformation phase:
110    =====================
111         The loop transformation phase scans all the stmts in the loop, and
112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113    the loop that needs to be vectorized.  It inserts the vector code sequence
114    just before the scalar stmt S, and records a pointer to the vector code
115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116    attached to S).  This pointer will be used for the vectorization of following
117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118    otherwise, we rely on dead code elimination for removing it.
119 
120         For example, say stmt S1 was vectorized into stmt VS1:
121 
122    VS1: vb = px[i];
123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124    S2:  a = b;
125 
126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
129    resulting sequence would be:
130 
131    VS1: vb = px[i];
132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133    VS2: va = vb;
134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 
136         Operands that are not SSA_NAMEs, are data-refs that appear in
137    load/store operations (like 'x[i]' in S1), and are handled differently.
138 
139    Target modeling:
140    =================
141         Currently the only target specific information that is used is the
142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143    Targets that can support different sizes of vectors, for now will need
144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
145    flexibility will be added in the future.
146 
147         Since we only vectorize operations which vector form can be
148    expressed using existing tree codes, to verify that an operation is
149    supported, the vectorizer checks the relevant optab at the relevant
150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
151    the value found is CODE_FOR_nothing, then there's no target support, and
152    we can't vectorize the stmt.
153 
154    For additional information on this project see:
155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157 
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159 
160 /* Function vect_determine_vectorization_factor
161 
162    Determine the vectorization factor (VF).  VF is the number of data elements
163    that are operated upon in parallel in a single iteration of the vectorized
164    loop.  For example, when vectorizing a loop that operates on 4byte elements,
165    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
166    elements can fit in a single vector register.
167 
168    We currently support vectorization of loops in which all types operated upon
169    are of the same size.  Therefore this function currently sets VF according to
170    the size of the types operated upon, and fails if there are multiple sizes
171    in the loop.
172 
173    VF is also the factor by which the loop iterations are strip-mined, e.g.:
174    original loop:
175         for (i=0; i<N; i++){
176           a[i] = b[i] + c[i];
177         }
178 
179    vectorized loop:
180         for (i=0; i<N; i+=VF){
181           a[i:VF] = b[i:VF] + c[i:VF];
182         }
183 */
184 
185 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)186 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
187 {
188   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
190   unsigned nbbs = loop->num_nodes;
191   poly_uint64 vectorization_factor = 1;
192   tree scalar_type = NULL_TREE;
193   gphi *phi;
194   tree vectype;
195   stmt_vec_info stmt_info;
196   unsigned i;
197   HOST_WIDE_INT dummy;
198   gimple *stmt, *pattern_stmt = NULL;
199   gimple_seq pattern_def_seq = NULL;
200   gimple_stmt_iterator pattern_def_si = gsi_none ();
201   bool analyze_pattern_stmt = false;
202   bool bool_result;
203   auto_vec<stmt_vec_info> mask_producers;
204 
205   if (dump_enabled_p ())
206     dump_printf_loc (MSG_NOTE, vect_location,
207                      "=== vect_determine_vectorization_factor ===\n");
208 
209   for (i = 0; i < nbbs; i++)
210     {
211       basic_block bb = bbs[i];
212 
213       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
214 	   gsi_next (&si))
215 	{
216 	  phi = si.phi ();
217 	  stmt_info = vinfo_for_stmt (phi);
218 	  if (dump_enabled_p ())
219 	    {
220 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
221 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 	    }
223 
224 	  gcc_assert (stmt_info);
225 
226 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
227 	      || STMT_VINFO_LIVE_P (stmt_info))
228             {
229 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
230               scalar_type = TREE_TYPE (PHI_RESULT (phi));
231 
232 	      if (dump_enabled_p ())
233 		{
234 		  dump_printf_loc (MSG_NOTE, vect_location,
235                                    "get vectype for scalar type:  ");
236 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
237                   dump_printf (MSG_NOTE, "\n");
238 		}
239 
240 	      vectype = get_vectype_for_scalar_type (scalar_type);
241 	      if (!vectype)
242 		{
243 		  if (dump_enabled_p ())
244 		    {
245 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
246                                        "not vectorized: unsupported "
247                                        "data-type ");
248 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
249                                          scalar_type);
250                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
251 		    }
252 		  return false;
253 		}
254 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
255 
256 	      if (dump_enabled_p ())
257 		{
258 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
259 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
260                   dump_printf (MSG_NOTE, "\n");
261 		}
262 
263 	      if (dump_enabled_p ())
264 		{
265 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
266 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
267 		  dump_printf (MSG_NOTE, "\n");
268 		}
269 
270 	      vect_update_max_nunits (&vectorization_factor, vectype);
271 	    }
272 	}
273 
274       for (gimple_stmt_iterator si = gsi_start_bb (bb);
275 	   !gsi_end_p (si) || analyze_pattern_stmt;)
276         {
277           tree vf_vectype;
278 
279           if (analyze_pattern_stmt)
280 	    stmt = pattern_stmt;
281           else
282             stmt = gsi_stmt (si);
283 
284           stmt_info = vinfo_for_stmt (stmt);
285 
286 	  if (dump_enabled_p ())
287 	    {
288 	      dump_printf_loc (MSG_NOTE, vect_location,
289                                "==> examining statement: ");
290 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 	    }
292 
293 	  gcc_assert (stmt_info);
294 
295 	  /* Skip stmts which do not need to be vectorized.  */
296 	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
297 	       && !STMT_VINFO_LIVE_P (stmt_info))
298 	      || gimple_clobber_p (stmt))
299             {
300               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
301                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
302                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
303                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
304                 {
305                   stmt = pattern_stmt;
306                   stmt_info = vinfo_for_stmt (pattern_stmt);
307                   if (dump_enabled_p ())
308                     {
309                       dump_printf_loc (MSG_NOTE, vect_location,
310                                        "==> examining pattern statement: ");
311                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312                     }
313                 }
314               else
315 	        {
316 	          if (dump_enabled_p ())
317 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
318                   gsi_next (&si);
319 	          continue;
320                 }
321 	    }
322           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
323                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
324                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
325                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
326             analyze_pattern_stmt = true;
327 
328 	  /* If a pattern statement has def stmts, analyze them too.  */
329 	  if (is_pattern_stmt_p (stmt_info))
330 	    {
331 	      if (pattern_def_seq == NULL)
332 		{
333 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
334 		  pattern_def_si = gsi_start (pattern_def_seq);
335 		}
336 	      else if (!gsi_end_p (pattern_def_si))
337 		gsi_next (&pattern_def_si);
338 	      if (pattern_def_seq != NULL)
339 		{
340 		  gimple *pattern_def_stmt = NULL;
341 		  stmt_vec_info pattern_def_stmt_info = NULL;
342 
343 		  while (!gsi_end_p (pattern_def_si))
344 		    {
345 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
346 		      pattern_def_stmt_info
347 			= vinfo_for_stmt (pattern_def_stmt);
348 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
349 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
350 			break;
351 		      gsi_next (&pattern_def_si);
352 		    }
353 
354 		  if (!gsi_end_p (pattern_def_si))
355 		    {
356 		      if (dump_enabled_p ())
357 			{
358 			  dump_printf_loc (MSG_NOTE, vect_location,
359                                            "==> examining pattern def stmt: ");
360 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
361                                             pattern_def_stmt, 0);
362 			}
363 
364 		      stmt = pattern_def_stmt;
365 		      stmt_info = pattern_def_stmt_info;
366 		    }
367 		  else
368 		    {
369 		      pattern_def_si = gsi_none ();
370 		      analyze_pattern_stmt = false;
371 		    }
372 		}
373 	      else
374 		analyze_pattern_stmt = false;
375 	    }
376 
377 	  if (gimple_get_lhs (stmt) == NULL_TREE
378 	      /* MASK_STORE has no lhs, but is ok.  */
379 	      && (!is_gimple_call (stmt)
380 		  || !gimple_call_internal_p (stmt)
381 		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
382 	    {
383 	      if (is_gimple_call (stmt))
384 		{
385 		  /* Ignore calls with no lhs.  These must be calls to
386 		     #pragma omp simd functions, and what vectorization factor
387 		     it really needs can't be determined until
388 		     vectorizable_simd_clone_call.  */
389 		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
390 		    {
391 		      pattern_def_seq = NULL;
392 		      gsi_next (&si);
393 		    }
394 		  continue;
395 		}
396 	      if (dump_enabled_p ())
397 		{
398 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399                                    "not vectorized: irregular stmt.");
400 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
401                                     0);
402 		}
403 	      return false;
404 	    }
405 
406 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
407 	    {
408 	      if (dump_enabled_p ())
409 	        {
410 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411                                    "not vectorized: vector stmt in loop:");
412 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
413 	        }
414 	      return false;
415 	    }
416 
417 	  bool_result = false;
418 
419 	  if (STMT_VINFO_VECTYPE (stmt_info))
420 	    {
421 	      /* The only case when a vectype had been already set is for stmts
422 	         that contain a dataref, or for "pattern-stmts" (stmts
423 		 generated by the vectorizer to represent/replace a certain
424 		 idiom).  */
425 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
426 			  || is_pattern_stmt_p (stmt_info)
427 			  || !gsi_end_p (pattern_def_si));
428 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
429 	    }
430 	  else
431 	    {
432 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
433 	      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
434 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 	      else
436 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 
438 	      /* Bool ops don't participate in vectorization factor
439 		 computation.  For comparison use compared types to
440 		 compute a factor.  */
441 	      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
442 		  && is_gimple_assign (stmt)
443 		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
444 		{
445 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
446 		      || STMT_VINFO_LIVE_P (stmt_info))
447 		    mask_producers.safe_push (stmt_info);
448 		  bool_result = true;
449 
450 		  if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
451 		      == tcc_comparison
452 		      && !VECT_SCALAR_BOOLEAN_TYPE_P
453 			    (TREE_TYPE (gimple_assign_rhs1 (stmt))))
454 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
455 		  else
456 		    {
457 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
458 			{
459 			  pattern_def_seq = NULL;
460 			  gsi_next (&si);
461 			}
462 		      continue;
463 		    }
464 		}
465 
466 	      if (dump_enabled_p ())
467 		{
468 		  dump_printf_loc (MSG_NOTE, vect_location,
469                                    "get vectype for scalar type:  ");
470 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
471                   dump_printf (MSG_NOTE, "\n");
472 		}
473 	      vectype = get_vectype_for_scalar_type (scalar_type);
474 	      if (!vectype)
475 		{
476 		  if (dump_enabled_p ())
477 		    {
478 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479                                        "not vectorized: unsupported "
480                                        "data-type ");
481 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
482                                          scalar_type);
483                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
484 		    }
485 		  return false;
486 		}
487 
488 	      if (!bool_result)
489 		STMT_VINFO_VECTYPE (stmt_info) = vectype;
490 
491 	      if (dump_enabled_p ())
492 		{
493 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
494 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
495                   dump_printf (MSG_NOTE, "\n");
496 		}
497             }
498 
499 	  /* Don't try to compute VF out scalar types if we stmt
500 	     produces boolean vector.  Use result vectype instead.  */
501 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
502 	    vf_vectype = vectype;
503 	  else
504 	    {
505 	      /* The vectorization factor is according to the smallest
506 		 scalar type (or the largest vector size, but we only
507 		 support one vector size per loop).  */
508 	      if (!bool_result)
509 		scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
510 							     &dummy);
511 	      if (dump_enabled_p ())
512 		{
513 		  dump_printf_loc (MSG_NOTE, vect_location,
514 				   "get vectype for scalar type:  ");
515 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
516 		  dump_printf (MSG_NOTE, "\n");
517 		}
518 	      vf_vectype = get_vectype_for_scalar_type (scalar_type);
519 	    }
520 	  if (!vf_vectype)
521 	    {
522 	      if (dump_enabled_p ())
523 		{
524 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525                                    "not vectorized: unsupported data-type ");
526 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
527                                      scalar_type);
528                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 		}
530 	      return false;
531 	    }
532 
533 	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
534 			GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
535 	    {
536 	      if (dump_enabled_p ())
537 		{
538 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
539                                    "not vectorized: different sized vector "
540                                    "types in statement, ");
541 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542                                      vectype);
543 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
544 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
545                                      vf_vectype);
546                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547 		}
548 	      return false;
549 	    }
550 
551 	  if (dump_enabled_p ())
552 	    {
553 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
554 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
555               dump_printf (MSG_NOTE, "\n");
556 	    }
557 
558 	  if (dump_enabled_p ())
559 	    {
560 	      dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
561 	      dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
562 	      dump_printf (MSG_NOTE, "\n");
563 	    }
564 
565 	  vect_update_max_nunits (&vectorization_factor, vf_vectype);
566 
567 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
568 	    {
569 	      pattern_def_seq = NULL;
570 	      gsi_next (&si);
571 	    }
572         }
573     }
574 
575   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
576   if (dump_enabled_p ())
577     {
578       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
579       dump_dec (MSG_NOTE, vectorization_factor);
580       dump_printf (MSG_NOTE, "\n");
581     }
582 
583   if (known_le (vectorization_factor, 1U))
584     {
585       if (dump_enabled_p ())
586         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587                          "not vectorized: unsupported data-type\n");
588       return false;
589     }
590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
591 
592   for (i = 0; i < mask_producers.length (); i++)
593     {
594       tree mask_type = NULL;
595 
596       stmt = STMT_VINFO_STMT (mask_producers[i]);
597 
598       if (is_gimple_assign (stmt)
599 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
600 	  && !VECT_SCALAR_BOOLEAN_TYPE_P
601 				      (TREE_TYPE (gimple_assign_rhs1 (stmt))))
602 	{
603 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
604 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
605 
606 	  if (!mask_type)
607 	    {
608 	      if (dump_enabled_p ())
609 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610 				 "not vectorized: unsupported mask\n");
611 	      return false;
612 	    }
613 	}
614       else
615 	{
616 	  tree rhs;
617 	  ssa_op_iter iter;
618 	  gimple *def_stmt;
619 	  enum vect_def_type dt;
620 
621 	  FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
622 	    {
623 	      if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
624 				       &def_stmt, &dt, &vectype))
625 		{
626 		  if (dump_enabled_p ())
627 		    {
628 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
629 				       "not vectorized: can't compute mask type "
630 				       "for statement, ");
631 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
632 					0);
633 		    }
634 		  return false;
635 		}
636 
637 	      /* No vectype probably means external definition.
638 		 Allow it in case there is another operand which
639 		 allows to determine mask type.  */
640 	      if (!vectype)
641 		continue;
642 
643 	      if (!mask_type)
644 		mask_type = vectype;
645 	      else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
646 				 TYPE_VECTOR_SUBPARTS (vectype)))
647 		{
648 		  if (dump_enabled_p ())
649 		    {
650 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 				       "not vectorized: different sized masks "
652 				       "types in statement, ");
653 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
654 					 mask_type);
655 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
656 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
657 					 vectype);
658 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
659 		    }
660 		  return false;
661 		}
662 	      else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
663 		       != VECTOR_BOOLEAN_TYPE_P (vectype))
664 		{
665 		  if (dump_enabled_p ())
666 		    {
667 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
668 				       "not vectorized: mixed mask and "
669 				       "nonmask vector types in statement, ");
670 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
671 					 mask_type);
672 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
673 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
674 					 vectype);
675 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
676 		    }
677 		  return false;
678 		}
679 	    }
680 
681 	  /* We may compare boolean value loaded as vector of integers.
682 	     Fix mask_type in such case.  */
683 	  if (mask_type
684 	      && !VECTOR_BOOLEAN_TYPE_P (mask_type)
685 	      && gimple_code (stmt) == GIMPLE_ASSIGN
686 	      && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
687 	    mask_type = build_same_sized_truth_vector_type (mask_type);
688 	}
689 
690       /* No mask_type should mean loop invariant predicate.
691 	 This is probably a subject for optimization in
692 	 if-conversion.  */
693       if (!mask_type)
694 	{
695 	  if (dump_enabled_p ())
696 	    {
697 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
698 			       "not vectorized: can't compute mask type "
699 			       "for statement, ");
700 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
701 				0);
702 	    }
703 	  return false;
704 	}
705 
706       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707     }
708 
709   return true;
710 }
711 
712 
713 /* Function vect_is_simple_iv_evolution.
714 
715    FORNOW: A simple evolution of an induction variables in the loop is
716    considered a polynomial evolution.  */
717 
718 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)719 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
720                              tree * step)
721 {
722   tree init_expr;
723   tree step_expr;
724   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
725   basic_block bb;
726 
727   /* When there is no evolution in this loop, the evolution function
728      is not "simple".  */
729   if (evolution_part == NULL_TREE)
730     return false;
731 
732   /* When the evolution is a polynomial of degree >= 2
733      the evolution function is not "simple".  */
734   if (tree_is_chrec (evolution_part))
735     return false;
736 
737   step_expr = evolution_part;
738   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
739 
740   if (dump_enabled_p ())
741     {
742       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
743       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
744       dump_printf (MSG_NOTE, ",  init: ");
745       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
746       dump_printf (MSG_NOTE, "\n");
747     }
748 
749   *init = init_expr;
750   *step = step_expr;
751 
752   if (TREE_CODE (step_expr) != INTEGER_CST
753       && (TREE_CODE (step_expr) != SSA_NAME
754 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
755 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
756 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
757 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
758 		  || !flag_associative_math)))
759       && (TREE_CODE (step_expr) != REAL_CST
760 	  || !flag_associative_math))
761     {
762       if (dump_enabled_p ())
763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764                          "step unknown.\n");
765       return false;
766     }
767 
768   return true;
769 }
770 
771 /* Function vect_analyze_scalar_cycles_1.
772 
773    Examine the cross iteration def-use cycles of scalar variables
774    in LOOP.  LOOP_VINFO represents the loop that is now being
775    considered for vectorization (can be LOOP, or an outer-loop
776    enclosing LOOP).  */
777 
778 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)779 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
780 {
781   basic_block bb = loop->header;
782   tree init, step;
783   auto_vec<gimple *, 64> worklist;
784   gphi_iterator gsi;
785   bool double_reduc;
786 
787   if (dump_enabled_p ())
788     dump_printf_loc (MSG_NOTE, vect_location,
789                      "=== vect_analyze_scalar_cycles ===\n");
790 
791   /* First - identify all inductions.  Reduction detection assumes that all the
792      inductions have been identified, therefore, this order must not be
793      changed.  */
794   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
795     {
796       gphi *phi = gsi.phi ();
797       tree access_fn = NULL;
798       tree def = PHI_RESULT (phi);
799       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
800 
801       if (dump_enabled_p ())
802 	{
803 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
804 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 	}
806 
807       /* Skip virtual phi's.  The data dependences that are associated with
808          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
809       if (virtual_operand_p (def))
810 	continue;
811 
812       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
813 
814       /* Analyze the evolution function.  */
815       access_fn = analyze_scalar_evolution (loop, def);
816       if (access_fn)
817 	{
818 	  STRIP_NOPS (access_fn);
819 	  if (dump_enabled_p ())
820 	    {
821 	      dump_printf_loc (MSG_NOTE, vect_location,
822                                "Access function of PHI: ");
823 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
824               dump_printf (MSG_NOTE, "\n");
825 	    }
826 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
827 	    = initial_condition_in_loop_num (access_fn, loop->num);
828 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
829 	    = evolution_part_in_loop_num (access_fn, loop->num);
830 	}
831 
832       if (!access_fn
833 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
834 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
835 	      && TREE_CODE (step) != INTEGER_CST))
836 	{
837 	  worklist.safe_push (phi);
838 	  continue;
839 	}
840 
841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
842 		  != NULL_TREE);
843       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
844 
845       if (dump_enabled_p ())
846 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
847       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
848     }
849 
850 
851   /* Second - identify all reductions and nested cycles.  */
852   while (worklist.length () > 0)
853     {
854       gimple *phi = worklist.pop ();
855       tree def = PHI_RESULT (phi);
856       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
857       gimple *reduc_stmt;
858 
859       if (dump_enabled_p ())
860         {
861           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
862           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863         }
864 
865       gcc_assert (!virtual_operand_p (def)
866 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
867 
868       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
869 						&double_reduc, false);
870       if (reduc_stmt)
871         {
872           if (double_reduc)
873             {
874               if (dump_enabled_p ())
875                 dump_printf_loc (MSG_NOTE, vect_location,
876 				 "Detected double reduction.\n");
877 
878               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
879               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
880                                                     vect_double_reduction_def;
881             }
882           else
883             {
884               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
885                 {
886                   if (dump_enabled_p ())
887                     dump_printf_loc (MSG_NOTE, vect_location,
888 				     "Detected vectorizable nested cycle.\n");
889 
890                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
891                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892                                                              vect_nested_cycle;
893                 }
894               else
895                 {
896                   if (dump_enabled_p ())
897                     dump_printf_loc (MSG_NOTE, vect_location,
898 				     "Detected reduction.\n");
899 
900                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
901                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
902                                                            vect_reduction_def;
903                   /* Store the reduction cycles for possible vectorization in
904                      loop-aware SLP if it was not detected as reduction
905 		     chain.  */
906 		  if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
907 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
908                 }
909             }
910         }
911       else
912         if (dump_enabled_p ())
913           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 			   "Unknown def-use cycle pattern.\n");
915     }
916 }
917 
918 
919 /* Function vect_analyze_scalar_cycles.
920 
921    Examine the cross iteration def-use cycles of scalar variables, by
922    analyzing the loop-header PHIs of scalar variables.  Classify each
923    cycle as one of the following: invariant, induction, reduction, unknown.
924    We do that for the loop represented by LOOP_VINFO, and also to its
925    inner-loop, if exists.
926    Examples for scalar cycles:
927 
928    Example1: reduction:
929 
930               loop1:
931               for (i=0; i<N; i++)
932                  sum += a[i];
933 
934    Example2: induction:
935 
936               loop2:
937               for (i=0; i<N; i++)
938                  a[i] = i;  */
939 
940 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)941 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
942 {
943   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
944 
945   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
946 
947   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
948      Reductions in such inner-loop therefore have different properties than
949      the reductions in the nest that gets vectorized:
950      1. When vectorized, they are executed in the same order as in the original
951         scalar loop, so we can't change the order of computation when
952         vectorizing them.
953      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
954         current checks are too strict.  */
955 
956   if (loop->inner)
957     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 }
959 
960 /* Transfer group and reduction information from STMT to its pattern stmt.  */
961 
962 static void
vect_fixup_reduc_chain(gimple * stmt)963 vect_fixup_reduc_chain (gimple *stmt)
964 {
965   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
966   gimple *stmtp;
967   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
968 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
969   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970   do
971     {
972       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
973       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
974       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
975       if (stmt)
976 	GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
977 	  = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
978     }
979   while (stmt);
980   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 }
982 
983 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
984 
985 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)986 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
987 {
988   gimple *first;
989   unsigned i;
990 
991   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
992     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
993       {
994 	gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
995 	while (next)
996 	  {
997 	    if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
998 	      break;
999 	    next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1000 	  }
1001 	/* If not all stmt in the chain are patterns try to handle
1002 	   the chain without patterns.  */
1003 	if (! next)
1004 	  {
1005 	    vect_fixup_reduc_chain (first);
1006 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1007 	      = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1008 	  }
1009       }
1010 }
1011 
1012 /* Function vect_get_loop_niters.
1013 
1014    Determine how many iterations the loop is executed and place it
1015    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1016    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1017    niter information holds in ASSUMPTIONS.
1018 
1019    Return the loop exit condition.  */
1020 
1021 
1022 static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)1023 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1024 		      tree *number_of_iterations, tree *number_of_iterationsm1)
1025 {
1026   edge exit = single_exit (loop);
1027   struct tree_niter_desc niter_desc;
1028   tree niter_assumptions, niter, may_be_zero;
1029   gcond *cond = get_loop_exit_condition (loop);
1030 
1031   *assumptions = boolean_true_node;
1032   *number_of_iterationsm1 = chrec_dont_know;
1033   *number_of_iterations = chrec_dont_know;
1034   if (dump_enabled_p ())
1035     dump_printf_loc (MSG_NOTE, vect_location,
1036 		     "=== get_loop_niters ===\n");
1037 
1038   if (!exit)
1039     return cond;
1040 
1041   niter = chrec_dont_know;
1042   may_be_zero = NULL_TREE;
1043   niter_assumptions = boolean_true_node;
1044   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1045       || chrec_contains_undetermined (niter_desc.niter))
1046     return cond;
1047 
1048   niter_assumptions = niter_desc.assumptions;
1049   may_be_zero = niter_desc.may_be_zero;
1050   niter = niter_desc.niter;
1051 
1052   if (may_be_zero && integer_zerop (may_be_zero))
1053     may_be_zero = NULL_TREE;
1054 
1055   if (may_be_zero)
1056     {
1057       if (COMPARISON_CLASS_P (may_be_zero))
1058 	{
1059 	  /* Try to combine may_be_zero with assumptions, this can simplify
1060 	     computation of niter expression.  */
1061 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1062 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1063 					     niter_assumptions,
1064 					     fold_build1 (TRUTH_NOT_EXPR,
1065 							  boolean_type_node,
1066 							  may_be_zero));
1067 	  else
1068 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1069 				 build_int_cst (TREE_TYPE (niter), 0),
1070 				 rewrite_to_non_trapping_overflow (niter));
1071 
1072 	  may_be_zero = NULL_TREE;
1073 	}
1074       else if (integer_nonzerop (may_be_zero))
1075 	{
1076 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1077 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1078 	  return cond;
1079 	}
1080       else
1081 	return cond;
1082     }
1083 
1084   *assumptions = niter_assumptions;
1085   *number_of_iterationsm1 = niter;
1086 
1087   /* We want the number of loop header executions which is the number
1088      of latch executions plus one.
1089      ???  For UINT_MAX latch executions this number overflows to zero
1090      for loops like do { n++; } while (n != 0);  */
1091   if (niter && !chrec_contains_undetermined (niter))
1092     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1093 			  build_int_cst (TREE_TYPE (niter), 1));
1094   *number_of_iterations = niter;
1095 
1096   return cond;
1097 }
1098 
1099 /* Function bb_in_loop_p
1100 
1101    Used as predicate for dfs order traversal of the loop bbs.  */
1102 
1103 static bool
bb_in_loop_p(const_basic_block bb,const void * data)1104 bb_in_loop_p (const_basic_block bb, const void *data)
1105 {
1106   const struct loop *const loop = (const struct loop *)data;
1107   if (flow_bb_inside_loop_p (loop, bb))
1108     return true;
1109   return false;
1110 }
1111 
1112 
1113 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1114    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1115 
_loop_vec_info(struct loop * loop_in)1116 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1117   : vec_info (vec_info::loop, init_cost (loop_in)),
1118     loop (loop_in),
1119     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1120     num_itersm1 (NULL_TREE),
1121     num_iters (NULL_TREE),
1122     num_iters_unchanged (NULL_TREE),
1123     num_iters_assumptions (NULL_TREE),
1124     th (0),
1125     versioning_threshold (0),
1126     vectorization_factor (0),
1127     max_vectorization_factor (0),
1128     mask_skip_niters (NULL_TREE),
1129     mask_compare_type (NULL_TREE),
1130     unaligned_dr (NULL),
1131     peeling_for_alignment (0),
1132     ptr_mask (0),
1133     ivexpr_map (NULL),
1134     slp_unrolling_factor (1),
1135     single_scalar_iteration_cost (0),
1136     vectorizable (false),
1137     can_fully_mask_p (true),
1138     fully_masked_p (false),
1139     peeling_for_gaps (false),
1140     peeling_for_niter (false),
1141     operands_swapped (false),
1142     no_data_dependencies (false),
1143     has_mask_store (false),
1144     scalar_loop (NULL),
1145     orig_loop_info (NULL)
1146 {
1147   /* Create/Update stmt_info for all stmts in the loop.  */
1148   basic_block *body = get_loop_body (loop);
1149   for (unsigned int i = 0; i < loop->num_nodes; i++)
1150     {
1151       basic_block bb = body[i];
1152       gimple_stmt_iterator si;
1153 
1154       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1155 	{
1156 	  gimple *phi = gsi_stmt (si);
1157 	  gimple_set_uid (phi, 0);
1158 	  set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 	}
1160 
1161       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1162 	{
1163 	  gimple *stmt = gsi_stmt (si);
1164 	  gimple_set_uid (stmt, 0);
1165 	  set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 	}
1167     }
1168   free (body);
1169 
1170   /* CHECKME: We want to visit all BBs before their successors (except for
1171      latch blocks, for which this assertion wouldn't hold).  In the simple
1172      case of the loop forms we allow, a dfs order of the BBs would the same
1173      as reversed postorder traversal, so we are safe.  */
1174 
1175   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1176 					  bbs, loop->num_nodes, loop);
1177   gcc_assert (nbbs == loop->num_nodes);
1178 }
1179 
1180 /* Free all levels of MASKS.  */
1181 
1182 void
release_vec_loop_masks(vec_loop_masks * masks)1183 release_vec_loop_masks (vec_loop_masks *masks)
1184 {
1185   rgroup_masks *rgm;
1186   unsigned int i;
1187   FOR_EACH_VEC_ELT (*masks, i, rgm)
1188     rgm->masks.release ();
1189   masks->release ();
1190 }
1191 
1192 /* Free all memory used by the _loop_vec_info, as well as all the
1193    stmt_vec_info structs of all the stmts in the loop.  */
1194 
~_loop_vec_info()1195 _loop_vec_info::~_loop_vec_info ()
1196 {
1197   int nbbs;
1198   gimple_stmt_iterator si;
1199   int j;
1200 
1201   nbbs = loop->num_nodes;
1202   for (j = 0; j < nbbs; j++)
1203     {
1204       basic_block bb = bbs[j];
1205       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1206         free_stmt_vec_info (gsi_stmt (si));
1207 
1208       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1209         {
1210 	  gimple *stmt = gsi_stmt (si);
1211 
1212 	  /* We may have broken canonical form by moving a constant
1213 	     into RHS1 of a commutative op.  Fix such occurrences.  */
1214 	  if (operands_swapped && is_gimple_assign (stmt))
1215 	    {
1216 	      enum tree_code code = gimple_assign_rhs_code (stmt);
1217 
1218 	      if ((code == PLUS_EXPR
1219 		   || code == POINTER_PLUS_EXPR
1220 		   || code == MULT_EXPR)
1221 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1222 		swap_ssa_operands (stmt,
1223 				   gimple_assign_rhs1_ptr (stmt),
1224 				   gimple_assign_rhs2_ptr (stmt));
1225 	      else if (code == COND_EXPR
1226 		       && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1227 		{
1228 		  tree cond_expr = gimple_assign_rhs1 (stmt);
1229 		  enum tree_code cond_code = TREE_CODE (cond_expr);
1230 
1231 		  if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1232 		    {
1233 		      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1234 								  0));
1235 		      cond_code = invert_tree_comparison (cond_code,
1236 							  honor_nans);
1237 		      if (cond_code != ERROR_MARK)
1238 			{
1239 			  TREE_SET_CODE (cond_expr, cond_code);
1240 			  swap_ssa_operands (stmt,
1241 					     gimple_assign_rhs2_ptr (stmt),
1242 					     gimple_assign_rhs3_ptr (stmt));
1243 			}
1244 		    }
1245 		}
1246 	    }
1247 
1248 	  /* Free stmt_vec_info.  */
1249 	  free_stmt_vec_info (stmt);
1250           gsi_next (&si);
1251         }
1252     }
1253 
1254   free (bbs);
1255 
1256   release_vec_loop_masks (&masks);
1257   delete ivexpr_map;
1258 
1259   loop->aux = NULL;
1260 }
1261 
1262 /* Return an invariant or register for EXPR and emit necessary
1263    computations in the LOOP_VINFO loop preheader.  */
1264 
1265 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)1266 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1267 {
1268   if (is_gimple_reg (expr)
1269       || is_gimple_min_invariant (expr))
1270     return expr;
1271 
1272   if (! loop_vinfo->ivexpr_map)
1273     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1274   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1275   if (! cached)
1276     {
1277       gimple_seq stmts = NULL;
1278       cached = force_gimple_operand (unshare_expr (expr),
1279 				     &stmts, true, NULL_TREE);
1280       if (stmts)
1281 	{
1282 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1283 	  gsi_insert_seq_on_edge_immediate (e, stmts);
1284 	}
1285     }
1286   return cached;
1287 }
1288 
1289 /* Return true if we can use CMP_TYPE as the comparison type to produce
1290    all masks required to mask LOOP_VINFO.  */
1291 
1292 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)1293 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1294 {
1295   rgroup_masks *rgm;
1296   unsigned int i;
1297   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1298     if (rgm->mask_type != NULL_TREE
1299 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1300 					    cmp_type, rgm->mask_type,
1301 					    OPTIMIZE_FOR_SPEED))
1302       return false;
1303   return true;
1304 }
1305 
1306 /* Calculate the maximum number of scalars per iteration for every
1307    rgroup in LOOP_VINFO.  */
1308 
1309 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)1310 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1311 {
1312   unsigned int res = 1;
1313   unsigned int i;
1314   rgroup_masks *rgm;
1315   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1316     res = MAX (res, rgm->max_nscalars_per_iter);
1317   return res;
1318 }
1319 
1320 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1321    whether we can actually generate the masks required.  Return true if so,
1322    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1323 
1324 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1325 vect_verify_full_masking (loop_vec_info loop_vinfo)
1326 {
1327   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1328   unsigned int min_ni_width;
1329 
1330   /* Use a normal loop if there are no statements that need masking.
1331      This only happens in rare degenerate cases: it means that the loop
1332      has no loads, no stores, and no live-out values.  */
1333   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1334     return false;
1335 
1336   /* Get the maximum number of iterations that is representable
1337      in the counter type.  */
1338   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1339   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1340 
1341   /* Get a more refined estimate for the number of iterations.  */
1342   widest_int max_back_edges;
1343   if (max_loop_iterations (loop, &max_back_edges))
1344     max_ni = wi::smin (max_ni, max_back_edges + 1);
1345 
1346   /* Account for rgroup masks, in which each bit is replicated N times.  */
1347   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1348 
1349   /* Work out how many bits we need to represent the limit.  */
1350   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1351 
1352   /* Find a scalar mode for which WHILE_ULT is supported.  */
1353   opt_scalar_int_mode cmp_mode_iter;
1354   tree cmp_type = NULL_TREE;
1355   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356     {
1357       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358       if (cmp_bits >= min_ni_width
1359 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360 	{
1361 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 	  if (this_type
1363 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364 	    {
1365 	      /* Although we could stop as soon as we find a valid mode,
1366 		 it's often better to continue until we hit Pmode, since the
1367 		 operands to the WHILE are more likely to be reusable in
1368 		 address calculations.  */
1369 	      cmp_type = this_type;
1370 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1371 		break;
1372 	    }
1373 	}
1374     }
1375 
1376   if (!cmp_type)
1377     return false;
1378 
1379   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1380   return true;
1381 }
1382 
1383 /* Calculate the cost of one scalar iteration of the loop.  */
1384 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1385 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1386 {
1387   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1388   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1389   int nbbs = loop->num_nodes, factor;
1390   int innerloop_iters, i;
1391 
1392   /* Gather costs for statements in the scalar loop.  */
1393 
1394   /* FORNOW.  */
1395   innerloop_iters = 1;
1396   if (loop->inner)
1397     innerloop_iters = 50; /* FIXME */
1398 
1399   for (i = 0; i < nbbs; i++)
1400     {
1401       gimple_stmt_iterator si;
1402       basic_block bb = bbs[i];
1403 
1404       if (bb->loop_father == loop->inner)
1405         factor = innerloop_iters;
1406       else
1407         factor = 1;
1408 
1409       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1410         {
1411 	  gimple *stmt = gsi_stmt (si);
1412           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1413 
1414           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1415             continue;
1416 
1417           /* Skip stmts that are not vectorized inside the loop.  */
1418           if (stmt_info
1419               && !STMT_VINFO_RELEVANT_P (stmt_info)
1420               && (!STMT_VINFO_LIVE_P (stmt_info)
1421                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1422 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1423             continue;
1424 
1425 	  vect_cost_for_stmt kind;
1426           if (STMT_VINFO_DATA_REF (stmt_info))
1427             {
1428               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1429                kind = scalar_load;
1430              else
1431                kind = scalar_store;
1432             }
1433           else
1434             kind = scalar_stmt;
1435 
1436 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1437 			    factor, kind, stmt_info, 0, vect_prologue);
1438         }
1439     }
1440 
1441   /* Now accumulate cost.  */
1442   void *target_cost_data = init_cost (loop);
1443   stmt_info_for_cost *si;
1444   int j;
1445   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1446 		    j, si)
1447     {
1448       struct _stmt_vec_info *stmt_info
1449 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1450       (void) add_stmt_cost (target_cost_data, si->count,
1451 			    si->kind, stmt_info, si->misalign,
1452 			    vect_body);
1453     }
1454   unsigned dummy, body_cost = 0;
1455   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1456   destroy_cost_data (target_cost_data);
1457   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1458 }
1459 
1460 
1461 /* Function vect_analyze_loop_form_1.
1462 
1463    Verify that certain CFG restrictions hold, including:
1464    - the loop has a pre-header
1465    - the loop has a single entry and exit
1466    - the loop exit condition is simple enough
1467    - the number of iterations can be analyzed, i.e, a countable loop.  The
1468      niter could be analyzed under some assumptions.  */
1469 
1470 bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1471 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1472 			  tree *assumptions, tree *number_of_iterationsm1,
1473 			  tree *number_of_iterations, gcond **inner_loop_cond)
1474 {
1475   if (dump_enabled_p ())
1476     dump_printf_loc (MSG_NOTE, vect_location,
1477 		     "=== vect_analyze_loop_form ===\n");
1478 
1479   /* Different restrictions apply when we are considering an inner-most loop,
1480      vs. an outer (nested) loop.
1481      (FORNOW. May want to relax some of these restrictions in the future).  */
1482 
1483   if (!loop->inner)
1484     {
1485       /* Inner-most loop.  We currently require that the number of BBs is
1486 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1487 	 look like this:
1488 
1489                         (pre-header)
1490                            |
1491                           header <--------+
1492                            | |            |
1493                            | +--> latch --+
1494                            |
1495                         (exit-bb)  */
1496 
1497       if (loop->num_nodes != 2)
1498         {
1499           if (dump_enabled_p ())
1500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1501 			     "not vectorized: control flow in loop.\n");
1502           return false;
1503         }
1504 
1505       if (empty_block_p (loop->header))
1506 	{
1507 	  if (dump_enabled_p ())
1508 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509 			     "not vectorized: empty loop.\n");
1510 	  return false;
1511 	}
1512     }
1513   else
1514     {
1515       struct loop *innerloop = loop->inner;
1516       edge entryedge;
1517 
1518       /* Nested loop. We currently require that the loop is doubly-nested,
1519 	 contains a single inner loop, and the number of BBs is exactly 5.
1520 	 Vectorizable outer-loops look like this:
1521 
1522 			(pre-header)
1523 			   |
1524 			  header <---+
1525 			   |         |
1526 		          inner-loop |
1527 			   |         |
1528 			  tail ------+
1529 			   |
1530 		        (exit-bb)
1531 
1532 	 The inner-loop has the properties expected of inner-most loops
1533 	 as described above.  */
1534 
1535       if ((loop->inner)->inner || (loop->inner)->next)
1536 	{
1537 	  if (dump_enabled_p ())
1538 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1539 			     "not vectorized: multiple nested loops.\n");
1540 	  return false;
1541 	}
1542 
1543       if (loop->num_nodes != 5)
1544         {
1545 	  if (dump_enabled_p ())
1546 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1547 			     "not vectorized: control flow in loop.\n");
1548 	  return false;
1549         }
1550 
1551       entryedge = loop_preheader_edge (innerloop);
1552       if (entryedge->src != loop->header
1553 	  || !single_exit (innerloop)
1554 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1555 	{
1556 	  if (dump_enabled_p ())
1557 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558 			     "not vectorized: unsupported outerloop form.\n");
1559 	  return false;
1560 	}
1561 
1562       /* Analyze the inner-loop.  */
1563       tree inner_niterm1, inner_niter, inner_assumptions;
1564       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1565 				      &inner_assumptions, &inner_niterm1,
1566 				      &inner_niter, NULL)
1567 	  /* Don't support analyzing niter under assumptions for inner
1568 	     loop.  */
1569 	  || !integer_onep (inner_assumptions))
1570 	{
1571 	  if (dump_enabled_p ())
1572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573 			     "not vectorized: Bad inner loop.\n");
1574 	  return false;
1575 	}
1576 
1577       if (!expr_invariant_in_loop_p (loop, inner_niter))
1578 	{
1579 	  if (dump_enabled_p ())
1580 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1581 			     "not vectorized: inner-loop count not"
1582                              " invariant.\n");
1583 	  return false;
1584 	}
1585 
1586       if (dump_enabled_p ())
1587         dump_printf_loc (MSG_NOTE, vect_location,
1588 			 "Considering outer-loop vectorization.\n");
1589     }
1590 
1591   if (!single_exit (loop)
1592       || EDGE_COUNT (loop->header->preds) != 2)
1593     {
1594       if (dump_enabled_p ())
1595         {
1596           if (!single_exit (loop))
1597 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 			     "not vectorized: multiple exits.\n");
1599           else if (EDGE_COUNT (loop->header->preds) != 2)
1600 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 			     "not vectorized: too many incoming edges.\n");
1602         }
1603       return false;
1604     }
1605 
1606   /* We assume that the loop exit condition is at the end of the loop. i.e,
1607      that the loop is represented as a do-while (with a proper if-guard
1608      before the loop if needed), where the loop header contains all the
1609      executable statements, and the latch is empty.  */
1610   if (!empty_block_p (loop->latch)
1611       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1612     {
1613       if (dump_enabled_p ())
1614 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 			 "not vectorized: latch block not empty.\n");
1616       return false;
1617     }
1618 
1619   /* Make sure the exit is not abnormal.  */
1620   edge e = single_exit (loop);
1621   if (e->flags & EDGE_ABNORMAL)
1622     {
1623       if (dump_enabled_p ())
1624 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1625 			 "not vectorized: abnormal loop exit edge.\n");
1626       return false;
1627     }
1628 
1629   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1630 				     number_of_iterationsm1);
1631   if (!*loop_cond)
1632     {
1633       if (dump_enabled_p ())
1634 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635 			 "not vectorized: complicated exit condition.\n");
1636       return false;
1637     }
1638 
1639   if (integer_zerop (*assumptions)
1640       || !*number_of_iterations
1641       || chrec_contains_undetermined (*number_of_iterations))
1642     {
1643       if (dump_enabled_p ())
1644 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 			 "not vectorized: number of iterations cannot be "
1646 			 "computed.\n");
1647       return false;
1648     }
1649 
1650   if (integer_zerop (*number_of_iterations))
1651     {
1652       if (dump_enabled_p ())
1653 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 			 "not vectorized: number of iterations = 0.\n");
1655       return false;
1656     }
1657 
1658   return true;
1659 }
1660 
1661 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1662 
1663 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1664 vect_analyze_loop_form (struct loop *loop)
1665 {
1666   tree assumptions, number_of_iterations, number_of_iterationsm1;
1667   gcond *loop_cond, *inner_loop_cond = NULL;
1668 
1669   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1670 				  &assumptions, &number_of_iterationsm1,
1671 				  &number_of_iterations, &inner_loop_cond))
1672     return NULL;
1673 
1674   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1675   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1676   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1677   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1678   if (!integer_onep (assumptions))
1679     {
1680       /* We consider to vectorize this loop by versioning it under
1681 	 some assumptions.  In order to do this, we need to clear
1682 	 existing information computed by scev and niter analyzer.  */
1683       scev_reset_htab ();
1684       free_numbers_of_iterations_estimates (loop);
1685       /* Also set flag for this loop so that following scev and niter
1686 	 analysis are done under the assumptions.  */
1687       loop_constraint_set (loop, LOOP_C_FINITE);
1688       /* Also record the assumptions for versioning.  */
1689       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690     }
1691 
1692   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1693     {
1694       if (dump_enabled_p ())
1695         {
1696           dump_printf_loc (MSG_NOTE, vect_location,
1697 			   "Symbolic number of iterations is ");
1698 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1699           dump_printf (MSG_NOTE, "\n");
1700         }
1701     }
1702 
1703   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1704   if (inner_loop_cond)
1705     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1706       = loop_exit_ctrl_vec_info_type;
1707 
1708   gcc_assert (!loop->aux);
1709   loop->aux = loop_vinfo;
1710   return loop_vinfo;
1711 }
1712 
1713 
1714 
1715 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1716    statements update the vectorization factor.  */
1717 
1718 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1719 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1720 {
1721   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1722   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1723   int nbbs = loop->num_nodes;
1724   poly_uint64 vectorization_factor;
1725   int i;
1726 
1727   if (dump_enabled_p ())
1728     dump_printf_loc (MSG_NOTE, vect_location,
1729 		     "=== vect_update_vf_for_slp ===\n");
1730 
1731   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1732   gcc_assert (known_ne (vectorization_factor, 0U));
1733 
1734   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1735      vectorization factor of the loop is the unrolling factor required by
1736      the SLP instances.  If that unrolling factor is 1, we say, that we
1737      perform pure SLP on loop - cross iteration parallelism is not
1738      exploited.  */
1739   bool only_slp_in_loop = true;
1740   for (i = 0; i < nbbs; i++)
1741     {
1742       basic_block bb = bbs[i];
1743       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1744 	   gsi_next (&si))
1745 	{
1746 	  gimple *stmt = gsi_stmt (si);
1747 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1748 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1749 	      && STMT_VINFO_RELATED_STMT (stmt_info))
1750 	    {
1751 	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1752 	      stmt_info = vinfo_for_stmt (stmt);
1753 	    }
1754 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1755 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1756 	      && !PURE_SLP_STMT (stmt_info))
1757 	    /* STMT needs both SLP and loop-based vectorization.  */
1758 	    only_slp_in_loop = false;
1759 	}
1760     }
1761 
1762   if (only_slp_in_loop)
1763     {
1764       dump_printf_loc (MSG_NOTE, vect_location,
1765 		       "Loop contains only SLP stmts\n");
1766       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1767     }
1768   else
1769     {
1770       dump_printf_loc (MSG_NOTE, vect_location,
1771 		       "Loop contains SLP and non-SLP stmts\n");
1772       /* Both the vectorization factor and unroll factor have the form
1773 	 current_vector_size * X for some rational X, so they must have
1774 	 a common multiple.  */
1775       vectorization_factor
1776 	= force_common_multiple (vectorization_factor,
1777 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778     }
1779 
1780   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1781   if (dump_enabled_p ())
1782     {
1783       dump_printf_loc (MSG_NOTE, vect_location,
1784 		       "Updating vectorization factor to ");
1785       dump_dec (MSG_NOTE, vectorization_factor);
1786       dump_printf (MSG_NOTE, ".\n");
1787     }
1788 }
1789 
1790 /* Return true if STMT_INFO describes a double reduction phi and if
1791    the other phi in the reduction is also relevant for vectorization.
1792    This rejects cases such as:
1793 
1794       outer1:
1795 	x_1 = PHI <x_3(outer2), ...>;
1796 	...
1797 
1798       inner:
1799 	x_2 = ...;
1800 	...
1801 
1802       outer2:
1803 	x_3 = PHI <x_2(inner)>;
1804 
1805    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1806 
1807 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1808 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1809 {
1810   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1811     return false;
1812 
1813   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1814   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 }
1816 
1817 /* Function vect_analyze_loop_operations.
1818 
1819    Scan the loop stmts and make sure they are all vectorizable.  */
1820 
1821 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1822 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1823 {
1824   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1825   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1826   int nbbs = loop->num_nodes;
1827   int i;
1828   stmt_vec_info stmt_info;
1829   bool need_to_vectorize = false;
1830   bool ok;
1831 
1832   if (dump_enabled_p ())
1833     dump_printf_loc (MSG_NOTE, vect_location,
1834 		     "=== vect_analyze_loop_operations ===\n");
1835 
1836   for (i = 0; i < nbbs; i++)
1837     {
1838       basic_block bb = bbs[i];
1839 
1840       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1841 	   gsi_next (&si))
1842         {
1843           gphi *phi = si.phi ();
1844           ok = true;
1845 
1846           stmt_info = vinfo_for_stmt (phi);
1847           if (dump_enabled_p ())
1848             {
1849               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1850               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1851             }
1852 	  if (virtual_operand_p (gimple_phi_result (phi)))
1853 	    continue;
1854 
1855           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1856              (i.e., a phi in the tail of the outer-loop).  */
1857           if (! is_loop_header_bb_p (bb))
1858             {
1859               /* FORNOW: we currently don't support the case that these phis
1860                  are not used in the outerloop (unless it is double reduction,
1861                  i.e., this phi is vect_reduction_def), cause this case
1862                  requires to actually do something here.  */
1863               if (STMT_VINFO_LIVE_P (stmt_info)
1864 		  && !vect_active_double_reduction_p (stmt_info))
1865                 {
1866                   if (dump_enabled_p ())
1867 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 				     "Unsupported loop-closed phi in "
1869 				     "outer-loop.\n");
1870                   return false;
1871                 }
1872 
1873               /* If PHI is used in the outer loop, we check that its operand
1874                  is defined in the inner loop.  */
1875               if (STMT_VINFO_RELEVANT_P (stmt_info))
1876                 {
1877                   tree phi_op;
1878 		  gimple *op_def_stmt;
1879 
1880                   if (gimple_phi_num_args (phi) != 1)
1881                     return false;
1882 
1883                   phi_op = PHI_ARG_DEF (phi, 0);
1884                   if (TREE_CODE (phi_op) != SSA_NAME)
1885                     return false;
1886 
1887                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1888 		  if (gimple_nop_p (op_def_stmt)
1889 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1890 		      || !vinfo_for_stmt (op_def_stmt))
1891                     return false;
1892 
1893                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                         != vect_used_in_outer
1895                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1896                            != vect_used_in_outer_by_reduction)
1897                     return false;
1898                 }
1899 
1900               continue;
1901             }
1902 
1903           gcc_assert (stmt_info);
1904 
1905           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1906                || STMT_VINFO_LIVE_P (stmt_info))
1907               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1908             {
1909               /* A scalar-dependence cycle that we don't support.  */
1910               if (dump_enabled_p ())
1911 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 				 "not vectorized: scalar dependence cycle.\n");
1913               return false;
1914             }
1915 
1916           if (STMT_VINFO_RELEVANT_P (stmt_info))
1917             {
1918               need_to_vectorize = true;
1919               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1920 		  && ! PURE_SLP_STMT (stmt_info))
1921                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1922 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1923 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1924 		       && ! PURE_SLP_STMT (stmt_info))
1925 		ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926             }
1927 
1928 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1929 	  if (ok
1930 	      && STMT_VINFO_LIVE_P (stmt_info)
1931 	      && !PURE_SLP_STMT (stmt_info))
1932 	    ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1933 
1934           if (!ok)
1935             {
1936               if (dump_enabled_p ())
1937                 {
1938 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 				   "not vectorized: relevant phi not "
1940 				   "supported: ");
1941                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1942                 }
1943 	      return false;
1944             }
1945         }
1946 
1947       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1948 	   gsi_next (&si))
1949         {
1950 	  gimple *stmt = gsi_stmt (si);
1951 	  if (!gimple_clobber_p (stmt)
1952 	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1953 	    return false;
1954         }
1955     } /* bbs */
1956 
1957   /* All operations in the loop are either irrelevant (deal with loop
1958      control, or dead), or only used outside the loop and can be moved
1959      out of the loop (e.g. invariants, inductions).  The loop can be
1960      optimized away by scalar optimizations.  We're better off not
1961      touching this loop.  */
1962   if (!need_to_vectorize)
1963     {
1964       if (dump_enabled_p ())
1965         dump_printf_loc (MSG_NOTE, vect_location,
1966 			 "All the computation can be taken out of the loop.\n");
1967       if (dump_enabled_p ())
1968 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1969 			 "not vectorized: redundant loop. no profit to "
1970 			 "vectorize.\n");
1971       return false;
1972     }
1973 
1974   return true;
1975 }
1976 
1977 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1978    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1979    definitely no, or -1 if it's worth retrying.  */
1980 
1981 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1982 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1983 {
1984   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986 
1987   /* Only fully-masked loops can have iteration counts less than the
1988      vectorization factor.  */
1989   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1990     {
1991       HOST_WIDE_INT max_niter;
1992 
1993       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1994 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1995       else
1996 	max_niter = max_stmt_executions_int (loop);
1997 
1998       if (max_niter != -1
1999 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2000 	{
2001 	  if (dump_enabled_p ())
2002 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 			     "not vectorized: iteration count smaller than "
2004 			     "vectorization factor.\n");
2005 	  return 0;
2006 	}
2007     }
2008 
2009   int min_profitable_iters, min_profitable_estimate;
2010   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2011 				      &min_profitable_estimate);
2012 
2013   if (min_profitable_iters < 0)
2014     {
2015       if (dump_enabled_p ())
2016 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017 			 "not vectorized: vectorization not profitable.\n");
2018       if (dump_enabled_p ())
2019 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 			 "not vectorized: vector version will never be "
2021 			 "profitable.\n");
2022       return -1;
2023     }
2024 
2025   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2026 			       * assumed_vf);
2027 
2028   /* Use the cost model only if it is more conservative than user specified
2029      threshold.  */
2030   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2031 				    min_profitable_iters);
2032 
2033   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2034 
2035   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2036       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2037     {
2038       if (dump_enabled_p ())
2039 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 			 "not vectorized: vectorization not profitable.\n");
2041       if (dump_enabled_p ())
2042 	dump_printf_loc (MSG_NOTE, vect_location,
2043 			 "not vectorized: iteration count smaller than user "
2044 			 "specified loop bound parameter or minimum profitable "
2045 			 "iterations (whichever is more conservative).\n");
2046       return 0;
2047     }
2048 
2049   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2050   if (estimated_niter == -1)
2051     estimated_niter = likely_max_stmt_executions_int (loop);
2052   if (estimated_niter != -1
2053       && ((unsigned HOST_WIDE_INT) estimated_niter
2054 	  < MAX (th, (unsigned) min_profitable_estimate)))
2055     {
2056       if (dump_enabled_p ())
2057 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 			 "not vectorized: estimated iteration count too "
2059 			 "small.\n");
2060       if (dump_enabled_p ())
2061 	dump_printf_loc (MSG_NOTE, vect_location,
2062 			 "not vectorized: estimated iteration count smaller "
2063 			 "than specified loop bound parameter or minimum "
2064 			 "profitable iterations (whichever is more "
2065 			 "conservative).\n");
2066       return -1;
2067     }
2068 
2069   return 1;
2070 }
2071 
2072 
2073 /* Function vect_analyze_loop_2.
2074 
2075    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2076    for it.  The different analyses will record information in the
2077    loop_vec_info struct.  */
2078 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)2079 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2080 {
2081   bool ok;
2082   int res;
2083   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2084   poly_uint64 min_vf = 2;
2085   unsigned int n_stmts = 0;
2086 
2087   /* The first group of checks is independent of the vector size.  */
2088   fatal = true;
2089 
2090   /* Find all data references in the loop (which correspond to vdefs/vuses)
2091      and analyze their evolution in the loop.  */
2092 
2093   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2094 
2095   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2096   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2097     {
2098       if (dump_enabled_p ())
2099 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 			 "not vectorized: loop nest containing two "
2101 			 "or more consecutive inner loops cannot be "
2102 			 "vectorized\n");
2103       return false;
2104     }
2105 
2106   for (unsigned i = 0; i < loop->num_nodes; i++)
2107     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2108 	 !gsi_end_p (gsi); gsi_next (&gsi))
2109       {
2110 	gimple *stmt = gsi_stmt (gsi);
2111 	if (is_gimple_debug (stmt))
2112 	  continue;
2113 	++n_stmts;
2114 	if (!find_data_references_in_stmt (loop, stmt,
2115 					   &LOOP_VINFO_DATAREFS (loop_vinfo)))
2116 	  {
2117 	    if (is_gimple_call (stmt) && loop->safelen)
2118 	      {
2119 		tree fndecl = gimple_call_fndecl (stmt), op;
2120 		if (fndecl != NULL_TREE)
2121 		  {
2122 		    cgraph_node *node = cgraph_node::get (fndecl);
2123 		    if (node != NULL && node->simd_clones != NULL)
2124 		      {
2125 			unsigned int j, n = gimple_call_num_args (stmt);
2126 			for (j = 0; j < n; j++)
2127 			  {
2128 			    op = gimple_call_arg (stmt, j);
2129 			    if (DECL_P (op)
2130 				|| (REFERENCE_CLASS_P (op)
2131 				    && get_base_address (op)))
2132 			      break;
2133 			  }
2134 			op = gimple_call_lhs (stmt);
2135 			/* Ignore #pragma omp declare simd functions
2136 			   if they don't have data references in the
2137 			   call stmt itself.  */
2138 			if (j == n
2139 			    && !(op
2140 				 && (DECL_P (op)
2141 				     || (REFERENCE_CLASS_P (op)
2142 					 && get_base_address (op)))))
2143 			  continue;
2144 		      }
2145 		  }
2146 	      }
2147 	    if (dump_enabled_p ())
2148 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 			       "not vectorized: loop contains function "
2150 			       "calls or data references that cannot "
2151 			       "be analyzed\n");
2152 	    return false;
2153 	  }
2154       }
2155 
2156   /* Analyze the data references and also adjust the minimal
2157      vectorization factor according to the loads and stores.  */
2158 
2159   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2160   if (!ok)
2161     {
2162       if (dump_enabled_p ())
2163 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2164 			 "bad data references.\n");
2165       return false;
2166     }
2167 
2168   /* Classify all cross-iteration scalar data-flow cycles.
2169      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2170   vect_analyze_scalar_cycles (loop_vinfo);
2171 
2172   vect_pattern_recog (loop_vinfo);
2173 
2174   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2175 
2176   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2177      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2178 
2179   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2180   if (!ok)
2181     {
2182       if (dump_enabled_p ())
2183 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 			 "bad data access.\n");
2185       return false;
2186     }
2187 
2188   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2189 
2190   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2191   if (!ok)
2192     {
2193       if (dump_enabled_p ())
2194 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195 			 "unexpected pattern.\n");
2196       return false;
2197     }
2198 
2199   /* While the rest of the analysis below depends on it in some way.  */
2200   fatal = false;
2201 
2202   /* Analyze data dependences between the data-refs in the loop
2203      and adjust the maximum vectorization factor according to
2204      the dependences.
2205      FORNOW: fail at the first data dependence that we encounter.  */
2206 
2207   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2208   if (!ok
2209       || (max_vf != MAX_VECTORIZATION_FACTOR
2210 	  && maybe_lt (max_vf, min_vf)))
2211     {
2212       if (dump_enabled_p ())
2213 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 			     "bad data dependence.\n");
2215       return false;
2216     }
2217   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2218 
2219   ok = vect_determine_vectorization_factor (loop_vinfo);
2220   if (!ok)
2221     {
2222       if (dump_enabled_p ())
2223 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 			 "can't determine vectorization factor.\n");
2225       return false;
2226     }
2227   if (max_vf != MAX_VECTORIZATION_FACTOR
2228       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2229     {
2230       if (dump_enabled_p ())
2231 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 			 "bad data dependence.\n");
2233       return false;
2234     }
2235 
2236   /* Compute the scalar iteration cost.  */
2237   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2238 
2239   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2240   unsigned th;
2241 
2242   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2243   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2244   if (!ok)
2245     return false;
2246 
2247   /* If there are any SLP instances mark them as pure_slp.  */
2248   bool slp = vect_make_slp_decision (loop_vinfo);
2249   if (slp)
2250     {
2251       /* Find stmts that need to be both vectorized and SLPed.  */
2252       vect_detect_hybrid_slp (loop_vinfo);
2253 
2254       /* Update the vectorization factor based on the SLP decision.  */
2255       vect_update_vf_for_slp (loop_vinfo);
2256     }
2257 
2258   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2259 
2260   /* We don't expect to have to roll back to anything other than an empty
2261      set of rgroups.  */
2262   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2263 
2264   /* This is the point where we can re-start analysis with SLP forced off.  */
2265 start_over:
2266 
2267   /* Now the vectorization factor is final.  */
2268   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2269   gcc_assert (known_ne (vectorization_factor, 0U));
2270 
2271   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2272     {
2273       dump_printf_loc (MSG_NOTE, vect_location,
2274 		       "vectorization_factor = ");
2275       dump_dec (MSG_NOTE, vectorization_factor);
2276       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2277 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2278     }
2279 
2280   HOST_WIDE_INT max_niter
2281     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2282 
2283   /* Analyze the alignment of the data-refs in the loop.
2284      Fail if a data reference is found that cannot be vectorized.  */
2285 
2286   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2287   if (!ok)
2288     {
2289       if (dump_enabled_p ())
2290 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 			 "bad data alignment.\n");
2292       return false;
2293     }
2294 
2295   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2296      It is important to call pruning after vect_analyze_data_ref_accesses,
2297      since we use grouping information gathered by interleaving analysis.  */
2298   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2299   if (!ok)
2300     return false;
2301 
2302   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2303      vectorization.  */
2304   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2305     {
2306     /* This pass will decide on using loop versioning and/or loop peeling in
2307        order to enhance the alignment of data references in the loop.  */
2308     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2309     if (!ok)
2310       {
2311 	if (dump_enabled_p ())
2312 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313 			   "bad data alignment.\n");
2314         return false;
2315       }
2316     }
2317 
2318   if (slp)
2319     {
2320       /* Analyze operations in the SLP instances.  Note this may
2321 	 remove unsupported SLP instances which makes the above
2322 	 SLP kind detection invalid.  */
2323       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2324       vect_slp_analyze_operations (loop_vinfo);
2325       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2326 	goto again;
2327     }
2328 
2329   /* Scan all the remaining operations in the loop that are not subject
2330      to SLP and make sure they are vectorizable.  */
2331   ok = vect_analyze_loop_operations (loop_vinfo);
2332   if (!ok)
2333     {
2334       if (dump_enabled_p ())
2335 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 			 "bad operation or unsupported loop bound.\n");
2337       return false;
2338     }
2339 
2340   /* Decide whether to use a fully-masked loop for this vectorization
2341      factor.  */
2342   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2343     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2344        && vect_verify_full_masking (loop_vinfo));
2345   if (dump_enabled_p ())
2346     {
2347       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2348 	dump_printf_loc (MSG_NOTE, vect_location,
2349 			 "using a fully-masked loop.\n");
2350       else
2351 	dump_printf_loc (MSG_NOTE, vect_location,
2352 			 "not using a fully-masked loop.\n");
2353     }
2354 
2355   /* If epilog loop is required because of data accesses with gaps,
2356      one additional iteration needs to be peeled.  Check if there is
2357      enough iterations for vectorization.  */
2358   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2359       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2360       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2361     {
2362       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2363       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2364 
2365       if (known_lt (wi::to_widest (scalar_niters), vf))
2366 	{
2367 	  if (dump_enabled_p ())
2368 	    dump_printf_loc (MSG_NOTE, vect_location,
2369 			     "loop has no enough iterations to support"
2370 			     " peeling for gaps.\n");
2371 	  return false;
2372 	}
2373     }
2374 
2375   /* Check the costings of the loop make vectorizing worthwhile.  */
2376   res = vect_analyze_loop_costing (loop_vinfo);
2377   if (res < 0)
2378     goto again;
2379   if (!res)
2380     {
2381       if (dump_enabled_p ())
2382 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 			 "Loop costings not worthwhile.\n");
2384       return false;
2385     }
2386 
2387   /* Decide whether we need to create an epilogue loop to handle
2388      remaining scalar iterations.  */
2389   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2390 
2391   unsigned HOST_WIDE_INT const_vf;
2392   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2393     /* The main loop handles all iterations.  */
2394     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2395   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2396 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2397     {
2398       /* Work out the (constant) number of iterations that need to be
2399 	 peeled for reasons other than niters.  */
2400       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2401       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2402 	peel_niter += 1;
2403       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2404 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2405 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2406     }
2407   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2408 	   /* ??? When peeling for gaps but not alignment, we could
2409 	      try to check whether the (variable) niters is known to be
2410 	      VF * N + 1.  That's something of a niche case though.  */
2411 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2412 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2413 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2414 		< (unsigned) exact_log2 (const_vf))
2415 	       /* In case of versioning, check if the maximum number of
2416 		  iterations is greater than th.  If they are identical,
2417 		  the epilogue is unnecessary.  */
2418 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2419 		   || ((unsigned HOST_WIDE_INT) max_niter
2420 		       > (th / const_vf) * const_vf))))
2421     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2422 
2423   /* If an epilogue loop is required make sure we can create one.  */
2424   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2425       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2426     {
2427       if (dump_enabled_p ())
2428         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2429       if (!vect_can_advance_ivs_p (loop_vinfo)
2430 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2431 					   single_exit (LOOP_VINFO_LOOP
2432 							 (loop_vinfo))))
2433         {
2434           if (dump_enabled_p ())
2435 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436 			     "not vectorized: can't create required "
2437 			     "epilog loop\n");
2438           goto again;
2439         }
2440     }
2441 
2442   /* During peeling, we need to check if number of loop iterations is
2443      enough for both peeled prolog loop and vector loop.  This check
2444      can be merged along with threshold check of loop versioning, so
2445      increase threshold for this case if necessary.  */
2446   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2447     {
2448       poly_uint64 niters_th = 0;
2449 
2450       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2451 	{
2452 	  /* Niters for peeled prolog loop.  */
2453 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2454 	    {
2455 	      struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2456 	      tree vectype
2457 		= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2458 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2459 	    }
2460 	  else
2461 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2462 	}
2463 
2464       /* Niters for at least one iteration of vectorized loop.  */
2465       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2466 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2467       /* One additional iteration because of peeling for gap.  */
2468       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2469 	niters_th += 1;
2470       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2471     }
2472 
2473   gcc_assert (known_eq (vectorization_factor,
2474 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2475 
2476   /* Ok to vectorize!  */
2477   return true;
2478 
2479 again:
2480   /* Try again with SLP forced off but if we didn't do any SLP there is
2481      no point in re-trying.  */
2482   if (!slp)
2483     return false;
2484 
2485   /* If there are reduction chains re-trying will fail anyway.  */
2486   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2487     return false;
2488 
2489   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2490      via interleaving or lane instructions.  */
2491   slp_instance instance;
2492   slp_tree node;
2493   unsigned i, j;
2494   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2495     {
2496       stmt_vec_info vinfo;
2497       vinfo = vinfo_for_stmt
2498 	  (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2499       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2500 	continue;
2501       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2502       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2503       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2504       if (! vect_store_lanes_supported (vectype, size, false)
2505 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2506 	 && ! vect_grouped_store_supported (vectype, size))
2507        return false;
2508       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2509 	{
2510 	  vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2511 	  vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2512 	  bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2513 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
2514 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2515 	  if (! vect_load_lanes_supported (vectype, size, false)
2516 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2517 						size))
2518 	    return false;
2519 	}
2520     }
2521 
2522   if (dump_enabled_p ())
2523     dump_printf_loc (MSG_NOTE, vect_location,
2524 		     "re-trying with SLP disabled\n");
2525 
2526   /* Roll back state appropriately.  No SLP this time.  */
2527   slp = false;
2528   /* Restore vectorization factor as it were without SLP.  */
2529   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2530   /* Free the SLP instances.  */
2531   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2532     vect_free_slp_instance (instance);
2533   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2534   /* Reset SLP type to loop_vect on all stmts.  */
2535   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2536     {
2537       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2538       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2539 	   !gsi_end_p (si); gsi_next (&si))
2540 	{
2541 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2542 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2543 	}
2544       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2545 	   !gsi_end_p (si); gsi_next (&si))
2546 	{
2547 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2548 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2549 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2550 	    {
2551 	      stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2552 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2553 	      for (gimple_stmt_iterator pi
2554 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2555 		   !gsi_end_p (pi); gsi_next (&pi))
2556 		{
2557 		  gimple *pstmt = gsi_stmt (pi);
2558 		  STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2559 		}
2560 	    }
2561 	}
2562     }
2563   /* Free optimized alias test DDRS.  */
2564   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2565   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2566   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2567   /* Reset target cost data.  */
2568   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2569   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2570     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2571   /* Reset accumulated rgroup information.  */
2572   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2573   /* Reset assorted flags.  */
2574   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2575   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2576   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2577   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2578   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2579 
2580   goto start_over;
2581 }
2582 
2583 /* Function vect_analyze_loop.
2584 
2585    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2586    for it.  The different analyses will record information in the
2587    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2588    be vectorized.  */
2589 loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo)2590 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2591 {
2592   loop_vec_info loop_vinfo;
2593   auto_vector_sizes vector_sizes;
2594 
2595   /* Autodetect first vector size we try.  */
2596   current_vector_size = 0;
2597   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2598   unsigned int next_size = 0;
2599 
2600   if (dump_enabled_p ())
2601     dump_printf_loc (MSG_NOTE, vect_location,
2602 		     "===== analyze_loop_nest =====\n");
2603 
2604   if (loop_outer (loop)
2605       && loop_vec_info_for_loop (loop_outer (loop))
2606       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2607     {
2608       if (dump_enabled_p ())
2609 	dump_printf_loc (MSG_NOTE, vect_location,
2610 			 "outer-loop already vectorized.\n");
2611       return NULL;
2612     }
2613 
2614   poly_uint64 autodetected_vector_size = 0;
2615   while (1)
2616     {
2617       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2618       loop_vinfo = vect_analyze_loop_form (loop);
2619       if (!loop_vinfo)
2620 	{
2621 	  if (dump_enabled_p ())
2622 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 			     "bad loop form.\n");
2624 	  return NULL;
2625 	}
2626 
2627       bool fatal = false;
2628 
2629       if (orig_loop_vinfo)
2630 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2631 
2632       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2633 	{
2634 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2635 
2636 	  return loop_vinfo;
2637 	}
2638 
2639       delete loop_vinfo;
2640 
2641       if (next_size == 0)
2642 	autodetected_vector_size = current_vector_size;
2643 
2644       if (next_size < vector_sizes.length ()
2645 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
2646 	next_size += 1;
2647 
2648       if (fatal
2649 	  || next_size == vector_sizes.length ()
2650 	  || known_eq (current_vector_size, 0U))
2651 	return NULL;
2652 
2653       /* Try the next biggest vector size.  */
2654       current_vector_size = vector_sizes[next_size++];
2655       if (dump_enabled_p ())
2656 	{
2657 	  dump_printf_loc (MSG_NOTE, vect_location,
2658 			   "***** Re-trying analysis with "
2659 			   "vector size ");
2660 	  dump_dec (MSG_NOTE, current_vector_size);
2661 	  dump_printf (MSG_NOTE, "\n");
2662 	}
2663     }
2664 }
2665 
2666 /* Return true if there is an in-order reduction function for CODE, storing
2667    it in *REDUC_FN if so.  */
2668 
2669 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2670 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2671 {
2672   switch (code)
2673     {
2674     case PLUS_EXPR:
2675       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2676       return true;
2677 
2678     default:
2679       return false;
2680     }
2681 }
2682 
2683 /* Function reduction_fn_for_scalar_code
2684 
2685    Input:
2686    CODE - tree_code of a reduction operations.
2687 
2688    Output:
2689    REDUC_FN - the corresponding internal function to be used to reduce the
2690       vector of partial results into a single scalar result, or IFN_LAST
2691       if the operation is a supported reduction operation, but does not have
2692       such an internal function.
2693 
2694    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2695 
2696 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2697 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2698 {
2699   switch (code)
2700     {
2701       case MAX_EXPR:
2702         *reduc_fn = IFN_REDUC_MAX;
2703         return true;
2704 
2705       case MIN_EXPR:
2706         *reduc_fn = IFN_REDUC_MIN;
2707         return true;
2708 
2709       case PLUS_EXPR:
2710         *reduc_fn = IFN_REDUC_PLUS;
2711         return true;
2712 
2713       case BIT_AND_EXPR:
2714 	*reduc_fn = IFN_REDUC_AND;
2715 	return true;
2716 
2717       case BIT_IOR_EXPR:
2718 	*reduc_fn = IFN_REDUC_IOR;
2719 	return true;
2720 
2721       case BIT_XOR_EXPR:
2722 	*reduc_fn = IFN_REDUC_XOR;
2723 	return true;
2724 
2725       case MULT_EXPR:
2726       case MINUS_EXPR:
2727         *reduc_fn = IFN_LAST;
2728         return true;
2729 
2730       default:
2731        return false;
2732     }
2733 }
2734 
2735 /* If there is a neutral value X such that SLP reduction NODE would not
2736    be affected by the introduction of additional X elements, return that X,
2737    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2738    is true if the SLP statements perform a single reduction, false if each
2739    statement performs an independent reduction.  */
2740 
2741 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)2742 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2743 			      bool reduc_chain)
2744 {
2745   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2746   gimple *stmt = stmts[0];
2747   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2748   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2749   tree scalar_type = TREE_TYPE (vector_type);
2750   struct loop *loop = gimple_bb (stmt)->loop_father;
2751   gcc_assert (loop);
2752 
2753   switch (code)
2754     {
2755     case WIDEN_SUM_EXPR:
2756     case DOT_PROD_EXPR:
2757     case SAD_EXPR:
2758     case PLUS_EXPR:
2759     case MINUS_EXPR:
2760     case BIT_IOR_EXPR:
2761     case BIT_XOR_EXPR:
2762       return build_zero_cst (scalar_type);
2763 
2764     case MULT_EXPR:
2765       return build_one_cst (scalar_type);
2766 
2767     case BIT_AND_EXPR:
2768       return build_all_ones_cst (scalar_type);
2769 
2770     case MAX_EXPR:
2771     case MIN_EXPR:
2772       /* For MIN/MAX the initial values are neutral.  A reduction chain
2773 	 has only a single initial value, so that value is neutral for
2774 	 all statements.  */
2775       if (reduc_chain)
2776 	return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2777       return NULL_TREE;
2778 
2779     default:
2780       return NULL_TREE;
2781     }
2782 }
2783 
2784 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2785    STMT is printed with a message MSG. */
2786 
2787 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2788 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2789 {
2790   dump_printf_loc (msg_type, vect_location, "%s", msg);
2791   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2792 }
2793 
2794 
2795 /* Detect SLP reduction of the form:
2796 
2797    #a1 = phi <a5, a0>
2798    a2 = operation (a1)
2799    a3 = operation (a2)
2800    a4 = operation (a3)
2801    a5 = operation (a4)
2802 
2803    #a = phi <a5>
2804 
2805    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2806    FIRST_STMT is the first reduction stmt in the chain
2807    (a2 = operation (a1)).
2808 
2809    Return TRUE if a reduction chain was detected.  */
2810 
2811 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2812 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2813 		       gimple *first_stmt)
2814 {
2815   struct loop *loop = (gimple_bb (phi))->loop_father;
2816   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2817   enum tree_code code;
2818   gimple *loop_use_stmt = NULL;
2819   stmt_vec_info use_stmt_info;
2820   tree lhs;
2821   imm_use_iterator imm_iter;
2822   use_operand_p use_p;
2823   int nloop_uses, size = 0, n_out_of_loop_uses;
2824   bool found = false;
2825 
2826   if (loop != vect_loop)
2827     return false;
2828 
2829   auto_vec<stmt_vec_info, 8> reduc_chain;
2830   lhs = PHI_RESULT (phi);
2831   code = gimple_assign_rhs_code (first_stmt);
2832   while (1)
2833     {
2834       nloop_uses = 0;
2835       n_out_of_loop_uses = 0;
2836       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2837         {
2838 	  gimple *use_stmt = USE_STMT (use_p);
2839 	  if (is_gimple_debug (use_stmt))
2840 	    continue;
2841 
2842           /* Check if we got back to the reduction phi.  */
2843 	  if (use_stmt == phi)
2844             {
2845 	      loop_use_stmt = use_stmt;
2846               found = true;
2847               break;
2848             }
2849 
2850           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2851             {
2852 	      loop_use_stmt = use_stmt;
2853 	      nloop_uses++;
2854             }
2855            else
2856              n_out_of_loop_uses++;
2857 
2858            /* There are can be either a single use in the loop or two uses in
2859               phi nodes.  */
2860            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2861              return false;
2862         }
2863 
2864       if (found)
2865         break;
2866 
2867       /* We reached a statement with no loop uses.  */
2868       if (nloop_uses == 0)
2869 	return false;
2870 
2871       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2872       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2873         return false;
2874 
2875       if (!is_gimple_assign (loop_use_stmt)
2876 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2877 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2878         return false;
2879 
2880       /* Insert USE_STMT into reduction chain.  */
2881       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2882       reduc_chain.safe_push (use_stmt_info);
2883 
2884       lhs = gimple_assign_lhs (loop_use_stmt);
2885       size++;
2886    }
2887 
2888   if (!found || loop_use_stmt != phi || size < 2)
2889     return false;
2890 
2891   /* Swap the operands, if needed, to make the reduction operand be the second
2892      operand.  */
2893   lhs = PHI_RESULT (phi);
2894   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2895     {
2896       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2897       if (gimple_assign_rhs2 (next_stmt) == lhs)
2898 	{
2899 	  tree op = gimple_assign_rhs1 (next_stmt);
2900 	  gimple *def_stmt = NULL;
2901 
2902           if (TREE_CODE (op) == SSA_NAME)
2903             def_stmt = SSA_NAME_DEF_STMT (op);
2904 
2905 	  /* Check that the other def is either defined in the loop
2906 	     ("vect_internal_def"), or it's an induction (defined by a
2907 	     loop-header phi-node).  */
2908           if (def_stmt
2909               && gimple_bb (def_stmt)
2910 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2911               && (is_gimple_assign (def_stmt)
2912                   || is_gimple_call (def_stmt)
2913                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914                            == vect_induction_def
2915                   || (gimple_code (def_stmt) == GIMPLE_PHI
2916                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2917                                   == vect_internal_def
2918                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2919 	    {
2920 	      lhs = gimple_assign_lhs (next_stmt);
2921  	      continue;
2922 	    }
2923 
2924 	  return false;
2925 	}
2926       else
2927 	{
2928           tree op = gimple_assign_rhs2 (next_stmt);
2929 	  gimple *def_stmt = NULL;
2930 
2931           if (TREE_CODE (op) == SSA_NAME)
2932             def_stmt = SSA_NAME_DEF_STMT (op);
2933 
2934           /* Check that the other def is either defined in the loop
2935             ("vect_internal_def"), or it's an induction (defined by a
2936             loop-header phi-node).  */
2937           if (def_stmt
2938               && gimple_bb (def_stmt)
2939 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2940               && (is_gimple_assign (def_stmt)
2941                   || is_gimple_call (def_stmt)
2942                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2943                               == vect_induction_def
2944                   || (gimple_code (def_stmt) == GIMPLE_PHI
2945                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2946                                   == vect_internal_def
2947                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2948   	    {
2949 	      if (dump_enabled_p ())
2950 		{
2951 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2952 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2953 		}
2954 
2955 	      swap_ssa_operands (next_stmt,
2956 	 		         gimple_assign_rhs1_ptr (next_stmt),
2957                                  gimple_assign_rhs2_ptr (next_stmt));
2958 	      update_stmt (next_stmt);
2959 
2960 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2961 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2962 	    }
2963 	  else
2964 	    return false;
2965         }
2966 
2967       lhs = gimple_assign_lhs (next_stmt);
2968     }
2969 
2970   /* Build up the actual chain.  */
2971   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2972     {
2973       GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2974       GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2975     }
2976   GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2977   GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2978 
2979   /* Save the chain for further analysis in SLP detection.  */
2980   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2981   GROUP_SIZE (reduc_chain[0]) = size;
2982 
2983   return true;
2984 }
2985 
2986 /* Return true if we need an in-order reduction for operation CODE
2987    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2988    overflow must wrap.  */
2989 
2990 static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)2991 needs_fold_left_reduction_p (tree type, tree_code code,
2992 			     bool need_wrapping_integral_overflow)
2993 {
2994   /* CHECKME: check for !flag_finite_math_only too?  */
2995   if (SCALAR_FLOAT_TYPE_P (type))
2996     switch (code)
2997       {
2998       case MIN_EXPR:
2999       case MAX_EXPR:
3000 	return false;
3001 
3002       default:
3003 	return !flag_associative_math;
3004       }
3005 
3006   if (INTEGRAL_TYPE_P (type))
3007     {
3008       if (!operation_no_trapping_overflow (type, code))
3009 	return true;
3010       if (need_wrapping_integral_overflow
3011 	  && !TYPE_OVERFLOW_WRAPS (type)
3012 	  && operation_can_overflow (code))
3013 	return true;
3014       return false;
3015     }
3016 
3017   if (SAT_FIXED_POINT_TYPE_P (type))
3018     return true;
3019 
3020   return false;
3021 }
3022 
3023 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3024    reduction operation CODE has a handled computation expression.  */
3025 
3026 bool
check_reduction_path(location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3027 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3028 		      enum tree_code code)
3029 {
3030   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3031   auto_bitmap visited;
3032   tree lookfor = PHI_RESULT (phi);
3033   ssa_op_iter curri;
3034   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3035   while (USE_FROM_PTR (curr) != loop_arg)
3036     curr = op_iter_next_use (&curri);
3037   curri.i = curri.numops;
3038   do
3039     {
3040       path.safe_push (std::make_pair (curri, curr));
3041       tree use = USE_FROM_PTR (curr);
3042       if (use == lookfor)
3043 	break;
3044       gimple *def = SSA_NAME_DEF_STMT (use);
3045       if (gimple_nop_p (def)
3046 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3047 	{
3048 pop:
3049 	  do
3050 	    {
3051 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3052 	      curri = x.first;
3053 	      curr = x.second;
3054 	      do
3055 		curr = op_iter_next_use (&curri);
3056 	      /* Skip already visited or non-SSA operands (from iterating
3057 	         over PHI args).  */
3058 	      while (curr != NULL_USE_OPERAND_P
3059 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3060 			 || ! bitmap_set_bit (visited,
3061 					      SSA_NAME_VERSION
3062 					        (USE_FROM_PTR (curr)))));
3063 	    }
3064 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3065 	  if (curr == NULL_USE_OPERAND_P)
3066 	    break;
3067 	}
3068       else
3069 	{
3070 	  if (gimple_code (def) == GIMPLE_PHI)
3071 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3072 	  else
3073 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3074 	  while (curr != NULL_USE_OPERAND_P
3075 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3076 		     || ! bitmap_set_bit (visited,
3077 					  SSA_NAME_VERSION
3078 					    (USE_FROM_PTR (curr)))))
3079 	    curr = op_iter_next_use (&curri);
3080 	  if (curr == NULL_USE_OPERAND_P)
3081 	    goto pop;
3082 	}
3083     }
3084   while (1);
3085   if (dump_file && (dump_flags & TDF_DETAILS))
3086     {
3087       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3088       unsigned i;
3089       std::pair<ssa_op_iter, use_operand_p> *x;
3090       FOR_EACH_VEC_ELT (path, i, x)
3091 	{
3092 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3093 	  dump_printf (MSG_NOTE, " ");
3094 	}
3095       dump_printf (MSG_NOTE, "\n");
3096     }
3097 
3098   /* Check whether the reduction path detected is valid.  */
3099   bool fail = path.length () == 0;
3100   bool neg = false;
3101   for (unsigned i = 1; i < path.length (); ++i)
3102     {
3103       gimple *use_stmt = USE_STMT (path[i].second);
3104       tree op = USE_FROM_PTR (path[i].second);
3105       if (! has_single_use (op)
3106 	  || ! is_gimple_assign (use_stmt))
3107 	{
3108 	  fail = true;
3109 	  break;
3110 	}
3111       if (gimple_assign_rhs_code (use_stmt) != code)
3112 	{
3113 	  if (code == PLUS_EXPR
3114 	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3115 	    {
3116 	      /* Track whether we negate the reduction value each iteration.  */
3117 	      if (gimple_assign_rhs2 (use_stmt) == op)
3118 		neg = ! neg;
3119 	    }
3120 	  else
3121 	    {
3122 	      fail = true;
3123 	      break;
3124 	    }
3125 	}
3126     }
3127   return ! fail && ! neg;
3128 }
3129 
3130 
3131 /* Function vect_is_simple_reduction
3132 
3133    (1) Detect a cross-iteration def-use cycle that represents a simple
3134    reduction computation.  We look for the following pattern:
3135 
3136    loop_header:
3137      a1 = phi < a0, a2 >
3138      a3 = ...
3139      a2 = operation (a3, a1)
3140 
3141    or
3142 
3143    a3 = ...
3144    loop_header:
3145      a1 = phi < a0, a2 >
3146      a2 = operation (a3, a1)
3147 
3148    such that:
3149    1. operation is commutative and associative and it is safe to
3150       change the order of the computation
3151    2. no uses for a2 in the loop (a2 is used out of the loop)
3152    3. no uses of a1 in the loop besides the reduction operation
3153    4. no uses of a1 outside the loop.
3154 
3155    Conditions 1,4 are tested here.
3156    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3157 
3158    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3159    nested cycles.
3160 
3161    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3162    reductions:
3163 
3164      a1 = phi < a0, a2 >
3165      inner loop (def of a3)
3166      a2 = phi < a3 >
3167 
3168    (4) Detect condition expressions, ie:
3169      for (int i = 0; i < N; i++)
3170        if (a[i] < val)
3171 	ret_val = a[i];
3172 
3173 */
3174 
3175 static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)3176 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3177 			  bool *double_reduc,
3178 			  bool need_wrapping_integral_overflow,
3179 			  enum vect_reduction_type *v_reduc_type)
3180 {
3181   struct loop *loop = (gimple_bb (phi))->loop_father;
3182   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3183   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3184   enum tree_code orig_code, code;
3185   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3186   tree type;
3187   int nloop_uses;
3188   tree name;
3189   imm_use_iterator imm_iter;
3190   use_operand_p use_p;
3191   bool phi_def;
3192 
3193   *double_reduc = false;
3194   *v_reduc_type = TREE_CODE_REDUCTION;
3195 
3196   tree phi_name = PHI_RESULT (phi);
3197   /* ???  If there are no uses of the PHI result the inner loop reduction
3198      won't be detected as possibly double-reduction by vectorizable_reduction
3199      because that tries to walk the PHI arg from the preheader edge which
3200      can be constant.  See PR60382.  */
3201   if (has_zero_uses (phi_name))
3202     return NULL;
3203   nloop_uses = 0;
3204   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3205     {
3206       gimple *use_stmt = USE_STMT (use_p);
3207       if (is_gimple_debug (use_stmt))
3208 	continue;
3209 
3210       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3211         {
3212           if (dump_enabled_p ())
3213 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214 			     "intermediate value used outside loop.\n");
3215 
3216           return NULL;
3217         }
3218 
3219       nloop_uses++;
3220       if (nloop_uses > 1)
3221         {
3222           if (dump_enabled_p ())
3223 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3224 			     "reduction value used in loop.\n");
3225           return NULL;
3226         }
3227 
3228       phi_use_stmt = use_stmt;
3229     }
3230 
3231   edge latch_e = loop_latch_edge (loop);
3232   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3233   if (TREE_CODE (loop_arg) != SSA_NAME)
3234     {
3235       if (dump_enabled_p ())
3236 	{
3237 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3238 			   "reduction: not ssa_name: ");
3239 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3240           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3241 	}
3242       return NULL;
3243     }
3244 
3245   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3246   if (is_gimple_assign (def_stmt))
3247     {
3248       name = gimple_assign_lhs (def_stmt);
3249       phi_def = false;
3250     }
3251   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3252     {
3253       name = PHI_RESULT (def_stmt);
3254       phi_def = true;
3255     }
3256   else
3257     {
3258       if (dump_enabled_p ())
3259 	{
3260 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3261 			   "reduction: unhandled reduction operation: ");
3262 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3263 	}
3264       return NULL;
3265     }
3266 
3267   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3268     return NULL;
3269 
3270   nloop_uses = 0;
3271   auto_vec<gphi *, 3> lcphis;
3272   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3273     {
3274       gimple *use_stmt = USE_STMT (use_p);
3275       if (is_gimple_debug (use_stmt))
3276 	continue;
3277       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3278 	nloop_uses++;
3279       else
3280 	/* We can have more than one loop-closed PHI.  */
3281 	lcphis.safe_push (as_a <gphi *> (use_stmt));
3282       if (nloop_uses > 1)
3283 	{
3284 	  if (dump_enabled_p ())
3285 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3286 			     "reduction used in loop.\n");
3287 	  return NULL;
3288 	}
3289     }
3290 
3291   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3292      defined in the inner loop.  */
3293   if (phi_def)
3294     {
3295       op1 = PHI_ARG_DEF (def_stmt, 0);
3296 
3297       if (gimple_phi_num_args (def_stmt) != 1
3298           || TREE_CODE (op1) != SSA_NAME)
3299         {
3300           if (dump_enabled_p ())
3301 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302 			     "unsupported phi node definition.\n");
3303 
3304           return NULL;
3305         }
3306 
3307       def1 = SSA_NAME_DEF_STMT (op1);
3308       if (gimple_bb (def1)
3309 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3310           && loop->inner
3311           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3312           && is_gimple_assign (def1)
3313 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3314         {
3315           if (dump_enabled_p ())
3316             report_vect_op (MSG_NOTE, def_stmt,
3317 			    "detected double reduction: ");
3318 
3319           *double_reduc = true;
3320           return def_stmt;
3321         }
3322 
3323       return NULL;
3324     }
3325 
3326   /* If we are vectorizing an inner reduction we are executing that
3327      in the original order only in case we are not dealing with a
3328      double reduction.  */
3329   bool check_reduction = true;
3330   if (flow_loop_nested_p (vect_loop, loop))
3331     {
3332       gphi *lcphi;
3333       unsigned i;
3334       check_reduction = false;
3335       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3336 	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3337 	  {
3338 	    gimple *use_stmt = USE_STMT (use_p);
3339 	    if (is_gimple_debug (use_stmt))
3340 	      continue;
3341 	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3342 	      check_reduction = true;
3343 	  }
3344     }
3345 
3346   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3347   code = orig_code = gimple_assign_rhs_code (def_stmt);
3348 
3349   /* We can handle "res -= x[i]", which is non-associative by
3350      simply rewriting this into "res += -x[i]".  Avoid changing
3351      gimple instruction for the first simple tests and only do this
3352      if we're allowed to change code at all.  */
3353   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3354     code = PLUS_EXPR;
3355 
3356   if (code == COND_EXPR)
3357     {
3358       if (! nested_in_vect_loop)
3359 	*v_reduc_type = COND_REDUCTION;
3360 
3361       op3 = gimple_assign_rhs1 (def_stmt);
3362       if (COMPARISON_CLASS_P (op3))
3363         {
3364           op4 = TREE_OPERAND (op3, 1);
3365           op3 = TREE_OPERAND (op3, 0);
3366         }
3367       if (op3 == phi_name || op4 == phi_name)
3368 	{
3369 	  if (dump_enabled_p ())
3370 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3371 			    "reduction: condition depends on previous"
3372 			    " iteration: ");
3373 	  return NULL;
3374 	}
3375 
3376       op1 = gimple_assign_rhs2 (def_stmt);
3377       op2 = gimple_assign_rhs3 (def_stmt);
3378     }
3379   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3380     {
3381       if (dump_enabled_p ())
3382 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3383 			"reduction: not commutative/associative: ");
3384       return NULL;
3385     }
3386   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3387     {
3388       op1 = gimple_assign_rhs1 (def_stmt);
3389       op2 = gimple_assign_rhs2 (def_stmt);
3390     }
3391   else
3392     {
3393       if (dump_enabled_p ())
3394 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3395 			"reduction: not handled operation: ");
3396       return NULL;
3397     }
3398 
3399   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3400     {
3401       if (dump_enabled_p ())
3402 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3403 			"reduction: both uses not ssa_names: ");
3404 
3405       return NULL;
3406     }
3407 
3408   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3409   if ((TREE_CODE (op1) == SSA_NAME
3410        && !types_compatible_p (type,TREE_TYPE (op1)))
3411       || (TREE_CODE (op2) == SSA_NAME
3412           && !types_compatible_p (type, TREE_TYPE (op2)))
3413       || (op3 && TREE_CODE (op3) == SSA_NAME
3414           && !types_compatible_p (type, TREE_TYPE (op3)))
3415       || (op4 && TREE_CODE (op4) == SSA_NAME
3416           && !types_compatible_p (type, TREE_TYPE (op4))))
3417     {
3418       if (dump_enabled_p ())
3419         {
3420           dump_printf_loc (MSG_NOTE, vect_location,
3421 			   "reduction: multiple types: operation type: ");
3422           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3423           dump_printf (MSG_NOTE, ", operands types: ");
3424           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3425 			     TREE_TYPE (op1));
3426           dump_printf (MSG_NOTE, ",");
3427           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3428 			     TREE_TYPE (op2));
3429           if (op3)
3430             {
3431               dump_printf (MSG_NOTE, ",");
3432               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3433 				 TREE_TYPE (op3));
3434             }
3435 
3436           if (op4)
3437             {
3438               dump_printf (MSG_NOTE, ",");
3439               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3440 				 TREE_TYPE (op4));
3441             }
3442           dump_printf (MSG_NOTE, "\n");
3443         }
3444 
3445       return NULL;
3446     }
3447 
3448   /* Check whether it's ok to change the order of the computation.
3449      Generally, when vectorizing a reduction we change the order of the
3450      computation.  This may change the behavior of the program in some
3451      cases, so we need to check that this is ok.  One exception is when
3452      vectorizing an outer-loop: the inner-loop is executed sequentially,
3453      and therefore vectorizing reductions in the inner-loop during
3454      outer-loop vectorization is safe.  */
3455   if (check_reduction
3456       && *v_reduc_type == TREE_CODE_REDUCTION
3457       && needs_fold_left_reduction_p (type, code,
3458 				      need_wrapping_integral_overflow))
3459     *v_reduc_type = FOLD_LEFT_REDUCTION;
3460 
3461   /* Reduction is safe. We're dealing with one of the following:
3462      1) integer arithmetic and no trapv
3463      2) floating point arithmetic, and special flags permit this optimization
3464      3) nested cycle (i.e., outer loop vectorization).  */
3465   if (TREE_CODE (op1) == SSA_NAME)
3466     def1 = SSA_NAME_DEF_STMT (op1);
3467 
3468   if (TREE_CODE (op2) == SSA_NAME)
3469     def2 = SSA_NAME_DEF_STMT (op2);
3470 
3471   if (code != COND_EXPR
3472       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3473     {
3474       if (dump_enabled_p ())
3475 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3476       return NULL;
3477     }
3478 
3479   /* Check that one def is the reduction def, defined by PHI,
3480      the other def is either defined in the loop ("vect_internal_def"),
3481      or it's an induction (defined by a loop-header phi-node).  */
3482 
3483   if (def2 && def2 == phi
3484       && (code == COND_EXPR
3485 	  || !def1 || gimple_nop_p (def1)
3486 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3487           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3488               && (is_gimple_assign (def1)
3489 		  || is_gimple_call (def1)
3490   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3491                       == vect_induction_def
3492    	          || (gimple_code (def1) == GIMPLE_PHI
3493 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3494                           == vect_internal_def
3495  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
3496     {
3497       if (dump_enabled_p ())
3498 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3499       return def_stmt;
3500     }
3501 
3502   if (def1 && def1 == phi
3503       && (code == COND_EXPR
3504 	  || !def2 || gimple_nop_p (def2)
3505 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3506 	  || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3507 	      && (is_gimple_assign (def2)
3508 		  || is_gimple_call (def2)
3509 		  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3510 		       == vect_induction_def
3511 		  || (gimple_code (def2) == GIMPLE_PHI
3512 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3513 			   == vect_internal_def
3514 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
3515     {
3516       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3517 	{
3518 	  /* Check if we can swap operands (just for simplicity - so that
3519 	     the rest of the code can assume that the reduction variable
3520 	     is always the last (second) argument).  */
3521 	  if (code == COND_EXPR)
3522 	    {
3523 	      /* Swap cond_expr by inverting the condition.  */
3524 	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
3525 	      enum tree_code invert_code = ERROR_MARK;
3526 	      enum tree_code cond_code = TREE_CODE (cond_expr);
3527 
3528 	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3529 		{
3530 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3531 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
3532 		}
3533 	      if (invert_code != ERROR_MARK)
3534 		{
3535 		  TREE_SET_CODE (cond_expr, invert_code);
3536 		  swap_ssa_operands (def_stmt,
3537 				     gimple_assign_rhs2_ptr (def_stmt),
3538 				     gimple_assign_rhs3_ptr (def_stmt));
3539 		}
3540 	      else
3541 		{
3542 		  if (dump_enabled_p ())
3543 		    report_vect_op (MSG_NOTE, def_stmt,
3544 				    "detected reduction: cannot swap operands "
3545 				    "for cond_expr");
3546 		  return NULL;
3547 		}
3548 	    }
3549 	  else
3550 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3551 			       gimple_assign_rhs2_ptr (def_stmt));
3552 
3553 	  if (dump_enabled_p ())
3554 	    report_vect_op (MSG_NOTE, def_stmt,
3555 			    "detected reduction: need to swap operands: ");
3556 
3557 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3558 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3559         }
3560       else
3561         {
3562           if (dump_enabled_p ())
3563             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3564         }
3565 
3566       return def_stmt;
3567     }
3568 
3569   /* Try to find SLP reduction chain.  */
3570   if (! nested_in_vect_loop
3571       && code != COND_EXPR
3572       && orig_code != MINUS_EXPR
3573       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3574     {
3575       if (dump_enabled_p ())
3576         report_vect_op (MSG_NOTE, def_stmt,
3577 			"reduction: detected reduction chain: ");
3578 
3579       return def_stmt;
3580     }
3581 
3582   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3583   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3584   while (first)
3585     {
3586       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3587       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3588       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3589       first = next;
3590     }
3591 
3592   /* Look for the expression computing loop_arg from loop PHI result.  */
3593   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3594 			    code))
3595     return def_stmt;
3596 
3597   if (dump_enabled_p ())
3598     {
3599       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3600 		      "reduction: unknown pattern: ");
3601     }
3602 
3603   return NULL;
3604 }
3605 
3606 /* Wrapper around vect_is_simple_reduction, which will modify code
3607    in-place if it enables detection of more reductions.  Arguments
3608    as there.  */
3609 
3610 gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow)3611 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3612 			     bool *double_reduc,
3613 			     bool need_wrapping_integral_overflow)
3614 {
3615   enum vect_reduction_type v_reduc_type;
3616   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3617 					  need_wrapping_integral_overflow,
3618 					  &v_reduc_type);
3619   if (def)
3620     {
3621       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3622       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3623       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3624       reduc_def_info = vinfo_for_stmt (def);
3625       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3626       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3627     }
3628   return def;
3629 }
3630 
3631 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3632 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3633 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3634                              int *peel_iters_epilogue,
3635                              stmt_vector_for_cost *scalar_cost_vec,
3636 			     stmt_vector_for_cost *prologue_cost_vec,
3637 			     stmt_vector_for_cost *epilogue_cost_vec)
3638 {
3639   int retval = 0;
3640   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3641 
3642   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3643     {
3644       *peel_iters_epilogue = assumed_vf / 2;
3645       if (dump_enabled_p ())
3646         dump_printf_loc (MSG_NOTE, vect_location,
3647 			 "cost model: epilogue peel iters set to vf/2 "
3648 			 "because loop iterations are unknown .\n");
3649 
3650       /* If peeled iterations are known but number of scalar loop
3651          iterations are unknown, count a taken branch per peeled loop.  */
3652       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3653 				 NULL, 0, vect_prologue);
3654       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3655 				 NULL, 0, vect_epilogue);
3656     }
3657   else
3658     {
3659       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3660       peel_iters_prologue = niters < peel_iters_prologue ?
3661                             niters : peel_iters_prologue;
3662       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3663       /* If we need to peel for gaps, but no peeling is required, we have to
3664 	 peel VF iterations.  */
3665       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3666 	*peel_iters_epilogue = assumed_vf;
3667     }
3668 
3669   stmt_info_for_cost *si;
3670   int j;
3671   if (peel_iters_prologue)
3672     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3673 	{
3674 	  stmt_vec_info stmt_info
3675 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3676 	  retval += record_stmt_cost (prologue_cost_vec,
3677 				      si->count * peel_iters_prologue,
3678 				      si->kind, stmt_info, si->misalign,
3679 				      vect_prologue);
3680 	}
3681   if (*peel_iters_epilogue)
3682     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3683 	{
3684 	  stmt_vec_info stmt_info
3685 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3686 	  retval += record_stmt_cost (epilogue_cost_vec,
3687 				      si->count * *peel_iters_epilogue,
3688 				      si->kind, stmt_info, si->misalign,
3689 				      vect_epilogue);
3690 	}
3691 
3692   return retval;
3693 }
3694 
3695 /* Function vect_estimate_min_profitable_iters
3696 
3697    Return the number of iterations required for the vector version of the
3698    loop to be profitable relative to the cost of the scalar version of the
3699    loop.
3700 
3701    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3702    of iterations for vectorization.  -1 value means loop vectorization
3703    is not profitable.  This returned value may be used for dynamic
3704    profitability check.
3705 
3706    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3707    for static check against estimated number of iterations.  */
3708 
3709 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3710 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3711 				    int *ret_min_profitable_niters,
3712 				    int *ret_min_profitable_estimate)
3713 {
3714   int min_profitable_iters;
3715   int min_profitable_estimate;
3716   int peel_iters_prologue;
3717   int peel_iters_epilogue;
3718   unsigned vec_inside_cost = 0;
3719   int vec_outside_cost = 0;
3720   unsigned vec_prologue_cost = 0;
3721   unsigned vec_epilogue_cost = 0;
3722   int scalar_single_iter_cost = 0;
3723   int scalar_outside_cost = 0;
3724   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3725   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3726   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3727 
3728   /* Cost model disabled.  */
3729   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3730     {
3731       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3732       *ret_min_profitable_niters = 0;
3733       *ret_min_profitable_estimate = 0;
3734       return;
3735     }
3736 
3737   /* Requires loop versioning tests to handle misalignment.  */
3738   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3739     {
3740       /*  FIXME: Make cost depend on complexity of individual check.  */
3741       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3742       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3743 			    vect_prologue);
3744       dump_printf (MSG_NOTE,
3745                    "cost model: Adding cost of checks for loop "
3746                    "versioning to treat misalignment.\n");
3747     }
3748 
3749   /* Requires loop versioning with alias checks.  */
3750   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3751     {
3752       /*  FIXME: Make cost depend on complexity of individual check.  */
3753       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3754       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3755 			    vect_prologue);
3756       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3757       if (len)
3758 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3759 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3760 			      NULL, 0, vect_prologue);
3761       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3762       if (len)
3763 	{
3764 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3765 	  unsigned int nstmts = len * 2 - 1;
3766 	  /* +1 for each bias that needs adding.  */
3767 	  for (unsigned int i = 0; i < len; ++i)
3768 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3769 	      nstmts += 1;
3770 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3771 				NULL, 0, vect_prologue);
3772 	}
3773       dump_printf (MSG_NOTE,
3774                    "cost model: Adding cost of checks for loop "
3775                    "versioning aliasing.\n");
3776     }
3777 
3778   /* Requires loop versioning with niter checks.  */
3779   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3780     {
3781       /*  FIXME: Make cost depend on complexity of individual check.  */
3782       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3783 			    vect_prologue);
3784       dump_printf (MSG_NOTE,
3785 		   "cost model: Adding cost of checks for loop "
3786 		   "versioning niters.\n");
3787     }
3788 
3789   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3790     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3791 			  vect_prologue);
3792 
3793   /* Count statements in scalar loop.  Using this as scalar cost for a single
3794      iteration for now.
3795 
3796      TODO: Add outer loop support.
3797 
3798      TODO: Consider assigning different costs to different scalar
3799      statements.  */
3800 
3801   scalar_single_iter_cost
3802     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3803 
3804   /* Add additional cost for the peeled instructions in prologue and epilogue
3805      loop.  (For fully-masked loops there will be no peeling.)
3806 
3807      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3808      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3809 
3810      TODO: Build an expression that represents peel_iters for prologue and
3811      epilogue to be used in a run-time test.  */
3812 
3813   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3814     {
3815       peel_iters_prologue = 0;
3816       peel_iters_epilogue = 0;
3817 
3818       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3819 	{
3820 	  /* We need to peel exactly one iteration.  */
3821 	  peel_iters_epilogue += 1;
3822 	  stmt_info_for_cost *si;
3823 	  int j;
3824 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3825 			    j, si)
3826 	    {
3827 	      struct _stmt_vec_info *stmt_info
3828 		= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3829 	      (void) add_stmt_cost (target_cost_data, si->count,
3830 				    si->kind, stmt_info, si->misalign,
3831 				    vect_epilogue);
3832 	    }
3833 	}
3834     }
3835   else if (npeel < 0)
3836     {
3837       peel_iters_prologue = assumed_vf / 2;
3838       dump_printf (MSG_NOTE, "cost model: "
3839                    "prologue peel iters set to vf/2.\n");
3840 
3841       /* If peeling for alignment is unknown, loop bound of main loop becomes
3842          unknown.  */
3843       peel_iters_epilogue = assumed_vf / 2;
3844       dump_printf (MSG_NOTE, "cost model: "
3845                    "epilogue peel iters set to vf/2 because "
3846                    "peeling for alignment is unknown.\n");
3847 
3848       /* If peeled iterations are unknown, count a taken branch and a not taken
3849          branch per peeled loop. Even if scalar loop iterations are known,
3850          vector iterations are not known since peeled prologue iterations are
3851          not known. Hence guards remain the same.  */
3852       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3853 			    NULL, 0, vect_prologue);
3854       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3855 			    NULL, 0, vect_prologue);
3856       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3857 			    NULL, 0, vect_epilogue);
3858       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3859 			    NULL, 0, vect_epilogue);
3860       stmt_info_for_cost *si;
3861       int j;
3862       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3863 	{
3864 	  struct _stmt_vec_info *stmt_info
3865 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3866 	  (void) add_stmt_cost (target_cost_data,
3867 				si->count * peel_iters_prologue,
3868 				si->kind, stmt_info, si->misalign,
3869 				vect_prologue);
3870 	  (void) add_stmt_cost (target_cost_data,
3871 				si->count * peel_iters_epilogue,
3872 				si->kind, stmt_info, si->misalign,
3873 				vect_epilogue);
3874 	}
3875     }
3876   else
3877     {
3878       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3879       stmt_info_for_cost *si;
3880       int j;
3881       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3882 
3883       prologue_cost_vec.create (2);
3884       epilogue_cost_vec.create (2);
3885       peel_iters_prologue = npeel;
3886 
3887       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3888 					  &peel_iters_epilogue,
3889 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3890 					    (loop_vinfo),
3891 					  &prologue_cost_vec,
3892 					  &epilogue_cost_vec);
3893 
3894       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3895 	{
3896 	  struct _stmt_vec_info *stmt_info
3897 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3898 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3899 				si->misalign, vect_prologue);
3900 	}
3901 
3902       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3903 	{
3904 	  struct _stmt_vec_info *stmt_info
3905 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3906 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3907 				si->misalign, vect_epilogue);
3908 	}
3909 
3910       prologue_cost_vec.release ();
3911       epilogue_cost_vec.release ();
3912     }
3913 
3914   /* FORNOW: The scalar outside cost is incremented in one of the
3915      following ways:
3916 
3917      1. The vectorizer checks for alignment and aliasing and generates
3918      a condition that allows dynamic vectorization.  A cost model
3919      check is ANDED with the versioning condition.  Hence scalar code
3920      path now has the added cost of the versioning check.
3921 
3922        if (cost > th & versioning_check)
3923          jmp to vector code
3924 
3925      Hence run-time scalar is incremented by not-taken branch cost.
3926 
3927      2. The vectorizer then checks if a prologue is required.  If the
3928      cost model check was not done before during versioning, it has to
3929      be done before the prologue check.
3930 
3931        if (cost <= th)
3932          prologue = scalar_iters
3933        if (prologue == 0)
3934          jmp to vector code
3935        else
3936          execute prologue
3937        if (prologue == num_iters)
3938 	 go to exit
3939 
3940      Hence the run-time scalar cost is incremented by a taken branch,
3941      plus a not-taken branch, plus a taken branch cost.
3942 
3943      3. The vectorizer then checks if an epilogue is required.  If the
3944      cost model check was not done before during prologue check, it
3945      has to be done with the epilogue check.
3946 
3947        if (prologue == 0)
3948          jmp to vector code
3949        else
3950          execute prologue
3951        if (prologue == num_iters)
3952 	 go to exit
3953        vector code:
3954          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3955            jmp to epilogue
3956 
3957      Hence the run-time scalar cost should be incremented by 2 taken
3958      branches.
3959 
3960      TODO: The back end may reorder the BBS's differently and reverse
3961      conditions/branch directions.  Change the estimates below to
3962      something more reasonable.  */
3963 
3964   /* If the number of iterations is known and we do not do versioning, we can
3965      decide whether to vectorize at compile time.  Hence the scalar version
3966      do not carry cost model guard costs.  */
3967   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3968       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3969     {
3970       /* Cost model check occurs at versioning.  */
3971       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3972 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3973       else
3974 	{
3975 	  /* Cost model check occurs at prologue generation.  */
3976 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3977 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3978 	      + vect_get_stmt_cost (cond_branch_not_taken);
3979 	  /* Cost model check occurs at epilogue generation.  */
3980 	  else
3981 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3982 	}
3983     }
3984 
3985   /* Complete the target-specific cost calculations.  */
3986   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3987 	       &vec_inside_cost, &vec_epilogue_cost);
3988 
3989   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3990 
3991   if (dump_enabled_p ())
3992     {
3993       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3994       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3995                    vec_inside_cost);
3996       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3997                    vec_prologue_cost);
3998       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3999                    vec_epilogue_cost);
4000       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4001                    scalar_single_iter_cost);
4002       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4003                    scalar_outside_cost);
4004       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4005                    vec_outside_cost);
4006       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4007                    peel_iters_prologue);
4008       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4009                    peel_iters_epilogue);
4010     }
4011 
4012   /* Calculate number of iterations required to make the vector version
4013      profitable, relative to the loop bodies only.  The following condition
4014      must hold true:
4015      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4016      where
4017      SIC = scalar iteration cost, VIC = vector iteration cost,
4018      VOC = vector outside cost, VF = vectorization factor,
4019      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4020      SOC = scalar outside cost for run time cost model check.  */
4021 
4022   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4023     {
4024       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4025 			      * assumed_vf
4026 			      - vec_inside_cost * peel_iters_prologue
4027 			      - vec_inside_cost * peel_iters_epilogue);
4028       if (min_profitable_iters <= 0)
4029         min_profitable_iters = 0;
4030       else
4031 	{
4032 	  min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4033 				   - vec_inside_cost);
4034 
4035 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4036 	      <= (((int) vec_inside_cost * min_profitable_iters)
4037 		  + (((int) vec_outside_cost - scalar_outside_cost)
4038 		     * assumed_vf)))
4039 	    min_profitable_iters++;
4040 	}
4041     }
4042   /* vector version will never be profitable.  */
4043   else
4044     {
4045       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4046 	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4047 		    "did not happen for a simd loop");
4048 
4049       if (dump_enabled_p ())
4050         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4051 			 "cost model: the vector iteration cost = %d "
4052 			 "divided by the scalar iteration cost = %d "
4053 			 "is greater or equal to the vectorization factor = %d"
4054                          ".\n",
4055 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4056       *ret_min_profitable_niters = -1;
4057       *ret_min_profitable_estimate = -1;
4058       return;
4059     }
4060 
4061   dump_printf (MSG_NOTE,
4062 	       "  Calculated minimum iters for profitability: %d\n",
4063 	       min_profitable_iters);
4064 
4065   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4066       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4067     /* We want the vectorized loop to execute at least once.  */
4068     min_profitable_iters = assumed_vf + peel_iters_prologue;
4069 
4070   if (dump_enabled_p ())
4071     dump_printf_loc (MSG_NOTE, vect_location,
4072                      "  Runtime profitability threshold = %d\n",
4073                      min_profitable_iters);
4074 
4075   *ret_min_profitable_niters = min_profitable_iters;
4076 
4077   /* Calculate number of iterations required to make the vector version
4078      profitable, relative to the loop bodies only.
4079 
4080      Non-vectorized variant is SIC * niters and it must win over vector
4081      variant on the expected loop trip count.  The following condition must hold true:
4082      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4083 
4084   if (vec_outside_cost <= 0)
4085     min_profitable_estimate = 0;
4086   else
4087     {
4088       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4089 				 * assumed_vf
4090 				 - vec_inside_cost * peel_iters_prologue
4091 				 - vec_inside_cost * peel_iters_epilogue)
4092 				 / ((scalar_single_iter_cost * assumed_vf)
4093 				   - vec_inside_cost);
4094     }
4095   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4096   if (dump_enabled_p ())
4097     dump_printf_loc (MSG_NOTE, vect_location,
4098 		     "  Static estimate profitability threshold = %d\n",
4099 		     min_profitable_estimate);
4100 
4101   *ret_min_profitable_estimate = min_profitable_estimate;
4102 }
4103 
4104 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4105    vector elements (not bits) for a vector with NELT elements.  */
4106 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4107 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4108 			      vec_perm_builder *sel)
4109 {
4110   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4111      by vec_perm_indices.  */
4112   sel->new_vector (nelt, 1, 3);
4113   for (unsigned int i = 0; i < 3; i++)
4114     sel->quick_push (i + offset);
4115 }
4116 
4117 /* Checks whether the target supports whole-vector shifts for vectors of mode
4118    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4119    it supports vec_perm_const with masks for all necessary shift amounts.  */
4120 static bool
have_whole_vector_shift(machine_mode mode)4121 have_whole_vector_shift (machine_mode mode)
4122 {
4123   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4124     return true;
4125 
4126   /* Variable-length vectors should be handled via the optab.  */
4127   unsigned int nelt;
4128   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4129     return false;
4130 
4131   vec_perm_builder sel;
4132   vec_perm_indices indices;
4133   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4134     {
4135       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4136       indices.new_vector (sel, 2, nelt);
4137       if (!can_vec_perm_const_p (mode, indices, false))
4138 	return false;
4139     }
4140   return true;
4141 }
4142 
4143 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4144    functions. Design better to avoid maintenance issues.  */
4145 
4146 /* Function vect_model_reduction_cost.
4147 
4148    Models cost for a reduction operation, including the vector ops
4149    generated within the strip-mine loop, the initial definition before
4150    the loop, and the epilogue code that must be generated.  */
4151 
4152 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies)4153 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4154 			   int ncopies)
4155 {
4156   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4157   enum tree_code code;
4158   optab optab;
4159   tree vectype;
4160   gimple *orig_stmt;
4161   machine_mode mode;
4162   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4163   struct loop *loop = NULL;
4164   void *target_cost_data;
4165 
4166   if (loop_vinfo)
4167     {
4168       loop = LOOP_VINFO_LOOP (loop_vinfo);
4169       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4170     }
4171   else
4172     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4173 
4174   /* Condition reductions generate two reductions in the loop.  */
4175   vect_reduction_type reduction_type
4176     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4177   if (reduction_type == COND_REDUCTION)
4178     ncopies *= 2;
4179 
4180   vectype = STMT_VINFO_VECTYPE (stmt_info);
4181   mode = TYPE_MODE (vectype);
4182   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4183 
4184   if (!orig_stmt)
4185     orig_stmt = STMT_VINFO_STMT (stmt_info);
4186 
4187   code = gimple_assign_rhs_code (orig_stmt);
4188 
4189   if (reduction_type == EXTRACT_LAST_REDUCTION
4190       || reduction_type == FOLD_LEFT_REDUCTION)
4191     {
4192       /* No extra instructions needed in the prologue.  */
4193       prologue_cost = 0;
4194 
4195       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4196 	/* Count one reduction-like operation per vector.  */
4197 	inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4198 				     stmt_info, 0, vect_body);
4199       else
4200 	{
4201 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4202 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4203 	  inside_cost = add_stmt_cost (target_cost_data,  nelements,
4204 				       vec_to_scalar, stmt_info, 0,
4205 				       vect_body);
4206 	  inside_cost += add_stmt_cost (target_cost_data,  nelements,
4207 					scalar_stmt, stmt_info, 0,
4208 					vect_body);
4209 	}
4210     }
4211   else
4212     {
4213       /* Add in cost for initial definition.
4214 	 For cond reduction we have four vectors: initial index, step,
4215 	 initial result of the data reduction, initial value of the index
4216 	 reduction.  */
4217       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4218       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4219 				      scalar_to_vec, stmt_info, 0,
4220 				      vect_prologue);
4221 
4222       /* Cost of reduction op inside loop.  */
4223       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4224 				   stmt_info, 0, vect_body);
4225     }
4226 
4227   /* Determine cost of epilogue code.
4228 
4229      We have a reduction operator that will reduce the vector in one statement.
4230      Also requires scalar extract.  */
4231 
4232   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4233     {
4234       if (reduc_fn != IFN_LAST)
4235 	{
4236 	  if (reduction_type == COND_REDUCTION)
4237 	    {
4238 	      /* An EQ stmt and an COND_EXPR stmt.  */
4239 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4240 					      vector_stmt, stmt_info, 0,
4241 					      vect_epilogue);
4242 	      /* Reduction of the max index and a reduction of the found
4243 		 values.  */
4244 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4245 					      vec_to_scalar, stmt_info, 0,
4246 					      vect_epilogue);
4247 	      /* A broadcast of the max value.  */
4248 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4249 					      scalar_to_vec, stmt_info, 0,
4250 					      vect_epilogue);
4251 	    }
4252 	  else
4253 	    {
4254 	      epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4255 					      stmt_info, 0, vect_epilogue);
4256 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4257 					      vec_to_scalar, stmt_info, 0,
4258 					      vect_epilogue);
4259 	    }
4260 	}
4261       else if (reduction_type == COND_REDUCTION)
4262 	{
4263 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4264 	  /* Extraction of scalar elements.  */
4265 	  epilogue_cost += add_stmt_cost (target_cost_data,
4266 					  2 * estimated_nunits,
4267 					  vec_to_scalar, stmt_info, 0,
4268 					  vect_epilogue);
4269 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4270 	  epilogue_cost += add_stmt_cost (target_cost_data,
4271 					  2 * estimated_nunits - 3,
4272 					  scalar_stmt, stmt_info, 0,
4273 					  vect_epilogue);
4274 	}
4275       else if (reduction_type == EXTRACT_LAST_REDUCTION
4276 	       || reduction_type == FOLD_LEFT_REDUCTION)
4277 	/* No extra instructions need in the epilogue.  */
4278 	;
4279       else
4280 	{
4281 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4282 	  tree bitsize =
4283 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4284 	  int element_bitsize = tree_to_uhwi (bitsize);
4285 	  int nelements = vec_size_in_bits / element_bitsize;
4286 
4287 	  if (code == COND_EXPR)
4288 	    code = MAX_EXPR;
4289 
4290 	  optab = optab_for_tree_code (code, vectype, optab_default);
4291 
4292 	  /* We have a whole vector shift available.  */
4293 	  if (optab != unknown_optab
4294 	      && VECTOR_MODE_P (mode)
4295 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4296 	      && have_whole_vector_shift (mode))
4297 	    {
4298 	      /* Final reduction via vector shifts and the reduction operator.
4299 		 Also requires scalar extract.  */
4300 	      epilogue_cost += add_stmt_cost (target_cost_data,
4301 					      exact_log2 (nelements) * 2,
4302 					      vector_stmt, stmt_info, 0,
4303 					      vect_epilogue);
4304 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4305 					      vec_to_scalar, stmt_info, 0,
4306 					      vect_epilogue);
4307 	    }
4308 	  else
4309 	    /* Use extracts and reduction op for final reduction.  For N
4310 	       elements, we have N extracts and N-1 reduction ops.  */
4311 	    epilogue_cost += add_stmt_cost (target_cost_data,
4312 					    nelements + nelements - 1,
4313 					    vector_stmt, stmt_info, 0,
4314 					    vect_epilogue);
4315 	}
4316     }
4317 
4318   if (dump_enabled_p ())
4319     dump_printf (MSG_NOTE,
4320                  "vect_model_reduction_cost: inside_cost = %d, "
4321                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4322                  prologue_cost, epilogue_cost);
4323 }
4324 
4325 
4326 /* Function vect_model_induction_cost.
4327 
4328    Models cost for induction operations.  */
4329 
4330 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)4331 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4332 {
4333   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4334   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4335   unsigned inside_cost, prologue_cost;
4336 
4337   if (PURE_SLP_STMT (stmt_info))
4338     return;
4339 
4340   /* loop cost for vec_loop.  */
4341   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4342 			       stmt_info, 0, vect_body);
4343 
4344   /* prologue cost for vec_init and vec_step.  */
4345   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4346 				 stmt_info, 0, vect_prologue);
4347 
4348   if (dump_enabled_p ())
4349     dump_printf_loc (MSG_NOTE, vect_location,
4350                      "vect_model_induction_cost: inside_cost = %d, "
4351                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4352 }
4353 
4354 
4355 
4356 /* Function get_initial_def_for_reduction
4357 
4358    Input:
4359    STMT - a stmt that performs a reduction operation in the loop.
4360    INIT_VAL - the initial value of the reduction variable
4361 
4362    Output:
4363    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4364         of the reduction (used for adjusting the epilog - see below).
4365    Return a vector variable, initialized according to the operation that STMT
4366         performs. This vector will be used as the initial value of the
4367         vector of partial results.
4368 
4369    Option1 (adjust in epilog): Initialize the vector as follows:
4370      add/bit or/xor:    [0,0,...,0,0]
4371      mult/bit and:      [1,1,...,1,1]
4372      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4373    and when necessary (e.g. add/mult case) let the caller know
4374    that it needs to adjust the result by init_val.
4375 
4376    Option2: Initialize the vector as follows:
4377      add/bit or/xor:    [init_val,0,0,...,0]
4378      mult/bit and:      [init_val,1,1,...,1]
4379      min/max/cond_expr: [init_val,init_val,...,init_val]
4380    and no adjustments are needed.
4381 
4382    For example, for the following code:
4383 
4384    s = init_val;
4385    for (i=0;i<n;i++)
4386      s = s + a[i];
4387 
4388    STMT is 's = s + a[i]', and the reduction variable is 's'.
4389    For a vector of 4 units, we want to return either [0,0,0,init_val],
4390    or [0,0,0,0] and let the caller know that it needs to adjust
4391    the result at the end by 'init_val'.
4392 
4393    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4394    initialization vector is simpler (same element in all entries), if
4395    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4396 
4397    A cost model should help decide between these two schemes.  */
4398 
4399 tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)4400 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4401                                tree *adjustment_def)
4402 {
4403   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4404   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4405   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4406   tree scalar_type = TREE_TYPE (init_val);
4407   tree vectype = get_vectype_for_scalar_type (scalar_type);
4408   enum tree_code code = gimple_assign_rhs_code (stmt);
4409   tree def_for_init;
4410   tree init_def;
4411   bool nested_in_vect_loop = false;
4412   REAL_VALUE_TYPE real_init_val = dconst0;
4413   int int_init_val = 0;
4414   gimple *def_stmt = NULL;
4415   gimple_seq stmts = NULL;
4416 
4417   gcc_assert (vectype);
4418 
4419   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4420 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4421 
4422   if (nested_in_vect_loop_p (loop, stmt))
4423     nested_in_vect_loop = true;
4424   else
4425     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4426 
4427   /* In case of double reduction we only create a vector variable to be put
4428      in the reduction phi node.  The actual statement creation is done in
4429      vect_create_epilog_for_reduction.  */
4430   if (adjustment_def && nested_in_vect_loop
4431       && TREE_CODE (init_val) == SSA_NAME
4432       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4433       && gimple_code (def_stmt) == GIMPLE_PHI
4434       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4435       && vinfo_for_stmt (def_stmt)
4436       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4437           == vect_double_reduction_def)
4438     {
4439       *adjustment_def = NULL;
4440       return vect_create_destination_var (init_val, vectype);
4441     }
4442 
4443   vect_reduction_type reduction_type
4444     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4445 
4446   /* In case of a nested reduction do not use an adjustment def as
4447      that case is not supported by the epilogue generation correctly
4448      if ncopies is not one.  */
4449   if (adjustment_def && nested_in_vect_loop)
4450     {
4451       *adjustment_def = NULL;
4452       return vect_get_vec_def_for_operand (init_val, stmt);
4453     }
4454 
4455   switch (code)
4456     {
4457     case WIDEN_SUM_EXPR:
4458     case DOT_PROD_EXPR:
4459     case SAD_EXPR:
4460     case PLUS_EXPR:
4461     case MINUS_EXPR:
4462     case BIT_IOR_EXPR:
4463     case BIT_XOR_EXPR:
4464     case MULT_EXPR:
4465     case BIT_AND_EXPR:
4466       {
4467         /* ADJUSTMENT_DEF is NULL when called from
4468            vect_create_epilog_for_reduction to vectorize double reduction.  */
4469         if (adjustment_def)
4470 	  *adjustment_def = init_val;
4471 
4472         if (code == MULT_EXPR)
4473           {
4474             real_init_val = dconst1;
4475             int_init_val = 1;
4476           }
4477 
4478         if (code == BIT_AND_EXPR)
4479           int_init_val = -1;
4480 
4481         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4482           def_for_init = build_real (scalar_type, real_init_val);
4483         else
4484           def_for_init = build_int_cst (scalar_type, int_init_val);
4485 
4486 	if (adjustment_def)
4487 	  /* Option1: the first element is '0' or '1' as well.  */
4488 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4489 						   def_for_init);
4490 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4491 	  {
4492 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4493 	    init_def = build_vector_from_val (vectype, def_for_init);
4494 	    gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4495 						      2, init_def, init_val);
4496 	    init_def = make_ssa_name (vectype);
4497 	    gimple_call_set_lhs (call, init_def);
4498 	    gimple_seq_add_stmt (&stmts, call);
4499 	  }
4500 	else
4501 	  {
4502 	    /* Option2: the first element is INIT_VAL.  */
4503 	    tree_vector_builder elts (vectype, 1, 2);
4504 	    elts.quick_push (init_val);
4505 	    elts.quick_push (def_for_init);
4506 	    init_def = gimple_build_vector (&stmts, &elts);
4507 	  }
4508       }
4509       break;
4510 
4511     case MIN_EXPR:
4512     case MAX_EXPR:
4513     case COND_EXPR:
4514       {
4515 	if (adjustment_def)
4516           {
4517 	    *adjustment_def = NULL_TREE;
4518 	    if (reduction_type != COND_REDUCTION
4519 		&& reduction_type != EXTRACT_LAST_REDUCTION)
4520 	      {
4521 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
4522 		break;
4523 	      }
4524 	  }
4525 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4526 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4527       }
4528       break;
4529 
4530     default:
4531       gcc_unreachable ();
4532     }
4533 
4534   if (stmts)
4535     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4536   return init_def;
4537 }
4538 
4539 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4540    NUMBER_OF_VECTORS is the number of vector defs to create.
4541    If NEUTRAL_OP is nonnull, introducing extra elements of that
4542    value will not change the result.  */
4543 
4544 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4545 get_initial_defs_for_reduction (slp_tree slp_node,
4546 				vec<tree> *vec_oprnds,
4547 				unsigned int number_of_vectors,
4548 				bool reduc_chain, tree neutral_op)
4549 {
4550   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4551   gimple *stmt = stmts[0];
4552   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4553   unsigned HOST_WIDE_INT nunits;
4554   unsigned j, number_of_places_left_in_vector;
4555   tree vector_type;
4556   unsigned int group_size = stmts.length ();
4557   unsigned int i;
4558   struct loop *loop;
4559 
4560   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4561 
4562   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4563 
4564   loop = (gimple_bb (stmt))->loop_father;
4565   gcc_assert (loop);
4566   edge pe = loop_preheader_edge (loop);
4567 
4568   gcc_assert (!reduc_chain || neutral_op);
4569 
4570   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4571      created vectors. It is greater than 1 if unrolling is performed.
4572 
4573      For example, we have two scalar operands, s1 and s2 (e.g., group of
4574      strided accesses of size two), while NUNITS is four (i.e., four scalars
4575      of this type can be packed in a vector).  The output vector will contain
4576      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4577      will be 2).
4578 
4579      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4580      containing the operands.
4581 
4582      For example, NUNITS is four as before, and the group size is 8
4583      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4584      {s5, s6, s7, s8}.  */
4585 
4586   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4587     nunits = group_size;
4588 
4589   number_of_places_left_in_vector = nunits;
4590   bool constant_p = true;
4591   tree_vector_builder elts (vector_type, nunits, 1);
4592   elts.quick_grow (nunits);
4593   gimple_seq ctor_seq = NULL;
4594   for (j = 0; j < nunits * number_of_vectors; ++j)
4595     {
4596       tree op;
4597       i = j % group_size;
4598       stmt_vinfo = vinfo_for_stmt (stmts[i]);
4599 
4600       /* Get the def before the loop.  In reduction chain we have only
4601 	 one initial value.  Else we have as many as PHIs in the group.  */
4602       if (reduc_chain)
4603 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4604       else if (((vec_oprnds->length () + 1) * nunits
4605 		- number_of_places_left_in_vector >= group_size)
4606 	       && neutral_op)
4607 	op = neutral_op;
4608       else
4609 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4610 
4611       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4612       number_of_places_left_in_vector--;
4613       elts[nunits - number_of_places_left_in_vector - 1] = op;
4614       if (!CONSTANT_CLASS_P (op))
4615 	constant_p = false;
4616 
4617       if (number_of_places_left_in_vector == 0)
4618 	{
4619 	  tree init;
4620 	  if (constant_p && !neutral_op
4621 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4622 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4623 	    /* Build the vector directly from ELTS.  */
4624 	    init = gimple_build_vector (&ctor_seq, &elts);
4625 	  else if (neutral_op)
4626 	    {
4627 	      /* Build a vector of the neutral value and shift the
4628 		 other elements into place.  */
4629 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4630 						   neutral_op);
4631 	      int k = nunits;
4632 	      while (k > 0 && elts[k - 1] == neutral_op)
4633 		k -= 1;
4634 	      while (k > 0)
4635 		{
4636 		  k -= 1;
4637 		  gcall *call = gimple_build_call_internal
4638 		      (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4639 		  init = make_ssa_name (vector_type);
4640 		  gimple_call_set_lhs (call, init);
4641 		  gimple_seq_add_stmt (&ctor_seq, call);
4642 		}
4643 	    }
4644 	  else
4645 	    {
4646 	      /* First time round, duplicate ELTS to fill the
4647 		 required number of vectors.  */
4648 	      duplicate_and_interleave (&ctor_seq, vector_type, elts,
4649 					number_of_vectors, *vec_oprnds);
4650 	      break;
4651 	    }
4652 	  vec_oprnds->quick_push (init);
4653 
4654 	  number_of_places_left_in_vector = nunits;
4655 	  elts.new_vector (vector_type, nunits, 1);
4656 	  elts.quick_grow (nunits);
4657 	  constant_p = true;
4658 	}
4659     }
4660   if (ctor_seq != NULL)
4661     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4662 }
4663 
4664 
4665 /* Function vect_create_epilog_for_reduction
4666 
4667    Create code at the loop-epilog to finalize the result of a reduction
4668    computation.
4669 
4670    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4671      reduction statements.
4672    STMT is the scalar reduction stmt that is being vectorized.
4673    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4674      number of elements that we can fit in a vectype (nunits).  In this case
4675      we have to generate more than one vector stmt - i.e - we need to "unroll"
4676      the vector stmt by a factor VF/nunits.  For more details see documentation
4677      in vectorizable_operation.
4678    REDUC_FN is the internal function for the epilog reduction.
4679    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4680      computation.
4681    REDUC_INDEX is the index of the operand in the right hand side of the
4682      statement that is defined by REDUCTION_PHI.
4683    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4684    SLP_NODE is an SLP node containing a group of reduction statements. The
4685      first one in this group is STMT.
4686    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4687      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4688      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4689      any value of the IV in the loop.
4690    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4691    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4692      null if this is not an SLP reduction
4693 
4694    This function:
4695    1. Creates the reduction def-use cycles: sets the arguments for
4696       REDUCTION_PHIS:
4697       The loop-entry argument is the vectorized initial-value of the reduction.
4698       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4699       sums.
4700    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4701       by calling the function specified by REDUC_FN if available, or by
4702       other means (whole-vector shifts or a scalar loop).
4703       The function also creates a new phi node at the loop exit to preserve
4704       loop-closed form, as illustrated below.
4705 
4706      The flow at the entry to this function:
4707 
4708         loop:
4709           vec_def = phi <null, null>            # REDUCTION_PHI
4710           VECT_DEF = vector_stmt                # vectorized form of STMT
4711           s_loop = scalar_stmt                  # (scalar) STMT
4712         loop_exit:
4713           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4714           use <s_out0>
4715           use <s_out0>
4716 
4717      The above is transformed by this function into:
4718 
4719         loop:
4720           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4721           VECT_DEF = vector_stmt                # vectorized form of STMT
4722           s_loop = scalar_stmt                  # (scalar) STMT
4723         loop_exit:
4724           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4725           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4726           v_out2 = reduce <v_out1>
4727           s_out3 = extract_field <v_out2, 0>
4728           s_out4 = adjust_result <s_out3>
4729           use <s_out4>
4730           use <s_out4>
4731 */
4732 
4733 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<gimple * > reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)4734 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4735 				  gimple *reduc_def_stmt,
4736 				  int ncopies, internal_fn reduc_fn,
4737 				  vec<gimple *> reduction_phis,
4738                                   bool double_reduc,
4739 				  slp_tree slp_node,
4740 				  slp_instance slp_node_instance,
4741 				  tree induc_val, enum tree_code induc_code,
4742 				  tree neutral_op)
4743 {
4744   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4745   stmt_vec_info prev_phi_info;
4746   tree vectype;
4747   machine_mode mode;
4748   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4749   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4750   basic_block exit_bb;
4751   tree scalar_dest;
4752   tree scalar_type;
4753   gimple *new_phi = NULL, *phi;
4754   gimple_stmt_iterator exit_gsi;
4755   tree vec_dest;
4756   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4757   gimple *epilog_stmt = NULL;
4758   enum tree_code code = gimple_assign_rhs_code (stmt);
4759   gimple *exit_phi;
4760   tree bitsize;
4761   tree adjustment_def = NULL;
4762   tree vec_initial_def = NULL;
4763   tree expr, def, initial_def = NULL;
4764   tree orig_name, scalar_result;
4765   imm_use_iterator imm_iter, phi_imm_iter;
4766   use_operand_p use_p, phi_use_p;
4767   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4768   bool nested_in_vect_loop = false;
4769   auto_vec<gimple *> new_phis;
4770   auto_vec<gimple *> inner_phis;
4771   enum vect_def_type dt = vect_unknown_def_type;
4772   int j, i;
4773   auto_vec<tree> scalar_results;
4774   unsigned int group_size = 1, k, ratio;
4775   auto_vec<tree> vec_initial_defs;
4776   auto_vec<gimple *> phis;
4777   bool slp_reduc = false;
4778   bool direct_slp_reduc;
4779   tree new_phi_result;
4780   gimple *inner_phi = NULL;
4781   tree induction_index = NULL_TREE;
4782 
4783   if (slp_node)
4784     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4785 
4786   if (nested_in_vect_loop_p (loop, stmt))
4787     {
4788       outer_loop = loop;
4789       loop = loop->inner;
4790       nested_in_vect_loop = true;
4791       gcc_assert (!slp_node);
4792     }
4793 
4794   vectype = STMT_VINFO_VECTYPE (stmt_info);
4795   gcc_assert (vectype);
4796   mode = TYPE_MODE (vectype);
4797 
4798   /* 1. Create the reduction def-use cycle:
4799      Set the arguments of REDUCTION_PHIS, i.e., transform
4800 
4801         loop:
4802           vec_def = phi <null, null>            # REDUCTION_PHI
4803           VECT_DEF = vector_stmt                # vectorized form of STMT
4804           ...
4805 
4806      into:
4807 
4808         loop:
4809           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4810           VECT_DEF = vector_stmt                # vectorized form of STMT
4811           ...
4812 
4813      (in case of SLP, do it for all the phis). */
4814 
4815   /* Get the loop-entry arguments.  */
4816   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4817   if (slp_node)
4818     {
4819       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4820       vec_initial_defs.reserve (vec_num);
4821       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4822 				      &vec_initial_defs, vec_num,
4823 				      GROUP_FIRST_ELEMENT (stmt_info),
4824 				      neutral_op);
4825     }
4826   else
4827     {
4828       /* Get at the scalar def before the loop, that defines the initial value
4829 	 of the reduction variable.  */
4830       gimple *def_stmt;
4831       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4832 					   loop_preheader_edge (loop));
4833       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4834 	 and we can't use zero for induc_val, use initial_def.  Similarly
4835 	 for REDUC_MIN and initial_def larger than the base.  */
4836       if (TREE_CODE (initial_def) == INTEGER_CST
4837 	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4838 	      == INTEGER_INDUC_COND_REDUCTION)
4839 	  && !integer_zerop (induc_val)
4840 	  && ((induc_code == MAX_EXPR
4841 	       && tree_int_cst_lt (initial_def, induc_val))
4842 	      || (induc_code == MIN_EXPR
4843 		  && tree_int_cst_lt (induc_val, initial_def))))
4844 	induc_val = initial_def;
4845       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4846       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4847 						       &adjustment_def);
4848       vec_initial_defs.create (1);
4849       vec_initial_defs.quick_push (vec_initial_def);
4850     }
4851 
4852   /* Set phi nodes arguments.  */
4853   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4854     {
4855       tree vec_init_def = vec_initial_defs[i];
4856       tree def = vect_defs[i];
4857       for (j = 0; j < ncopies; j++)
4858         {
4859 	  if (j != 0)
4860 	    {
4861 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4862 	      if (nested_in_vect_loop)
4863 		vec_init_def
4864 		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4865 						    vec_init_def);
4866 	    }
4867 
4868 	  /* Set the loop-entry arg of the reduction-phi.  */
4869 
4870 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4871 	      == INTEGER_INDUC_COND_REDUCTION)
4872 	    {
4873 	      /* Initialise the reduction phi to zero.  This prevents initial
4874 		 values of non-zero interferring with the reduction op.  */
4875 	      gcc_assert (ncopies == 1);
4876 	      gcc_assert (i == 0);
4877 
4878 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4879 	      tree induc_val_vec
4880 		= build_vector_from_val (vec_init_def_type, induc_val);
4881 
4882 	      add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4883 			   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4884 	    }
4885 	  else
4886 	    add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4887 			 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4888 
4889           /* Set the loop-latch arg for the reduction-phi.  */
4890           if (j > 0)
4891             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4892 
4893           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4894 		       UNKNOWN_LOCATION);
4895 
4896           if (dump_enabled_p ())
4897             {
4898               dump_printf_loc (MSG_NOTE, vect_location,
4899 			       "transform reduction: created def-use cycle: ");
4900               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4901               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4902             }
4903         }
4904     }
4905 
4906   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4907      which is updated with the current index of the loop for every match of
4908      the original loop's cond_expr (VEC_STMT).  This results in a vector
4909      containing the last time the condition passed for that vector lane.
4910      The first match will be a 1 to allow 0 to be used for non-matching
4911      indexes.  If there are no matches at all then the vector will be all
4912      zeroes.  */
4913   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4914     {
4915       tree indx_before_incr, indx_after_incr;
4916       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4917 
4918       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4919       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4920 
4921       int scalar_precision
4922 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4923       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4924       tree cr_index_vector_type = build_vector_type
4925 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4926 
4927       /* First we create a simple vector induction variable which starts
4928 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4929 	 vector size (STEP).  */
4930 
4931       /* Create a {1,2,3,...} vector.  */
4932       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4933 
4934       /* Create a vector of the step value.  */
4935       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4936       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4937 
4938       /* Create an induction variable.  */
4939       gimple_stmt_iterator incr_gsi;
4940       bool insert_after;
4941       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4942       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4943 		 insert_after, &indx_before_incr, &indx_after_incr);
4944 
4945       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4946 	 filled with zeros (VEC_ZERO).  */
4947 
4948       /* Create a vector of 0s.  */
4949       tree zero = build_zero_cst (cr_index_scalar_type);
4950       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4951 
4952       /* Create a vector phi node.  */
4953       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4954       new_phi = create_phi_node (new_phi_tree, loop->header);
4955       set_vinfo_for_stmt (new_phi,
4956 			  new_stmt_vec_info (new_phi, loop_vinfo));
4957       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4958 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4959 
4960       /* Now take the condition from the loops original cond_expr
4961 	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4962 	 every match uses values from the induction variable
4963 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4964 	 (NEW_PHI_TREE).
4965 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4966 	 the new cond_expr (INDEX_COND_EXPR).  */
4967 
4968       /* Duplicate the condition from vec_stmt.  */
4969       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4970 
4971       /* Create a conditional, where the condition is taken from vec_stmt
4972 	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4973 	 else is the phi (NEW_PHI_TREE).  */
4974       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4975 				     ccompare, indx_before_incr,
4976 				     new_phi_tree);
4977       induction_index = make_ssa_name (cr_index_vector_type);
4978       gimple *index_condition = gimple_build_assign (induction_index,
4979 						     index_cond_expr);
4980       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4981       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4982 							loop_vinfo);
4983       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4984       set_vinfo_for_stmt (index_condition, index_vec_info);
4985 
4986       /* Update the phi with the vec cond.  */
4987       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4988 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4989     }
4990 
4991   /* 2. Create epilog code.
4992         The reduction epilog code operates across the elements of the vector
4993         of partial results computed by the vectorized loop.
4994         The reduction epilog code consists of:
4995 
4996         step 1: compute the scalar result in a vector (v_out2)
4997         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4998         step 3: adjust the scalar result (s_out3) if needed.
4999 
5000         Step 1 can be accomplished using one the following three schemes:
5001           (scheme 1) using reduc_fn, if available.
5002           (scheme 2) using whole-vector shifts, if available.
5003           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5004                      combined.
5005 
5006           The overall epilog code looks like this:
5007 
5008           s_out0 = phi <s_loop>         # original EXIT_PHI
5009           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5010           v_out2 = reduce <v_out1>              # step 1
5011           s_out3 = extract_field <v_out2, 0>    # step 2
5012           s_out4 = adjust_result <s_out3>       # step 3
5013 
5014           (step 3 is optional, and steps 1 and 2 may be combined).
5015           Lastly, the uses of s_out0 are replaced by s_out4.  */
5016 
5017 
5018   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5019          v_out1 = phi <VECT_DEF>
5020          Store them in NEW_PHIS.  */
5021 
5022   exit_bb = single_exit (loop)->dest;
5023   prev_phi_info = NULL;
5024   new_phis.create (vect_defs.length ());
5025   FOR_EACH_VEC_ELT (vect_defs, i, def)
5026     {
5027       for (j = 0; j < ncopies; j++)
5028         {
5029 	  tree new_def = copy_ssa_name (def);
5030           phi = create_phi_node (new_def, exit_bb);
5031           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5032           if (j == 0)
5033             new_phis.quick_push (phi);
5034           else
5035 	    {
5036 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
5037 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5038 	    }
5039 
5040           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5041           prev_phi_info = vinfo_for_stmt (phi);
5042         }
5043     }
5044 
5045   /* The epilogue is created for the outer-loop, i.e., for the loop being
5046      vectorized.  Create exit phis for the outer loop.  */
5047   if (double_reduc)
5048     {
5049       loop = outer_loop;
5050       exit_bb = single_exit (loop)->dest;
5051       inner_phis.create (vect_defs.length ());
5052       FOR_EACH_VEC_ELT (new_phis, i, phi)
5053 	{
5054 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
5055 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
5056 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5057 			   PHI_RESULT (phi));
5058 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5059 							    loop_vinfo));
5060 	  inner_phis.quick_push (phi);
5061 	  new_phis[i] = outer_phi;
5062 	  prev_phi_info = vinfo_for_stmt (outer_phi);
5063           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5064             {
5065 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5066 	      new_result = copy_ssa_name (PHI_RESULT (phi));
5067 	      outer_phi = create_phi_node (new_result, exit_bb);
5068 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5069 			       PHI_RESULT (phi));
5070 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5071 								loop_vinfo));
5072 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5073 	      prev_phi_info = vinfo_for_stmt (outer_phi);
5074 	    }
5075 	}
5076     }
5077 
5078   exit_gsi = gsi_after_labels (exit_bb);
5079 
5080   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5081          (i.e. when reduc_fn is not available) and in the final adjustment
5082 	 code (if needed).  Also get the original scalar reduction variable as
5083          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5084          represents a reduction pattern), the tree-code and scalar-def are
5085          taken from the original stmt that the pattern-stmt (STMT) replaces.
5086          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5087          are taken from STMT.  */
5088 
5089   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5090   if (!orig_stmt)
5091     {
5092       /* Regular reduction  */
5093       orig_stmt = stmt;
5094     }
5095   else
5096     {
5097       /* Reduction pattern  */
5098       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5099       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5100       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5101     }
5102 
5103   code = gimple_assign_rhs_code (orig_stmt);
5104   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5105      partial results are added and not subtracted.  */
5106   if (code == MINUS_EXPR)
5107     code = PLUS_EXPR;
5108 
5109   scalar_dest = gimple_assign_lhs (orig_stmt);
5110   scalar_type = TREE_TYPE (scalar_dest);
5111   scalar_results.create (group_size);
5112   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5113   bitsize = TYPE_SIZE (scalar_type);
5114 
5115   /* In case this is a reduction in an inner-loop while vectorizing an outer
5116      loop - we don't need to extract a single scalar result at the end of the
5117      inner-loop (unless it is double reduction, i.e., the use of reduction is
5118      outside the outer-loop).  The final vector of partial results will be used
5119      in the vectorized outer-loop, or reduced to a scalar result at the end of
5120      the outer-loop.  */
5121   if (nested_in_vect_loop && !double_reduc)
5122     goto vect_finalize_reduction;
5123 
5124   /* SLP reduction without reduction chain, e.g.,
5125      # a1 = phi <a2, a0>
5126      # b1 = phi <b2, b0>
5127      a2 = operation (a1)
5128      b2 = operation (b1)  */
5129   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5130 
5131   /* True if we should implement SLP_REDUC using native reduction operations
5132      instead of scalar operations.  */
5133   direct_slp_reduc = (reduc_fn != IFN_LAST
5134 		      && slp_reduc
5135 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5136 
5137   /* In case of reduction chain, e.g.,
5138      # a1 = phi <a3, a0>
5139      a2 = operation (a1)
5140      a3 = operation (a2),
5141 
5142      we may end up with more than one vector result.  Here we reduce them to
5143      one vector.  */
5144   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5145     {
5146       tree first_vect = PHI_RESULT (new_phis[0]);
5147       gassign *new_vec_stmt = NULL;
5148       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5149       for (k = 1; k < new_phis.length (); k++)
5150         {
5151 	  gimple *next_phi = new_phis[k];
5152           tree second_vect = PHI_RESULT (next_phi);
5153           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5154           new_vec_stmt = gimple_build_assign (tem, code,
5155 					      first_vect, second_vect);
5156           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5157 	  first_vect = tem;
5158         }
5159 
5160       new_phi_result = first_vect;
5161       if (new_vec_stmt)
5162         {
5163           new_phis.truncate (0);
5164           new_phis.safe_push (new_vec_stmt);
5165         }
5166     }
5167   /* Likewise if we couldn't use a single defuse cycle.  */
5168   else if (ncopies > 1)
5169     {
5170       gcc_assert (new_phis.length () == 1);
5171       tree first_vect = PHI_RESULT (new_phis[0]);
5172       gassign *new_vec_stmt = NULL;
5173       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5174       gimple *next_phi = new_phis[0];
5175       for (int k = 1; k < ncopies; ++k)
5176 	{
5177 	  next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5178 	  tree second_vect = PHI_RESULT (next_phi);
5179           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5180           new_vec_stmt = gimple_build_assign (tem, code,
5181 					      first_vect, second_vect);
5182           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5183 	  first_vect = tem;
5184 	}
5185       new_phi_result = first_vect;
5186       new_phis.truncate (0);
5187       new_phis.safe_push (new_vec_stmt);
5188     }
5189   else
5190     new_phi_result = PHI_RESULT (new_phis[0]);
5191 
5192   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5193       && reduc_fn != IFN_LAST)
5194     {
5195       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5196 	 various data values where the condition matched and another vector
5197 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5198 	 need to extract the last matching index (which will be the index with
5199 	 highest value) and use this to index into the data vector.
5200 	 For the case where there were no matches, the data vector will contain
5201 	 all default values and the index vector will be all zeros.  */
5202 
5203       /* Get various versions of the type of the vector of indexes.  */
5204       tree index_vec_type = TREE_TYPE (induction_index);
5205       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5206       tree index_scalar_type = TREE_TYPE (index_vec_type);
5207       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5208 	(index_vec_type);
5209 
5210       /* Get an unsigned integer version of the type of the data vector.  */
5211       int scalar_precision
5212 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5213       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5214       tree vectype_unsigned = build_vector_type
5215 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5216 
5217       /* First we need to create a vector (ZERO_VEC) of zeros and another
5218 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5219 	 can create using a MAX reduction and then expanding.
5220 	 In the case where the loop never made any matches, the max index will
5221 	 be zero.  */
5222 
5223       /* Vector of {0, 0, 0,...}.  */
5224       tree zero_vec = make_ssa_name (vectype);
5225       tree zero_vec_rhs = build_zero_cst (vectype);
5226       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5227       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5228 
5229       /* Find maximum value from the vector of found indexes.  */
5230       tree max_index = make_ssa_name (index_scalar_type);
5231       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5232 							  1, induction_index);
5233       gimple_call_set_lhs (max_index_stmt, max_index);
5234       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5235 
5236       /* Vector of {max_index, max_index, max_index,...}.  */
5237       tree max_index_vec = make_ssa_name (index_vec_type);
5238       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5239 						      max_index);
5240       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5241 							max_index_vec_rhs);
5242       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5243 
5244       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5245 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5246 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5247 	 otherwise.  Only one value should match, resulting in a vector
5248 	 (VEC_COND) with one data value and the rest zeros.
5249 	 In the case where the loop never made any matches, every index will
5250 	 match, resulting in a vector with all data values (which will all be
5251 	 the default value).  */
5252 
5253       /* Compare the max index vector to the vector of found indexes to find
5254 	 the position of the max value.  */
5255       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5256       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5257 						      induction_index,
5258 						      max_index_vec);
5259       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5260 
5261       /* Use the compare to choose either values from the data vector or
5262 	 zero.  */
5263       tree vec_cond = make_ssa_name (vectype);
5264       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5265 						   vec_compare, new_phi_result,
5266 						   zero_vec);
5267       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5268 
5269       /* Finally we need to extract the data value from the vector (VEC_COND)
5270 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5271 	 reduction, but because this doesn't exist, we can use a MAX reduction
5272 	 instead.  The data value might be signed or a float so we need to cast
5273 	 it first.
5274 	 In the case where the loop never made any matches, the data values are
5275 	 all identical, and so will reduce down correctly.  */
5276 
5277       /* Make the matched data values unsigned.  */
5278       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5279       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5280 				       vec_cond);
5281       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5282 							VIEW_CONVERT_EXPR,
5283 							vec_cond_cast_rhs);
5284       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5285 
5286       /* Reduce down to a scalar value.  */
5287       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5288       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5289 							   1, vec_cond_cast);
5290       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5291       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5292 
5293       /* Convert the reduced value back to the result type and set as the
5294 	 result.  */
5295       gimple_seq stmts = NULL;
5296       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5297 			       data_reduc);
5298       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5299       scalar_results.safe_push (new_temp);
5300     }
5301   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5302 	   && reduc_fn == IFN_LAST)
5303     {
5304       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5305 	 idx = 0;
5306          idx_val = induction_index[0];
5307 	 val = data_reduc[0];
5308          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5309 	   if (induction_index[i] > idx_val)
5310 	     val = data_reduc[i], idx_val = induction_index[i];
5311 	 return val;  */
5312 
5313       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5314       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5315       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5316       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5317       /* Enforced by vectorizable_reduction, which ensures we have target
5318 	 support before allowing a conditional reduction on variable-length
5319 	 vectors.  */
5320       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5321       tree idx_val = NULL_TREE, val = NULL_TREE;
5322       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5323 	{
5324 	  tree old_idx_val = idx_val;
5325 	  tree old_val = val;
5326 	  idx_val = make_ssa_name (idx_eltype);
5327 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5328 					     build3 (BIT_FIELD_REF, idx_eltype,
5329 						     induction_index,
5330 						     bitsize_int (el_size),
5331 						     bitsize_int (off)));
5332 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5333 	  val = make_ssa_name (data_eltype);
5334 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5335 					     build3 (BIT_FIELD_REF,
5336 						     data_eltype,
5337 						     new_phi_result,
5338 						     bitsize_int (el_size),
5339 						     bitsize_int (off)));
5340 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5341 	  if (off != 0)
5342 	    {
5343 	      tree new_idx_val = idx_val;
5344 	      tree new_val = val;
5345 	      if (off != v_size - el_size)
5346 		{
5347 		  new_idx_val = make_ssa_name (idx_eltype);
5348 		  epilog_stmt = gimple_build_assign (new_idx_val,
5349 						     MAX_EXPR, idx_val,
5350 						     old_idx_val);
5351 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352 		}
5353 	      new_val = make_ssa_name (data_eltype);
5354 	      epilog_stmt = gimple_build_assign (new_val,
5355 						 COND_EXPR,
5356 						 build2 (GT_EXPR,
5357 							 boolean_type_node,
5358 							 idx_val,
5359 							 old_idx_val),
5360 						 val, old_val);
5361 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362 	      idx_val = new_idx_val;
5363 	      val = new_val;
5364 	    }
5365 	}
5366       /* Convert the reduced value back to the result type and set as the
5367 	 result.  */
5368       gimple_seq stmts = NULL;
5369       val = gimple_convert (&stmts, scalar_type, val);
5370       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5371       scalar_results.safe_push (val);
5372     }
5373 
5374   /* 2.3 Create the reduction code, using one of the three schemes described
5375          above. In SLP we simply need to extract all the elements from the
5376          vector (without reducing them), so we use scalar shifts.  */
5377   else if (reduc_fn != IFN_LAST && !slp_reduc)
5378     {
5379       tree tmp;
5380       tree vec_elem_type;
5381 
5382       /* Case 1:  Create:
5383          v_out2 = reduc_expr <v_out1>  */
5384 
5385       if (dump_enabled_p ())
5386         dump_printf_loc (MSG_NOTE, vect_location,
5387 			 "Reduce using direct vector reduction.\n");
5388 
5389       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5390       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5391 	{
5392 	  tree tmp_dest
5393 	    = vect_create_destination_var (scalar_dest, vec_elem_type);
5394 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5395 						    new_phi_result);
5396 	  gimple_set_lhs (epilog_stmt, tmp_dest);
5397 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5398 	  gimple_set_lhs (epilog_stmt, new_temp);
5399 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5400 
5401 	  epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5402 					     new_temp);
5403 	}
5404       else
5405 	{
5406 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5407 						    new_phi_result);
5408 	  gimple_set_lhs (epilog_stmt, new_scalar_dest);
5409 	}
5410 
5411       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5412       gimple_set_lhs (epilog_stmt, new_temp);
5413       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5414 
5415       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5416 	   == INTEGER_INDUC_COND_REDUCTION)
5417 	  && !operand_equal_p (initial_def, induc_val, 0))
5418 	{
5419 	  /* Earlier we set the initial value to be a vector if induc_val
5420 	     values.  Check the result and if it is induc_val then replace
5421 	     with the original initial value, unless induc_val is
5422 	     the same as initial_def already.  */
5423 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5424 				  induc_val);
5425 
5426 	  tmp = make_ssa_name (new_scalar_dest);
5427 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5428 					     initial_def, new_temp);
5429 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5430 	  new_temp = tmp;
5431 	}
5432 
5433       scalar_results.safe_push (new_temp);
5434     }
5435   else if (direct_slp_reduc)
5436     {
5437       /* Here we create one vector for each of the GROUP_SIZE results,
5438 	 with the elements for other SLP statements replaced with the
5439 	 neutral value.  We can then do a normal reduction on each vector.  */
5440 
5441       /* Enforced by vectorizable_reduction.  */
5442       gcc_assert (new_phis.length () == 1);
5443       gcc_assert (pow2p_hwi (group_size));
5444 
5445       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5446       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5447       gimple_seq seq = NULL;
5448 
5449       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5450 	 and the same element size as VECTYPE.  */
5451       tree index = build_index_vector (vectype, 0, 1);
5452       tree index_type = TREE_TYPE (index);
5453       tree index_elt_type = TREE_TYPE (index_type);
5454       tree mask_type = build_same_sized_truth_vector_type (index_type);
5455 
5456       /* Create a vector that, for each element, identifies which of
5457 	 the GROUP_SIZE results should use it.  */
5458       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5459       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5460 			    build_vector_from_val (index_type, index_mask));
5461 
5462       /* Get a neutral vector value.  This is simply a splat of the neutral
5463 	 scalar value if we have one, otherwise the initial scalar value
5464 	 is itself a neutral value.  */
5465       tree vector_identity = NULL_TREE;
5466       if (neutral_op)
5467 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5468 							neutral_op);
5469       for (unsigned int i = 0; i < group_size; ++i)
5470 	{
5471 	  /* If there's no univeral neutral value, we can use the
5472 	     initial scalar value from the original PHI.  This is used
5473 	     for MIN and MAX reduction, for example.  */
5474 	  if (!neutral_op)
5475 	    {
5476 	      tree scalar_value
5477 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5478 					 loop_preheader_edge (loop));
5479 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5480 							      scalar_value);
5481 	    }
5482 
5483 	  /* Calculate the equivalent of:
5484 
5485 	     sel[j] = (index[j] == i);
5486 
5487 	     which selects the elements of NEW_PHI_RESULT that should
5488 	     be included in the result.  */
5489 	  tree compare_val = build_int_cst (index_elt_type, i);
5490 	  compare_val = build_vector_from_val (index_type, compare_val);
5491 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5492 				   index, compare_val);
5493 
5494 	  /* Calculate the equivalent of:
5495 
5496 	     vec = seq ? new_phi_result : vector_identity;
5497 
5498 	     VEC is now suitable for a full vector reduction.  */
5499 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5500 				   sel, new_phi_result, vector_identity);
5501 
5502 	  /* Do the reduction and convert it to the appropriate type.  */
5503 	  gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5504 	  tree scalar = make_ssa_name (TREE_TYPE (vectype));
5505 	  gimple_call_set_lhs (call, scalar);
5506 	  gimple_seq_add_stmt (&seq, call);
5507 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5508 	  scalar_results.safe_push (scalar);
5509 	}
5510       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5511     }
5512   else
5513     {
5514       bool reduce_with_shift;
5515       tree vec_temp;
5516 
5517       /* COND reductions all do the final reduction with MAX_EXPR
5518 	 or MIN_EXPR.  */
5519       if (code == COND_EXPR)
5520 	{
5521 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5522 	      == INTEGER_INDUC_COND_REDUCTION)
5523 	    code = induc_code;
5524 	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5525 		   == CONST_COND_REDUCTION)
5526 	    code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5527 	  else
5528 	    code = MAX_EXPR;
5529 	}
5530 
5531       /* See if the target wants to do the final (shift) reduction
5532 	 in a vector mode of smaller size and first reduce upper/lower
5533 	 halves against each other.  */
5534       enum machine_mode mode1 = mode;
5535       tree vectype1 = vectype;
5536       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5537       unsigned sz1 = sz;
5538       if (!slp_reduc
5539 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5540 	sz1 = GET_MODE_SIZE (mode1).to_constant ();
5541 
5542       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5543       reduce_with_shift = have_whole_vector_shift (mode1);
5544       if (!VECTOR_MODE_P (mode1))
5545 	reduce_with_shift = false;
5546       else
5547 	{
5548 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5549 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5550 	    reduce_with_shift = false;
5551 	}
5552 
5553       /* First reduce the vector to the desired vector size we should
5554 	 do shift reduction on by combining upper and lower halves.  */
5555       new_temp = new_phi_result;
5556       while (sz > sz1)
5557 	{
5558 	  gcc_assert (!slp_reduc);
5559 	  sz /= 2;
5560 	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5561 
5562 	  /* The target has to make sure we support lowpart/highpart
5563 	     extraction, either via direct vector extract or through
5564 	     an integer mode punning.  */
5565 	  tree dst1, dst2;
5566 	  if (convert_optab_handler (vec_extract_optab,
5567 				     TYPE_MODE (TREE_TYPE (new_temp)),
5568 				     TYPE_MODE (vectype1))
5569 	      != CODE_FOR_nothing)
5570 	    {
5571 	      /* Extract sub-vectors directly once vec_extract becomes
5572 		 a conversion optab.  */
5573 	      dst1 = make_ssa_name (vectype1);
5574 	      epilog_stmt
5575 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5576 					 build3 (BIT_FIELD_REF, vectype1,
5577 						 new_temp, TYPE_SIZE (vectype1),
5578 						 bitsize_int (0)));
5579 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5580 	      dst2 =  make_ssa_name (vectype1);
5581 	      epilog_stmt
5582 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5583 					 build3 (BIT_FIELD_REF, vectype1,
5584 						 new_temp, TYPE_SIZE (vectype1),
5585 						 bitsize_int (sz * BITS_PER_UNIT)));
5586 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587 	    }
5588 	  else
5589 	    {
5590 	      /* Extract via punning to appropriately sized integer mode
5591 		 vector.  */
5592 	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5593 							    1);
5594 	      tree etype = build_vector_type (eltype, 2);
5595 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5596 						 TYPE_MODE (etype),
5597 						 TYPE_MODE (eltype))
5598 			  != CODE_FOR_nothing);
5599 	      tree tem = make_ssa_name (etype);
5600 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5601 						 build1 (VIEW_CONVERT_EXPR,
5602 							 etype, new_temp));
5603 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5604 	      new_temp = tem;
5605 	      tem = make_ssa_name (eltype);
5606 	      epilog_stmt
5607 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5608 					 build3 (BIT_FIELD_REF, eltype,
5609 						 new_temp, TYPE_SIZE (eltype),
5610 						 bitsize_int (0)));
5611 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612 	      dst1 = make_ssa_name (vectype1);
5613 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5614 						 build1 (VIEW_CONVERT_EXPR,
5615 							 vectype1, tem));
5616 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617 	      tem = make_ssa_name (eltype);
5618 	      epilog_stmt
5619 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5620 					 build3 (BIT_FIELD_REF, eltype,
5621 						 new_temp, TYPE_SIZE (eltype),
5622 						 bitsize_int (sz * BITS_PER_UNIT)));
5623 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5624 	      dst2 =  make_ssa_name (vectype1);
5625 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5626 						 build1 (VIEW_CONVERT_EXPR,
5627 							 vectype1, tem));
5628 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5629 	    }
5630 
5631 	  new_temp = make_ssa_name (vectype1);
5632 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5633 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5634 	}
5635 
5636       if (reduce_with_shift && !slp_reduc)
5637 	{
5638 	  int element_bitsize = tree_to_uhwi (bitsize);
5639 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5640 	     for variable-length vectors and also requires direct target support
5641 	     for loop reductions.  */
5642 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5643 	  int nelements = vec_size_in_bits / element_bitsize;
5644 	  vec_perm_builder sel;
5645 	  vec_perm_indices indices;
5646 
5647           int elt_offset;
5648 
5649           tree zero_vec = build_zero_cst (vectype1);
5650           /* Case 2: Create:
5651              for (offset = nelements/2; offset >= 1; offset/=2)
5652                 {
5653                   Create:  va' = vec_shift <va, offset>
5654                   Create:  va = vop <va, va'>
5655                 }  */
5656 
5657           tree rhs;
5658 
5659           if (dump_enabled_p ())
5660             dump_printf_loc (MSG_NOTE, vect_location,
5661 			     "Reduce using vector shifts\n");
5662 
5663 	  mode1 = TYPE_MODE (vectype1);
5664           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5665           for (elt_offset = nelements / 2;
5666                elt_offset >= 1;
5667                elt_offset /= 2)
5668             {
5669 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5670 	      indices.new_vector (sel, 2, nelements);
5671 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5672 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5673 						 new_temp, zero_vec, mask);
5674               new_name = make_ssa_name (vec_dest, epilog_stmt);
5675               gimple_assign_set_lhs (epilog_stmt, new_name);
5676               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5677 
5678 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5679 						 new_temp);
5680               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5681               gimple_assign_set_lhs (epilog_stmt, new_temp);
5682               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5683             }
5684 
5685 	  /* 2.4  Extract the final scalar result.  Create:
5686 	     s_out3 = extract_field <v_out2, bitpos>  */
5687 
5688 	  if (dump_enabled_p ())
5689 	    dump_printf_loc (MSG_NOTE, vect_location,
5690 			     "extract scalar result\n");
5691 
5692 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5693 			bitsize, bitsize_zero_node);
5694 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5695 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5696 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5697 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5698 	  scalar_results.safe_push (new_temp);
5699         }
5700       else
5701         {
5702           /* Case 3: Create:
5703              s = extract_field <v_out2, 0>
5704              for (offset = element_size;
5705                   offset < vector_size;
5706                   offset += element_size;)
5707                {
5708                  Create:  s' = extract_field <v_out2, offset>
5709                  Create:  s = op <s, s'>  // For non SLP cases
5710                }  */
5711 
5712           if (dump_enabled_p ())
5713             dump_printf_loc (MSG_NOTE, vect_location,
5714 			     "Reduce using scalar code.\n");
5715 
5716 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5717 	  int element_bitsize = tree_to_uhwi (bitsize);
5718           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5719             {
5720               int bit_offset;
5721               if (gimple_code (new_phi) == GIMPLE_PHI)
5722                 vec_temp = PHI_RESULT (new_phi);
5723               else
5724                 vec_temp = gimple_assign_lhs (new_phi);
5725               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5726 				 bitsize_zero_node);
5727               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5728               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5729               gimple_assign_set_lhs (epilog_stmt, new_temp);
5730               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5731 
5732               /* In SLP we don't need to apply reduction operation, so we just
5733                  collect s' values in SCALAR_RESULTS.  */
5734               if (slp_reduc)
5735                 scalar_results.safe_push (new_temp);
5736 
5737               for (bit_offset = element_bitsize;
5738                    bit_offset < vec_size_in_bits;
5739                    bit_offset += element_bitsize)
5740                 {
5741                   tree bitpos = bitsize_int (bit_offset);
5742                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5743                                      bitsize, bitpos);
5744 
5745                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5746                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5747                   gimple_assign_set_lhs (epilog_stmt, new_name);
5748                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5749 
5750                   if (slp_reduc)
5751                     {
5752                       /* In SLP we don't need to apply reduction operation, so
5753                          we just collect s' values in SCALAR_RESULTS.  */
5754                       new_temp = new_name;
5755                       scalar_results.safe_push (new_name);
5756                     }
5757                   else
5758                     {
5759 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5760 							 new_name, new_temp);
5761                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5762                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5763                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5764                     }
5765                 }
5766             }
5767 
5768           /* The only case where we need to reduce scalar results in SLP, is
5769              unrolling.  If the size of SCALAR_RESULTS is greater than
5770              GROUP_SIZE, we reduce them combining elements modulo
5771              GROUP_SIZE.  */
5772           if (slp_reduc)
5773             {
5774               tree res, first_res, new_res;
5775 	      gimple *new_stmt;
5776 
5777               /* Reduce multiple scalar results in case of SLP unrolling.  */
5778               for (j = group_size; scalar_results.iterate (j, &res);
5779                    j++)
5780                 {
5781                   first_res = scalar_results[j % group_size];
5782 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
5783 						  first_res, res);
5784                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5785                   gimple_assign_set_lhs (new_stmt, new_res);
5786                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5787                   scalar_results[j % group_size] = new_res;
5788                 }
5789             }
5790           else
5791             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5792             scalar_results.safe_push (new_temp);
5793         }
5794 
5795       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5796 	   == INTEGER_INDUC_COND_REDUCTION)
5797 	  && !operand_equal_p (initial_def, induc_val, 0))
5798 	{
5799 	  /* Earlier we set the initial value to be a vector if induc_val
5800 	     values.  Check the result and if it is induc_val then replace
5801 	     with the original initial value, unless induc_val is
5802 	     the same as initial_def already.  */
5803 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5804 				  induc_val);
5805 
5806 	  tree tmp = make_ssa_name (new_scalar_dest);
5807 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5808 					     initial_def, new_temp);
5809 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5810 	  scalar_results[0] = tmp;
5811 	}
5812     }
5813 
5814 vect_finalize_reduction:
5815 
5816   if (double_reduc)
5817     loop = loop->inner;
5818 
5819   /* 2.5 Adjust the final result by the initial value of the reduction
5820 	 variable. (When such adjustment is not needed, then
5821 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5822 	 new_temp = loop_exit_def + adjustment_def  */
5823 
5824   if (adjustment_def)
5825     {
5826       gcc_assert (!slp_reduc);
5827       if (nested_in_vect_loop)
5828 	{
5829           new_phi = new_phis[0];
5830 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5831 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5832 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
5833 	}
5834       else
5835 	{
5836           new_temp = scalar_results[0];
5837 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5838 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
5839 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5840 	}
5841 
5842       epilog_stmt = gimple_build_assign (new_dest, expr);
5843       new_temp = make_ssa_name (new_dest, epilog_stmt);
5844       gimple_assign_set_lhs (epilog_stmt, new_temp);
5845       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5846       if (nested_in_vect_loop)
5847         {
5848           set_vinfo_for_stmt (epilog_stmt,
5849                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5850           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5851                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5852 
5853           if (!double_reduc)
5854             scalar_results.quick_push (new_temp);
5855           else
5856             scalar_results[0] = new_temp;
5857         }
5858       else
5859         scalar_results[0] = new_temp;
5860 
5861       new_phis[0] = epilog_stmt;
5862     }
5863 
5864   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5865           phis with new adjusted scalar results, i.e., replace use <s_out0>
5866           with use <s_out4>.
5867 
5868      Transform:
5869         loop_exit:
5870           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5871           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5872           v_out2 = reduce <v_out1>
5873           s_out3 = extract_field <v_out2, 0>
5874           s_out4 = adjust_result <s_out3>
5875           use <s_out0>
5876           use <s_out0>
5877 
5878      into:
5879 
5880         loop_exit:
5881           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5882           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5883           v_out2 = reduce <v_out1>
5884           s_out3 = extract_field <v_out2, 0>
5885           s_out4 = adjust_result <s_out3>
5886           use <s_out4>
5887           use <s_out4> */
5888 
5889 
5890   /* In SLP reduction chain we reduce vector results into one vector if
5891      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5892      the last stmt in the reduction chain, since we are looking for the loop
5893      exit phi node.  */
5894   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5895     {
5896       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5897       /* Handle reduction patterns.  */
5898       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5899 	dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5900 
5901       scalar_dest = gimple_assign_lhs (dest_stmt);
5902       group_size = 1;
5903     }
5904 
5905   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5906      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5907      need to match SCALAR_RESULTS with corresponding statements.  The first
5908      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5909      the first vector stmt, etc.
5910      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5911   if (group_size > new_phis.length ())
5912     {
5913       ratio = group_size / new_phis.length ();
5914       gcc_assert (!(group_size % new_phis.length ()));
5915     }
5916   else
5917     ratio = 1;
5918 
5919   for (k = 0; k < group_size; k++)
5920     {
5921       if (k % ratio == 0)
5922         {
5923           epilog_stmt = new_phis[k / ratio];
5924           reduction_phi = reduction_phis[k / ratio];
5925 	  if (double_reduc)
5926 	    inner_phi = inner_phis[k / ratio];
5927         }
5928 
5929       if (slp_reduc)
5930         {
5931 	  gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5932 
5933           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5934           /* SLP statements can't participate in patterns.  */
5935           gcc_assert (!orig_stmt);
5936           scalar_dest = gimple_assign_lhs (current_stmt);
5937         }
5938 
5939       phis.create (3);
5940       /* Find the loop-closed-use at the loop exit of the original scalar
5941          result.  (The reduction result is expected to have two immediate uses -
5942          one at the latch block, and one at the loop exit).  */
5943       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5944         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5945 	    && !is_gimple_debug (USE_STMT (use_p)))
5946           phis.safe_push (USE_STMT (use_p));
5947 
5948       /* While we expect to have found an exit_phi because of loop-closed-ssa
5949          form we can end up without one if the scalar cycle is dead.  */
5950 
5951       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5952         {
5953           if (outer_loop)
5954             {
5955               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5956               gphi *vect_phi;
5957 
5958               /* FORNOW. Currently not supporting the case that an inner-loop
5959                  reduction is not used in the outer-loop (but only outside the
5960                  outer-loop), unless it is double reduction.  */
5961               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5962                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5963                           || double_reduc);
5964 
5965 	      if (double_reduc)
5966 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5967 	      else
5968 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5969               if (!double_reduc
5970                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5971                       != vect_double_reduction_def)
5972                 continue;
5973 
5974               /* Handle double reduction:
5975 
5976                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5977                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5978                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5979                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5980 
5981                  At that point the regular reduction (stmt2 and stmt3) is
5982                  already vectorized, as well as the exit phi node, stmt4.
5983                  Here we vectorize the phi node of double reduction, stmt1, and
5984                  update all relevant statements.  */
5985 
5986               /* Go through all the uses of s2 to find double reduction phi
5987                  node, i.e., stmt1 above.  */
5988               orig_name = PHI_RESULT (exit_phi);
5989               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5990                 {
5991                   stmt_vec_info use_stmt_vinfo;
5992                   stmt_vec_info new_phi_vinfo;
5993                   tree vect_phi_init, preheader_arg, vect_phi_res;
5994                   basic_block bb = gimple_bb (use_stmt);
5995 		  gimple *use;
5996 
5997                   /* Check that USE_STMT is really double reduction phi
5998                      node.  */
5999                   if (gimple_code (use_stmt) != GIMPLE_PHI
6000                       || gimple_phi_num_args (use_stmt) != 2
6001                       || bb->loop_father != outer_loop)
6002                     continue;
6003                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6004                   if (!use_stmt_vinfo
6005                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6006                           != vect_double_reduction_def)
6007 		    continue;
6008 
6009                   /* Create vector phi node for double reduction:
6010                      vs1 = phi <vs0, vs2>
6011                      vs1 was created previously in this function by a call to
6012                        vect_get_vec_def_for_operand and is stored in
6013                        vec_initial_def;
6014                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6015                      vs0 is created here.  */
6016 
6017                   /* Create vector phi node.  */
6018                   vect_phi = create_phi_node (vec_initial_def, bb);
6019                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6020                                     loop_vec_info_for_loop (outer_loop));
6021                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6022 
6023                   /* Create vs0 - initial def of the double reduction phi.  */
6024                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6025                                              loop_preheader_edge (outer_loop));
6026                   vect_phi_init = get_initial_def_for_reduction
6027 		    (stmt, preheader_arg, NULL);
6028 
6029                   /* Update phi node arguments with vs0 and vs2.  */
6030                   add_phi_arg (vect_phi, vect_phi_init,
6031                                loop_preheader_edge (outer_loop),
6032                                UNKNOWN_LOCATION);
6033                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6034                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6035                   if (dump_enabled_p ())
6036                     {
6037                       dump_printf_loc (MSG_NOTE, vect_location,
6038 				       "created double reduction phi node: ");
6039                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6040                     }
6041 
6042                   vect_phi_res = PHI_RESULT (vect_phi);
6043 
6044                   /* Replace the use, i.e., set the correct vs1 in the regular
6045                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6046                      loop is redundant.  */
6047                   use = reduction_phi;
6048                   for (j = 0; j < ncopies; j++)
6049                     {
6050                       edge pr_edge = loop_preheader_edge (loop);
6051                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6052                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6053                     }
6054                 }
6055             }
6056         }
6057 
6058       phis.release ();
6059       if (nested_in_vect_loop)
6060         {
6061           if (double_reduc)
6062             loop = outer_loop;
6063           else
6064             continue;
6065         }
6066 
6067       phis.create (3);
6068       /* Find the loop-closed-use at the loop exit of the original scalar
6069          result.  (The reduction result is expected to have two immediate uses,
6070          one at the latch block, and one at the loop exit).  For double
6071          reductions we are looking for exit phis of the outer loop.  */
6072       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6073         {
6074           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6075 	    {
6076 	      if (!is_gimple_debug (USE_STMT (use_p)))
6077 		phis.safe_push (USE_STMT (use_p));
6078 	    }
6079           else
6080             {
6081               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6082                 {
6083                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6084 
6085                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6086                     {
6087                       if (!flow_bb_inside_loop_p (loop,
6088                                              gimple_bb (USE_STMT (phi_use_p)))
6089 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6090                         phis.safe_push (USE_STMT (phi_use_p));
6091                     }
6092                 }
6093             }
6094         }
6095 
6096       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6097         {
6098           /* Replace the uses:  */
6099           orig_name = PHI_RESULT (exit_phi);
6100           scalar_result = scalar_results[k];
6101           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6102             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6103               SET_USE (use_p, scalar_result);
6104         }
6105 
6106       phis.release ();
6107     }
6108 }
6109 
6110 /* Return a vector of type VECTYPE that is equal to the vector select
6111    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6112    before GSI.  */
6113 
6114 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6115 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6116 		     tree vec, tree identity)
6117 {
6118   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6119   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6120 					  mask, vec, identity);
6121   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6122   return cond;
6123 }
6124 
6125 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6126    order, starting with LHS.  Insert the extraction statements before GSI and
6127    associate the new scalar SSA names with variable SCALAR_DEST.
6128    Return the SSA name for the result.  */
6129 
6130 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6131 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6132 		       tree_code code, tree lhs, tree vector_rhs)
6133 {
6134   tree vectype = TREE_TYPE (vector_rhs);
6135   tree scalar_type = TREE_TYPE (vectype);
6136   tree bitsize = TYPE_SIZE (scalar_type);
6137   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6138   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6139 
6140   for (unsigned HOST_WIDE_INT bit_offset = 0;
6141        bit_offset < vec_size_in_bits;
6142        bit_offset += element_bitsize)
6143     {
6144       tree bitpos = bitsize_int (bit_offset);
6145       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6146 			 bitsize, bitpos);
6147 
6148       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6149       rhs = make_ssa_name (scalar_dest, stmt);
6150       gimple_assign_set_lhs (stmt, rhs);
6151       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6152 
6153       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6154       tree new_name = make_ssa_name (scalar_dest, stmt);
6155       gimple_assign_set_lhs (stmt, new_name);
6156       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6157       lhs = new_name;
6158     }
6159   return lhs;
6160 }
6161 
6162 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6163    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6164    statement.  CODE is the operation performed by STMT and OPS are
6165    its scalar operands.  REDUC_INDEX is the index of the operand in
6166    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6167    implements in-order reduction, or IFN_LAST if we should open-code it.
6168    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6169    that should be used to control the operation in a fully-masked loop.  */
6170 
6171 static bool
vectorize_fold_left_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6172 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6173 			       gimple **vec_stmt, slp_tree slp_node,
6174 			       gimple *reduc_def_stmt,
6175 			       tree_code code, internal_fn reduc_fn,
6176 			       tree ops[3], tree vectype_in,
6177 			       int reduc_index, vec_loop_masks *masks)
6178 {
6179   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6180   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6181   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6182   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6183   gimple *new_stmt = NULL;
6184 
6185   int ncopies;
6186   if (slp_node)
6187     ncopies = 1;
6188   else
6189     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6190 
6191   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6192   gcc_assert (ncopies == 1);
6193   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6194   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6195   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6196 	      == FOLD_LEFT_REDUCTION);
6197 
6198   if (slp_node)
6199     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6200 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6201 
6202   tree op0 = ops[1 - reduc_index];
6203 
6204   int group_size = 1;
6205   gimple *scalar_dest_def;
6206   auto_vec<tree> vec_oprnds0;
6207   if (slp_node)
6208     {
6209       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6210       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6211       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6212     }
6213   else
6214     {
6215       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6216       vec_oprnds0.create (1);
6217       vec_oprnds0.quick_push (loop_vec_def0);
6218       scalar_dest_def = stmt;
6219     }
6220 
6221   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6222   tree scalar_type = TREE_TYPE (scalar_dest);
6223   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6224 
6225   int vec_num = vec_oprnds0.length ();
6226   gcc_assert (vec_num == 1 || slp_node);
6227   tree vec_elem_type = TREE_TYPE (vectype_out);
6228   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6229 
6230   tree vector_identity = NULL_TREE;
6231   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6232     vector_identity = build_zero_cst (vectype_out);
6233 
6234   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6235   int i;
6236   tree def0;
6237   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6238     {
6239       tree mask = NULL_TREE;
6240       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6241 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6242 
6243       /* Handle MINUS by adding the negative.  */
6244       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6245 	{
6246 	  tree negated = make_ssa_name (vectype_out);
6247 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6248 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6249 	  def0 = negated;
6250 	}
6251 
6252       if (mask)
6253 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6254 				    vector_identity);
6255 
6256       /* On the first iteration the input is simply the scalar phi
6257 	 result, and for subsequent iterations it is the output of
6258 	 the preceding operation.  */
6259       if (reduc_fn != IFN_LAST)
6260 	{
6261 	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6262 	  /* For chained SLP reductions the output of the previous reduction
6263 	     operation serves as the input of the next. For the final statement
6264 	     the output cannot be a temporary - we reuse the original
6265 	     scalar destination of the last statement.  */
6266 	  if (i != vec_num - 1)
6267 	    {
6268 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6269 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6270 	      gimple_set_lhs (new_stmt, reduc_var);
6271 	    }
6272 	}
6273       else
6274 	{
6275 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6276 					     reduc_var, def0);
6277 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6278 	  /* Remove the statement, so that we can use the same code paths
6279 	     as for statements that we've just created.  */
6280 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6281 	  gsi_remove (&tmp_gsi, true);
6282 	}
6283 
6284       if (i == vec_num - 1)
6285 	{
6286 	  gimple_set_lhs (new_stmt, scalar_dest);
6287 	  vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6288 	}
6289       else
6290 	vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6291 
6292       if (slp_node)
6293 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6294     }
6295 
6296   if (!slp_node)
6297     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6298 
6299   return true;
6300 }
6301 
6302 /* Function is_nonwrapping_integer_induction.
6303 
6304    Check if STMT (which is part of loop LOOP) both increments and
6305    does not cause overflow.  */
6306 
6307 static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)6308 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6309 {
6310   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6311   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6312   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6313   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6314   widest_int ni, max_loop_value, lhs_max;
6315   bool overflow = false;
6316 
6317   /* Make sure the loop is integer based.  */
6318   if (TREE_CODE (base) != INTEGER_CST
6319       || TREE_CODE (step) != INTEGER_CST)
6320     return false;
6321 
6322   /* Check that the max size of the loop will not wrap.  */
6323 
6324   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6325     return true;
6326 
6327   if (! max_stmt_executions (loop, &ni))
6328     return false;
6329 
6330   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6331 			    &overflow);
6332   if (overflow)
6333     return false;
6334 
6335   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6336 			    TYPE_SIGN (lhs_type), &overflow);
6337   if (overflow)
6338     return false;
6339 
6340   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6341 	  <= TYPE_PRECISION (lhs_type));
6342 }
6343 
6344 /* Function vectorizable_reduction.
6345 
6346    Check if STMT performs a reduction operation that can be vectorized.
6347    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6348    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6349    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6350 
6351    This function also handles reduction idioms (patterns) that have been
6352    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6353    of this form:
6354      X = pattern_expr (arg0, arg1, ..., X)
6355    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6356    sequence that had been detected and replaced by the pattern-stmt (STMT).
6357 
6358    This function also handles reduction of condition expressions, for example:
6359      for (int i = 0; i < N; i++)
6360        if (a[i] < value)
6361 	 last = a[i];
6362    This is handled by vectorising the loop and creating an additional vector
6363    containing the loop indexes for which "a[i] < value" was true.  In the
6364    function epilogue this is reduced to a single max value and then used to
6365    index into the vector of results.
6366 
6367    In some cases of reduction patterns, the type of the reduction variable X is
6368    different than the type of the other arguments of STMT.
6369    In such cases, the vectype that is used when transforming STMT into a vector
6370    stmt is different than the vectype that is used to determine the
6371    vectorization factor, because it consists of a different number of elements
6372    than the actual number of elements that are being operated upon in parallel.
6373 
6374    For example, consider an accumulation of shorts into an int accumulator.
6375    On some targets it's possible to vectorize this pattern operating on 8
6376    shorts at a time (hence, the vectype for purposes of determining the
6377    vectorization factor should be V8HI); on the other hand, the vectype that
6378    is used to create the vector form is actually V4SI (the type of the result).
6379 
6380    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6381    indicates what is the actual level of parallelism (V8HI in the example), so
6382    that the right vectorization factor would be derived.  This vectype
6383    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6384    be used to create the vectorized stmt.  The right vectype for the vectorized
6385    stmt is obtained from the type of the result X:
6386         get_vectype_for_scalar_type (TREE_TYPE (X))
6387 
6388    This means that, contrary to "regular" reductions (or "regular" stmts in
6389    general), the following equation:
6390       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6391    does *NOT* necessarily hold for reduction patterns.  */
6392 
6393 bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)6394 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6395 			gimple **vec_stmt, slp_tree slp_node,
6396 			slp_instance slp_node_instance)
6397 {
6398   tree vec_dest;
6399   tree scalar_dest;
6400   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6401   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6402   tree vectype_in = NULL_TREE;
6403   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6404   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6405   enum tree_code code, orig_code;
6406   internal_fn reduc_fn;
6407   machine_mode vec_mode;
6408   int op_type;
6409   optab optab;
6410   tree new_temp = NULL_TREE;
6411   gimple *def_stmt;
6412   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6413   gimple *cond_reduc_def_stmt = NULL;
6414   enum tree_code cond_reduc_op_code = ERROR_MARK;
6415   tree scalar_type;
6416   bool is_simple_use;
6417   gimple *orig_stmt;
6418   stmt_vec_info orig_stmt_info = NULL;
6419   int i;
6420   int ncopies;
6421   int epilog_copies;
6422   stmt_vec_info prev_stmt_info, prev_phi_info;
6423   bool single_defuse_cycle = false;
6424   gimple *new_stmt = NULL;
6425   int j;
6426   tree ops[3];
6427   enum vect_def_type dts[3];
6428   bool nested_cycle = false, found_nested_cycle_def = false;
6429   bool double_reduc = false;
6430   basic_block def_bb;
6431   struct loop * def_stmt_loop, *outer_loop = NULL;
6432   tree def_arg;
6433   gimple *def_arg_stmt;
6434   auto_vec<tree> vec_oprnds0;
6435   auto_vec<tree> vec_oprnds1;
6436   auto_vec<tree> vec_oprnds2;
6437   auto_vec<tree> vect_defs;
6438   auto_vec<gimple *> phis;
6439   int vec_num;
6440   tree def0, tem;
6441   bool first_p = true;
6442   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6443   tree cond_reduc_val = NULL_TREE;
6444 
6445   /* Make sure it was already recognized as a reduction computation.  */
6446   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6447       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6448     return false;
6449 
6450   if (nested_in_vect_loop_p (loop, stmt))
6451     {
6452       outer_loop = loop;
6453       loop = loop->inner;
6454       nested_cycle = true;
6455     }
6456 
6457   /* In case of reduction chain we switch to the first stmt in the chain, but
6458      we don't update STMT_INFO, since only the last stmt is marked as reduction
6459      and has reduction properties.  */
6460   if (GROUP_FIRST_ELEMENT (stmt_info)
6461       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6462     {
6463       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6464       first_p = false;
6465     }
6466 
6467   if (gimple_code (stmt) == GIMPLE_PHI)
6468     {
6469       /* Analysis is fully done on the reduction stmt invocation.  */
6470       if (! vec_stmt)
6471 	{
6472 	  if (slp_node)
6473 	    slp_node_instance->reduc_phis = slp_node;
6474 
6475 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6476 	  return true;
6477 	}
6478 
6479       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6480 	/* Leave the scalar phi in place.  Note that checking
6481 	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6482 	   for reductions involving a single statement.  */
6483 	return true;
6484 
6485       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6486       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6487 	reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6488 
6489       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6490 	  == EXTRACT_LAST_REDUCTION)
6491 	/* Leave the scalar phi in place.  */
6492 	return true;
6493 
6494       gcc_assert (is_gimple_assign (reduc_stmt));
6495       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6496 	{
6497 	  tree op = gimple_op (reduc_stmt, k);
6498 	  if (op == gimple_phi_result (stmt))
6499 	    continue;
6500 	  if (k == 1
6501 	      && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6502 	    continue;
6503 	  if (!vectype_in
6504 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6505 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6506 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6507 	  break;
6508 	}
6509       gcc_assert (vectype_in);
6510 
6511       if (slp_node)
6512 	ncopies = 1;
6513       else
6514 	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6515 
6516       use_operand_p use_p;
6517       gimple *use_stmt;
6518       if (ncopies > 1
6519 	  && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6520 	      <= vect_used_only_live)
6521 	  && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6522 	  && (use_stmt == reduc_stmt
6523 	      || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6524 		  == reduc_stmt)))
6525 	single_defuse_cycle = true;
6526 
6527       /* Create the destination vector  */
6528       scalar_dest = gimple_assign_lhs (reduc_stmt);
6529       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6530 
6531       if (slp_node)
6532 	/* The size vect_schedule_slp_instance computes is off for us.  */
6533 	vec_num = vect_get_num_vectors
6534 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6535 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6536 	   vectype_in);
6537       else
6538 	vec_num = 1;
6539 
6540       /* Generate the reduction PHIs upfront.  */
6541       prev_phi_info = NULL;
6542       for (j = 0; j < ncopies; j++)
6543 	{
6544 	  if (j == 0 || !single_defuse_cycle)
6545 	    {
6546 	      for (i = 0; i < vec_num; i++)
6547 		{
6548 		  /* Create the reduction-phi that defines the reduction
6549 		     operand.  */
6550 		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
6551 		  set_vinfo_for_stmt (new_phi,
6552 				      new_stmt_vec_info (new_phi, loop_vinfo));
6553 
6554 		  if (slp_node)
6555 		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6556 		  else
6557 		    {
6558 		      if (j == 0)
6559 			STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6560 		      else
6561 			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6562 		      prev_phi_info = vinfo_for_stmt (new_phi);
6563 		    }
6564 		}
6565 	    }
6566 	}
6567 
6568       return true;
6569     }
6570 
6571   /* 1. Is vectorizable reduction?  */
6572   /* Not supportable if the reduction variable is used in the loop, unless
6573      it's a reduction chain.  */
6574   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6575       && !GROUP_FIRST_ELEMENT (stmt_info))
6576     return false;
6577 
6578   /* Reductions that are not used even in an enclosing outer-loop,
6579      are expected to be "live" (used out of the loop).  */
6580   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6581       && !STMT_VINFO_LIVE_P (stmt_info))
6582     return false;
6583 
6584   /* 2. Has this been recognized as a reduction pattern?
6585 
6586      Check if STMT represents a pattern that has been recognized
6587      in earlier analysis stages.  For stmts that represent a pattern,
6588      the STMT_VINFO_RELATED_STMT field records the last stmt in
6589      the original sequence that constitutes the pattern.  */
6590 
6591   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6592   if (orig_stmt)
6593     {
6594       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6595       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6596       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6597     }
6598 
6599   /* 3. Check the operands of the operation.  The first operands are defined
6600         inside the loop body. The last operand is the reduction variable,
6601         which is defined by the loop-header-phi.  */
6602 
6603   gcc_assert (is_gimple_assign (stmt));
6604 
6605   /* Flatten RHS.  */
6606   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6607     {
6608     case GIMPLE_BINARY_RHS:
6609       code = gimple_assign_rhs_code (stmt);
6610       op_type = TREE_CODE_LENGTH (code);
6611       gcc_assert (op_type == binary_op);
6612       ops[0] = gimple_assign_rhs1 (stmt);
6613       ops[1] = gimple_assign_rhs2 (stmt);
6614       break;
6615 
6616     case GIMPLE_TERNARY_RHS:
6617       code = gimple_assign_rhs_code (stmt);
6618       op_type = TREE_CODE_LENGTH (code);
6619       gcc_assert (op_type == ternary_op);
6620       ops[0] = gimple_assign_rhs1 (stmt);
6621       ops[1] = gimple_assign_rhs2 (stmt);
6622       ops[2] = gimple_assign_rhs3 (stmt);
6623       break;
6624 
6625     case GIMPLE_UNARY_RHS:
6626       return false;
6627 
6628     default:
6629       gcc_unreachable ();
6630     }
6631 
6632   if (code == COND_EXPR && slp_node)
6633     return false;
6634 
6635   scalar_dest = gimple_assign_lhs (stmt);
6636   scalar_type = TREE_TYPE (scalar_dest);
6637   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6638       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6639     return false;
6640 
6641   /* Do not try to vectorize bit-precision reductions.  */
6642   if (!type_has_mode_precision_p (scalar_type))
6643     return false;
6644 
6645   /* All uses but the last are expected to be defined in the loop.
6646      The last use is the reduction variable.  In case of nested cycle this
6647      assumption is not true: we use reduc_index to record the index of the
6648      reduction variable.  */
6649   gimple *reduc_def_stmt = NULL;
6650   int reduc_index = -1;
6651   for (i = 0; i < op_type; i++)
6652     {
6653       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6654       if (i == 0 && code == COND_EXPR)
6655         continue;
6656 
6657       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6658 					  &def_stmt, &dts[i], &tem);
6659       dt = dts[i];
6660       gcc_assert (is_simple_use);
6661       if (dt == vect_reduction_def)
6662 	{
6663           reduc_def_stmt = def_stmt;
6664 	  reduc_index = i;
6665 	  continue;
6666 	}
6667       else if (tem)
6668 	{
6669 	  /* To properly compute ncopies we are interested in the widest
6670 	     input type in case we're looking at a widening accumulation.  */
6671 	  if (!vectype_in
6672 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6673 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6674 	    vectype_in = tem;
6675 	}
6676 
6677       if (dt != vect_internal_def
6678 	  && dt != vect_external_def
6679 	  && dt != vect_constant_def
6680 	  && dt != vect_induction_def
6681           && !(dt == vect_nested_cycle && nested_cycle))
6682 	return false;
6683 
6684       if (dt == vect_nested_cycle)
6685         {
6686           found_nested_cycle_def = true;
6687           reduc_def_stmt = def_stmt;
6688           reduc_index = i;
6689         }
6690 
6691       if (i == 1 && code == COND_EXPR)
6692 	{
6693 	  /* Record how value of COND_EXPR is defined.  */
6694 	  if (dt == vect_constant_def)
6695 	    {
6696 	      cond_reduc_dt = dt;
6697 	      cond_reduc_val = ops[i];
6698 	    }
6699 	  if (dt == vect_induction_def
6700 	      && def_stmt != NULL
6701 	      && is_nonwrapping_integer_induction (def_stmt, loop))
6702 	    {
6703 	      cond_reduc_dt = dt;
6704 	      cond_reduc_def_stmt = def_stmt;
6705 	    }
6706 	}
6707     }
6708 
6709   if (!vectype_in)
6710     vectype_in = vectype_out;
6711 
6712   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6713      directy used in stmt.  */
6714   if (reduc_index == -1)
6715     {
6716       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6717 	{
6718 	  if (dump_enabled_p ())
6719 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720 			     "in-order reduction chain without SLP.\n");
6721 	  return false;
6722 	}
6723 
6724       if (orig_stmt)
6725 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6726       else
6727 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6728     }
6729 
6730   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6731     return false;
6732 
6733   if (!(reduc_index == -1
6734 	|| dts[reduc_index] == vect_reduction_def
6735 	|| dts[reduc_index] == vect_nested_cycle
6736 	|| ((dts[reduc_index] == vect_internal_def
6737 	     || dts[reduc_index] == vect_external_def
6738 	     || dts[reduc_index] == vect_constant_def
6739 	     || dts[reduc_index] == vect_induction_def)
6740 	    && nested_cycle && found_nested_cycle_def)))
6741     {
6742       /* For pattern recognized stmts, orig_stmt might be a reduction,
6743 	 but some helper statements for the pattern might not, or
6744 	 might be COND_EXPRs with reduction uses in the condition.  */
6745       gcc_assert (orig_stmt);
6746       return false;
6747     }
6748 
6749   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6750   enum vect_reduction_type v_reduc_type
6751     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6752   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6753 
6754   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6755   /* If we have a condition reduction, see if we can simplify it further.  */
6756   if (v_reduc_type == COND_REDUCTION)
6757     {
6758       /* TODO: We can't yet handle reduction chains, since we need to treat
6759 	 each COND_EXPR in the chain specially, not just the last one.
6760 	 E.g. for:
6761 
6762 	    x_1 = PHI <x_3, ...>
6763 	    x_2 = a_2 ? ... : x_1;
6764 	    x_3 = a_3 ? ... : x_2;
6765 
6766 	 we're interested in the last element in x_3 for which a_2 || a_3
6767 	 is true, whereas the current reduction chain handling would
6768 	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6769 	 as a reduction operation.  */
6770       if (reduc_index == -1)
6771 	{
6772 	  if (dump_enabled_p ())
6773 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6774 			     "conditional reduction chains not supported\n");
6775 	  return false;
6776 	}
6777 
6778       /* vect_is_simple_reduction ensured that operand 2 is the
6779 	 loop-carried operand.  */
6780       gcc_assert (reduc_index == 2);
6781 
6782       /* Loop peeling modifies initial value of reduction PHI, which
6783 	 makes the reduction stmt to be transformed different to the
6784 	 original stmt analyzed.  We need to record reduction code for
6785 	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6786 	 it can be used directly at transform stage.  */
6787       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6788 	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6789 	{
6790 	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
6791 	  gcc_assert (cond_reduc_dt == vect_constant_def);
6792 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6793 	}
6794       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6795 					       vectype_in, OPTIMIZE_FOR_SPEED))
6796 	{
6797 	  if (dump_enabled_p ())
6798 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 			     "optimizing condition reduction with"
6800 			     " FOLD_EXTRACT_LAST.\n");
6801 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6802 	}
6803       else if (cond_reduc_dt == vect_induction_def)
6804 	{
6805 	  stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6806 	  tree base
6807 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6808 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6809 
6810 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6811 		      && TREE_CODE (step) == INTEGER_CST);
6812 	  cond_reduc_val = NULL_TREE;
6813 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6814 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6815 	    ;
6816 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6817 	     above base; punt if base is the minimum value of the type for
6818 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6819 	  else if (tree_int_cst_sgn (step) == -1)
6820 	    {
6821 	      cond_reduc_op_code = MIN_EXPR;
6822 	      if (tree_int_cst_sgn (base) == -1)
6823 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6824 	      else if (tree_int_cst_lt (base,
6825 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6826 		cond_reduc_val
6827 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6828 	    }
6829 	  else
6830 	    {
6831 	      cond_reduc_op_code = MAX_EXPR;
6832 	      if (tree_int_cst_sgn (base) == 1)
6833 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6834 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6835 					base))
6836 		cond_reduc_val
6837 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6838 	    }
6839 	  if (cond_reduc_val)
6840 	    {
6841 	      if (dump_enabled_p ())
6842 		dump_printf_loc (MSG_NOTE, vect_location,
6843 				 "condition expression based on "
6844 				 "integer induction.\n");
6845 	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6846 		= INTEGER_INDUC_COND_REDUCTION;
6847 	    }
6848 	}
6849       else if (cond_reduc_dt == vect_constant_def)
6850 	{
6851 	  enum vect_def_type cond_initial_dt;
6852 	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6853 	  tree cond_initial_val
6854 	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6855 
6856 	  gcc_assert (cond_reduc_val != NULL_TREE);
6857 	  vect_is_simple_use (cond_initial_val, loop_vinfo,
6858 			      &def_stmt, &cond_initial_dt);
6859 	  if (cond_initial_dt == vect_constant_def
6860 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6861 				     TREE_TYPE (cond_reduc_val)))
6862 	    {
6863 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6864 				    cond_initial_val, cond_reduc_val);
6865 	      if (e && (integer_onep (e) || integer_zerop (e)))
6866 		{
6867 		  if (dump_enabled_p ())
6868 		    dump_printf_loc (MSG_NOTE, vect_location,
6869 				     "condition expression based on "
6870 				     "compile time constant.\n");
6871 		  /* Record reduction code at analysis stage.  */
6872 		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6873 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6874 		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6875 		    = CONST_COND_REDUCTION;
6876 		}
6877 	    }
6878 	}
6879     }
6880 
6881   if (orig_stmt)
6882     gcc_assert (tmp == orig_stmt
6883 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6884   else
6885     /* We changed STMT to be the first stmt in reduction chain, hence we
6886        check that in this case the first element in the chain is STMT.  */
6887     gcc_assert (stmt == tmp
6888 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6889 
6890   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6891     return false;
6892 
6893   if (slp_node)
6894     ncopies = 1;
6895   else
6896     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6897 
6898   gcc_assert (ncopies >= 1);
6899 
6900   vec_mode = TYPE_MODE (vectype_in);
6901   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6902 
6903   if (code == COND_EXPR)
6904     {
6905       /* Only call during the analysis stage, otherwise we'll lose
6906 	 STMT_VINFO_TYPE.  */
6907       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6908 						ops[reduc_index], 0, NULL))
6909         {
6910           if (dump_enabled_p ())
6911 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6912 			     "unsupported condition in reduction\n");
6913 	  return false;
6914         }
6915     }
6916   else
6917     {
6918       /* 4. Supportable by target?  */
6919 
6920       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6921 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
6922 	{
6923 	  /* Shifts and rotates are only supported by vectorizable_shifts,
6924 	     not vectorizable_reduction.  */
6925           if (dump_enabled_p ())
6926 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6927 			     "unsupported shift or rotation.\n");
6928 	  return false;
6929 	}
6930 
6931       /* 4.1. check support for the operation in the loop  */
6932       optab = optab_for_tree_code (code, vectype_in, optab_default);
6933       if (!optab)
6934         {
6935           if (dump_enabled_p ())
6936 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6937 			     "no optab.\n");
6938 
6939           return false;
6940         }
6941 
6942       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6943         {
6944           if (dump_enabled_p ())
6945             dump_printf (MSG_NOTE, "op not supported by target.\n");
6946 
6947 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6948 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6949             return false;
6950 
6951           if (dump_enabled_p ())
6952   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6953         }
6954 
6955       /* Worthwhile without SIMD support?  */
6956       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6957 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6958         {
6959           if (dump_enabled_p ())
6960 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961 			     "not worthwhile without SIMD support.\n");
6962 
6963           return false;
6964         }
6965     }
6966 
6967   /* 4.2. Check support for the epilog operation.
6968 
6969           If STMT represents a reduction pattern, then the type of the
6970           reduction variable may be different than the type of the rest
6971           of the arguments.  For example, consider the case of accumulation
6972           of shorts into an int accumulator; The original code:
6973                         S1: int_a = (int) short_a;
6974           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6975 
6976           was replaced with:
6977                         STMT: int_acc = widen_sum <short_a, int_acc>
6978 
6979           This means that:
6980           1. The tree-code that is used to create the vector operation in the
6981              epilog code (that reduces the partial results) is not the
6982              tree-code of STMT, but is rather the tree-code of the original
6983              stmt from the pattern that STMT is replacing.  I.e, in the example
6984              above we want to use 'widen_sum' in the loop, but 'plus' in the
6985              epilog.
6986           2. The type (mode) we use to check available target support
6987              for the vector operation to be created in the *epilog*, is
6988              determined by the type of the reduction variable (in the example
6989              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6990              However the type (mode) we use to check available target support
6991              for the vector operation to be created *inside the loop*, is
6992              determined by the type of the other arguments to STMT (in the
6993              example we'd check this: optab_handler (widen_sum_optab,
6994 	     vect_short_mode)).
6995 
6996           This is contrary to "regular" reductions, in which the types of all
6997           the arguments are the same as the type of the reduction variable.
6998           For "regular" reductions we can therefore use the same vector type
6999           (and also the same tree-code) when generating the epilog code and
7000           when generating the code inside the loop.  */
7001 
7002   vect_reduction_type reduction_type
7003     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7004   if (orig_stmt
7005       && (reduction_type == TREE_CODE_REDUCTION
7006 	  || reduction_type == FOLD_LEFT_REDUCTION))
7007     {
7008       /* This is a reduction pattern: get the vectype from the type of the
7009          reduction variable, and get the tree-code from orig_stmt.  */
7010       orig_code = gimple_assign_rhs_code (orig_stmt);
7011       gcc_assert (vectype_out);
7012       vec_mode = TYPE_MODE (vectype_out);
7013     }
7014   else
7015     {
7016       /* Regular reduction: use the same vectype and tree-code as used for
7017          the vector code inside the loop can be used for the epilog code. */
7018       orig_code = code;
7019 
7020       if (code == MINUS_EXPR)
7021 	orig_code = PLUS_EXPR;
7022 
7023       /* For simple condition reductions, replace with the actual expression
7024 	 we want to base our reduction around.  */
7025       if (reduction_type == CONST_COND_REDUCTION)
7026 	{
7027 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7028 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7029 	}
7030       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7031 	orig_code = cond_reduc_op_code;
7032     }
7033 
7034   if (nested_cycle)
7035     {
7036       def_bb = gimple_bb (reduc_def_stmt);
7037       def_stmt_loop = def_bb->loop_father;
7038       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7039                                        loop_preheader_edge (def_stmt_loop));
7040       if (TREE_CODE (def_arg) == SSA_NAME
7041           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7042           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7043           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7044           && vinfo_for_stmt (def_arg_stmt)
7045           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7046               == vect_double_reduction_def)
7047         double_reduc = true;
7048     }
7049 
7050   reduc_fn = IFN_LAST;
7051 
7052   if (reduction_type == TREE_CODE_REDUCTION
7053       || reduction_type == FOLD_LEFT_REDUCTION
7054       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7055       || reduction_type == CONST_COND_REDUCTION)
7056     {
7057       if (reduction_type == FOLD_LEFT_REDUCTION
7058 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
7059 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7060 	{
7061 	  if (reduc_fn != IFN_LAST
7062 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7063 						  OPTIMIZE_FOR_SPEED))
7064 	    {
7065 	      if (dump_enabled_p ())
7066 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7067 				 "reduc op not supported by target.\n");
7068 
7069 	      reduc_fn = IFN_LAST;
7070 	    }
7071 	}
7072       else
7073 	{
7074 	  if (!nested_cycle || double_reduc)
7075 	    {
7076 	      if (dump_enabled_p ())
7077 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7078 				 "no reduc code for scalar code.\n");
7079 
7080 	      return false;
7081 	    }
7082 	}
7083     }
7084   else if (reduction_type == COND_REDUCTION)
7085     {
7086       int scalar_precision
7087 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7088       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7089       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7090 						nunits_out);
7091 
7092       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7093 					  OPTIMIZE_FOR_SPEED))
7094 	reduc_fn = IFN_REDUC_MAX;
7095     }
7096 
7097   if (reduction_type != EXTRACT_LAST_REDUCTION
7098       && reduc_fn == IFN_LAST
7099       && !nunits_out.is_constant ())
7100     {
7101       if (dump_enabled_p ())
7102 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 			 "missing target support for reduction on"
7104 			 " variable-length vectors.\n");
7105       return false;
7106     }
7107 
7108   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7109       && ncopies > 1)
7110     {
7111       if (dump_enabled_p ())
7112 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113 			 "multiple types in double reduction or condition "
7114 			 "reduction.\n");
7115       return false;
7116     }
7117 
7118   /* For SLP reductions, see if there is a neutral value we can use.  */
7119   tree neutral_op = NULL_TREE;
7120   if (slp_node)
7121     neutral_op
7122       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7123 				      GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7124 
7125   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7126     {
7127       /* We can't support in-order reductions of code such as this:
7128 
7129 	   for (int i = 0; i < n1; ++i)
7130 	     for (int j = 0; j < n2; ++j)
7131 	       l += a[j];
7132 
7133 	 since GCC effectively transforms the loop when vectorizing:
7134 
7135 	   for (int i = 0; i < n1 / VF; ++i)
7136 	     for (int j = 0; j < n2; ++j)
7137 	       for (int k = 0; k < VF; ++k)
7138 		 l += a[j];
7139 
7140 	 which is a reassociation of the original operation.  */
7141       if (dump_enabled_p ())
7142 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 			 "in-order double reduction not supported.\n");
7144 
7145       return false;
7146     }
7147 
7148   if (reduction_type == FOLD_LEFT_REDUCTION
7149       && slp_node
7150       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7151     {
7152       /* We cannot use in-order reductions in this case because there is
7153 	 an implicit reassociation of the operations involved.  */
7154       if (dump_enabled_p ())
7155 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7156 			 "in-order unchained SLP reductions not supported.\n");
7157       return false;
7158     }
7159 
7160   /* For double reductions, and for SLP reductions with a neutral value,
7161      we construct a variable-length initial vector by loading a vector
7162      full of the neutral value and then shift-and-inserting the start
7163      values into the low-numbered elements.  */
7164   if ((double_reduc || neutral_op)
7165       && !nunits_out.is_constant ()
7166       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7167 					  vectype_out, OPTIMIZE_FOR_SPEED))
7168     {
7169       if (dump_enabled_p ())
7170 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7171 			 "reduction on variable-length vectors requires"
7172 			 " target support for a vector-shift-and-insert"
7173 			 " operation.\n");
7174       return false;
7175     }
7176 
7177   /* Check extra constraints for variable-length unchained SLP reductions.  */
7178   if (STMT_SLP_TYPE (stmt_info)
7179       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7180       && !nunits_out.is_constant ())
7181     {
7182       /* We checked above that we could build the initial vector when
7183 	 there's a neutral element value.  Check here for the case in
7184 	 which each SLP statement has its own initial value and in which
7185 	 that value needs to be repeated for every instance of the
7186 	 statement within the initial vector.  */
7187       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7188       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7189       if (!neutral_op
7190 	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
7191 	{
7192 	  if (dump_enabled_p ())
7193 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7194 			     "unsupported form of SLP reduction for"
7195 			     " variable-length vectors: cannot build"
7196 			     " initial vector.\n");
7197 	  return false;
7198 	}
7199       /* The epilogue code relies on the number of elements being a multiple
7200 	 of the group size.  The duplicate-and-interleave approach to setting
7201 	 up the the initial vector does too.  */
7202       if (!multiple_p (nunits_out, group_size))
7203 	{
7204 	  if (dump_enabled_p ())
7205 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206 			     "unsupported form of SLP reduction for"
7207 			     " variable-length vectors: the vector size"
7208 			     " is not a multiple of the number of results.\n");
7209 	  return false;
7210 	}
7211     }
7212 
7213   /* In case of widenning multiplication by a constant, we update the type
7214      of the constant to be the type of the other operand.  We check that the
7215      constant fits the type in the pattern recognition pass.  */
7216   if (code == DOT_PROD_EXPR
7217       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7218     {
7219       if (TREE_CODE (ops[0]) == INTEGER_CST)
7220         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7221       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7222         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7223       else
7224         {
7225           if (dump_enabled_p ())
7226 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7227 			     "invalid types in dot-prod\n");
7228 
7229           return false;
7230         }
7231     }
7232 
7233   if (reduction_type == COND_REDUCTION)
7234     {
7235       widest_int ni;
7236 
7237       if (! max_loop_iterations (loop, &ni))
7238 	{
7239 	  if (dump_enabled_p ())
7240 	    dump_printf_loc (MSG_NOTE, vect_location,
7241 			     "loop count not known, cannot create cond "
7242 			     "reduction.\n");
7243 	  return false;
7244 	}
7245       /* Convert backedges to iterations.  */
7246       ni += 1;
7247 
7248       /* The additional index will be the same type as the condition.  Check
7249 	 that the loop can fit into this less one (because we'll use up the
7250 	 zero slot for when there are no matches).  */
7251       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7252       if (wi::geu_p (ni, wi::to_widest (max_index)))
7253 	{
7254 	  if (dump_enabled_p ())
7255 	    dump_printf_loc (MSG_NOTE, vect_location,
7256 			     "loop size is greater than data size.\n");
7257 	  return false;
7258 	}
7259     }
7260 
7261   /* In case the vectorization factor (VF) is bigger than the number
7262      of elements that we can fit in a vectype (nunits), we have to generate
7263      more than one vector stmt - i.e - we need to "unroll" the
7264      vector stmt by a factor VF/nunits.  For more details see documentation
7265      in vectorizable_operation.  */
7266 
7267   /* If the reduction is used in an outer loop we need to generate
7268      VF intermediate results, like so (e.g. for ncopies=2):
7269 	r0 = phi (init, r0)
7270 	r1 = phi (init, r1)
7271 	r0 = x0 + r0;
7272         r1 = x1 + r1;
7273     (i.e. we generate VF results in 2 registers).
7274     In this case we have a separate def-use cycle for each copy, and therefore
7275     for each copy we get the vector def for the reduction variable from the
7276     respective phi node created for this copy.
7277 
7278     Otherwise (the reduction is unused in the loop nest), we can combine
7279     together intermediate results, like so (e.g. for ncopies=2):
7280 	r = phi (init, r)
7281 	r = x0 + r;
7282 	r = x1 + r;
7283    (i.e. we generate VF/2 results in a single register).
7284    In this case for each copy we get the vector def for the reduction variable
7285    from the vectorized reduction operation generated in the previous iteration.
7286 
7287    This only works when we see both the reduction PHI and its only consumer
7288    in vectorizable_reduction and there are no intermediate stmts
7289    participating.  */
7290   use_operand_p use_p;
7291   gimple *use_stmt;
7292   if (ncopies > 1
7293       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7294       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7295       && (use_stmt == stmt
7296 	  || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7297     {
7298       single_defuse_cycle = true;
7299       epilog_copies = 1;
7300     }
7301   else
7302     epilog_copies = ncopies;
7303 
7304   /* If the reduction stmt is one of the patterns that have lane
7305      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7306   if ((ncopies > 1
7307        && ! single_defuse_cycle)
7308       && (code == DOT_PROD_EXPR
7309 	  || code == WIDEN_SUM_EXPR
7310 	  || code == SAD_EXPR))
7311     {
7312       if (dump_enabled_p ())
7313 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314 			 "multi def-use cycle not possible for lane-reducing "
7315 			 "reduction operation\n");
7316       return false;
7317     }
7318 
7319   if (slp_node)
7320     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7321   else
7322     vec_num = 1;
7323 
7324   internal_fn cond_fn = get_conditional_internal_fn (code);
7325   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7326 
7327   if (!vec_stmt) /* transformation not required.  */
7328     {
7329       if (first_p)
7330 	vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7331       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7332 	{
7333 	  if (reduction_type != FOLD_LEFT_REDUCTION
7334 	      && (cond_fn == IFN_LAST
7335 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7336 						      OPTIMIZE_FOR_SPEED)))
7337 	    {
7338 	      if (dump_enabled_p ())
7339 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7340 				 "can't use a fully-masked loop because no"
7341 				 " conditional operation is available.\n");
7342 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7343 	    }
7344 	  else if (reduc_index == -1)
7345 	    {
7346 	      if (dump_enabled_p ())
7347 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7348 				 "can't use a fully-masked loop for chained"
7349 				 " reductions.\n");
7350 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7351 	    }
7352 	  else
7353 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7354 				   vectype_in);
7355 	}
7356       if (dump_enabled_p ()
7357 	  && reduction_type == FOLD_LEFT_REDUCTION)
7358 	dump_printf_loc (MSG_NOTE, vect_location,
7359 			 "using an in-order (fold-left) reduction.\n");
7360       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7361       return true;
7362     }
7363 
7364   /* Transform.  */
7365 
7366   if (dump_enabled_p ())
7367     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7368 
7369   /* FORNOW: Multiple types are not supported for condition.  */
7370   if (code == COND_EXPR)
7371     gcc_assert (ncopies == 1);
7372 
7373   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7374 
7375   if (reduction_type == FOLD_LEFT_REDUCTION)
7376     return vectorize_fold_left_reduction
7377       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7378        reduc_fn, ops, vectype_in, reduc_index, masks);
7379 
7380   if (reduction_type == EXTRACT_LAST_REDUCTION)
7381     {
7382       gcc_assert (!slp_node);
7383       return vectorizable_condition (stmt, gsi, vec_stmt,
7384 				     NULL, reduc_index, NULL);
7385     }
7386 
7387   /* Create the destination vector  */
7388   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7389 
7390   prev_stmt_info = NULL;
7391   prev_phi_info = NULL;
7392   if (!slp_node)
7393     {
7394       vec_oprnds0.create (1);
7395       vec_oprnds1.create (1);
7396       if (op_type == ternary_op)
7397         vec_oprnds2.create (1);
7398     }
7399 
7400   phis.create (vec_num);
7401   vect_defs.create (vec_num);
7402   if (!slp_node)
7403     vect_defs.quick_push (NULL_TREE);
7404 
7405   if (slp_node)
7406     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7407   else
7408     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7409 
7410   for (j = 0; j < ncopies; j++)
7411     {
7412       if (code == COND_EXPR)
7413         {
7414           gcc_assert (!slp_node);
7415           vectorizable_condition (stmt, gsi, vec_stmt,
7416                                   PHI_RESULT (phis[0]),
7417                                   reduc_index, NULL);
7418           /* Multiple types are not supported for condition.  */
7419           break;
7420         }
7421 
7422       /* Handle uses.  */
7423       if (j == 0)
7424         {
7425 	  if (slp_node)
7426 	    {
7427 	      /* Get vec defs for all the operands except the reduction index,
7428 		 ensuring the ordering of the ops in the vector is kept.  */
7429 	      auto_vec<tree, 3> slp_ops;
7430 	      auto_vec<vec<tree>, 3> vec_defs;
7431 
7432 	      slp_ops.quick_push (ops[0]);
7433 	      slp_ops.quick_push (ops[1]);
7434 	      if (op_type == ternary_op)
7435 		slp_ops.quick_push (ops[2]);
7436 
7437 	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7438 
7439 	      vec_oprnds0.safe_splice (vec_defs[0]);
7440 	      vec_defs[0].release ();
7441 	      vec_oprnds1.safe_splice (vec_defs[1]);
7442 	      vec_defs[1].release ();
7443 	      if (op_type == ternary_op)
7444 		{
7445 		  vec_oprnds2.safe_splice (vec_defs[2]);
7446 		  vec_defs[2].release ();
7447 		}
7448 	    }
7449           else
7450 	    {
7451               vec_oprnds0.quick_push
7452 		(vect_get_vec_def_for_operand (ops[0], stmt));
7453               vec_oprnds1.quick_push
7454 		(vect_get_vec_def_for_operand (ops[1], stmt));
7455               if (op_type == ternary_op)
7456 		vec_oprnds2.quick_push
7457 		  (vect_get_vec_def_for_operand (ops[2], stmt));
7458 	    }
7459         }
7460       else
7461         {
7462           if (!slp_node)
7463             {
7464 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7465 
7466 	      if (single_defuse_cycle && reduc_index == 0)
7467 		vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7468 	      else
7469 		vec_oprnds0[0]
7470 		  = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7471 	      if (single_defuse_cycle && reduc_index == 1)
7472 		vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7473 	      else
7474 		vec_oprnds1[0]
7475 		  = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7476 	      if (op_type == ternary_op)
7477 		{
7478 		  if (single_defuse_cycle && reduc_index == 2)
7479 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7480 		  else
7481 		    vec_oprnds2[0]
7482 		      = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7483 		}
7484             }
7485         }
7486 
7487       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7488         {
7489 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7490 	  if (masked_loop_p)
7491 	    {
7492 	      /* Make sure that the reduction accumulator is vop[0].  */
7493 	      if (reduc_index == 1)
7494 		{
7495 		  gcc_assert (commutative_tree_code (code));
7496 		  std::swap (vop[0], vop[1]);
7497 		}
7498 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7499 					      vectype_in, i * ncopies + j);
7500 	      gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7501 							vop[0], vop[1]);
7502 	      new_temp = make_ssa_name (vec_dest, call);
7503 	      gimple_call_set_lhs (call, new_temp);
7504 	      gimple_call_set_nothrow (call, true);
7505 	      new_stmt = call;
7506 	    }
7507 	  else
7508 	    {
7509 	      if (op_type == ternary_op)
7510 		vop[2] = vec_oprnds2[i];
7511 
7512 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7513 	      new_stmt = gimple_build_assign (new_temp, code,
7514 					      vop[0], vop[1], vop[2]);
7515 	    }
7516 	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
7517 
7518           if (slp_node)
7519             {
7520               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7521               vect_defs.quick_push (new_temp);
7522             }
7523           else
7524             vect_defs[0] = new_temp;
7525         }
7526 
7527       if (slp_node)
7528         continue;
7529 
7530       if (j == 0)
7531 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7532       else
7533 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7534 
7535       prev_stmt_info = vinfo_for_stmt (new_stmt);
7536     }
7537 
7538   /* Finalize the reduction-phi (set its arguments) and create the
7539      epilog reduction code.  */
7540   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7541     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7542 
7543   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7544 				    epilog_copies, reduc_fn, phis,
7545 				    double_reduc, slp_node, slp_node_instance,
7546 				    cond_reduc_val, cond_reduc_op_code,
7547 				    neutral_op);
7548 
7549   return true;
7550 }
7551 
7552 /* Function vect_min_worthwhile_factor.
7553 
7554    For a loop where we could vectorize the operation indicated by CODE,
7555    return the minimum vectorization factor that makes it worthwhile
7556    to use generic vectors.  */
7557 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7558 vect_min_worthwhile_factor (enum tree_code code)
7559 {
7560   switch (code)
7561     {
7562     case PLUS_EXPR:
7563     case MINUS_EXPR:
7564     case NEGATE_EXPR:
7565       return 4;
7566 
7567     case BIT_AND_EXPR:
7568     case BIT_IOR_EXPR:
7569     case BIT_XOR_EXPR:
7570     case BIT_NOT_EXPR:
7571       return 2;
7572 
7573     default:
7574       return INT_MAX;
7575     }
7576 }
7577 
7578 /* Return true if VINFO indicates we are doing loop vectorization and if
7579    it is worth decomposing CODE operations into scalar operations for
7580    that loop's vectorization factor.  */
7581 
7582 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7583 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7584 {
7585   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7586   unsigned HOST_WIDE_INT value;
7587   return (loop_vinfo
7588 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7589 	  && value >= vect_min_worthwhile_factor (code));
7590 }
7591 
7592 /* Function vectorizable_induction
7593 
7594    Check if PHI performs an induction computation that can be vectorized.
7595    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7596    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7597    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7598 
7599 bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt,slp_tree slp_node)7600 vectorizable_induction (gimple *phi,
7601 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7602 			gimple **vec_stmt, slp_tree slp_node)
7603 {
7604   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7605   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7606   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7607   unsigned ncopies;
7608   bool nested_in_vect_loop = false;
7609   struct loop *iv_loop;
7610   tree vec_def;
7611   edge pe = loop_preheader_edge (loop);
7612   basic_block new_bb;
7613   tree new_vec, vec_init, vec_step, t;
7614   tree new_name;
7615   gimple *new_stmt;
7616   gphi *induction_phi;
7617   tree induc_def, vec_dest;
7618   tree init_expr, step_expr;
7619   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7620   unsigned i;
7621   tree expr;
7622   gimple_seq stmts;
7623   imm_use_iterator imm_iter;
7624   use_operand_p use_p;
7625   gimple *exit_phi;
7626   edge latch_e;
7627   tree loop_arg;
7628   gimple_stmt_iterator si;
7629   basic_block bb = gimple_bb (phi);
7630 
7631   if (gimple_code (phi) != GIMPLE_PHI)
7632     return false;
7633 
7634   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7635     return false;
7636 
7637   /* Make sure it was recognized as induction computation.  */
7638   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7639     return false;
7640 
7641   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7642   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7643 
7644   if (slp_node)
7645     ncopies = 1;
7646   else
7647     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7648   gcc_assert (ncopies >= 1);
7649 
7650   /* FORNOW. These restrictions should be relaxed.  */
7651   if (nested_in_vect_loop_p (loop, phi))
7652     {
7653       imm_use_iterator imm_iter;
7654       use_operand_p use_p;
7655       gimple *exit_phi;
7656       edge latch_e;
7657       tree loop_arg;
7658 
7659       if (ncopies > 1)
7660 	{
7661 	  if (dump_enabled_p ())
7662 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7663 			     "multiple types in nested loop.\n");
7664 	  return false;
7665 	}
7666 
7667       /* FORNOW: outer loop induction with SLP not supported.  */
7668       if (STMT_SLP_TYPE (stmt_info))
7669 	return false;
7670 
7671       exit_phi = NULL;
7672       latch_e = loop_latch_edge (loop->inner);
7673       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7674       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7675 	{
7676 	  gimple *use_stmt = USE_STMT (use_p);
7677 	  if (is_gimple_debug (use_stmt))
7678 	    continue;
7679 
7680 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7681 	    {
7682 	      exit_phi = use_stmt;
7683 	      break;
7684 	    }
7685 	}
7686       if (exit_phi)
7687 	{
7688 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7689 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7690 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7691 	    {
7692 	      if (dump_enabled_p ())
7693 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694 				 "inner-loop induction only used outside "
7695 				 "of the outer vectorized loop.\n");
7696 	      return false;
7697 	    }
7698 	}
7699 
7700       nested_in_vect_loop = true;
7701       iv_loop = loop->inner;
7702     }
7703   else
7704     iv_loop = loop;
7705   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7706 
7707   if (slp_node && !nunits.is_constant ())
7708     {
7709       /* The current SLP code creates the initial value element-by-element.  */
7710       if (dump_enabled_p ())
7711 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7712 			 "SLP induction not supported for variable-length"
7713 			 " vectors.\n");
7714       return false;
7715     }
7716 
7717   if (!vec_stmt) /* transformation not required.  */
7718     {
7719       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7720       if (dump_enabled_p ())
7721         dump_printf_loc (MSG_NOTE, vect_location,
7722                          "=== vectorizable_induction ===\n");
7723       vect_model_induction_cost (stmt_info, ncopies);
7724       return true;
7725     }
7726 
7727   /* Transform.  */
7728 
7729   /* Compute a vector variable, initialized with the first VF values of
7730      the induction variable.  E.g., for an iv with IV_PHI='X' and
7731      evolution S, for a vector of 4 units, we want to compute:
7732      [X, X + S, X + 2*S, X + 3*S].  */
7733 
7734   if (dump_enabled_p ())
7735     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7736 
7737   latch_e = loop_latch_edge (iv_loop);
7738   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7739 
7740   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7741   gcc_assert (step_expr != NULL_TREE);
7742 
7743   pe = loop_preheader_edge (iv_loop);
7744   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7745 				     loop_preheader_edge (iv_loop));
7746 
7747   stmts = NULL;
7748   if (!nested_in_vect_loop)
7749     {
7750       /* Convert the initial value to the desired type.  */
7751       tree new_type = TREE_TYPE (vectype);
7752       init_expr = gimple_convert (&stmts, new_type, init_expr);
7753 
7754       /* If we are using the loop mask to "peel" for alignment then we need
7755 	 to adjust the start value here.  */
7756       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7757       if (skip_niters != NULL_TREE)
7758 	{
7759 	  if (FLOAT_TYPE_P (vectype))
7760 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7761 					skip_niters);
7762 	  else
7763 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7764 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7765 					 skip_niters, step_expr);
7766 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7767 				    init_expr, skip_step);
7768 	}
7769     }
7770 
7771   /* Convert the step to the desired type.  */
7772   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7773 
7774   if (stmts)
7775     {
7776       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7777       gcc_assert (!new_bb);
7778     }
7779 
7780   /* Find the first insertion point in the BB.  */
7781   si = gsi_after_labels (bb);
7782 
7783   /* For SLP induction we have to generate several IVs as for example
7784      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7785      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7786      [VF*S, VF*S, VF*S, VF*S] for all.  */
7787   if (slp_node)
7788     {
7789       /* Enforced above.  */
7790       unsigned int const_nunits = nunits.to_constant ();
7791 
7792       /* Generate [VF*S, VF*S, ... ].  */
7793       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7794 	{
7795 	  expr = build_int_cst (integer_type_node, vf);
7796 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7797 	}
7798       else
7799 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7800       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7801 			      expr, step_expr);
7802       if (! CONSTANT_CLASS_P (new_name))
7803 	new_name = vect_init_vector (phi, new_name,
7804 				     TREE_TYPE (step_expr), NULL);
7805       new_vec = build_vector_from_val (vectype, new_name);
7806       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7807 
7808       /* Now generate the IVs.  */
7809       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7810       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7811       unsigned elts = const_nunits * nvects;
7812       unsigned nivs = least_common_multiple (group_size,
7813 					     const_nunits) / const_nunits;
7814       gcc_assert (elts % group_size == 0);
7815       tree elt = init_expr;
7816       unsigned ivn;
7817       for (ivn = 0; ivn < nivs; ++ivn)
7818 	{
7819 	  tree_vector_builder elts (vectype, const_nunits, 1);
7820 	  stmts = NULL;
7821 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7822 	    {
7823 	      if (ivn*const_nunits + eltn >= group_size
7824 		  && (ivn * const_nunits + eltn) % group_size == 0)
7825 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7826 				    elt, step_expr);
7827 	      elts.quick_push (elt);
7828 	    }
7829 	  vec_init = gimple_build_vector (&stmts, &elts);
7830 	  if (stmts)
7831 	    {
7832 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7833 	      gcc_assert (!new_bb);
7834 	    }
7835 
7836 	  /* Create the induction-phi that defines the induction-operand.  */
7837 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7838 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7839 	  set_vinfo_for_stmt (induction_phi,
7840 			      new_stmt_vec_info (induction_phi, loop_vinfo));
7841 	  induc_def = PHI_RESULT (induction_phi);
7842 
7843 	  /* Create the iv update inside the loop  */
7844 	  vec_def = make_ssa_name (vec_dest);
7845 	  new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7846 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7847 	  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7848 
7849 	  /* Set the arguments of the phi node:  */
7850 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7851 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7852 		       UNKNOWN_LOCATION);
7853 
7854 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7855 	}
7856 
7857       /* Re-use IVs when we can.  */
7858       if (ivn < nvects)
7859 	{
7860 	  unsigned vfp
7861 	    = least_common_multiple (group_size, const_nunits) / group_size;
7862 	  /* Generate [VF'*S, VF'*S, ... ].  */
7863 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7864 	    {
7865 	      expr = build_int_cst (integer_type_node, vfp);
7866 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7867 	    }
7868 	  else
7869 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7870 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7871 				  expr, step_expr);
7872 	  if (! CONSTANT_CLASS_P (new_name))
7873 	    new_name = vect_init_vector (phi, new_name,
7874 					 TREE_TYPE (step_expr), NULL);
7875 	  new_vec = build_vector_from_val (vectype, new_name);
7876 	  vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7877 	  for (; ivn < nvects; ++ivn)
7878 	    {
7879 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7880 	      tree def;
7881 	      if (gimple_code (iv) == GIMPLE_PHI)
7882 		def = gimple_phi_result (iv);
7883 	      else
7884 		def = gimple_assign_lhs (iv);
7885 	      new_stmt = gimple_build_assign (make_ssa_name (vectype),
7886 					      PLUS_EXPR,
7887 					      def, vec_step);
7888 	      if (gimple_code (iv) == GIMPLE_PHI)
7889 		gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7890 	      else
7891 		{
7892 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7893 		  gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7894 		}
7895 	      set_vinfo_for_stmt (new_stmt,
7896 				  new_stmt_vec_info (new_stmt, loop_vinfo));
7897 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7898 	    }
7899 	}
7900 
7901       return true;
7902     }
7903 
7904   /* Create the vector that holds the initial_value of the induction.  */
7905   if (nested_in_vect_loop)
7906     {
7907       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7908 	 been created during vectorization of previous stmts.  We obtain it
7909 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7910       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7911       /* If the initial value is not of proper type, convert it.  */
7912       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7913 	{
7914 	  new_stmt
7915 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7916 							  vect_simple_var,
7917 							  "vec_iv_"),
7918 				   VIEW_CONVERT_EXPR,
7919 				   build1 (VIEW_CONVERT_EXPR, vectype,
7920 					   vec_init));
7921 	  vec_init = gimple_assign_lhs (new_stmt);
7922 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7923 						 new_stmt);
7924 	  gcc_assert (!new_bb);
7925 	  set_vinfo_for_stmt (new_stmt,
7926 			      new_stmt_vec_info (new_stmt, loop_vinfo));
7927 	}
7928     }
7929   else
7930     {
7931       /* iv_loop is the loop to be vectorized. Create:
7932 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7933       stmts = NULL;
7934       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7935 
7936       unsigned HOST_WIDE_INT const_nunits;
7937       if (nunits.is_constant (&const_nunits))
7938 	{
7939 	  tree_vector_builder elts (vectype, const_nunits, 1);
7940 	  elts.quick_push (new_name);
7941 	  for (i = 1; i < const_nunits; i++)
7942 	    {
7943 	      /* Create: new_name_i = new_name + step_expr  */
7944 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7945 				       new_name, step_expr);
7946 	      elts.quick_push (new_name);
7947 	    }
7948 	  /* Create a vector from [new_name_0, new_name_1, ...,
7949 	     new_name_nunits-1]  */
7950 	  vec_init = gimple_build_vector (&stmts, &elts);
7951 	}
7952       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7953 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7954 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7955 				 new_name, step_expr);
7956       else
7957 	{
7958 	  /* Build:
7959 	        [base, base, base, ...]
7960 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7961 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7962 	  gcc_assert (flag_associative_math);
7963 	  tree index = build_index_vector (vectype, 0, 1);
7964 	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7965 							new_name);
7966 	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7967 							step_expr);
7968 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7969 	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7970 				   vec_init, step_vec);
7971 	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7972 				   vec_init, base_vec);
7973 	}
7974 
7975       if (stmts)
7976 	{
7977 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7978 	  gcc_assert (!new_bb);
7979 	}
7980     }
7981 
7982 
7983   /* Create the vector that holds the step of the induction.  */
7984   if (nested_in_vect_loop)
7985     /* iv_loop is nested in the loop to be vectorized. Generate:
7986        vec_step = [S, S, S, S]  */
7987     new_name = step_expr;
7988   else
7989     {
7990       /* iv_loop is the loop to be vectorized. Generate:
7991 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7992       gimple_seq seq = NULL;
7993       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7994 	{
7995 	  expr = build_int_cst (integer_type_node, vf);
7996 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7997 	}
7998       else
7999 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
8000       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8001 			       expr, step_expr);
8002       if (seq)
8003 	{
8004 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8005 	  gcc_assert (!new_bb);
8006 	}
8007     }
8008 
8009   t = unshare_expr (new_name);
8010   gcc_assert (CONSTANT_CLASS_P (new_name)
8011 	      || TREE_CODE (new_name) == SSA_NAME);
8012   new_vec = build_vector_from_val (vectype, t);
8013   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8014 
8015 
8016   /* Create the following def-use cycle:
8017      loop prolog:
8018          vec_init = ...
8019 	 vec_step = ...
8020      loop:
8021          vec_iv = PHI <vec_init, vec_loop>
8022          ...
8023          STMT
8024          ...
8025          vec_loop = vec_iv + vec_step;  */
8026 
8027   /* Create the induction-phi that defines the induction-operand.  */
8028   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8029   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8030   set_vinfo_for_stmt (induction_phi,
8031 		      new_stmt_vec_info (induction_phi, loop_vinfo));
8032   induc_def = PHI_RESULT (induction_phi);
8033 
8034   /* Create the iv update inside the loop  */
8035   vec_def = make_ssa_name (vec_dest);
8036   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8037   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8038   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8039 
8040   /* Set the arguments of the phi node:  */
8041   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8042   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8043 	       UNKNOWN_LOCATION);
8044 
8045   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8046 
8047   /* In case that vectorization factor (VF) is bigger than the number
8048      of elements that we can fit in a vectype (nunits), we have to generate
8049      more than one vector stmt - i.e - we need to "unroll" the
8050      vector stmt by a factor VF/nunits.  For more details see documentation
8051      in vectorizable_operation.  */
8052 
8053   if (ncopies > 1)
8054     {
8055       gimple_seq seq = NULL;
8056       stmt_vec_info prev_stmt_vinfo;
8057       /* FORNOW. This restriction should be relaxed.  */
8058       gcc_assert (!nested_in_vect_loop);
8059 
8060       /* Create the vector that holds the step of the induction.  */
8061       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8062 	{
8063 	  expr = build_int_cst (integer_type_node, nunits);
8064 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8065 	}
8066       else
8067 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8068       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8069 			       expr, step_expr);
8070       if (seq)
8071 	{
8072 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8073 	  gcc_assert (!new_bb);
8074 	}
8075 
8076       t = unshare_expr (new_name);
8077       gcc_assert (CONSTANT_CLASS_P (new_name)
8078 		  || TREE_CODE (new_name) == SSA_NAME);
8079       new_vec = build_vector_from_val (vectype, t);
8080       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8081 
8082       vec_def = induc_def;
8083       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8084       for (i = 1; i < ncopies; i++)
8085 	{
8086 	  /* vec_i = vec_prev + vec_step  */
8087 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8088 					  vec_def, vec_step);
8089 	  vec_def = make_ssa_name (vec_dest, new_stmt);
8090 	  gimple_assign_set_lhs (new_stmt, vec_def);
8091 
8092 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8093 	  set_vinfo_for_stmt (new_stmt,
8094 			      new_stmt_vec_info (new_stmt, loop_vinfo));
8095 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8096 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8097 	}
8098     }
8099 
8100   if (nested_in_vect_loop)
8101     {
8102       /* Find the loop-closed exit-phi of the induction, and record
8103          the final vector of induction results:  */
8104       exit_phi = NULL;
8105       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8106         {
8107 	  gimple *use_stmt = USE_STMT (use_p);
8108 	  if (is_gimple_debug (use_stmt))
8109 	    continue;
8110 
8111 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8112 	    {
8113 	      exit_phi = use_stmt;
8114 	      break;
8115 	    }
8116         }
8117       if (exit_phi)
8118 	{
8119 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8120 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
8121 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
8122 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8123 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
8124 
8125 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8126 	  if (dump_enabled_p ())
8127 	    {
8128 	      dump_printf_loc (MSG_NOTE, vect_location,
8129 			       "vector of inductions after inner-loop:");
8130 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8131 	    }
8132 	}
8133     }
8134 
8135 
8136   if (dump_enabled_p ())
8137     {
8138       dump_printf_loc (MSG_NOTE, vect_location,
8139 		       "transform induction: created def-use cycle: ");
8140       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8141       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8142 			SSA_NAME_DEF_STMT (vec_def), 0);
8143     }
8144 
8145   return true;
8146 }
8147 
8148 /* Function vectorizable_live_operation.
8149 
8150    STMT computes a value that is used outside the loop.  Check if
8151    it can be supported.  */
8152 
8153 bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,gimple ** vec_stmt)8154 vectorizable_live_operation (gimple *stmt,
8155 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8156 			     slp_tree slp_node, int slp_index,
8157 			     gimple **vec_stmt)
8158 {
8159   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8160   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8162   imm_use_iterator imm_iter;
8163   tree lhs, lhs_type, bitsize, vec_bitsize;
8164   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8165   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8166   int ncopies;
8167   gimple *use_stmt;
8168   auto_vec<tree> vec_oprnds;
8169   int vec_entry = 0;
8170   poly_uint64 vec_index = 0;
8171 
8172   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8173 
8174   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8175     return false;
8176 
8177   /* FORNOW.  CHECKME.  */
8178   if (nested_in_vect_loop_p (loop, stmt))
8179     return false;
8180 
8181   /* If STMT is not relevant and it is a simple assignment and its inputs are
8182      invariant then it can remain in place, unvectorized.  The original last
8183      scalar value that it computes will be used.  */
8184   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8185     {
8186       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8187       if (dump_enabled_p ())
8188 	dump_printf_loc (MSG_NOTE, vect_location,
8189 			 "statement is simple and uses invariant.  Leaving in "
8190 			 "place.\n");
8191       return true;
8192     }
8193 
8194   if (slp_node)
8195     ncopies = 1;
8196   else
8197     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8198 
8199   if (slp_node)
8200     {
8201       gcc_assert (slp_index >= 0);
8202 
8203       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8204       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8205 
8206       /* Get the last occurrence of the scalar index from the concatenation of
8207 	 all the slp vectors. Calculate which slp vector it is and the index
8208 	 within.  */
8209       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8210 
8211       /* Calculate which vector contains the result, and which lane of
8212 	 that vector we need.  */
8213       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8214 	{
8215 	  if (dump_enabled_p ())
8216 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8217 			     "Cannot determine which vector holds the"
8218 			     " final result.\n");
8219 	  return false;
8220 	}
8221     }
8222 
8223   if (!vec_stmt)
8224     {
8225       /* No transformation required.  */
8226       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8227 	{
8228 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8229 					       OPTIMIZE_FOR_SPEED))
8230 	    {
8231 	      if (dump_enabled_p ())
8232 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8233 				 "can't use a fully-masked loop because "
8234 				 "the target doesn't support extract last "
8235 				 "reduction.\n");
8236 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8237 	    }
8238 	  else if (slp_node)
8239 	    {
8240 	      if (dump_enabled_p ())
8241 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8242 				 "can't use a fully-masked loop because an "
8243 				 "SLP statement is live after the loop.\n");
8244 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8245 	    }
8246 	  else if (ncopies > 1)
8247 	    {
8248 	      if (dump_enabled_p ())
8249 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8250 				 "can't use a fully-masked loop because"
8251 				 " ncopies is greater than 1.\n");
8252 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8253 	    }
8254 	  else
8255 	    {
8256 	      gcc_assert (ncopies == 1 && !slp_node);
8257 	      vect_record_loop_mask (loop_vinfo,
8258 				     &LOOP_VINFO_MASKS (loop_vinfo),
8259 				     1, vectype);
8260 	    }
8261 	}
8262       return true;
8263     }
8264 
8265   /* If stmt has a related stmt, then use that for getting the lhs.  */
8266   if (is_pattern_stmt_p (stmt_info))
8267     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8268 
8269   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8270 	: gimple_get_lhs (stmt);
8271   lhs_type = TREE_TYPE (lhs);
8272 
8273   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8274 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8275 	     : TYPE_SIZE (TREE_TYPE (vectype)));
8276   vec_bitsize = TYPE_SIZE (vectype);
8277 
8278   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8279   tree vec_lhs, bitstart;
8280   if (slp_node)
8281     {
8282       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8283 
8284       /* Get the correct slp vectorized stmt.  */
8285       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8286       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8287 	vec_lhs = gimple_phi_result (phi);
8288       else
8289 	vec_lhs = gimple_get_lhs (vec_stmt);
8290 
8291       /* Get entry to use.  */
8292       bitstart = bitsize_int (vec_index);
8293       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8294     }
8295   else
8296     {
8297       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8298       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8299       gcc_checking_assert (ncopies == 1
8300 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8301 
8302       /* For multiple copies, get the last copy.  */
8303       for (int i = 1; i < ncopies; ++i)
8304 	vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8305 						  vec_lhs);
8306 
8307       /* Get the last lane in the vector.  */
8308       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8309     }
8310 
8311   gimple_seq stmts = NULL;
8312   tree new_tree;
8313   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8314     {
8315       /* Emit:
8316 
8317 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8318 
8319 	 where VEC_LHS is the vectorized live-out result and MASK is
8320 	 the loop mask for the final iteration.  */
8321       gcc_assert (ncopies == 1 && !slp_node);
8322       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8323       tree scalar_res = make_ssa_name (scalar_type);
8324       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8325 				      1, vectype, 0);
8326       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8327 						    2, mask, vec_lhs);
8328       gimple_call_set_lhs (new_stmt, scalar_res);
8329       gimple_seq_add_stmt (&stmts, new_stmt);
8330 
8331       /* Convert the extracted vector element to the required scalar type.  */
8332       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8333     }
8334   else
8335     {
8336       tree bftype = TREE_TYPE (vectype);
8337       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8338 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8339       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8340       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8341 				       &stmts, true, NULL_TREE);
8342     }
8343 
8344   if (stmts)
8345     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8346 
8347   /* Replace use of lhs with newly computed result.  If the use stmt is a
8348      single arg PHI, just replace all uses of PHI result.  It's necessary
8349      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8350   use_operand_p use_p;
8351   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8352     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8353 	&& !is_gimple_debug (use_stmt))
8354     {
8355       if (gimple_code (use_stmt) == GIMPLE_PHI
8356 	  && gimple_phi_num_args (use_stmt) == 1)
8357 	{
8358 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8359 	}
8360       else
8361 	{
8362 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8363 	    SET_USE (use_p, new_tree);
8364 	}
8365       update_stmt (use_stmt);
8366     }
8367 
8368   return true;
8369 }
8370 
8371 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8372 
8373 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)8374 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8375 {
8376   ssa_op_iter op_iter;
8377   imm_use_iterator imm_iter;
8378   def_operand_p def_p;
8379   gimple *ustmt;
8380 
8381   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8382     {
8383       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8384 	{
8385 	  basic_block bb;
8386 
8387 	  if (!is_gimple_debug (ustmt))
8388 	    continue;
8389 
8390 	  bb = gimple_bb (ustmt);
8391 
8392 	  if (!flow_bb_inside_loop_p (loop, bb))
8393 	    {
8394 	      if (gimple_debug_bind_p (ustmt))
8395 		{
8396 		  if (dump_enabled_p ())
8397 		    dump_printf_loc (MSG_NOTE, vect_location,
8398                                      "killing debug use\n");
8399 
8400 		  gimple_debug_bind_reset_value (ustmt);
8401 		  update_stmt (ustmt);
8402 		}
8403 	      else
8404 		gcc_unreachable ();
8405 	    }
8406 	}
8407     }
8408 }
8409 
8410 /* Given loop represented by LOOP_VINFO, return true if computation of
8411    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8412    otherwise.  */
8413 
8414 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8415 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8416 {
8417   /* Constant case.  */
8418   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8419     {
8420       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8421       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8422 
8423       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8424       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8425       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8426 	return true;
8427     }
8428 
8429   widest_int max;
8430   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8431   /* Check the upper bound of loop niters.  */
8432   if (get_max_loop_iterations (loop, &max))
8433     {
8434       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8435       signop sgn = TYPE_SIGN (type);
8436       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8437       if (max < type_max)
8438 	return true;
8439     }
8440   return false;
8441 }
8442 
8443 /* Return a mask type with half the number of elements as TYPE.  */
8444 
8445 tree
vect_halve_mask_nunits(tree type)8446 vect_halve_mask_nunits (tree type)
8447 {
8448   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8449   return build_truth_vector_type (nunits, current_vector_size);
8450 }
8451 
8452 /* Return a mask type with twice as many elements as TYPE.  */
8453 
8454 tree
vect_double_mask_nunits(tree type)8455 vect_double_mask_nunits (tree type)
8456 {
8457   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8458   return build_truth_vector_type (nunits, current_vector_size);
8459 }
8460 
8461 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8462    contain a sequence of NVECTORS masks that each control a vector of type
8463    VECTYPE.  */
8464 
8465 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)8466 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8467 		       unsigned int nvectors, tree vectype)
8468 {
8469   gcc_assert (nvectors != 0);
8470   if (masks->length () < nvectors)
8471     masks->safe_grow_cleared (nvectors);
8472   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8473   /* The number of scalars per iteration and the number of vectors are
8474      both compile-time constants.  */
8475   unsigned int nscalars_per_iter
8476     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8477 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8478   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8479     {
8480       rgm->max_nscalars_per_iter = nscalars_per_iter;
8481       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8482     }
8483 }
8484 
8485 /* Given a complete set of masks MASKS, extract mask number INDEX
8486    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8487    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8488 
8489    See the comment above vec_loop_masks for more details about the mask
8490    arrangement.  */
8491 
8492 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8493 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8494 		    unsigned int nvectors, tree vectype, unsigned int index)
8495 {
8496   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8497   tree mask_type = rgm->mask_type;
8498 
8499   /* Populate the rgroup's mask array, if this is the first time we've
8500      used it.  */
8501   if (rgm->masks.is_empty ())
8502     {
8503       rgm->masks.safe_grow_cleared (nvectors);
8504       for (unsigned int i = 0; i < nvectors; ++i)
8505 	{
8506 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8507 	  /* Provide a dummy definition until the real one is available.  */
8508 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8509 	  rgm->masks[i] = mask;
8510 	}
8511     }
8512 
8513   tree mask = rgm->masks[index];
8514   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8515 		TYPE_VECTOR_SUBPARTS (vectype)))
8516     {
8517       /* A loop mask for data type X can be reused for data type Y
8518 	 if X has N times more elements than Y and if Y's elements
8519 	 are N times bigger than X's.  In this case each sequence
8520 	 of N elements in the loop mask will be all-zero or all-one.
8521 	 We can then view-convert the mask so that each sequence of
8522 	 N elements is replaced by a single element.  */
8523       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8524 			      TYPE_VECTOR_SUBPARTS (vectype)));
8525       gimple_seq seq = NULL;
8526       mask_type = build_same_sized_truth_vector_type (vectype);
8527       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8528       if (seq)
8529 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8530     }
8531   return mask;
8532 }
8533 
8534 /* Scale profiling counters by estimation for LOOP which is vectorized
8535    by factor VF.  */
8536 
8537 static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)8538 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8539 {
8540   edge preheader = loop_preheader_edge (loop);
8541   /* Reduce loop iterations by the vectorization factor.  */
8542   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8543   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8544 
8545   if (freq_h.nonzero_p ())
8546     {
8547       profile_probability p;
8548 
8549       /* Avoid dropping loop body profile counter to 0 because of zero count
8550 	 in loop's preheader.  */
8551       if (!(freq_e == profile_count::zero ()))
8552         freq_e = freq_e.force_nonzero ();
8553       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8554       scale_loop_frequencies (loop, p);
8555     }
8556 
8557   edge exit_e = single_exit (loop);
8558   exit_e->probability = profile_probability::always ()
8559 				 .apply_scale (1, new_est_niter + 1);
8560 
8561   edge exit_l = single_pred_edge (loop->latch);
8562   profile_probability prob = exit_l->probability;
8563   exit_l->probability = exit_e->probability.invert ();
8564   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8565     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8566 }
8567 
8568 /* Function vect_transform_loop.
8569 
8570    The analysis phase has determined that the loop is vectorizable.
8571    Vectorize the loop - created vectorized stmts to replace the scalar
8572    stmts in the loop, and update the loop exit condition.
8573    Returns scalar epilogue loop if any.  */
8574 
8575 struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)8576 vect_transform_loop (loop_vec_info loop_vinfo)
8577 {
8578   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8579   struct loop *epilogue = NULL;
8580   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8581   int nbbs = loop->num_nodes;
8582   int i;
8583   tree niters_vector = NULL_TREE;
8584   tree step_vector = NULL_TREE;
8585   tree niters_vector_mult_vf = NULL_TREE;
8586   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8587   unsigned int lowest_vf = constant_lower_bound (vf);
8588   bool grouped_store;
8589   bool slp_scheduled = false;
8590   gimple *stmt, *pattern_stmt;
8591   gimple_seq pattern_def_seq = NULL;
8592   gimple_stmt_iterator pattern_def_si = gsi_none ();
8593   bool transform_pattern_stmt = false;
8594   bool check_profitability = false;
8595   unsigned int th;
8596 
8597   if (dump_enabled_p ())
8598     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8599 
8600   /* Use the more conservative vectorization threshold.  If the number
8601      of iterations is constant assume the cost check has been performed
8602      by our caller.  If the threshold makes all loops profitable that
8603      run at least the (estimated) vectorization factor number of times
8604      checking is pointless, too.  */
8605   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8606   if (th >= vect_vf_for_cost (loop_vinfo)
8607       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8608     {
8609       if (dump_enabled_p ())
8610 	dump_printf_loc (MSG_NOTE, vect_location,
8611 			 "Profitability threshold is %d loop iterations.\n",
8612                          th);
8613       check_profitability = true;
8614     }
8615 
8616   /* Make sure there exists a single-predecessor exit bb.  Do this before
8617      versioning.   */
8618   edge e = single_exit (loop);
8619   if (! single_pred_p (e->dest))
8620     {
8621       split_loop_exit_edge (e);
8622       if (dump_enabled_p ())
8623 	dump_printf (MSG_NOTE, "split exit edge\n");
8624     }
8625 
8626   /* Version the loop first, if required, so the profitability check
8627      comes first.  */
8628 
8629   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8630     {
8631       poly_uint64 versioning_threshold
8632 	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8633       if (check_profitability
8634 	  && ordered_p (poly_uint64 (th), versioning_threshold))
8635 	{
8636 	  versioning_threshold = ordered_max (poly_uint64 (th),
8637 					      versioning_threshold);
8638 	  check_profitability = false;
8639 	}
8640       vect_loop_versioning (loop_vinfo, th, check_profitability,
8641 			    versioning_threshold);
8642       check_profitability = false;
8643     }
8644 
8645   /* Make sure there exists a single-predecessor exit bb also on the
8646      scalar loop copy.  Do this after versioning but before peeling
8647      so CFG structure is fine for both scalar and if-converted loop
8648      to make slpeel_duplicate_current_defs_from_edges face matched
8649      loop closed PHI nodes on the exit.  */
8650   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8651     {
8652       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8653       if (! single_pred_p (e->dest))
8654 	{
8655 	  split_loop_exit_edge (e);
8656 	  if (dump_enabled_p ())
8657 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8658 	}
8659     }
8660 
8661   tree niters = vect_build_loop_niters (loop_vinfo);
8662   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8663   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8664   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8665   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8666 			      &step_vector, &niters_vector_mult_vf, th,
8667 			      check_profitability, niters_no_overflow);
8668 
8669   if (niters_vector == NULL_TREE)
8670     {
8671       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8672 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8673 	  && known_eq (lowest_vf, vf))
8674 	{
8675 	  niters_vector
8676 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8677 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8678 	  step_vector = build_one_cst (TREE_TYPE (niters));
8679 	}
8680       else
8681 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8682 				     &step_vector, niters_no_overflow);
8683     }
8684 
8685   /* 1) Make sure the loop header has exactly two entries
8686      2) Make sure we have a preheader basic block.  */
8687 
8688   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8689 
8690   split_edge (loop_preheader_edge (loop));
8691 
8692   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8693       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8694     /* This will deal with any possible peeling.  */
8695     vect_prepare_for_masked_peels (loop_vinfo);
8696 
8697   /* FORNOW: the vectorizer supports only loops which body consist
8698      of one basic block (header + empty latch). When the vectorizer will
8699      support more involved loop forms, the order by which the BBs are
8700      traversed need to be reconsidered.  */
8701 
8702   for (i = 0; i < nbbs; i++)
8703     {
8704       basic_block bb = bbs[i];
8705       stmt_vec_info stmt_info;
8706 
8707       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8708 	   gsi_next (&si))
8709         {
8710 	  gphi *phi = si.phi ();
8711 	  if (dump_enabled_p ())
8712 	    {
8713 	      dump_printf_loc (MSG_NOTE, vect_location,
8714                                "------>vectorizing phi: ");
8715 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8716 	    }
8717 	  stmt_info = vinfo_for_stmt (phi);
8718 	  if (!stmt_info)
8719 	    continue;
8720 
8721 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8722 	    vect_loop_kill_debug_uses (loop, phi);
8723 
8724 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8725 	      && !STMT_VINFO_LIVE_P (stmt_info))
8726 	    continue;
8727 
8728 	  if (STMT_VINFO_VECTYPE (stmt_info)
8729 	      && (maybe_ne
8730 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8731 	      && dump_enabled_p ())
8732 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8733 
8734 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8735 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8736 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8737 	      && ! PURE_SLP_STMT (stmt_info))
8738 	    {
8739 	      if (dump_enabled_p ())
8740 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8741 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8742 	    }
8743 	}
8744 
8745       pattern_stmt = NULL;
8746       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8747 	   !gsi_end_p (si) || transform_pattern_stmt;)
8748 	{
8749 	  bool is_store;
8750 
8751           if (transform_pattern_stmt)
8752 	    stmt = pattern_stmt;
8753           else
8754 	    {
8755 	      stmt = gsi_stmt (si);
8756 	      /* During vectorization remove existing clobber stmts.  */
8757 	      if (gimple_clobber_p (stmt))
8758 		{
8759 		  unlink_stmt_vdef (stmt);
8760 		  gsi_remove (&si, true);
8761 		  release_defs (stmt);
8762 		  continue;
8763 		}
8764 	    }
8765 
8766 	  if (dump_enabled_p ())
8767 	    {
8768 	      dump_printf_loc (MSG_NOTE, vect_location,
8769 			       "------>vectorizing statement: ");
8770 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8771 	    }
8772 
8773 	  stmt_info = vinfo_for_stmt (stmt);
8774 
8775 	  /* vector stmts created in the outer-loop during vectorization of
8776 	     stmts in an inner-loop may not have a stmt_info, and do not
8777 	     need to be vectorized.  */
8778 	  if (!stmt_info)
8779 	    {
8780 	      gsi_next (&si);
8781 	      continue;
8782 	    }
8783 
8784 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8785 	    vect_loop_kill_debug_uses (loop, stmt);
8786 
8787 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8788 	      && !STMT_VINFO_LIVE_P (stmt_info))
8789             {
8790               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8791                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8792                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8793                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8794                 {
8795                   stmt = pattern_stmt;
8796                   stmt_info = vinfo_for_stmt (stmt);
8797                 }
8798               else
8799 	        {
8800    	          gsi_next (&si);
8801 	          continue;
8802                 }
8803 	    }
8804           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8805                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8806                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8807                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8808             transform_pattern_stmt = true;
8809 
8810 	  /* If pattern statement has def stmts, vectorize them too.  */
8811 	  if (is_pattern_stmt_p (stmt_info))
8812 	    {
8813 	      if (pattern_def_seq == NULL)
8814 		{
8815 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8816 		  pattern_def_si = gsi_start (pattern_def_seq);
8817 		}
8818 	      else if (!gsi_end_p (pattern_def_si))
8819 		gsi_next (&pattern_def_si);
8820 	      if (pattern_def_seq != NULL)
8821 		{
8822 		  gimple *pattern_def_stmt = NULL;
8823 		  stmt_vec_info pattern_def_stmt_info = NULL;
8824 
8825 		  while (!gsi_end_p (pattern_def_si))
8826 		    {
8827 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
8828 		      pattern_def_stmt_info
8829 			= vinfo_for_stmt (pattern_def_stmt);
8830 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8831 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8832 			break;
8833 		      gsi_next (&pattern_def_si);
8834 		    }
8835 
8836 		  if (!gsi_end_p (pattern_def_si))
8837 		    {
8838 		      if (dump_enabled_p ())
8839 			{
8840 			  dump_printf_loc (MSG_NOTE, vect_location,
8841 					   "==> vectorizing pattern def "
8842 					   "stmt: ");
8843 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8844 					    pattern_def_stmt, 0);
8845 			}
8846 
8847 		      stmt = pattern_def_stmt;
8848 		      stmt_info = pattern_def_stmt_info;
8849 		    }
8850 		  else
8851 		    {
8852 		      pattern_def_si = gsi_none ();
8853 		      transform_pattern_stmt = false;
8854 		    }
8855 		}
8856 	      else
8857 		transform_pattern_stmt = false;
8858             }
8859 
8860 	  if (STMT_VINFO_VECTYPE (stmt_info))
8861 	    {
8862 	      poly_uint64 nunits
8863 		= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8864 	      if (!STMT_SLP_TYPE (stmt_info)
8865 		  && maybe_ne (nunits, vf)
8866 		  && dump_enabled_p ())
8867 		  /* For SLP VF is set according to unrolling factor, and not
8868 		     to vector size, hence for SLP this print is not valid.  */
8869 		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8870 	    }
8871 
8872 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
8873 	     reached.  */
8874 	  if (STMT_SLP_TYPE (stmt_info))
8875 	    {
8876 	      if (!slp_scheduled)
8877 		{
8878 		  slp_scheduled = true;
8879 
8880 		  if (dump_enabled_p ())
8881 		    dump_printf_loc (MSG_NOTE, vect_location,
8882 				     "=== scheduling SLP instances ===\n");
8883 
8884 		  vect_schedule_slp (loop_vinfo);
8885 		}
8886 
8887 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8888 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8889 		{
8890 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8891 		    {
8892 		      pattern_def_seq = NULL;
8893 		      gsi_next (&si);
8894 		    }
8895 		  continue;
8896 		}
8897 	    }
8898 
8899 	  /* -------- vectorize statement ------------ */
8900 	  if (dump_enabled_p ())
8901 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8902 
8903 	  grouped_store = false;
8904 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8905           if (is_store)
8906             {
8907 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8908 		{
8909 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8910 		     interleaving chain was completed - free all the stores in
8911 		     the chain.  */
8912 		  gsi_next (&si);
8913 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8914 		}
8915 	      else
8916 		{
8917 		  /* Free the attached stmt_vec_info and remove the stmt.  */
8918 		  gimple *store = gsi_stmt (si);
8919 		  free_stmt_vec_info (store);
8920 		  unlink_stmt_vdef (store);
8921 		  gsi_remove (&si, true);
8922 		  release_defs (store);
8923 		}
8924 
8925 	      /* Stores can only appear at the end of pattern statements.  */
8926 	      gcc_assert (!transform_pattern_stmt);
8927 	      pattern_def_seq = NULL;
8928 	    }
8929 	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8930 	    {
8931 	      pattern_def_seq = NULL;
8932 	      gsi_next (&si);
8933 	    }
8934 	}		        /* stmts in BB */
8935 
8936       /* Stub out scalar statements that must not survive vectorization.
8937 	 Doing this here helps with grouped statements, or statements that
8938 	 are involved in patterns.  */
8939       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8940 	   !gsi_end_p (gsi); gsi_next (&gsi))
8941 	{
8942 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8943 	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8944 	    {
8945 	      tree lhs = gimple_get_lhs (call);
8946 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8947 		{
8948 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8949 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8950 		  gsi_replace (&gsi, new_stmt, true);
8951 		}
8952 	    }
8953 	}
8954     }				/* BBs in loop */
8955 
8956   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8957      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8958   if (integer_onep (step_vector))
8959     niters_no_overflow = true;
8960   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8961 			   niters_vector_mult_vf, !niters_no_overflow);
8962 
8963   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8964   scale_profile_for_vect_loop (loop, assumed_vf);
8965 
8966   /* True if the final iteration might not handle a full vector's
8967      worth of scalar iterations.  */
8968   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8969   /* The minimum number of iterations performed by the epilogue.  This
8970      is 1 when peeling for gaps because we always need a final scalar
8971      iteration.  */
8972   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8973   /* +1 to convert latch counts to loop iteration counts,
8974      -min_epilogue_iters to remove iterations that cannot be performed
8975        by the vector code.  */
8976   int bias_for_lowest = 1 - min_epilogue_iters;
8977   int bias_for_assumed = bias_for_lowest;
8978   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8979   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8980     {
8981       /* When the amount of peeling is known at compile time, the first
8982 	 iteration will have exactly alignment_npeels active elements.
8983 	 In the worst case it will have at least one.  */
8984       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8985       bias_for_lowest += lowest_vf - min_first_active;
8986       bias_for_assumed += assumed_vf - min_first_active;
8987     }
8988   /* In these calculations the "- 1" converts loop iteration counts
8989      back to latch counts.  */
8990   if (loop->any_upper_bound)
8991     loop->nb_iterations_upper_bound
8992       = (final_iter_may_be_partial
8993 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8994 			  lowest_vf) - 1
8995 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8996 			   lowest_vf) - 1);
8997   if (loop->any_likely_upper_bound)
8998     loop->nb_iterations_likely_upper_bound
8999       = (final_iter_may_be_partial
9000 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9001 			  + bias_for_lowest, lowest_vf) - 1
9002 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9003 			   + bias_for_lowest, lowest_vf) - 1);
9004   if (loop->any_estimate)
9005     loop->nb_iterations_estimate
9006       = (final_iter_may_be_partial
9007 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9008 			  assumed_vf) - 1
9009 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9010 			   assumed_vf) - 1);
9011 
9012   if (dump_enabled_p ())
9013     {
9014       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9015 	{
9016 	  dump_printf_loc (MSG_NOTE, vect_location,
9017 			   "LOOP VECTORIZED\n");
9018 	  if (loop->inner)
9019 	    dump_printf_loc (MSG_NOTE, vect_location,
9020 			     "OUTER LOOP VECTORIZED\n");
9021 	  dump_printf (MSG_NOTE, "\n");
9022 	}
9023       else
9024 	{
9025 	  dump_printf_loc (MSG_NOTE, vect_location,
9026 			   "LOOP EPILOGUE VECTORIZED (VS=");
9027 	  dump_dec (MSG_NOTE, current_vector_size);
9028 	  dump_printf (MSG_NOTE, ")\n");
9029 	}
9030     }
9031 
9032   /* Free SLP instances here because otherwise stmt reference counting
9033      won't work.  */
9034   slp_instance instance;
9035   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9036     vect_free_slp_instance (instance);
9037   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9038   /* Clear-up safelen field since its value is invalid after vectorization
9039      since vectorized loop can have loop-carried dependencies.  */
9040   loop->safelen = 0;
9041 
9042   /* Don't vectorize epilogue for epilogue.  */
9043   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9044     epilogue = NULL;
9045 
9046   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9047     epilogue = NULL;
9048 
9049   if (epilogue)
9050     {
9051       auto_vector_sizes vector_sizes;
9052       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9053       unsigned int next_size = 0;
9054 
9055       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9056 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9057 	  && known_eq (vf, lowest_vf))
9058 	{
9059 	  unsigned int eiters
9060 	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9061 	       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9062 	  eiters = eiters % lowest_vf;
9063 	  epilogue->nb_iterations_upper_bound = eiters - 1;
9064 
9065 	  unsigned int ratio;
9066 	  while (next_size < vector_sizes.length ()
9067 		 && !(constant_multiple_p (current_vector_size,
9068 					   vector_sizes[next_size], &ratio)
9069 		      && eiters >= lowest_vf / ratio))
9070 	    next_size += 1;
9071 	}
9072       else
9073 	while (next_size < vector_sizes.length ()
9074 	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
9075 	  next_size += 1;
9076 
9077       if (next_size == vector_sizes.length ())
9078 	epilogue = NULL;
9079     }
9080 
9081   if (epilogue)
9082     {
9083       epilogue->force_vectorize = loop->force_vectorize;
9084       epilogue->safelen = loop->safelen;
9085       epilogue->dont_vectorize = false;
9086 
9087       /* We may need to if-convert epilogue to vectorize it.  */
9088       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9089 	tree_if_conversion (epilogue);
9090     }
9091 
9092   return epilogue;
9093 }
9094 
9095 /* The code below is trying to perform simple optimization - revert
9096    if-conversion for masked stores, i.e. if the mask of a store is zero
9097    do not perform it and all stored value producers also if possible.
9098    For example,
9099      for (i=0; i<n; i++)
9100        if (c[i])
9101 	{
9102 	  p1[i] += 1;
9103 	  p2[i] = p3[i] +2;
9104 	}
9105    this transformation will produce the following semi-hammock:
9106 
9107    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9108      {
9109        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9110        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9111        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9112        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9113        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9114        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9115      }
9116 */
9117 
9118 void
optimize_mask_stores(struct loop * loop)9119 optimize_mask_stores (struct loop *loop)
9120 {
9121   basic_block *bbs = get_loop_body (loop);
9122   unsigned nbbs = loop->num_nodes;
9123   unsigned i;
9124   basic_block bb;
9125   struct loop *bb_loop;
9126   gimple_stmt_iterator gsi;
9127   gimple *stmt;
9128   auto_vec<gimple *> worklist;
9129 
9130   vect_location = find_loop_location (loop);
9131   /* Pick up all masked stores in loop if any.  */
9132   for (i = 0; i < nbbs; i++)
9133     {
9134       bb = bbs[i];
9135       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9136 	   gsi_next (&gsi))
9137 	{
9138 	  stmt = gsi_stmt (gsi);
9139 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9140 	    worklist.safe_push (stmt);
9141 	}
9142     }
9143 
9144   free (bbs);
9145   if (worklist.is_empty ())
9146     return;
9147 
9148   /* Loop has masked stores.  */
9149   while (!worklist.is_empty ())
9150     {
9151       gimple *last, *last_store;
9152       edge e, efalse;
9153       tree mask;
9154       basic_block store_bb, join_bb;
9155       gimple_stmt_iterator gsi_to;
9156       tree vdef, new_vdef;
9157       gphi *phi;
9158       tree vectype;
9159       tree zero;
9160 
9161       last = worklist.pop ();
9162       mask = gimple_call_arg (last, 2);
9163       bb = gimple_bb (last);
9164       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9165 	 the same loop as if_bb.  It could be different to LOOP when two
9166 	 level loop-nest is vectorized and mask_store belongs to the inner
9167 	 one.  */
9168       e = split_block (bb, last);
9169       bb_loop = bb->loop_father;
9170       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9171       join_bb = e->dest;
9172       store_bb = create_empty_bb (bb);
9173       add_bb_to_loop (store_bb, bb_loop);
9174       e->flags = EDGE_TRUE_VALUE;
9175       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9176       /* Put STORE_BB to likely part.  */
9177       efalse->probability = profile_probability::unlikely ();
9178       store_bb->count = efalse->count ();
9179       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9180       if (dom_info_available_p (CDI_DOMINATORS))
9181 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9182       if (dump_enabled_p ())
9183 	dump_printf_loc (MSG_NOTE, vect_location,
9184 			 "Create new block %d to sink mask stores.",
9185 			 store_bb->index);
9186       /* Create vector comparison with boolean result.  */
9187       vectype = TREE_TYPE (mask);
9188       zero = build_zero_cst (vectype);
9189       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9190       gsi = gsi_last_bb (bb);
9191       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9192       /* Create new PHI node for vdef of the last masked store:
9193 	 .MEM_2 = VDEF <.MEM_1>
9194 	 will be converted to
9195 	 .MEM.3 = VDEF <.MEM_1>
9196 	 and new PHI node will be created in join bb
9197 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9198       */
9199       vdef = gimple_vdef (last);
9200       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9201       gimple_set_vdef (last, new_vdef);
9202       phi = create_phi_node (vdef, join_bb);
9203       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9204 
9205       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9206       while (true)
9207 	{
9208 	  gimple_stmt_iterator gsi_from;
9209 	  gimple *stmt1 = NULL;
9210 
9211 	  /* Move masked store to STORE_BB.  */
9212 	  last_store = last;
9213 	  gsi = gsi_for_stmt (last);
9214 	  gsi_from = gsi;
9215 	  /* Shift GSI to the previous stmt for further traversal.  */
9216 	  gsi_prev (&gsi);
9217 	  gsi_to = gsi_start_bb (store_bb);
9218 	  gsi_move_before (&gsi_from, &gsi_to);
9219 	  /* Setup GSI_TO to the non-empty block start.  */
9220 	  gsi_to = gsi_start_bb (store_bb);
9221 	  if (dump_enabled_p ())
9222 	    {
9223 	      dump_printf_loc (MSG_NOTE, vect_location,
9224 			       "Move stmt to created bb\n");
9225 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9226 	    }
9227 	  /* Move all stored value producers if possible.  */
9228 	  while (!gsi_end_p (gsi))
9229 	    {
9230 	      tree lhs;
9231 	      imm_use_iterator imm_iter;
9232 	      use_operand_p use_p;
9233 	      bool res;
9234 
9235 	      /* Skip debug statements.  */
9236 	      if (is_gimple_debug (gsi_stmt (gsi)))
9237 		{
9238 		  gsi_prev (&gsi);
9239 		  continue;
9240 		}
9241 	      stmt1 = gsi_stmt (gsi);
9242 	      /* Do not consider statements writing to memory or having
9243 		 volatile operand.  */
9244 	      if (gimple_vdef (stmt1)
9245 		  || gimple_has_volatile_ops (stmt1))
9246 		break;
9247 	      gsi_from = gsi;
9248 	      gsi_prev (&gsi);
9249 	      lhs = gimple_get_lhs (stmt1);
9250 	      if (!lhs)
9251 		break;
9252 
9253 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9254 	      if (TREE_CODE (lhs) != SSA_NAME)
9255 		break;
9256 
9257 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9258 		{
9259 		  /* Remove dead scalar statement.  */
9260 		  if (has_zero_uses (lhs))
9261 		    {
9262 		      gsi_remove (&gsi_from, true);
9263 		      continue;
9264 		    }
9265 		}
9266 
9267 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9268 	      res = true;
9269 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9270 		{
9271 		  gimple *use_stmt;
9272 		  use_stmt = USE_STMT (use_p);
9273 		  if (is_gimple_debug (use_stmt))
9274 		    continue;
9275 		  if (gimple_bb (use_stmt) != store_bb)
9276 		    {
9277 		      res = false;
9278 		      break;
9279 		    }
9280 		}
9281 	      if (!res)
9282 		break;
9283 
9284 	      if (gimple_vuse (stmt1)
9285 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9286 		break;
9287 
9288 	      /* Can move STMT1 to STORE_BB.  */
9289 	      if (dump_enabled_p ())
9290 		{
9291 		  dump_printf_loc (MSG_NOTE, vect_location,
9292 				   "Move stmt to created bb\n");
9293 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9294 		}
9295 	      gsi_move_before (&gsi_from, &gsi_to);
9296 	      /* Shift GSI_TO for further insertion.  */
9297 	      gsi_prev (&gsi_to);
9298 	    }
9299 	  /* Put other masked stores with the same mask to STORE_BB.  */
9300 	  if (worklist.is_empty ()
9301 	      || gimple_call_arg (worklist.last (), 2) != mask
9302 	      || worklist.last () != stmt1)
9303 	    break;
9304 	  last = worklist.pop ();
9305 	}
9306       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9307     }
9308 }
9309