1 /* Loop Vectorization
2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 
51 /* Loop Vectorization Pass.
52 
53    This pass tries to vectorize loops.
54 
55    For example, the vectorizer transforms the following simple loop:
56 
57         short a[N]; short b[N]; short c[N]; int i;
58 
59         for (i=0; i<N; i++){
60           a[i] = b[i] + c[i];
61         }
62 
63    as if it was manually vectorized by rewriting the source code into:
64 
65         typedef int __attribute__((mode(V8HI))) v8hi;
66         short a[N];  short b[N]; short c[N];   int i;
67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68         v8hi va, vb, vc;
69 
70         for (i=0; i<N/8; i++){
71           vb = pb[i];
72           vc = pc[i];
73           va = vb + vc;
74           pa[i] = va;
75         }
76 
77         The main entry to this pass is vectorize_loops(), in which
78    the vectorizer applies a set of analyses on a given set of loops,
79    followed by the actual vectorization transformation for the loops that
80    had successfully passed the analysis phase.
81         Throughout this pass we make a distinction between two types of
82    data: scalars (which are represented by SSA_NAMES), and memory references
83    ("data-refs").  These two types of data require different handling both
84    during analysis and transformation. The types of data-refs that the
85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87    accesses are required to have a simple (consecutive) access pattern.
88 
89    Analysis phase:
90    ===============
91         The driver for the analysis phase is vect_analyze_loop().
92    It applies a set of analyses, some of which rely on the scalar evolution
93    analyzer (scev) developed by Sebastian Pop.
94 
95         During the analysis phase the vectorizer records some information
96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97    loop, as well as general information about the loop as a whole, which is
98    recorded in a "loop_vec_info" struct attached to each loop.
99 
100    Transformation phase:
101    =====================
102         The loop transformation phase scans all the stmts in the loop, and
103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104    the loop that needs to be vectorized.  It inserts the vector code sequence
105    just before the scalar stmt S, and records a pointer to the vector code
106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107    attached to S).  This pointer will be used for the vectorization of following
108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109    otherwise, we rely on dead code elimination for removing it.
110 
111         For example, say stmt S1 was vectorized into stmt VS1:
112 
113    VS1: vb = px[i];
114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115    S2:  a = b;
116 
117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
120    resulting sequence would be:
121 
122    VS1: vb = px[i];
123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124    VS2: va = vb;
125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
126 
127         Operands that are not SSA_NAMEs, are data-refs that appear in
128    load/store operations (like 'x[i]' in S1), and are handled differently.
129 
130    Target modeling:
131    =================
132         Currently the only target specific information that is used is the
133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134    Targets that can support different sizes of vectors, for now will need
135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
136    flexibility will be added in the future.
137 
138         Since we only vectorize operations which vector form can be
139    expressed using existing tree codes, to verify that an operation is
140    supported, the vectorizer checks the relevant optab at the relevant
141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
142    the value found is CODE_FOR_nothing, then there's no target support, and
143    we can't vectorize the stmt.
144 
145    For additional information on this project see:
146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
147 */
148 
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
150 
151 /* Function vect_determine_vectorization_factor
152 
153    Determine the vectorization factor (VF).  VF is the number of data elements
154    that are operated upon in parallel in a single iteration of the vectorized
155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157    elements can fit in a single vector register.
158 
159    We currently support vectorization of loops in which all types operated upon
160    are of the same size.  Therefore this function currently sets VF according to
161    the size of the types operated upon, and fails if there are multiple sizes
162    in the loop.
163 
164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
165    original loop:
166         for (i=0; i<N; i++){
167           a[i] = b[i] + c[i];
168         }
169 
170    vectorized loop:
171         for (i=0; i<N; i+=VF){
172           a[i:VF] = b[i:VF] + c[i:VF];
173         }
174 */
175 
176 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
178 {
179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181   unsigned nbbs = loop->num_nodes;
182   unsigned int vectorization_factor = 0;
183   tree scalar_type;
184   gphi *phi;
185   tree vectype;
186   unsigned int nunits;
187   stmt_vec_info stmt_info;
188   unsigned i;
189   HOST_WIDE_INT dummy;
190   gimple *stmt, *pattern_stmt = NULL;
191   gimple_seq pattern_def_seq = NULL;
192   gimple_stmt_iterator pattern_def_si = gsi_none ();
193   bool analyze_pattern_stmt = false;
194   bool bool_result;
195   auto_vec<stmt_vec_info> mask_producers;
196 
197   if (dump_enabled_p ())
198     dump_printf_loc (MSG_NOTE, vect_location,
199                      "=== vect_determine_vectorization_factor ===\n");
200 
201   for (i = 0; i < nbbs; i++)
202     {
203       basic_block bb = bbs[i];
204 
205       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 	   gsi_next (&si))
207 	{
208 	  phi = si.phi ();
209 	  stmt_info = vinfo_for_stmt (phi);
210 	  if (dump_enabled_p ())
211 	    {
212 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 	      dump_printf (MSG_NOTE, "\n");
215 	    }
216 
217 	  gcc_assert (stmt_info);
218 
219 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
220 	      || STMT_VINFO_LIVE_P (stmt_info))
221             {
222 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
223               scalar_type = TREE_TYPE (PHI_RESULT (phi));
224 
225 	      if (dump_enabled_p ())
226 		{
227 		  dump_printf_loc (MSG_NOTE, vect_location,
228                                    "get vectype for scalar type:  ");
229 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
230                   dump_printf (MSG_NOTE, "\n");
231 		}
232 
233 	      vectype = get_vectype_for_scalar_type (scalar_type);
234 	      if (!vectype)
235 		{
236 		  if (dump_enabled_p ())
237 		    {
238 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
239                                        "not vectorized: unsupported "
240                                        "data-type ");
241 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
242                                          scalar_type);
243                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
244 		    }
245 		  return false;
246 		}
247 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
248 
249 	      if (dump_enabled_p ())
250 		{
251 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
252 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
253                   dump_printf (MSG_NOTE, "\n");
254 		}
255 
256 	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
257 	      if (dump_enabled_p ())
258 		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
259                                  nunits);
260 
261 	      if (!vectorization_factor
262 		  || (nunits > vectorization_factor))
263 		vectorization_factor = nunits;
264 	    }
265 	}
266 
267       for (gimple_stmt_iterator si = gsi_start_bb (bb);
268 	   !gsi_end_p (si) || analyze_pattern_stmt;)
269         {
270           tree vf_vectype;
271 
272           if (analyze_pattern_stmt)
273 	    stmt = pattern_stmt;
274           else
275             stmt = gsi_stmt (si);
276 
277           stmt_info = vinfo_for_stmt (stmt);
278 
279 	  if (dump_enabled_p ())
280 	    {
281 	      dump_printf_loc (MSG_NOTE, vect_location,
282                                "==> examining statement: ");
283 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
284               dump_printf (MSG_NOTE, "\n");
285 	    }
286 
287 	  gcc_assert (stmt_info);
288 
289 	  /* Skip stmts which do not need to be vectorized.  */
290 	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
291 	       && !STMT_VINFO_LIVE_P (stmt_info))
292 	      || gimple_clobber_p (stmt))
293             {
294               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
295                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
296                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
297                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298                 {
299                   stmt = pattern_stmt;
300                   stmt_info = vinfo_for_stmt (pattern_stmt);
301                   if (dump_enabled_p ())
302                     {
303                       dump_printf_loc (MSG_NOTE, vect_location,
304                                        "==> examining pattern statement: ");
305                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
306                       dump_printf (MSG_NOTE, "\n");
307                     }
308                 }
309               else
310 	        {
311 	          if (dump_enabled_p ())
312 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
313                   gsi_next (&si);
314 	          continue;
315                 }
316 	    }
317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
321             analyze_pattern_stmt = true;
322 
323 	  /* If a pattern statement has def stmts, analyze them too.  */
324 	  if (is_pattern_stmt_p (stmt_info))
325 	    {
326 	      if (pattern_def_seq == NULL)
327 		{
328 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
329 		  pattern_def_si = gsi_start (pattern_def_seq);
330 		}
331 	      else if (!gsi_end_p (pattern_def_si))
332 		gsi_next (&pattern_def_si);
333 	      if (pattern_def_seq != NULL)
334 		{
335 		  gimple *pattern_def_stmt = NULL;
336 		  stmt_vec_info pattern_def_stmt_info = NULL;
337 
338 		  while (!gsi_end_p (pattern_def_si))
339 		    {
340 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
341 		      pattern_def_stmt_info
342 			= vinfo_for_stmt (pattern_def_stmt);
343 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
344 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
345 			break;
346 		      gsi_next (&pattern_def_si);
347 		    }
348 
349 		  if (!gsi_end_p (pattern_def_si))
350 		    {
351 		      if (dump_enabled_p ())
352 			{
353 			  dump_printf_loc (MSG_NOTE, vect_location,
354                                            "==> examining pattern def stmt: ");
355 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
356                                             pattern_def_stmt, 0);
357                           dump_printf (MSG_NOTE, "\n");
358 			}
359 
360 		      stmt = pattern_def_stmt;
361 		      stmt_info = pattern_def_stmt_info;
362 		    }
363 		  else
364 		    {
365 		      pattern_def_si = gsi_none ();
366 		      analyze_pattern_stmt = false;
367 		    }
368 		}
369 	      else
370 		analyze_pattern_stmt = false;
371 	    }
372 
373 	  if (gimple_get_lhs (stmt) == NULL_TREE
374 	      /* MASK_STORE has no lhs, but is ok.  */
375 	      && (!is_gimple_call (stmt)
376 		  || !gimple_call_internal_p (stmt)
377 		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 	    {
379 	      if (is_gimple_call (stmt))
380 		{
381 		  /* Ignore calls with no lhs.  These must be calls to
382 		     #pragma omp simd functions, and what vectorization factor
383 		     it really needs can't be determined until
384 		     vectorizable_simd_clone_call.  */
385 		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 		    {
387 		      pattern_def_seq = NULL;
388 		      gsi_next (&si);
389 		    }
390 		  continue;
391 		}
392 	      if (dump_enabled_p ())
393 		{
394 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395                                    "not vectorized: irregular stmt.");
396 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
397                                     0);
398                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 		}
400 	      return false;
401 	    }
402 
403 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 	    {
405 	      if (dump_enabled_p ())
406 	        {
407 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408                                    "not vectorized: vector stmt in loop:");
409 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
410                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
411 	        }
412 	      return false;
413 	    }
414 
415 	  bool_result = false;
416 
417 	  if (STMT_VINFO_VECTYPE (stmt_info))
418 	    {
419 	      /* The only case when a vectype had been already set is for stmts
420 	         that contain a dataref, or for "pattern-stmts" (stmts
421 		 generated by the vectorizer to represent/replace a certain
422 		 idiom).  */
423 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 			  || is_pattern_stmt_p (stmt_info)
425 			  || !gsi_end_p (pattern_def_si));
426 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
427 	    }
428 	  else
429 	    {
430 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 	      if (is_gimple_call (stmt)
432 		  && gimple_call_internal_p (stmt)
433 		  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
434 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 	      else
436 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437 
438 	      /* Bool ops don't participate in vectorization factor
439 		 computation.  For comparison use compared types to
440 		 compute a factor.  */
441 	      if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
442 		  && is_gimple_assign (stmt)
443 		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
444 		{
445 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
446 		      || STMT_VINFO_LIVE_P (stmt_info))
447 		    mask_producers.safe_push (stmt_info);
448 		  bool_result = true;
449 
450 		  if (gimple_code (stmt) == GIMPLE_ASSIGN
451 		      && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
452 			 == tcc_comparison
453 		      && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
454 			 != BOOLEAN_TYPE)
455 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
456 		  else
457 		    {
458 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
459 			{
460 			  pattern_def_seq = NULL;
461 			  gsi_next (&si);
462 			}
463 		      continue;
464 		    }
465 		}
466 
467 	      if (dump_enabled_p ())
468 		{
469 		  dump_printf_loc (MSG_NOTE, vect_location,
470                                    "get vectype for scalar type:  ");
471 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
472                   dump_printf (MSG_NOTE, "\n");
473 		}
474 	      vectype = get_vectype_for_scalar_type (scalar_type);
475 	      if (!vectype)
476 		{
477 		  if (dump_enabled_p ())
478 		    {
479 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480                                        "not vectorized: unsupported "
481                                        "data-type ");
482 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
483                                          scalar_type);
484                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
485 		    }
486 		  return false;
487 		}
488 
489 	      if (!bool_result)
490 		STMT_VINFO_VECTYPE (stmt_info) = vectype;
491 
492 	      if (dump_enabled_p ())
493 		{
494 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
495 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
496                   dump_printf (MSG_NOTE, "\n");
497 		}
498             }
499 
500 	  /* Don't try to compute VF out scalar types if we stmt
501 	     produces boolean vector.  Use result vectype instead.  */
502 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
503 	    vf_vectype = vectype;
504 	  else
505 	    {
506 	      /* The vectorization factor is according to the smallest
507 		 scalar type (or the largest vector size, but we only
508 		 support one vector size per loop).  */
509 	      if (!bool_result)
510 		scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
511 							     &dummy);
512 	      if (dump_enabled_p ())
513 		{
514 		  dump_printf_loc (MSG_NOTE, vect_location,
515 				   "get vectype for scalar type:  ");
516 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
517 		  dump_printf (MSG_NOTE, "\n");
518 		}
519 	      vf_vectype = get_vectype_for_scalar_type (scalar_type);
520 	    }
521 	  if (!vf_vectype)
522 	    {
523 	      if (dump_enabled_p ())
524 		{
525 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
526                                    "not vectorized: unsupported data-type ");
527 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
528                                      scalar_type);
529                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 		}
531 	      return false;
532 	    }
533 
534 	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
535 	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
536 	    {
537 	      if (dump_enabled_p ())
538 		{
539 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540                                    "not vectorized: different sized vector "
541                                    "types in statement, ");
542 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543                                      vectype);
544 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
545 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
546                                      vf_vectype);
547                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
548 		}
549 	      return false;
550 	    }
551 
552 	  if (dump_enabled_p ())
553 	    {
554 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
555 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
556               dump_printf (MSG_NOTE, "\n");
557 	    }
558 
559 	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
560 	  if (dump_enabled_p ())
561 	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
562 	  if (!vectorization_factor
563 	      || (nunits > vectorization_factor))
564 	    vectorization_factor = nunits;
565 
566 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 	    {
568 	      pattern_def_seq = NULL;
569 	      gsi_next (&si);
570 	    }
571         }
572     }
573 
574   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
575   if (dump_enabled_p ())
576     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
577                      vectorization_factor);
578   if (vectorization_factor <= 1)
579     {
580       if (dump_enabled_p ())
581         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582                          "not vectorized: unsupported data-type\n");
583       return false;
584     }
585   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
586 
587   for (i = 0; i < mask_producers.length (); i++)
588     {
589       tree mask_type = NULL;
590 
591       stmt = STMT_VINFO_STMT (mask_producers[i]);
592 
593       if (gimple_code (stmt) == GIMPLE_ASSIGN
594 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
595 	  && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
596 	{
597 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
598 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
599 
600 	  if (!mask_type)
601 	    {
602 	      if (dump_enabled_p ())
603 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
604 				 "not vectorized: unsupported mask\n");
605 	      return false;
606 	    }
607 	}
608       else
609 	{
610 	  tree rhs;
611 	  ssa_op_iter iter;
612 	  gimple *def_stmt;
613 	  enum vect_def_type dt;
614 
615 	  FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
616 	    {
617 	      if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
618 				       &def_stmt, &dt, &vectype))
619 		{
620 		  if (dump_enabled_p ())
621 		    {
622 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 				       "not vectorized: can't compute mask type "
624 				       "for statement, ");
625 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
626 					0);
627 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
628 		    }
629 		  return false;
630 		}
631 
632 	      /* No vectype probably means external definition.
633 		 Allow it in case there is another operand which
634 		 allows to determine mask type.  */
635 	      if (!vectype)
636 		continue;
637 
638 	      if (!mask_type)
639 		mask_type = vectype;
640 	      else if (TYPE_VECTOR_SUBPARTS (mask_type)
641 		       != TYPE_VECTOR_SUBPARTS (vectype))
642 		{
643 		  if (dump_enabled_p ())
644 		    {
645 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 				       "not vectorized: different sized masks "
647 				       "types in statement, ");
648 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
649 					 mask_type);
650 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
651 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 					 vectype);
653 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
654 		    }
655 		  return false;
656 		}
657 	      else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
658 		       != VECTOR_BOOLEAN_TYPE_P (vectype))
659 		{
660 		  if (dump_enabled_p ())
661 		    {
662 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 				       "not vectorized: mixed mask and "
664 				       "nonmask vector types in statement, ");
665 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
666 					 mask_type);
667 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
668 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 					 vectype);
670 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
671 		    }
672 		  return false;
673 		}
674 	    }
675 
676 	  /* We may compare boolean value loaded as vector of integers.
677 	     Fix mask_type in such case.  */
678 	  if (mask_type
679 	      && !VECTOR_BOOLEAN_TYPE_P (mask_type)
680 	      && gimple_code (stmt) == GIMPLE_ASSIGN
681 	      && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
682 	    mask_type = build_same_sized_truth_vector_type (mask_type);
683 	}
684 
685       /* No mask_type should mean loop invariant predicate.
686 	 This is probably a subject for optimization in
687 	 if-conversion.  */
688       if (!mask_type)
689 	{
690 	  if (dump_enabled_p ())
691 	    {
692 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 			       "not vectorized: can't compute mask type "
694 			       "for statement, ");
695 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
696 				0);
697 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
698 	    }
699 	  return false;
700 	}
701 
702       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
703     }
704 
705   return true;
706 }
707 
708 
709 /* Function vect_is_simple_iv_evolution.
710 
711    FORNOW: A simple evolution of an induction variables in the loop is
712    considered a polynomial evolution.  */
713 
714 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)715 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
716                              tree * step)
717 {
718   tree init_expr;
719   tree step_expr;
720   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
721   basic_block bb;
722 
723   /* When there is no evolution in this loop, the evolution function
724      is not "simple".  */
725   if (evolution_part == NULL_TREE)
726     return false;
727 
728   /* When the evolution is a polynomial of degree >= 2
729      the evolution function is not "simple".  */
730   if (tree_is_chrec (evolution_part))
731     return false;
732 
733   step_expr = evolution_part;
734   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
735 
736   if (dump_enabled_p ())
737     {
738       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
739       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
740       dump_printf (MSG_NOTE, ",  init: ");
741       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
742       dump_printf (MSG_NOTE, "\n");
743     }
744 
745   *init = init_expr;
746   *step = step_expr;
747 
748   if (TREE_CODE (step_expr) != INTEGER_CST
749       && (TREE_CODE (step_expr) != SSA_NAME
750 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
751 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
752 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
753 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
754 		  || !flag_associative_math)))
755       && (TREE_CODE (step_expr) != REAL_CST
756 	  || !flag_associative_math))
757     {
758       if (dump_enabled_p ())
759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760                          "step unknown.\n");
761       return false;
762     }
763 
764   return true;
765 }
766 
767 /* Function vect_analyze_scalar_cycles_1.
768 
769    Examine the cross iteration def-use cycles of scalar variables
770    in LOOP.  LOOP_VINFO represents the loop that is now being
771    considered for vectorization (can be LOOP, or an outer-loop
772    enclosing LOOP).  */
773 
774 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)775 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
776 {
777   basic_block bb = loop->header;
778   tree init, step;
779   auto_vec<gimple *, 64> worklist;
780   gphi_iterator gsi;
781   bool double_reduc;
782 
783   if (dump_enabled_p ())
784     dump_printf_loc (MSG_NOTE, vect_location,
785                      "=== vect_analyze_scalar_cycles ===\n");
786 
787   /* First - identify all inductions.  Reduction detection assumes that all the
788      inductions have been identified, therefore, this order must not be
789      changed.  */
790   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
791     {
792       gphi *phi = gsi.phi ();
793       tree access_fn = NULL;
794       tree def = PHI_RESULT (phi);
795       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
796 
797       if (dump_enabled_p ())
798 	{
799 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
800 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
801           dump_printf (MSG_NOTE, "\n");
802 	}
803 
804       /* Skip virtual phi's.  The data dependences that are associated with
805          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
806       if (virtual_operand_p (def))
807 	continue;
808 
809       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
810 
811       /* Analyze the evolution function.  */
812       access_fn = analyze_scalar_evolution (loop, def);
813       if (access_fn)
814 	{
815 	  STRIP_NOPS (access_fn);
816 	  if (dump_enabled_p ())
817 	    {
818 	      dump_printf_loc (MSG_NOTE, vect_location,
819                                "Access function of PHI: ");
820 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
821               dump_printf (MSG_NOTE, "\n");
822 	    }
823 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
824 	    = initial_condition_in_loop_num (access_fn, loop->num);
825 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
826 	    = evolution_part_in_loop_num (access_fn, loop->num);
827 	}
828 
829       if (!access_fn
830 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
831 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
832 	      && TREE_CODE (step) != INTEGER_CST))
833 	{
834 	  worklist.safe_push (phi);
835 	  continue;
836 	}
837 
838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
839 		  != NULL_TREE);
840       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
841 
842       if (dump_enabled_p ())
843 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
844       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
845     }
846 
847 
848   /* Second - identify all reductions and nested cycles.  */
849   while (worklist.length () > 0)
850     {
851       gimple *phi = worklist.pop ();
852       tree def = PHI_RESULT (phi);
853       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
854       gimple *reduc_stmt;
855       bool nested_cycle;
856 
857       if (dump_enabled_p ())
858         {
859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861           dump_printf (MSG_NOTE, "\n");
862         }
863 
864       gcc_assert (!virtual_operand_p (def)
865 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866 
867       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
868       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
869 						&double_reduc, false);
870       if (reduc_stmt)
871         {
872           if (double_reduc)
873             {
874               if (dump_enabled_p ())
875                 dump_printf_loc (MSG_NOTE, vect_location,
876 				 "Detected double reduction.\n");
877 
878               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
879               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
880                                                     vect_double_reduction_def;
881             }
882           else
883             {
884               if (nested_cycle)
885                 {
886                   if (dump_enabled_p ())
887                     dump_printf_loc (MSG_NOTE, vect_location,
888 				     "Detected vectorizable nested cycle.\n");
889 
890                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
891                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892                                                              vect_nested_cycle;
893                 }
894               else
895                 {
896                   if (dump_enabled_p ())
897                     dump_printf_loc (MSG_NOTE, vect_location,
898 				     "Detected reduction.\n");
899 
900                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
901                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
902                                                            vect_reduction_def;
903                   /* Store the reduction cycles for possible vectorization in
904                      loop-aware SLP.  */
905                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
906                 }
907             }
908         }
909       else
910         if (dump_enabled_p ())
911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 			   "Unknown def-use cycle pattern.\n");
913     }
914 }
915 
916 
917 /* Function vect_analyze_scalar_cycles.
918 
919    Examine the cross iteration def-use cycles of scalar variables, by
920    analyzing the loop-header PHIs of scalar variables.  Classify each
921    cycle as one of the following: invariant, induction, reduction, unknown.
922    We do that for the loop represented by LOOP_VINFO, and also to its
923    inner-loop, if exists.
924    Examples for scalar cycles:
925 
926    Example1: reduction:
927 
928               loop1:
929               for (i=0; i<N; i++)
930                  sum += a[i];
931 
932    Example2: induction:
933 
934               loop2:
935               for (i=0; i<N; i++)
936                  a[i] = i;  */
937 
938 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 {
941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942 
943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944 
945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946      Reductions in such inner-loop therefore have different properties than
947      the reductions in the nest that gets vectorized:
948      1. When vectorized, they are executed in the same order as in the original
949         scalar loop, so we can't change the order of computation when
950         vectorizing them.
951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952         current checks are too strict.  */
953 
954   if (loop->inner)
955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 }
957 
958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
959 
960 static void
vect_fixup_reduc_chain(gimple * stmt)961 vect_fixup_reduc_chain (gimple *stmt)
962 {
963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964   gimple *stmtp;
965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968   do
969     {
970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973       if (stmt)
974 	GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 	  = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976     }
977   while (stmt);
978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 }
980 
981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
982 
983 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 {
986   gimple *first;
987   unsigned i;
988 
989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991       {
992 	gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 	while (next)
994 	  {
995 	    if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 	      break;
997 	    next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998 	  }
999 	/* If not all stmt in the chain are patterns try to handle
1000 	   the chain without patterns.  */
1001 	if (! next)
1002 	  {
1003 	    vect_fixup_reduc_chain (first);
1004 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 	      = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006 	  }
1007       }
1008 }
1009 
1010 /* Function vect_get_loop_niters.
1011 
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.
1015 
1016    Return the loop exit condition.  */
1017 
1018 
1019 static gcond *
vect_get_loop_niters(struct loop * loop,tree * number_of_iterations,tree * number_of_iterationsm1)1020 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1021 		      tree *number_of_iterationsm1)
1022 {
1023   tree niters;
1024 
1025   if (dump_enabled_p ())
1026     dump_printf_loc (MSG_NOTE, vect_location,
1027 		     "=== get_loop_niters ===\n");
1028 
1029   niters = number_of_latch_executions (loop);
1030   *number_of_iterationsm1 = niters;
1031 
1032   /* We want the number of loop header executions which is the number
1033      of latch executions plus one.
1034      ???  For UINT_MAX latch executions this number overflows to zero
1035      for loops like do { n++; } while (n != 0);  */
1036   if (niters && !chrec_contains_undetermined (niters))
1037     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1038 			  build_int_cst (TREE_TYPE (niters), 1));
1039   *number_of_iterations = niters;
1040 
1041   return get_loop_exit_condition (loop);
1042 }
1043 
1044 
1045 /* Function bb_in_loop_p
1046 
1047    Used as predicate for dfs order traversal of the loop bbs.  */
1048 
1049 static bool
bb_in_loop_p(const_basic_block bb,const void * data)1050 bb_in_loop_p (const_basic_block bb, const void *data)
1051 {
1052   const struct loop *const loop = (const struct loop *)data;
1053   if (flow_bb_inside_loop_p (loop, bb))
1054     return true;
1055   return false;
1056 }
1057 
1058 
1059 /* Function new_loop_vec_info.
1060 
1061    Create and initialize a new loop_vec_info struct for LOOP, as well as
1062    stmt_vec_info structs for all the stmts in LOOP.  */
1063 
1064 static loop_vec_info
new_loop_vec_info(struct loop * loop)1065 new_loop_vec_info (struct loop *loop)
1066 {
1067   loop_vec_info res;
1068   basic_block *bbs;
1069   gimple_stmt_iterator si;
1070   unsigned int i, nbbs;
1071 
1072   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1073   res->kind = vec_info::loop;
1074   LOOP_VINFO_LOOP (res) = loop;
1075 
1076   bbs = get_loop_body (loop);
1077 
1078   /* Create/Update stmt_info for all stmts in the loop.  */
1079   for (i = 0; i < loop->num_nodes; i++)
1080     {
1081       basic_block bb = bbs[i];
1082 
1083       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1084 	{
1085 	  gimple *phi = gsi_stmt (si);
1086 	  gimple_set_uid (phi, 0);
1087 	  set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1088 	}
1089 
1090       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1091 	{
1092 	  gimple *stmt = gsi_stmt (si);
1093 	  gimple_set_uid (stmt, 0);
1094 	  set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1095 	}
1096     }
1097 
1098   /* CHECKME: We want to visit all BBs before their successors (except for
1099      latch blocks, for which this assertion wouldn't hold).  In the simple
1100      case of the loop forms we allow, a dfs order of the BBs would the same
1101      as reversed postorder traversal, so we are safe.  */
1102 
1103    free (bbs);
1104    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1105    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1106                               bbs, loop->num_nodes, loop);
1107    gcc_assert (nbbs == loop->num_nodes);
1108 
1109   LOOP_VINFO_BBS (res) = bbs;
1110   LOOP_VINFO_NITERSM1 (res) = NULL;
1111   LOOP_VINFO_NITERS (res) = NULL;
1112   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1113   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1114   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1115   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1116   LOOP_VINFO_VECT_FACTOR (res) = 0;
1117   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1118   LOOP_VINFO_DATAREFS (res) = vNULL;
1119   LOOP_VINFO_DDRS (res) = vNULL;
1120   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1121   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1122   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1123   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1124   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1125   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1126   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1127   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1128   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1129   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1130   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1131   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1132 
1133   return res;
1134 }
1135 
1136 
1137 /* Function destroy_loop_vec_info.
1138 
1139    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1140    stmts in the loop.  */
1141 
1142 void
destroy_loop_vec_info(loop_vec_info loop_vinfo,bool clean_stmts)1143 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1144 {
1145   struct loop *loop;
1146   basic_block *bbs;
1147   int nbbs;
1148   gimple_stmt_iterator si;
1149   int j;
1150   vec<slp_instance> slp_instances;
1151   slp_instance instance;
1152   bool swapped;
1153 
1154   if (!loop_vinfo)
1155     return;
1156 
1157   loop = LOOP_VINFO_LOOP (loop_vinfo);
1158 
1159   bbs = LOOP_VINFO_BBS (loop_vinfo);
1160   nbbs = clean_stmts ? loop->num_nodes : 0;
1161   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1162 
1163   for (j = 0; j < nbbs; j++)
1164     {
1165       basic_block bb = bbs[j];
1166       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1167         free_stmt_vec_info (gsi_stmt (si));
1168 
1169       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1170         {
1171 	  gimple *stmt = gsi_stmt (si);
1172 
1173 	  /* We may have broken canonical form by moving a constant
1174 	     into RHS1 of a commutative op.  Fix such occurrences.  */
1175 	  if (swapped && is_gimple_assign (stmt))
1176 	    {
1177 	      enum tree_code code = gimple_assign_rhs_code (stmt);
1178 
1179 	      if ((code == PLUS_EXPR
1180 		   || code == POINTER_PLUS_EXPR
1181 		   || code == MULT_EXPR)
1182 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1183 		swap_ssa_operands (stmt,
1184 				   gimple_assign_rhs1_ptr (stmt),
1185 				   gimple_assign_rhs2_ptr (stmt));
1186 	    }
1187 
1188 	  /* Free stmt_vec_info.  */
1189 	  free_stmt_vec_info (stmt);
1190           gsi_next (&si);
1191         }
1192     }
1193 
1194   free (LOOP_VINFO_BBS (loop_vinfo));
1195   vect_destroy_datarefs (loop_vinfo);
1196   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1197   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1198   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1199   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1201   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1202   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1203     vect_free_slp_instance (instance);
1204 
1205   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1206   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1207   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1208   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1209 
1210   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1211   loop_vinfo->scalar_cost_vec.release ();
1212 
1213   free (loop_vinfo);
1214   loop->aux = NULL;
1215 }
1216 
1217 
1218 /* Calculate the cost of one scalar iteration of the loop.  */
1219 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1220 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1221 {
1222   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1223   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1224   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1225   int innerloop_iters, i;
1226 
1227   /* Count statements in scalar loop.  Using this as scalar cost for a single
1228      iteration for now.
1229 
1230      TODO: Add outer loop support.
1231 
1232      TODO: Consider assigning different costs to different scalar
1233      statements.  */
1234 
1235   /* FORNOW.  */
1236   innerloop_iters = 1;
1237   if (loop->inner)
1238     innerloop_iters = 50; /* FIXME */
1239 
1240   for (i = 0; i < nbbs; i++)
1241     {
1242       gimple_stmt_iterator si;
1243       basic_block bb = bbs[i];
1244 
1245       if (bb->loop_father == loop->inner)
1246         factor = innerloop_iters;
1247       else
1248         factor = 1;
1249 
1250       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1251         {
1252 	  gimple *stmt = gsi_stmt (si);
1253           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1254 
1255           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1256             continue;
1257 
1258           /* Skip stmts that are not vectorized inside the loop.  */
1259           if (stmt_info
1260               && !STMT_VINFO_RELEVANT_P (stmt_info)
1261               && (!STMT_VINFO_LIVE_P (stmt_info)
1262                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1263 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1264             continue;
1265 
1266 	  vect_cost_for_stmt kind;
1267           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1268             {
1269               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1270                kind = scalar_load;
1271              else
1272                kind = scalar_store;
1273             }
1274           else
1275             kind = scalar_stmt;
1276 
1277 	  scalar_single_iter_cost
1278 	    += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1279 				 factor, kind, NULL, 0, vect_prologue);
1280         }
1281     }
1282   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1283     = scalar_single_iter_cost;
1284 }
1285 
1286 
1287 /* Function vect_analyze_loop_form_1.
1288 
1289    Verify that certain CFG restrictions hold, including:
1290    - the loop has a pre-header
1291    - the loop has a single entry and exit
1292    - the loop exit condition is simple enough, and the number of iterations
1293      can be analyzed (a countable loop).  */
1294 
1295 bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1296 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1297 			  tree *number_of_iterationsm1,
1298 			  tree *number_of_iterations, gcond **inner_loop_cond)
1299 {
1300   if (dump_enabled_p ())
1301     dump_printf_loc (MSG_NOTE, vect_location,
1302 		     "=== vect_analyze_loop_form ===\n");
1303 
1304   /* Different restrictions apply when we are considering an inner-most loop,
1305      vs. an outer (nested) loop.
1306      (FORNOW. May want to relax some of these restrictions in the future).  */
1307 
1308   if (!loop->inner)
1309     {
1310       /* Inner-most loop.  We currently require that the number of BBs is
1311 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1312 	 look like this:
1313 
1314                         (pre-header)
1315                            |
1316                           header <--------+
1317                            | |            |
1318                            | +--> latch --+
1319                            |
1320                         (exit-bb)  */
1321 
1322       if (loop->num_nodes != 2)
1323         {
1324           if (dump_enabled_p ())
1325             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326 			     "not vectorized: control flow in loop.\n");
1327           return false;
1328         }
1329 
1330       if (empty_block_p (loop->header))
1331 	{
1332 	  if (dump_enabled_p ())
1333 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1334 			     "not vectorized: empty loop.\n");
1335 	  return false;
1336 	}
1337     }
1338   else
1339     {
1340       struct loop *innerloop = loop->inner;
1341       edge entryedge;
1342 
1343       /* Nested loop. We currently require that the loop is doubly-nested,
1344 	 contains a single inner loop, and the number of BBs is exactly 5.
1345 	 Vectorizable outer-loops look like this:
1346 
1347 			(pre-header)
1348 			   |
1349 			  header <---+
1350 			   |         |
1351 		          inner-loop |
1352 			   |         |
1353 			  tail ------+
1354 			   |
1355 		        (exit-bb)
1356 
1357 	 The inner-loop has the properties expected of inner-most loops
1358 	 as described above.  */
1359 
1360       if ((loop->inner)->inner || (loop->inner)->next)
1361 	{
1362 	  if (dump_enabled_p ())
1363 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364 			     "not vectorized: multiple nested loops.\n");
1365 	  return false;
1366 	}
1367 
1368       if (loop->num_nodes != 5)
1369         {
1370 	  if (dump_enabled_p ())
1371 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 			     "not vectorized: control flow in loop.\n");
1373 	  return false;
1374         }
1375 
1376       entryedge = loop_preheader_edge (innerloop);
1377       if (entryedge->src != loop->header
1378 	  || !single_exit (innerloop)
1379 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1380 	{
1381 	  if (dump_enabled_p ())
1382 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 			     "not vectorized: unsupported outerloop form.\n");
1384 	  return false;
1385 	}
1386 
1387       /* Analyze the inner-loop.  */
1388       tree inner_niterm1, inner_niter;
1389       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1390 				      &inner_niterm1, &inner_niter, NULL))
1391 	{
1392 	  if (dump_enabled_p ())
1393             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1394 			     "not vectorized: Bad inner loop.\n");
1395 	  return false;
1396 	}
1397 
1398       if (!expr_invariant_in_loop_p (loop, inner_niter))
1399 	{
1400 	  if (dump_enabled_p ())
1401 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 			     "not vectorized: inner-loop count not"
1403                              " invariant.\n");
1404 	  return false;
1405 	}
1406 
1407       if (dump_enabled_p ())
1408         dump_printf_loc (MSG_NOTE, vect_location,
1409 			 "Considering outer-loop vectorization.\n");
1410     }
1411 
1412   if (!single_exit (loop)
1413       || EDGE_COUNT (loop->header->preds) != 2)
1414     {
1415       if (dump_enabled_p ())
1416         {
1417           if (!single_exit (loop))
1418 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 			     "not vectorized: multiple exits.\n");
1420           else if (EDGE_COUNT (loop->header->preds) != 2)
1421 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422 			     "not vectorized: too many incoming edges.\n");
1423         }
1424       return false;
1425     }
1426 
1427   /* We assume that the loop exit condition is at the end of the loop. i.e,
1428      that the loop is represented as a do-while (with a proper if-guard
1429      before the loop if needed), where the loop header contains all the
1430      executable statements, and the latch is empty.  */
1431   if (!empty_block_p (loop->latch)
1432       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1433     {
1434       if (dump_enabled_p ())
1435 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 			 "not vectorized: latch block not empty.\n");
1437       return false;
1438     }
1439 
1440   /* Make sure there exists a single-predecessor exit bb:  */
1441   if (!single_pred_p (single_exit (loop)->dest))
1442     {
1443       edge e = single_exit (loop);
1444       if (!(e->flags & EDGE_ABNORMAL))
1445 	{
1446 	  split_loop_exit_edge (e);
1447 	  if (dump_enabled_p ())
1448 	    dump_printf (MSG_NOTE, "split exit edge.\n");
1449 	}
1450       else
1451 	{
1452 	  if (dump_enabled_p ())
1453 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454 			     "not vectorized: abnormal loop exit edge.\n");
1455 	  return false;
1456 	}
1457     }
1458 
1459   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1460 				     number_of_iterationsm1);
1461   if (!*loop_cond)
1462     {
1463       if (dump_enabled_p ())
1464 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 			 "not vectorized: complicated exit condition.\n");
1466       return false;
1467     }
1468 
1469   if (!*number_of_iterations
1470       || chrec_contains_undetermined (*number_of_iterations))
1471     {
1472       if (dump_enabled_p ())
1473 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 			 "not vectorized: number of iterations cannot be "
1475 			 "computed.\n");
1476       return false;
1477     }
1478 
1479   if (integer_zerop (*number_of_iterations))
1480     {
1481       if (dump_enabled_p ())
1482 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483 			 "not vectorized: number of iterations = 0.\n");
1484       return false;
1485     }
1486 
1487   return true;
1488 }
1489 
1490 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1491 
1492 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1493 vect_analyze_loop_form (struct loop *loop)
1494 {
1495   tree number_of_iterations, number_of_iterationsm1;
1496   gcond *loop_cond, *inner_loop_cond = NULL;
1497 
1498   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1499 				  &number_of_iterations, &inner_loop_cond))
1500     return NULL;
1501 
1502   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1503   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1504   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1505   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1506 
1507   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1508     {
1509       if (dump_enabled_p ())
1510         {
1511           dump_printf_loc (MSG_NOTE, vect_location,
1512 			   "Symbolic number of iterations is ");
1513 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1514           dump_printf (MSG_NOTE, "\n");
1515         }
1516     }
1517 
1518   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1519   if (inner_loop_cond)
1520     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1521       = loop_exit_ctrl_vec_info_type;
1522 
1523   gcc_assert (!loop->aux);
1524   loop->aux = loop_vinfo;
1525   return loop_vinfo;
1526 }
1527 
1528 
1529 
1530 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1531    statements update the vectorization factor.  */
1532 
1533 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1534 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1535 {
1536   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1537   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1538   int nbbs = loop->num_nodes;
1539   unsigned int vectorization_factor;
1540   int i;
1541 
1542   if (dump_enabled_p ())
1543     dump_printf_loc (MSG_NOTE, vect_location,
1544 		     "=== vect_update_vf_for_slp ===\n");
1545 
1546   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1547   gcc_assert (vectorization_factor != 0);
1548 
1549   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1550      vectorization factor of the loop is the unrolling factor required by
1551      the SLP instances.  If that unrolling factor is 1, we say, that we
1552      perform pure SLP on loop - cross iteration parallelism is not
1553      exploited.  */
1554   bool only_slp_in_loop = true;
1555   for (i = 0; i < nbbs; i++)
1556     {
1557       basic_block bb = bbs[i];
1558       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1559 	   gsi_next (&si))
1560 	{
1561 	  gimple *stmt = gsi_stmt (si);
1562 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1563 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1564 	      && STMT_VINFO_RELATED_STMT (stmt_info))
1565 	    {
1566 	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1567 	      stmt_info = vinfo_for_stmt (stmt);
1568 	    }
1569 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1570 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1571 	      && !PURE_SLP_STMT (stmt_info))
1572 	    /* STMT needs both SLP and loop-based vectorization.  */
1573 	    only_slp_in_loop = false;
1574 	}
1575     }
1576 
1577   if (only_slp_in_loop)
1578     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1579   else
1580     vectorization_factor
1581       = least_common_multiple (vectorization_factor,
1582 			       LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1583 
1584   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1585   if (dump_enabled_p ())
1586     dump_printf_loc (MSG_NOTE, vect_location,
1587 		     "Updating vectorization factor to %d\n",
1588 		     vectorization_factor);
1589 }
1590 
1591 /* Function vect_analyze_loop_operations.
1592 
1593    Scan the loop stmts and make sure they are all vectorizable.  */
1594 
1595 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1596 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1597 {
1598   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1599   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1600   int nbbs = loop->num_nodes;
1601   int i;
1602   stmt_vec_info stmt_info;
1603   bool need_to_vectorize = false;
1604   bool ok;
1605 
1606   if (dump_enabled_p ())
1607     dump_printf_loc (MSG_NOTE, vect_location,
1608 		     "=== vect_analyze_loop_operations ===\n");
1609 
1610   for (i = 0; i < nbbs; i++)
1611     {
1612       basic_block bb = bbs[i];
1613 
1614       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1615 	   gsi_next (&si))
1616         {
1617           gphi *phi = si.phi ();
1618           ok = true;
1619 
1620           stmt_info = vinfo_for_stmt (phi);
1621           if (dump_enabled_p ())
1622             {
1623               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1624               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1625               dump_printf (MSG_NOTE, "\n");
1626             }
1627 	  if (virtual_operand_p (gimple_phi_result (phi)))
1628 	    continue;
1629 
1630           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1631              (i.e., a phi in the tail of the outer-loop).  */
1632           if (! is_loop_header_bb_p (bb))
1633             {
1634               /* FORNOW: we currently don't support the case that these phis
1635                  are not used in the outerloop (unless it is double reduction,
1636                  i.e., this phi is vect_reduction_def), cause this case
1637                  requires to actually do something here.  */
1638               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1639                    || STMT_VINFO_LIVE_P (stmt_info))
1640                   && STMT_VINFO_DEF_TYPE (stmt_info)
1641                      != vect_double_reduction_def)
1642                 {
1643                   if (dump_enabled_p ())
1644 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 				     "Unsupported loop-closed phi in "
1646 				     "outer-loop.\n");
1647                   return false;
1648                 }
1649 
1650               /* If PHI is used in the outer loop, we check that its operand
1651                  is defined in the inner loop.  */
1652               if (STMT_VINFO_RELEVANT_P (stmt_info))
1653                 {
1654                   tree phi_op;
1655 		  gimple *op_def_stmt;
1656 
1657                   if (gimple_phi_num_args (phi) != 1)
1658                     return false;
1659 
1660                   phi_op = PHI_ARG_DEF (phi, 0);
1661                   if (TREE_CODE (phi_op) != SSA_NAME)
1662                     return false;
1663 
1664                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1665 		  if (gimple_nop_p (op_def_stmt)
1666 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1667 		      || !vinfo_for_stmt (op_def_stmt))
1668                     return false;
1669 
1670                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1671                         != vect_used_in_outer
1672                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1673                            != vect_used_in_outer_by_reduction)
1674                     return false;
1675                 }
1676 
1677               continue;
1678             }
1679 
1680           gcc_assert (stmt_info);
1681 
1682           if (STMT_VINFO_LIVE_P (stmt_info))
1683             {
1684               /* FORNOW: not yet supported.  */
1685               if (dump_enabled_p ())
1686 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687 				 "not vectorized: value used after loop.\n");
1688               return false;
1689             }
1690 
1691           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1692               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1693             {
1694               /* A scalar-dependence cycle that we don't support.  */
1695               if (dump_enabled_p ())
1696 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 				 "not vectorized: scalar dependence cycle.\n");
1698               return false;
1699             }
1700 
1701           if (STMT_VINFO_RELEVANT_P (stmt_info))
1702             {
1703               need_to_vectorize = true;
1704               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1705                 ok = vectorizable_induction (phi, NULL, NULL);
1706             }
1707 
1708           if (!ok)
1709             {
1710               if (dump_enabled_p ())
1711                 {
1712 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713 				   "not vectorized: relevant phi not "
1714 				   "supported: ");
1715                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1716                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1717                 }
1718 	      return false;
1719             }
1720         }
1721 
1722       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1723 	   gsi_next (&si))
1724         {
1725 	  gimple *stmt = gsi_stmt (si);
1726 	  if (!gimple_clobber_p (stmt)
1727 	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1728 	    return false;
1729         }
1730     } /* bbs */
1731 
1732   /* All operations in the loop are either irrelevant (deal with loop
1733      control, or dead), or only used outside the loop and can be moved
1734      out of the loop (e.g. invariants, inductions).  The loop can be
1735      optimized away by scalar optimizations.  We're better off not
1736      touching this loop.  */
1737   if (!need_to_vectorize)
1738     {
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_NOTE, vect_location,
1741 			 "All the computation can be taken out of the loop.\n");
1742       if (dump_enabled_p ())
1743 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 			 "not vectorized: redundant loop. no profit to "
1745 			 "vectorize.\n");
1746       return false;
1747     }
1748 
1749   return true;
1750 }
1751 
1752 
1753 /* Function vect_analyze_loop_2.
1754 
1755    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1756    for it.  The different analyses will record information in the
1757    loop_vec_info struct.  */
1758 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)1759 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1760 {
1761   bool ok;
1762   int max_vf = MAX_VECTORIZATION_FACTOR;
1763   int min_vf = 2;
1764   unsigned int n_stmts = 0;
1765 
1766   /* The first group of checks is independent of the vector size.  */
1767   fatal = true;
1768 
1769   /* Find all data references in the loop (which correspond to vdefs/vuses)
1770      and analyze their evolution in the loop.  */
1771 
1772   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1773 
1774   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1775   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1776     {
1777       if (dump_enabled_p ())
1778 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1779 			 "not vectorized: loop nest containing two "
1780 			 "or more consecutive inner loops cannot be "
1781 			 "vectorized\n");
1782       return false;
1783     }
1784 
1785   for (unsigned i = 0; i < loop->num_nodes; i++)
1786     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1787 	 !gsi_end_p (gsi); gsi_next (&gsi))
1788       {
1789 	gimple *stmt = gsi_stmt (gsi);
1790 	if (is_gimple_debug (stmt))
1791 	  continue;
1792 	++n_stmts;
1793 	if (!find_data_references_in_stmt (loop, stmt,
1794 					   &LOOP_VINFO_DATAREFS (loop_vinfo)))
1795 	  {
1796 	    if (is_gimple_call (stmt) && loop->safelen)
1797 	      {
1798 		tree fndecl = gimple_call_fndecl (stmt), op;
1799 		if (fndecl != NULL_TREE)
1800 		  {
1801 		    cgraph_node *node = cgraph_node::get (fndecl);
1802 		    if (node != NULL && node->simd_clones != NULL)
1803 		      {
1804 			unsigned int j, n = gimple_call_num_args (stmt);
1805 			for (j = 0; j < n; j++)
1806 			  {
1807 			    op = gimple_call_arg (stmt, j);
1808 			    if (DECL_P (op)
1809 				|| (REFERENCE_CLASS_P (op)
1810 				    && get_base_address (op)))
1811 			      break;
1812 			  }
1813 			op = gimple_call_lhs (stmt);
1814 			/* Ignore #pragma omp declare simd functions
1815 			   if they don't have data references in the
1816 			   call stmt itself.  */
1817 			if (j == n
1818 			    && !(op
1819 				 && (DECL_P (op)
1820 				     || (REFERENCE_CLASS_P (op)
1821 					 && get_base_address (op)))))
1822 			  continue;
1823 		      }
1824 		  }
1825 	      }
1826 	    if (dump_enabled_p ())
1827 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 			       "not vectorized: loop contains function "
1829 			       "calls or data references that cannot "
1830 			       "be analyzed\n");
1831 	    return false;
1832 	  }
1833       }
1834 
1835   /* Analyze the data references and also adjust the minimal
1836      vectorization factor according to the loads and stores.  */
1837 
1838   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1839   if (!ok)
1840     {
1841       if (dump_enabled_p ())
1842 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 			 "bad data references.\n");
1844       return false;
1845     }
1846 
1847   /* Classify all cross-iteration scalar data-flow cycles.
1848      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1849   vect_analyze_scalar_cycles (loop_vinfo);
1850 
1851   vect_pattern_recog (loop_vinfo);
1852 
1853   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1854 
1855   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1856      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1857 
1858   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1859   if (!ok)
1860     {
1861       if (dump_enabled_p ())
1862 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 			 "bad data access.\n");
1864       return false;
1865     }
1866 
1867   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1868 
1869   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1870   if (!ok)
1871     {
1872       if (dump_enabled_p ())
1873 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 			 "unexpected pattern.\n");
1875       return false;
1876     }
1877 
1878   /* While the rest of the analysis below depends on it in some way.  */
1879   fatal = false;
1880 
1881   /* Analyze data dependences between the data-refs in the loop
1882      and adjust the maximum vectorization factor according to
1883      the dependences.
1884      FORNOW: fail at the first data dependence that we encounter.  */
1885 
1886   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1887   if (!ok
1888       || max_vf < min_vf)
1889     {
1890       if (dump_enabled_p ())
1891 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 			     "bad data dependence.\n");
1893       return false;
1894     }
1895 
1896   ok = vect_determine_vectorization_factor (loop_vinfo);
1897   if (!ok)
1898     {
1899       if (dump_enabled_p ())
1900 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901 			 "can't determine vectorization factor.\n");
1902       return false;
1903     }
1904   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1905     {
1906       if (dump_enabled_p ())
1907 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 			 "bad data dependence.\n");
1909       return false;
1910     }
1911 
1912   /* Compute the scalar iteration cost.  */
1913   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1914 
1915   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1916   HOST_WIDE_INT estimated_niter;
1917   unsigned th;
1918   int min_scalar_loop_bound;
1919 
1920   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1921   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1922   if (!ok)
1923     return false;
1924 
1925   /* If there are any SLP instances mark them as pure_slp.  */
1926   bool slp = vect_make_slp_decision (loop_vinfo);
1927   if (slp)
1928     {
1929       /* Find stmts that need to be both vectorized and SLPed.  */
1930       vect_detect_hybrid_slp (loop_vinfo);
1931 
1932       /* Update the vectorization factor based on the SLP decision.  */
1933       vect_update_vf_for_slp (loop_vinfo);
1934     }
1935 
1936   /* This is the point where we can re-start analysis with SLP forced off.  */
1937 start_over:
1938 
1939   /* Now the vectorization factor is final.  */
1940   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1941   gcc_assert (vectorization_factor != 0);
1942 
1943   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1944     dump_printf_loc (MSG_NOTE, vect_location,
1945 		     "vectorization_factor = %d, niters = "
1946 		     HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1947 		     LOOP_VINFO_INT_NITERS (loop_vinfo));
1948 
1949   HOST_WIDE_INT max_niter
1950     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1951   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1952        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1953       || (max_niter != -1
1954 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1955     {
1956       if (dump_enabled_p ())
1957 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 			 "not vectorized: iteration count smaller than "
1959 			 "vectorization factor.\n");
1960       return false;
1961     }
1962 
1963   /* Analyze the alignment of the data-refs in the loop.
1964      Fail if a data reference is found that cannot be vectorized.  */
1965 
1966   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967   if (!ok)
1968     {
1969       if (dump_enabled_p ())
1970 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 			 "bad data alignment.\n");
1972       return false;
1973     }
1974 
1975   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976      It is important to call pruning after vect_analyze_data_ref_accesses,
1977      since we use grouping information gathered by interleaving analysis.  */
1978   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979   if (!ok)
1980     {
1981       if (dump_enabled_p ())
1982 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1983 			 "number of versioning for alias "
1984 			 "run-time tests exceeds %d "
1985 			 "(--param vect-max-version-for-alias-checks)\n",
1986 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1987       return false;
1988     }
1989 
1990   /* This pass will decide on using loop versioning and/or loop peeling in
1991      order to enhance the alignment of data references in the loop.  */
1992   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1993   if (!ok)
1994     {
1995       if (dump_enabled_p ())
1996 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997 			 "bad data alignment.\n");
1998       return false;
1999     }
2000 
2001   if (slp)
2002     {
2003       /* Analyze operations in the SLP instances.  Note this may
2004 	 remove unsupported SLP instances which makes the above
2005 	 SLP kind detection invalid.  */
2006       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2007       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2008 				   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2009       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2010 	goto again;
2011     }
2012 
2013   /* Scan all the remaining operations in the loop that are not subject
2014      to SLP and make sure they are vectorizable.  */
2015   ok = vect_analyze_loop_operations (loop_vinfo);
2016   if (!ok)
2017     {
2018       if (dump_enabled_p ())
2019 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 			 "bad operation or unsupported loop bound.\n");
2021       return false;
2022     }
2023 
2024   /* Analyze cost.  Decide if worth while to vectorize.  */
2025   int min_profitable_estimate, min_profitable_iters;
2026   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2027 				      &min_profitable_estimate);
2028 
2029   if (min_profitable_iters < 0)
2030     {
2031       if (dump_enabled_p ())
2032 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 			 "not vectorized: vectorization not profitable.\n");
2034       if (dump_enabled_p ())
2035 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036 			 "not vectorized: vector version will never be "
2037 			 "profitable.\n");
2038       goto again;
2039     }
2040 
2041   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2042 			    * vectorization_factor) - 1);
2043 
2044   /* Use the cost model only if it is more conservative than user specified
2045      threshold.  */
2046   th = (unsigned) min_scalar_loop_bound;
2047   if (min_profitable_iters
2048       && (!min_scalar_loop_bound
2049           || min_profitable_iters > min_scalar_loop_bound))
2050     th = (unsigned) min_profitable_iters;
2051 
2052   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2053 
2054   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2055       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2056     {
2057       if (dump_enabled_p ())
2058 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 			 "not vectorized: vectorization not profitable.\n");
2060       if (dump_enabled_p ())
2061         dump_printf_loc (MSG_NOTE, vect_location,
2062 			 "not vectorized: iteration count smaller than user "
2063 			 "specified loop bound parameter or minimum profitable "
2064 			 "iterations (whichever is more conservative).\n");
2065       goto again;
2066     }
2067 
2068   estimated_niter
2069     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2070   if (estimated_niter != -1
2071       && ((unsigned HOST_WIDE_INT) estimated_niter
2072           <= MAX (th, (unsigned)min_profitable_estimate)))
2073     {
2074       if (dump_enabled_p ())
2075 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 			 "not vectorized: estimated iteration count too "
2077                          "small.\n");
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_NOTE, vect_location,
2080 			 "not vectorized: estimated iteration count smaller "
2081                          "than specified loop bound parameter or minimum "
2082                          "profitable iterations (whichever is more "
2083                          "conservative).\n");
2084       goto again;
2085     }
2086 
2087   /* Decide whether we need to create an epilogue loop to handle
2088      remaining scalar iterations.  */
2089   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2090         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2091        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2092 
2093   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2094       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2095     {
2096       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2097 		   - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2098 	  < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2099 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2100     }
2101   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2102 	   || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2103 	       < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2104                /* In case of versioning, check if the maximum number of
2105                   iterations is greater than th.  If they are identical,
2106                   the epilogue is unnecessary.  */
2107 	       && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2108 	            && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2109                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2110     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2111 
2112   /* If an epilogue loop is required make sure we can create one.  */
2113   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2114       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2115     {
2116       if (dump_enabled_p ())
2117         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2118       if (!vect_can_advance_ivs_p (loop_vinfo)
2119 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2120 					   single_exit (LOOP_VINFO_LOOP
2121 							 (loop_vinfo))))
2122         {
2123           if (dump_enabled_p ())
2124 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 			     "not vectorized: can't create required "
2126 			     "epilog loop\n");
2127           goto again;
2128         }
2129     }
2130 
2131   gcc_assert (vectorization_factor
2132 	      == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2133 
2134   /* Ok to vectorize!  */
2135   return true;
2136 
2137 again:
2138   /* Try again with SLP forced off but if we didn't do any SLP there is
2139      no point in re-trying.  */
2140   if (!slp)
2141     return false;
2142 
2143   /* If there are reduction chains re-trying will fail anyway.  */
2144   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2145     return false;
2146 
2147   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2148      via interleaving or lane instructions.  */
2149   slp_instance instance;
2150   slp_tree node;
2151   unsigned i, j;
2152   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2153     {
2154       stmt_vec_info vinfo;
2155       vinfo = vinfo_for_stmt
2156 	  (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2157       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2158 	continue;
2159       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2160       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2161       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2162       if (! vect_store_lanes_supported (vectype, size)
2163 	  && ! vect_grouped_store_supported (vectype, size))
2164 	return false;
2165       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2166 	{
2167 	  vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2168 	  vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2169 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
2170 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2171 	  if (! vect_load_lanes_supported (vectype, size)
2172 	      && ! vect_grouped_load_supported (vectype, size))
2173 	    return false;
2174 	}
2175     }
2176 
2177   if (dump_enabled_p ())
2178     dump_printf_loc (MSG_NOTE, vect_location,
2179 		     "re-trying with SLP disabled\n");
2180 
2181   /* Roll back state appropriately.  No SLP this time.  */
2182   slp = false;
2183   /* Restore vectorization factor as it were without SLP.  */
2184   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2185   /* Free the SLP instances.  */
2186   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2187     vect_free_slp_instance (instance);
2188   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2189   /* Reset SLP type to loop_vect on all stmts.  */
2190   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2191     {
2192       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2193       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2194 	   !gsi_end_p (si); gsi_next (&si))
2195 	{
2196 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2197 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2198 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2199 	    {
2200 	      stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2201 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2202 	      for (gimple_stmt_iterator pi
2203 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2204 		   !gsi_end_p (pi); gsi_next (&pi))
2205 		{
2206 		  gimple *pstmt = gsi_stmt (pi);
2207 		  STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2208 		}
2209 	    }
2210 	}
2211     }
2212   /* Free optimized alias test DDRS.  */
2213   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2214   /* Reset target cost data.  */
2215   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2216   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2217     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2218   /* Reset assorted flags.  */
2219   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2220   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2221   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2222 
2223   goto start_over;
2224 }
2225 
2226 /* Function vect_analyze_loop.
2227 
2228    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2229    for it.  The different analyses will record information in the
2230    loop_vec_info struct.  */
2231 loop_vec_info
vect_analyze_loop(struct loop * loop)2232 vect_analyze_loop (struct loop *loop)
2233 {
2234   loop_vec_info loop_vinfo;
2235   unsigned int vector_sizes;
2236 
2237   /* Autodetect first vector size we try.  */
2238   current_vector_size = 0;
2239   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2240 
2241   if (dump_enabled_p ())
2242     dump_printf_loc (MSG_NOTE, vect_location,
2243 		     "===== analyze_loop_nest =====\n");
2244 
2245   if (loop_outer (loop)
2246       && loop_vec_info_for_loop (loop_outer (loop))
2247       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2248     {
2249       if (dump_enabled_p ())
2250 	dump_printf_loc (MSG_NOTE, vect_location,
2251 			 "outer-loop already vectorized.\n");
2252       return NULL;
2253     }
2254 
2255   while (1)
2256     {
2257       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2258       loop_vinfo = vect_analyze_loop_form (loop);
2259       if (!loop_vinfo)
2260 	{
2261 	  if (dump_enabled_p ())
2262 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 			     "bad loop form.\n");
2264 	  return NULL;
2265 	}
2266 
2267       bool fatal = false;
2268       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2269 	{
2270 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2271 
2272 	  return loop_vinfo;
2273 	}
2274 
2275       destroy_loop_vec_info (loop_vinfo, true);
2276 
2277       vector_sizes &= ~current_vector_size;
2278       if (fatal
2279 	  || vector_sizes == 0
2280 	  || current_vector_size == 0)
2281 	return NULL;
2282 
2283       /* Try the next biggest vector size.  */
2284       current_vector_size = 1 << floor_log2 (vector_sizes);
2285       if (dump_enabled_p ())
2286 	dump_printf_loc (MSG_NOTE, vect_location,
2287 			 "***** Re-trying analysis with "
2288 			 "vector size %d\n", current_vector_size);
2289     }
2290 }
2291 
2292 
2293 /* Function reduction_code_for_scalar_code
2294 
2295    Input:
2296    CODE - tree_code of a reduction operations.
2297 
2298    Output:
2299    REDUC_CODE - the corresponding tree-code to be used to reduce the
2300       vector of partial results into a single scalar result, or ERROR_MARK
2301       if the operation is a supported reduction operation, but does not have
2302       such a tree-code.
2303 
2304    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2305 
2306 static bool
reduction_code_for_scalar_code(enum tree_code code,enum tree_code * reduc_code)2307 reduction_code_for_scalar_code (enum tree_code code,
2308                                 enum tree_code *reduc_code)
2309 {
2310   switch (code)
2311     {
2312       case MAX_EXPR:
2313         *reduc_code = REDUC_MAX_EXPR;
2314         return true;
2315 
2316       case MIN_EXPR:
2317         *reduc_code = REDUC_MIN_EXPR;
2318         return true;
2319 
2320       case PLUS_EXPR:
2321         *reduc_code = REDUC_PLUS_EXPR;
2322         return true;
2323 
2324       case MULT_EXPR:
2325       case MINUS_EXPR:
2326       case BIT_IOR_EXPR:
2327       case BIT_XOR_EXPR:
2328       case BIT_AND_EXPR:
2329         *reduc_code = ERROR_MARK;
2330         return true;
2331 
2332       default:
2333        return false;
2334     }
2335 }
2336 
2337 
2338 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2339    STMT is printed with a message MSG. */
2340 
2341 static void
report_vect_op(int msg_type,gimple * stmt,const char * msg)2342 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2343 {
2344   dump_printf_loc (msg_type, vect_location, "%s", msg);
2345   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2346   dump_printf (msg_type, "\n");
2347 }
2348 
2349 
2350 /* Detect SLP reduction of the form:
2351 
2352    #a1 = phi <a5, a0>
2353    a2 = operation (a1)
2354    a3 = operation (a2)
2355    a4 = operation (a3)
2356    a5 = operation (a4)
2357 
2358    #a = phi <a5>
2359 
2360    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2361    FIRST_STMT is the first reduction stmt in the chain
2362    (a2 = operation (a1)).
2363 
2364    Return TRUE if a reduction chain was detected.  */
2365 
2366 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2367 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2368 		       gimple *first_stmt)
2369 {
2370   struct loop *loop = (gimple_bb (phi))->loop_father;
2371   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2372   enum tree_code code;
2373   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2374   stmt_vec_info use_stmt_info, current_stmt_info;
2375   tree lhs;
2376   imm_use_iterator imm_iter;
2377   use_operand_p use_p;
2378   int nloop_uses, size = 0, n_out_of_loop_uses;
2379   bool found = false;
2380 
2381   if (loop != vect_loop)
2382     return false;
2383 
2384   lhs = PHI_RESULT (phi);
2385   code = gimple_assign_rhs_code (first_stmt);
2386   while (1)
2387     {
2388       nloop_uses = 0;
2389       n_out_of_loop_uses = 0;
2390       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2391         {
2392 	  gimple *use_stmt = USE_STMT (use_p);
2393 	  if (is_gimple_debug (use_stmt))
2394 	    continue;
2395 
2396           /* Check if we got back to the reduction phi.  */
2397 	  if (use_stmt == phi)
2398             {
2399 	      loop_use_stmt = use_stmt;
2400               found = true;
2401               break;
2402             }
2403 
2404           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2405             {
2406 	      loop_use_stmt = use_stmt;
2407 	      nloop_uses++;
2408             }
2409            else
2410              n_out_of_loop_uses++;
2411 
2412            /* There are can be either a single use in the loop or two uses in
2413               phi nodes.  */
2414            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2415              return false;
2416         }
2417 
2418       if (found)
2419         break;
2420 
2421       /* We reached a statement with no loop uses.  */
2422       if (nloop_uses == 0)
2423 	return false;
2424 
2425       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2426       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2427         return false;
2428 
2429       if (!is_gimple_assign (loop_use_stmt)
2430 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2431 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2432         return false;
2433 
2434       /* Insert USE_STMT into reduction chain.  */
2435       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2436       if (current_stmt)
2437         {
2438           current_stmt_info = vinfo_for_stmt (current_stmt);
2439 	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2440           GROUP_FIRST_ELEMENT (use_stmt_info)
2441             = GROUP_FIRST_ELEMENT (current_stmt_info);
2442         }
2443       else
2444 	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2445 
2446       lhs = gimple_assign_lhs (loop_use_stmt);
2447       current_stmt = loop_use_stmt;
2448       size++;
2449    }
2450 
2451   if (!found || loop_use_stmt != phi || size < 2)
2452     return false;
2453 
2454   /* Swap the operands, if needed, to make the reduction operand be the second
2455      operand.  */
2456   lhs = PHI_RESULT (phi);
2457   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2458   while (next_stmt)
2459     {
2460       if (gimple_assign_rhs2 (next_stmt) == lhs)
2461 	{
2462 	  tree op = gimple_assign_rhs1 (next_stmt);
2463 	  gimple *def_stmt = NULL;
2464 
2465           if (TREE_CODE (op) == SSA_NAME)
2466             def_stmt = SSA_NAME_DEF_STMT (op);
2467 
2468 	  /* Check that the other def is either defined in the loop
2469 	     ("vect_internal_def"), or it's an induction (defined by a
2470 	     loop-header phi-node).  */
2471           if (def_stmt
2472               && gimple_bb (def_stmt)
2473 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2474               && (is_gimple_assign (def_stmt)
2475                   || is_gimple_call (def_stmt)
2476                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2477                            == vect_induction_def
2478                   || (gimple_code (def_stmt) == GIMPLE_PHI
2479                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2480                                   == vect_internal_def
2481                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2482 	    {
2483 	      lhs = gimple_assign_lhs (next_stmt);
2484 	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2485  	      continue;
2486 	    }
2487 
2488 	  return false;
2489 	}
2490       else
2491 	{
2492           tree op = gimple_assign_rhs2 (next_stmt);
2493 	  gimple *def_stmt = NULL;
2494 
2495           if (TREE_CODE (op) == SSA_NAME)
2496             def_stmt = SSA_NAME_DEF_STMT (op);
2497 
2498           /* Check that the other def is either defined in the loop
2499             ("vect_internal_def"), or it's an induction (defined by a
2500             loop-header phi-node).  */
2501           if (def_stmt
2502               && gimple_bb (def_stmt)
2503 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2504               && (is_gimple_assign (def_stmt)
2505                   || is_gimple_call (def_stmt)
2506                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2507                               == vect_induction_def
2508                   || (gimple_code (def_stmt) == GIMPLE_PHI
2509                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2510                                   == vect_internal_def
2511                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2512   	    {
2513 	      if (dump_enabled_p ())
2514 		{
2515 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2516 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2517                   dump_printf (MSG_NOTE, "\n");
2518 		}
2519 
2520 	      swap_ssa_operands (next_stmt,
2521 	 		         gimple_assign_rhs1_ptr (next_stmt),
2522                                  gimple_assign_rhs2_ptr (next_stmt));
2523 	      update_stmt (next_stmt);
2524 
2525 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2526 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2527 	    }
2528 	  else
2529 	    return false;
2530         }
2531 
2532       lhs = gimple_assign_lhs (next_stmt);
2533       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2534     }
2535 
2536   /* Save the chain for further analysis in SLP detection.  */
2537   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2538   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2539   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2540 
2541   return true;
2542 }
2543 
2544 
2545 /* Function vect_is_simple_reduction_1
2546 
2547    (1) Detect a cross-iteration def-use cycle that represents a simple
2548    reduction computation.  We look for the following pattern:
2549 
2550    loop_header:
2551      a1 = phi < a0, a2 >
2552      a3 = ...
2553      a2 = operation (a3, a1)
2554 
2555    or
2556 
2557    a3 = ...
2558    loop_header:
2559      a1 = phi < a0, a2 >
2560      a2 = operation (a3, a1)
2561 
2562    such that:
2563    1. operation is commutative and associative and it is safe to
2564       change the order of the computation (if CHECK_REDUCTION is true)
2565    2. no uses for a2 in the loop (a2 is used out of the loop)
2566    3. no uses of a1 in the loop besides the reduction operation
2567    4. no uses of a1 outside the loop.
2568 
2569    Conditions 1,4 are tested here.
2570    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2571 
2572    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2573    nested cycles, if CHECK_REDUCTION is false.
2574 
2575    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2576    reductions:
2577 
2578      a1 = phi < a0, a2 >
2579      inner loop (def of a3)
2580      a2 = phi < a3 >
2581 
2582    (4) Detect condition expressions, ie:
2583      for (int i = 0; i < N; i++)
2584        if (a[i] < val)
2585 	ret_val = a[i];
2586 
2587 */
2588 
2589 static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool check_reduction,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)2590 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2591 			  bool check_reduction, bool *double_reduc,
2592 			  bool need_wrapping_integral_overflow,
2593 			  enum vect_reduction_type *v_reduc_type)
2594 {
2595   struct loop *loop = (gimple_bb (phi))->loop_father;
2596   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2597   edge latch_e = loop_latch_edge (loop);
2598   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2599   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2600   enum tree_code orig_code, code;
2601   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2602   tree type;
2603   int nloop_uses;
2604   tree name;
2605   imm_use_iterator imm_iter;
2606   use_operand_p use_p;
2607   bool phi_def;
2608 
2609   *double_reduc = false;
2610   *v_reduc_type = TREE_CODE_REDUCTION;
2611 
2612   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2613      otherwise, we assume outer loop vectorization.  */
2614   gcc_assert ((check_reduction && loop == vect_loop)
2615               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2616 
2617   name = PHI_RESULT (phi);
2618   /* ???  If there are no uses of the PHI result the inner loop reduction
2619      won't be detected as possibly double-reduction by vectorizable_reduction
2620      because that tries to walk the PHI arg from the preheader edge which
2621      can be constant.  See PR60382.  */
2622   if (has_zero_uses (name))
2623     return NULL;
2624   nloop_uses = 0;
2625   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2626     {
2627       gimple *use_stmt = USE_STMT (use_p);
2628       if (is_gimple_debug (use_stmt))
2629 	continue;
2630 
2631       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2632         {
2633           if (dump_enabled_p ())
2634 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 			     "intermediate value used outside loop.\n");
2636 
2637           return NULL;
2638         }
2639 
2640       nloop_uses++;
2641       if (nloop_uses > 1)
2642         {
2643           if (dump_enabled_p ())
2644 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645 			     "reduction used in loop.\n");
2646           return NULL;
2647         }
2648 
2649       phi_use_stmt = use_stmt;
2650     }
2651 
2652   if (TREE_CODE (loop_arg) != SSA_NAME)
2653     {
2654       if (dump_enabled_p ())
2655 	{
2656 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657 			   "reduction: not ssa_name: ");
2658 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2659           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2660 	}
2661       return NULL;
2662     }
2663 
2664   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2665   if (!def_stmt)
2666     {
2667       if (dump_enabled_p ())
2668 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2669 			 "reduction: no def_stmt.\n");
2670       return NULL;
2671     }
2672 
2673   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2674     {
2675       if (dump_enabled_p ())
2676         {
2677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2678           dump_printf (MSG_NOTE, "\n");
2679         }
2680       return NULL;
2681     }
2682 
2683   if (is_gimple_assign (def_stmt))
2684     {
2685       name = gimple_assign_lhs (def_stmt);
2686       phi_def = false;
2687     }
2688   else
2689     {
2690       name = PHI_RESULT (def_stmt);
2691       phi_def = true;
2692     }
2693 
2694   nloop_uses = 0;
2695   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2696     {
2697       gimple *use_stmt = USE_STMT (use_p);
2698       if (is_gimple_debug (use_stmt))
2699 	continue;
2700       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2701 	nloop_uses++;
2702       if (nloop_uses > 1)
2703 	{
2704 	  if (dump_enabled_p ())
2705 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2706 			     "reduction used in loop.\n");
2707 	  return NULL;
2708 	}
2709     }
2710 
2711   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2712      defined in the inner loop.  */
2713   if (phi_def)
2714     {
2715       op1 = PHI_ARG_DEF (def_stmt, 0);
2716 
2717       if (gimple_phi_num_args (def_stmt) != 1
2718           || TREE_CODE (op1) != SSA_NAME)
2719         {
2720           if (dump_enabled_p ())
2721 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 			     "unsupported phi node definition.\n");
2723 
2724           return NULL;
2725         }
2726 
2727       def1 = SSA_NAME_DEF_STMT (op1);
2728       if (gimple_bb (def1)
2729 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2730           && loop->inner
2731           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2732           && is_gimple_assign (def1)
2733 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2734         {
2735           if (dump_enabled_p ())
2736             report_vect_op (MSG_NOTE, def_stmt,
2737 			    "detected double reduction: ");
2738 
2739           *double_reduc = true;
2740           return def_stmt;
2741         }
2742 
2743       return NULL;
2744     }
2745 
2746   code = orig_code = gimple_assign_rhs_code (def_stmt);
2747 
2748   /* We can handle "res -= x[i]", which is non-associative by
2749      simply rewriting this into "res += -x[i]".  Avoid changing
2750      gimple instruction for the first simple tests and only do this
2751      if we're allowed to change code at all.  */
2752   if (code == MINUS_EXPR
2753       && (op1 = gimple_assign_rhs1 (def_stmt))
2754       && TREE_CODE (op1) == SSA_NAME
2755       && SSA_NAME_DEF_STMT (op1) == phi)
2756     code = PLUS_EXPR;
2757 
2758   if (code == COND_EXPR)
2759     {
2760       if (check_reduction)
2761 	*v_reduc_type = COND_REDUCTION;
2762     }
2763   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2764     {
2765       if (dump_enabled_p ())
2766 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2767 			"reduction: not commutative/associative: ");
2768       return NULL;
2769     }
2770 
2771   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2772     {
2773       if (code != COND_EXPR)
2774         {
2775 	  if (dump_enabled_p ())
2776 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2777 			    "reduction: not binary operation: ");
2778 
2779           return NULL;
2780         }
2781 
2782       op3 = gimple_assign_rhs1 (def_stmt);
2783       if (COMPARISON_CLASS_P (op3))
2784         {
2785           op4 = TREE_OPERAND (op3, 1);
2786           op3 = TREE_OPERAND (op3, 0);
2787         }
2788 
2789       op1 = gimple_assign_rhs2 (def_stmt);
2790       op2 = gimple_assign_rhs3 (def_stmt);
2791 
2792       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2793         {
2794           if (dump_enabled_p ())
2795             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2796 			    "reduction: uses not ssa_names: ");
2797 
2798           return NULL;
2799         }
2800     }
2801   else
2802     {
2803       op1 = gimple_assign_rhs1 (def_stmt);
2804       op2 = gimple_assign_rhs2 (def_stmt);
2805 
2806       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2807         {
2808           if (dump_enabled_p ())
2809 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2810 			    "reduction: uses not ssa_names: ");
2811 
2812           return NULL;
2813         }
2814    }
2815 
2816   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2817   if ((TREE_CODE (op1) == SSA_NAME
2818        && !types_compatible_p (type,TREE_TYPE (op1)))
2819       || (TREE_CODE (op2) == SSA_NAME
2820           && !types_compatible_p (type, TREE_TYPE (op2)))
2821       || (op3 && TREE_CODE (op3) == SSA_NAME
2822           && !types_compatible_p (type, TREE_TYPE (op3)))
2823       || (op4 && TREE_CODE (op4) == SSA_NAME
2824           && !types_compatible_p (type, TREE_TYPE (op4))))
2825     {
2826       if (dump_enabled_p ())
2827         {
2828           dump_printf_loc (MSG_NOTE, vect_location,
2829 			   "reduction: multiple types: operation type: ");
2830           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2831           dump_printf (MSG_NOTE, ", operands types: ");
2832           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2833 			     TREE_TYPE (op1));
2834           dump_printf (MSG_NOTE, ",");
2835           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2836 			     TREE_TYPE (op2));
2837           if (op3)
2838             {
2839               dump_printf (MSG_NOTE, ",");
2840               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2841 				 TREE_TYPE (op3));
2842             }
2843 
2844           if (op4)
2845             {
2846               dump_printf (MSG_NOTE, ",");
2847               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2848 				 TREE_TYPE (op4));
2849             }
2850           dump_printf (MSG_NOTE, "\n");
2851         }
2852 
2853       return NULL;
2854     }
2855 
2856   /* Check that it's ok to change the order of the computation.
2857      Generally, when vectorizing a reduction we change the order of the
2858      computation.  This may change the behavior of the program in some
2859      cases, so we need to check that this is ok.  One exception is when
2860      vectorizing an outer-loop: the inner-loop is executed sequentially,
2861      and therefore vectorizing reductions in the inner-loop during
2862      outer-loop vectorization is safe.  */
2863 
2864   if (*v_reduc_type != COND_REDUCTION
2865       && check_reduction)
2866     {
2867       /* CHECKME: check for !flag_finite_math_only too?  */
2868       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2869 	{
2870 	  /* Changing the order of operations changes the semantics.  */
2871 	  if (dump_enabled_p ())
2872 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2873 			"reduction: unsafe fp math optimization: ");
2874 	  return NULL;
2875 	}
2876       else if (INTEGRAL_TYPE_P (type))
2877 	{
2878 	  if (!operation_no_trapping_overflow (type, code))
2879 	    {
2880 	      /* Changing the order of operations changes the semantics.  */
2881 	      if (dump_enabled_p ())
2882 		report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2883 				"reduction: unsafe int math optimization"
2884 				" (overflow traps): ");
2885 	      return NULL;
2886 	    }
2887 	  if (need_wrapping_integral_overflow
2888 	      && !TYPE_OVERFLOW_WRAPS (type)
2889 	      && operation_can_overflow (code))
2890 	    {
2891 	      /* Changing the order of operations changes the semantics.  */
2892 	      if (dump_enabled_p ())
2893 		report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2894 				"reduction: unsafe int math optimization"
2895 				" (overflow doesn't wrap): ");
2896 	      return NULL;
2897 	    }
2898 	}
2899       else if (SAT_FIXED_POINT_TYPE_P (type))
2900 	{
2901 	  /* Changing the order of operations changes the semantics.  */
2902 	  if (dump_enabled_p ())
2903 	  report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2904 			  "reduction: unsafe fixed-point math optimization: ");
2905 	  return NULL;
2906 	}
2907     }
2908 
2909   /* Reduction is safe. We're dealing with one of the following:
2910      1) integer arithmetic and no trapv
2911      2) floating point arithmetic, and special flags permit this optimization
2912      3) nested cycle (i.e., outer loop vectorization).  */
2913   if (TREE_CODE (op1) == SSA_NAME)
2914     def1 = SSA_NAME_DEF_STMT (op1);
2915 
2916   if (TREE_CODE (op2) == SSA_NAME)
2917     def2 = SSA_NAME_DEF_STMT (op2);
2918 
2919   if (code != COND_EXPR
2920       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2921     {
2922       if (dump_enabled_p ())
2923 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2924       return NULL;
2925     }
2926 
2927   /* Check that one def is the reduction def, defined by PHI,
2928      the other def is either defined in the loop ("vect_internal_def"),
2929      or it's an induction (defined by a loop-header phi-node).  */
2930 
2931   if (def2 && def2 == phi
2932       && (code == COND_EXPR
2933 	  || !def1 || gimple_nop_p (def1)
2934 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2935           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2936               && (is_gimple_assign (def1)
2937 		  || is_gimple_call (def1)
2938   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2939                       == vect_induction_def
2940    	          || (gimple_code (def1) == GIMPLE_PHI
2941 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2942                           == vect_internal_def
2943  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
2944     {
2945       if (dump_enabled_p ())
2946 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2947       return def_stmt;
2948     }
2949 
2950   if (def1 && def1 == phi
2951       && (code == COND_EXPR
2952 	  || !def2 || gimple_nop_p (def2)
2953 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2954           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2955  	      && (is_gimple_assign (def2)
2956 		  || is_gimple_call (def2)
2957 	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2958                       == vect_induction_def
2959  	          || (gimple_code (def2) == GIMPLE_PHI
2960 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2961                           == vect_internal_def
2962 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
2963     {
2964       if (check_reduction
2965 	  && orig_code != MINUS_EXPR)
2966         {
2967 	  if (code == COND_EXPR)
2968 	    {
2969 	      /* No current known use where this case would be useful.  */
2970 	      if (dump_enabled_p ())
2971 		report_vect_op (MSG_NOTE, def_stmt,
2972 				"detected reduction: cannot currently swap "
2973 				"operands for cond_expr");
2974 	      return NULL;
2975 	    }
2976 
2977           /* Swap operands (just for simplicity - so that the rest of the code
2978 	     can assume that the reduction variable is always the last (second)
2979 	     argument).  */
2980           if (dump_enabled_p ())
2981 	    report_vect_op (MSG_NOTE, def_stmt,
2982 	  	            "detected reduction: need to swap operands: ");
2983 
2984           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2985  			     gimple_assign_rhs2_ptr (def_stmt));
2986 
2987 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2988 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2989         }
2990       else
2991         {
2992           if (dump_enabled_p ())
2993             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2994         }
2995 
2996       return def_stmt;
2997     }
2998 
2999   /* Try to find SLP reduction chain.  */
3000   if (check_reduction && code != COND_EXPR
3001       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3002     {
3003       if (dump_enabled_p ())
3004         report_vect_op (MSG_NOTE, def_stmt,
3005 			"reduction: detected reduction chain: ");
3006 
3007       return def_stmt;
3008     }
3009 
3010   if (dump_enabled_p ())
3011     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3012 		    "reduction: unknown pattern: ");
3013 
3014   return NULL;
3015 }
3016 
3017 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3018    in-place if it enables detection of more reductions.  Arguments
3019    as there.  */
3020 
3021 gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool check_reduction,bool * double_reduc,bool need_wrapping_integral_overflow)3022 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3023 			     bool check_reduction, bool *double_reduc,
3024 			     bool need_wrapping_integral_overflow)
3025 {
3026   enum vect_reduction_type v_reduc_type;
3027   return vect_is_simple_reduction (loop_info, phi, check_reduction,
3028 				   double_reduc,
3029 				   need_wrapping_integral_overflow,
3030 				   &v_reduc_type);
3031 }
3032 
3033 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3034 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3035 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3036                              int *peel_iters_epilogue,
3037                              stmt_vector_for_cost *scalar_cost_vec,
3038 			     stmt_vector_for_cost *prologue_cost_vec,
3039 			     stmt_vector_for_cost *epilogue_cost_vec)
3040 {
3041   int retval = 0;
3042   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3043 
3044   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3045     {
3046       *peel_iters_epilogue = vf/2;
3047       if (dump_enabled_p ())
3048         dump_printf_loc (MSG_NOTE, vect_location,
3049 			 "cost model: epilogue peel iters set to vf/2 "
3050 			 "because loop iterations are unknown .\n");
3051 
3052       /* If peeled iterations are known but number of scalar loop
3053          iterations are unknown, count a taken branch per peeled loop.  */
3054       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3055 				 NULL, 0, vect_prologue);
3056       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3057 				 NULL, 0, vect_epilogue);
3058     }
3059   else
3060     {
3061       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3062       peel_iters_prologue = niters < peel_iters_prologue ?
3063                             niters : peel_iters_prologue;
3064       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3065       /* If we need to peel for gaps, but no peeling is required, we have to
3066 	 peel VF iterations.  */
3067       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3068         *peel_iters_epilogue = vf;
3069     }
3070 
3071   stmt_info_for_cost *si;
3072   int j;
3073   if (peel_iters_prologue)
3074     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3075       retval += record_stmt_cost (prologue_cost_vec,
3076 				  si->count * peel_iters_prologue,
3077 				  si->kind, NULL, si->misalign,
3078 				  vect_prologue);
3079   if (*peel_iters_epilogue)
3080     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3081       retval += record_stmt_cost (epilogue_cost_vec,
3082 				  si->count * *peel_iters_epilogue,
3083 				  si->kind, NULL, si->misalign,
3084 				  vect_epilogue);
3085 
3086   return retval;
3087 }
3088 
3089 /* Function vect_estimate_min_profitable_iters
3090 
3091    Return the number of iterations required for the vector version of the
3092    loop to be profitable relative to the cost of the scalar version of the
3093    loop.  */
3094 
3095 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3096 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3097 				    int *ret_min_profitable_niters,
3098 				    int *ret_min_profitable_estimate)
3099 {
3100   int min_profitable_iters;
3101   int min_profitable_estimate;
3102   int peel_iters_prologue;
3103   int peel_iters_epilogue;
3104   unsigned vec_inside_cost = 0;
3105   int vec_outside_cost = 0;
3106   unsigned vec_prologue_cost = 0;
3107   unsigned vec_epilogue_cost = 0;
3108   int scalar_single_iter_cost = 0;
3109   int scalar_outside_cost = 0;
3110   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3111   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3112   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3113 
3114   /* Cost model disabled.  */
3115   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3116     {
3117       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3118       *ret_min_profitable_niters = 0;
3119       *ret_min_profitable_estimate = 0;
3120       return;
3121     }
3122 
3123   /* Requires loop versioning tests to handle misalignment.  */
3124   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3125     {
3126       /*  FIXME: Make cost depend on complexity of individual check.  */
3127       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3128       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3129 			    vect_prologue);
3130       dump_printf (MSG_NOTE,
3131                    "cost model: Adding cost of checks for loop "
3132                    "versioning to treat misalignment.\n");
3133     }
3134 
3135   /* Requires loop versioning with alias checks.  */
3136   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3137     {
3138       /*  FIXME: Make cost depend on complexity of individual check.  */
3139       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3140       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3141 			    vect_prologue);
3142       dump_printf (MSG_NOTE,
3143                    "cost model: Adding cost of checks for loop "
3144                    "versioning aliasing.\n");
3145     }
3146 
3147   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3148       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3149     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3150 			  vect_prologue);
3151 
3152   /* Count statements in scalar loop.  Using this as scalar cost for a single
3153      iteration for now.
3154 
3155      TODO: Add outer loop support.
3156 
3157      TODO: Consider assigning different costs to different scalar
3158      statements.  */
3159 
3160   scalar_single_iter_cost
3161     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3162 
3163   /* Add additional cost for the peeled instructions in prologue and epilogue
3164      loop.
3165 
3166      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3167      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3168 
3169      TODO: Build an expression that represents peel_iters for prologue and
3170      epilogue to be used in a run-time test.  */
3171 
3172   if (npeel  < 0)
3173     {
3174       peel_iters_prologue = vf/2;
3175       dump_printf (MSG_NOTE, "cost model: "
3176                    "prologue peel iters set to vf/2.\n");
3177 
3178       /* If peeling for alignment is unknown, loop bound of main loop becomes
3179          unknown.  */
3180       peel_iters_epilogue = vf/2;
3181       dump_printf (MSG_NOTE, "cost model: "
3182                    "epilogue peel iters set to vf/2 because "
3183                    "peeling for alignment is unknown.\n");
3184 
3185       /* If peeled iterations are unknown, count a taken branch and a not taken
3186          branch per peeled loop. Even if scalar loop iterations are known,
3187          vector iterations are not known since peeled prologue iterations are
3188          not known. Hence guards remain the same.  */
3189       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3190 			    NULL, 0, vect_prologue);
3191       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3192 			    NULL, 0, vect_prologue);
3193       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3194 			    NULL, 0, vect_epilogue);
3195       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3196 			    NULL, 0, vect_epilogue);
3197       stmt_info_for_cost *si;
3198       int j;
3199       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3200 	{
3201 	  struct _stmt_vec_info *stmt_info
3202 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3203 	  (void) add_stmt_cost (target_cost_data,
3204 				si->count * peel_iters_prologue,
3205 				si->kind, stmt_info, si->misalign,
3206 				vect_prologue);
3207 	  (void) add_stmt_cost (target_cost_data,
3208 				si->count * peel_iters_epilogue,
3209 				si->kind, stmt_info, si->misalign,
3210 				vect_epilogue);
3211 	}
3212     }
3213   else
3214     {
3215       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3216       stmt_info_for_cost *si;
3217       int j;
3218       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3219 
3220       prologue_cost_vec.create (2);
3221       epilogue_cost_vec.create (2);
3222       peel_iters_prologue = npeel;
3223 
3224       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3225 					  &peel_iters_epilogue,
3226 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3227 					    (loop_vinfo),
3228 					  &prologue_cost_vec,
3229 					  &epilogue_cost_vec);
3230 
3231       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3232 	{
3233 	  struct _stmt_vec_info *stmt_info
3234 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3235 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3236 				si->misalign, vect_prologue);
3237 	}
3238 
3239       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3240 	{
3241 	  struct _stmt_vec_info *stmt_info
3242 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3243 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3244 				si->misalign, vect_epilogue);
3245 	}
3246 
3247       prologue_cost_vec.release ();
3248       epilogue_cost_vec.release ();
3249     }
3250 
3251   /* FORNOW: The scalar outside cost is incremented in one of the
3252      following ways:
3253 
3254      1. The vectorizer checks for alignment and aliasing and generates
3255      a condition that allows dynamic vectorization.  A cost model
3256      check is ANDED with the versioning condition.  Hence scalar code
3257      path now has the added cost of the versioning check.
3258 
3259        if (cost > th & versioning_check)
3260          jmp to vector code
3261 
3262      Hence run-time scalar is incremented by not-taken branch cost.
3263 
3264      2. The vectorizer then checks if a prologue is required.  If the
3265      cost model check was not done before during versioning, it has to
3266      be done before the prologue check.
3267 
3268        if (cost <= th)
3269          prologue = scalar_iters
3270        if (prologue == 0)
3271          jmp to vector code
3272        else
3273          execute prologue
3274        if (prologue == num_iters)
3275 	 go to exit
3276 
3277      Hence the run-time scalar cost is incremented by a taken branch,
3278      plus a not-taken branch, plus a taken branch cost.
3279 
3280      3. The vectorizer then checks if an epilogue is required.  If the
3281      cost model check was not done before during prologue check, it
3282      has to be done with the epilogue check.
3283 
3284        if (prologue == 0)
3285          jmp to vector code
3286        else
3287          execute prologue
3288        if (prologue == num_iters)
3289 	 go to exit
3290        vector code:
3291          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3292            jmp to epilogue
3293 
3294      Hence the run-time scalar cost should be incremented by 2 taken
3295      branches.
3296 
3297      TODO: The back end may reorder the BBS's differently and reverse
3298      conditions/branch directions.  Change the estimates below to
3299      something more reasonable.  */
3300 
3301   /* If the number of iterations is known and we do not do versioning, we can
3302      decide whether to vectorize at compile time.  Hence the scalar version
3303      do not carry cost model guard costs.  */
3304   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3305       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3306       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3307     {
3308       /* Cost model check occurs at versioning.  */
3309       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3310           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3311 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3312       else
3313 	{
3314 	  /* Cost model check occurs at prologue generation.  */
3315 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3316 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3317 	      + vect_get_stmt_cost (cond_branch_not_taken);
3318 	  /* Cost model check occurs at epilogue generation.  */
3319 	  else
3320 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3321 	}
3322     }
3323 
3324   /* Complete the target-specific cost calculations.  */
3325   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3326 	       &vec_inside_cost, &vec_epilogue_cost);
3327 
3328   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3329 
3330   if (dump_enabled_p ())
3331     {
3332       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3333       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3334                    vec_inside_cost);
3335       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3336                    vec_prologue_cost);
3337       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3338                    vec_epilogue_cost);
3339       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3340                    scalar_single_iter_cost);
3341       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3342                    scalar_outside_cost);
3343       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3344                    vec_outside_cost);
3345       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3346                    peel_iters_prologue);
3347       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3348                    peel_iters_epilogue);
3349     }
3350 
3351   /* Calculate number of iterations required to make the vector version
3352      profitable, relative to the loop bodies only.  The following condition
3353      must hold true:
3354      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3355      where
3356      SIC = scalar iteration cost, VIC = vector iteration cost,
3357      VOC = vector outside cost, VF = vectorization factor,
3358      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3359      SOC = scalar outside cost for run time cost model check.  */
3360 
3361   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3362     {
3363       if (vec_outside_cost <= 0)
3364         min_profitable_iters = 1;
3365       else
3366         {
3367           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3368 				  - vec_inside_cost * peel_iters_prologue
3369                                   - vec_inside_cost * peel_iters_epilogue)
3370                                  / ((scalar_single_iter_cost * vf)
3371                                     - vec_inside_cost);
3372 
3373           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3374               <= (((int) vec_inside_cost * min_profitable_iters)
3375                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3376             min_profitable_iters++;
3377         }
3378     }
3379   /* vector version will never be profitable.  */
3380   else
3381     {
3382       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3383 	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3384 		    "did not happen for a simd loop");
3385 
3386       if (dump_enabled_p ())
3387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3388 			 "cost model: the vector iteration cost = %d "
3389 			 "divided by the scalar iteration cost = %d "
3390 			 "is greater or equal to the vectorization factor = %d"
3391                          ".\n",
3392 			 vec_inside_cost, scalar_single_iter_cost, vf);
3393       *ret_min_profitable_niters = -1;
3394       *ret_min_profitable_estimate = -1;
3395       return;
3396     }
3397 
3398   dump_printf (MSG_NOTE,
3399 	       "  Calculated minimum iters for profitability: %d\n",
3400 	       min_profitable_iters);
3401 
3402   min_profitable_iters =
3403 	min_profitable_iters < vf ? vf : min_profitable_iters;
3404 
3405   /* Because the condition we create is:
3406      if (niters <= min_profitable_iters)
3407        then skip the vectorized loop.  */
3408   min_profitable_iters--;
3409 
3410   if (dump_enabled_p ())
3411     dump_printf_loc (MSG_NOTE, vect_location,
3412                      "  Runtime profitability threshold = %d\n",
3413                      min_profitable_iters);
3414 
3415   *ret_min_profitable_niters = min_profitable_iters;
3416 
3417   /* Calculate number of iterations required to make the vector version
3418      profitable, relative to the loop bodies only.
3419 
3420      Non-vectorized variant is SIC * niters and it must win over vector
3421      variant on the expected loop trip count.  The following condition must hold true:
3422      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3423 
3424   if (vec_outside_cost <= 0)
3425     min_profitable_estimate = 1;
3426   else
3427     {
3428       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3429 				 - vec_inside_cost * peel_iters_prologue
3430 				 - vec_inside_cost * peel_iters_epilogue)
3431 				 / ((scalar_single_iter_cost * vf)
3432 				   - vec_inside_cost);
3433     }
3434   min_profitable_estimate --;
3435   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3436   if (dump_enabled_p ())
3437     dump_printf_loc (MSG_NOTE, vect_location,
3438 		     "  Static estimate profitability threshold = %d\n",
3439 		     min_profitable_estimate);
3440 
3441   *ret_min_profitable_estimate = min_profitable_estimate;
3442 }
3443 
3444 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3445    vector elements (not bits) for a vector of mode MODE.  */
3446 static void
calc_vec_perm_mask_for_shift(enum machine_mode mode,unsigned int offset,unsigned char * sel)3447 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3448 			      unsigned char *sel)
3449 {
3450   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3451 
3452   for (i = 0; i < nelt; i++)
3453     sel[i] = (i + offset) & (2*nelt - 1);
3454 }
3455 
3456 /* Checks whether the target supports whole-vector shifts for vectors of mode
3457    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3458    it supports vec_perm_const with masks for all necessary shift amounts.  */
3459 static bool
have_whole_vector_shift(enum machine_mode mode)3460 have_whole_vector_shift (enum machine_mode mode)
3461 {
3462   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3463     return true;
3464 
3465   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3466     return false;
3467 
3468   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3469   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3470 
3471   for (i = nelt/2; i >= 1; i/=2)
3472     {
3473       calc_vec_perm_mask_for_shift (mode, i, sel);
3474       if (!can_vec_perm_p (mode, false, sel))
3475 	return false;
3476     }
3477   return true;
3478 }
3479 
3480 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3481 
3482 static tree
get_reduction_op(gimple * stmt,int reduc_index)3483 get_reduction_op (gimple *stmt, int reduc_index)
3484 {
3485   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3486     {
3487     case GIMPLE_SINGLE_RHS:
3488       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3489 		  == ternary_op);
3490       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3491     case GIMPLE_UNARY_RHS:
3492       return gimple_assign_rhs1 (stmt);
3493     case GIMPLE_BINARY_RHS:
3494       return (reduc_index
3495 	      ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3496     case GIMPLE_TERNARY_RHS:
3497       return gimple_op (stmt, reduc_index + 1);
3498     default:
3499       gcc_unreachable ();
3500     }
3501 }
3502 
3503 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3504    functions. Design better to avoid maintenance issues.  */
3505 
3506 /* Function vect_model_reduction_cost.
3507 
3508    Models cost for a reduction operation, including the vector ops
3509    generated within the strip-mine loop, the initial definition before
3510    the loop, and the epilogue code that must be generated.  */
3511 
3512 static bool
vect_model_reduction_cost(stmt_vec_info stmt_info,enum tree_code reduc_code,int ncopies,int reduc_index)3513 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3514 			   int ncopies, int reduc_index)
3515 {
3516   int prologue_cost = 0, epilogue_cost = 0;
3517   enum tree_code code;
3518   optab optab;
3519   tree vectype;
3520   gimple *stmt, *orig_stmt;
3521   tree reduction_op;
3522   machine_mode mode;
3523   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524   struct loop *loop = NULL;
3525   void *target_cost_data;
3526 
3527   if (loop_vinfo)
3528     {
3529       loop = LOOP_VINFO_LOOP (loop_vinfo);
3530       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3531     }
3532   else
3533     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3534 
3535   /* Condition reductions generate two reductions in the loop.  */
3536   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3537     ncopies *= 2;
3538 
3539   /* Cost of reduction op inside loop.  */
3540   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3541 					stmt_info, 0, vect_body);
3542   stmt = STMT_VINFO_STMT (stmt_info);
3543 
3544   reduction_op = get_reduction_op (stmt, reduc_index);
3545 
3546   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3547   if (!vectype)
3548     {
3549       if (dump_enabled_p ())
3550         {
3551 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 			   "unsupported data-type ");
3553           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3554 			     TREE_TYPE (reduction_op));
3555           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3556         }
3557       return false;
3558    }
3559 
3560   mode = TYPE_MODE (vectype);
3561   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3562 
3563   if (!orig_stmt)
3564     orig_stmt = STMT_VINFO_STMT (stmt_info);
3565 
3566   code = gimple_assign_rhs_code (orig_stmt);
3567 
3568   /* Add in cost for initial definition.
3569      For cond reduction we have four vectors: initial index, step, initial
3570      result of the data reduction, initial value of the index reduction.  */
3571   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3572 		       == COND_REDUCTION ? 4 : 1;
3573   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3574 				  scalar_to_vec, stmt_info, 0,
3575 				  vect_prologue);
3576 
3577   /* Determine cost of epilogue code.
3578 
3579      We have a reduction operator that will reduce the vector in one statement.
3580      Also requires scalar extract.  */
3581 
3582   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3583     {
3584       if (reduc_code != ERROR_MARK)
3585 	{
3586 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3587 	    {
3588 	      /* An EQ stmt and an COND_EXPR stmt.  */
3589 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
3590 					      vector_stmt, stmt_info, 0,
3591 					      vect_epilogue);
3592 	      /* Reduction of the max index and a reduction of the found
3593 		 values.  */
3594 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
3595 					      vec_to_scalar, stmt_info, 0,
3596 					      vect_epilogue);
3597 	      /* A broadcast of the max value.  */
3598 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3599 					      scalar_to_vec, stmt_info, 0,
3600 					      vect_epilogue);
3601 	    }
3602 	  else
3603 	    {
3604 	      epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3605 					      stmt_info, 0, vect_epilogue);
3606 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3607 					      vec_to_scalar, stmt_info, 0,
3608 					      vect_epilogue);
3609 	    }
3610 	}
3611       else
3612 	{
3613 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3614 	  tree bitsize =
3615 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3616 	  int element_bitsize = tree_to_uhwi (bitsize);
3617 	  int nelements = vec_size_in_bits / element_bitsize;
3618 
3619 	  optab = optab_for_tree_code (code, vectype, optab_default);
3620 
3621 	  /* We have a whole vector shift available.  */
3622 	  if (VECTOR_MODE_P (mode)
3623 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3624 	      && have_whole_vector_shift (mode))
3625 	    {
3626 	      /* Final reduction via vector shifts and the reduction operator.
3627 		 Also requires scalar extract.  */
3628 	      epilogue_cost += add_stmt_cost (target_cost_data,
3629 					      exact_log2 (nelements) * 2,
3630 					      vector_stmt, stmt_info, 0,
3631 					      vect_epilogue);
3632 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3633 					      vec_to_scalar, stmt_info, 0,
3634 					      vect_epilogue);
3635 	    }
3636 	  else
3637 	    /* Use extracts and reduction op for final reduction.  For N
3638 	       elements, we have N extracts and N-1 reduction ops.  */
3639 	    epilogue_cost += add_stmt_cost (target_cost_data,
3640 					    nelements + nelements - 1,
3641 					    vector_stmt, stmt_info, 0,
3642 					    vect_epilogue);
3643 	}
3644     }
3645 
3646   if (dump_enabled_p ())
3647     dump_printf (MSG_NOTE,
3648                  "vect_model_reduction_cost: inside_cost = %d, "
3649                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3650                  prologue_cost, epilogue_cost);
3651 
3652   return true;
3653 }
3654 
3655 
3656 /* Function vect_model_induction_cost.
3657 
3658    Models cost for induction operations.  */
3659 
3660 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)3661 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3662 {
3663   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3664   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3665   unsigned inside_cost, prologue_cost;
3666 
3667   /* loop cost for vec_loop.  */
3668   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3669 			       stmt_info, 0, vect_body);
3670 
3671   /* prologue cost for vec_init and vec_step.  */
3672   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3673 				 stmt_info, 0, vect_prologue);
3674 
3675   if (dump_enabled_p ())
3676     dump_printf_loc (MSG_NOTE, vect_location,
3677                      "vect_model_induction_cost: inside_cost = %d, "
3678                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3679 }
3680 
3681 
3682 /* Function get_initial_def_for_induction
3683 
3684    Input:
3685    STMT - a stmt that performs an induction operation in the loop.
3686    IV_PHI - the initial value of the induction variable
3687 
3688    Output:
3689    Return a vector variable, initialized with the first VF values of
3690    the induction variable.  E.g., for an iv with IV_PHI='X' and
3691    evolution S, for a vector of 4 units, we want to return:
3692    [X, X + S, X + 2*S, X + 3*S].  */
3693 
3694 static tree
get_initial_def_for_induction(gimple * iv_phi)3695 get_initial_def_for_induction (gimple *iv_phi)
3696 {
3697   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3698   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3699   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3700   tree vectype;
3701   int nunits;
3702   edge pe = loop_preheader_edge (loop);
3703   struct loop *iv_loop;
3704   basic_block new_bb;
3705   tree new_vec, vec_init, vec_step, t;
3706   tree new_name;
3707   gimple *new_stmt;
3708   gphi *induction_phi;
3709   tree induc_def, vec_def, vec_dest;
3710   tree init_expr, step_expr;
3711   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3712   int i;
3713   int ncopies;
3714   tree expr;
3715   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3716   bool nested_in_vect_loop = false;
3717   gimple_seq stmts;
3718   imm_use_iterator imm_iter;
3719   use_operand_p use_p;
3720   gimple *exit_phi;
3721   edge latch_e;
3722   tree loop_arg;
3723   gimple_stmt_iterator si;
3724   basic_block bb = gimple_bb (iv_phi);
3725   tree stepvectype;
3726   tree resvectype;
3727 
3728   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3729   if (nested_in_vect_loop_p (loop, iv_phi))
3730     {
3731       nested_in_vect_loop = true;
3732       iv_loop = loop->inner;
3733     }
3734   else
3735     iv_loop = loop;
3736   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3737 
3738   latch_e = loop_latch_edge (iv_loop);
3739   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3740 
3741   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3742   gcc_assert (step_expr != NULL_TREE);
3743 
3744   pe = loop_preheader_edge (iv_loop);
3745   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3746 				     loop_preheader_edge (iv_loop));
3747 
3748   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3749   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3750   gcc_assert (vectype);
3751   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3752   ncopies = vf / nunits;
3753 
3754   gcc_assert (phi_info);
3755   gcc_assert (ncopies >= 1);
3756 
3757   /* Convert the step to the desired type.  */
3758   stmts = NULL;
3759   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3760   if (stmts)
3761     {
3762       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3763       gcc_assert (!new_bb);
3764     }
3765 
3766   /* Find the first insertion point in the BB.  */
3767   si = gsi_after_labels (bb);
3768 
3769   /* Create the vector that holds the initial_value of the induction.  */
3770   if (nested_in_vect_loop)
3771     {
3772       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3773 	 been created during vectorization of previous stmts.  We obtain it
3774 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3775       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3776       /* If the initial value is not of proper type, convert it.  */
3777       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3778 	{
3779 	  new_stmt
3780 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
3781 							  vect_simple_var,
3782 							  "vec_iv_"),
3783 				   VIEW_CONVERT_EXPR,
3784 				   build1 (VIEW_CONVERT_EXPR, vectype,
3785 					   vec_init));
3786 	  vec_init = gimple_assign_lhs (new_stmt);
3787 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3788 						 new_stmt);
3789 	  gcc_assert (!new_bb);
3790 	  set_vinfo_for_stmt (new_stmt,
3791 			      new_stmt_vec_info (new_stmt, loop_vinfo));
3792 	}
3793     }
3794   else
3795     {
3796       vec<constructor_elt, va_gc> *v;
3797 
3798       /* iv_loop is the loop to be vectorized. Create:
3799 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3800       stmts = NULL;
3801       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3802 
3803       vec_alloc (v, nunits);
3804       bool constant_p = is_gimple_min_invariant (new_name);
3805       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3806       for (i = 1; i < nunits; i++)
3807 	{
3808 	  /* Create: new_name_i = new_name + step_expr  */
3809 	  new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3810 				   new_name, step_expr);
3811 	  if (!is_gimple_min_invariant (new_name))
3812 	    constant_p = false;
3813 	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3814 	}
3815       if (stmts)
3816 	{
3817 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3818 	  gcc_assert (!new_bb);
3819 	}
3820 
3821       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3822       if (constant_p)
3823 	new_vec = build_vector_from_ctor (vectype, v);
3824       else
3825 	new_vec = build_constructor (vectype, v);
3826       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3827     }
3828 
3829 
3830   /* Create the vector that holds the step of the induction.  */
3831   if (nested_in_vect_loop)
3832     /* iv_loop is nested in the loop to be vectorized. Generate:
3833        vec_step = [S, S, S, S]  */
3834     new_name = step_expr;
3835   else
3836     {
3837       /* iv_loop is the loop to be vectorized. Generate:
3838 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3839       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3840 	{
3841 	  expr = build_int_cst (integer_type_node, vf);
3842 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3843 	}
3844       else
3845 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
3846       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3847 			      expr, step_expr);
3848       if (TREE_CODE (step_expr) == SSA_NAME)
3849 	new_name = vect_init_vector (iv_phi, new_name,
3850 				     TREE_TYPE (step_expr), NULL);
3851     }
3852 
3853   t = unshare_expr (new_name);
3854   gcc_assert (CONSTANT_CLASS_P (new_name)
3855 	      || TREE_CODE (new_name) == SSA_NAME);
3856   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3857   gcc_assert (stepvectype);
3858   new_vec = build_vector_from_val (stepvectype, t);
3859   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3860 
3861 
3862   /* Create the following def-use cycle:
3863      loop prolog:
3864          vec_init = ...
3865 	 vec_step = ...
3866      loop:
3867          vec_iv = PHI <vec_init, vec_loop>
3868          ...
3869          STMT
3870          ...
3871          vec_loop = vec_iv + vec_step;  */
3872 
3873   /* Create the induction-phi that defines the induction-operand.  */
3874   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3875   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3876   set_vinfo_for_stmt (induction_phi,
3877 		      new_stmt_vec_info (induction_phi, loop_vinfo));
3878   induc_def = PHI_RESULT (induction_phi);
3879 
3880   /* Create the iv update inside the loop  */
3881   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3882   vec_def = make_ssa_name (vec_dest, new_stmt);
3883   gimple_assign_set_lhs (new_stmt, vec_def);
3884   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3885   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3886 
3887   /* Set the arguments of the phi node:  */
3888   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3889   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3890 	       UNKNOWN_LOCATION);
3891 
3892 
3893   /* In case that vectorization factor (VF) is bigger than the number
3894      of elements that we can fit in a vectype (nunits), we have to generate
3895      more than one vector stmt - i.e - we need to "unroll" the
3896      vector stmt by a factor VF/nunits.  For more details see documentation
3897      in vectorizable_operation.  */
3898 
3899   if (ncopies > 1)
3900     {
3901       stmt_vec_info prev_stmt_vinfo;
3902       /* FORNOW. This restriction should be relaxed.  */
3903       gcc_assert (!nested_in_vect_loop);
3904 
3905       /* Create the vector that holds the step of the induction.  */
3906       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3907 	{
3908 	  expr = build_int_cst (integer_type_node, nunits);
3909 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3910 	}
3911       else
3912 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3913       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3914 			      expr, step_expr);
3915       if (TREE_CODE (step_expr) == SSA_NAME)
3916 	new_name = vect_init_vector (iv_phi, new_name,
3917 				     TREE_TYPE (step_expr), NULL);
3918       t = unshare_expr (new_name);
3919       gcc_assert (CONSTANT_CLASS_P (new_name)
3920 		  || TREE_CODE (new_name) == SSA_NAME);
3921       new_vec = build_vector_from_val (stepvectype, t);
3922       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3923 
3924       vec_def = induc_def;
3925       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3926       for (i = 1; i < ncopies; i++)
3927 	{
3928 	  /* vec_i = vec_prev + vec_step  */
3929 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3930 					  vec_def, vec_step);
3931 	  vec_def = make_ssa_name (vec_dest, new_stmt);
3932 	  gimple_assign_set_lhs (new_stmt, vec_def);
3933 
3934 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3935 	  if (!useless_type_conversion_p (resvectype, vectype))
3936 	    {
3937 	      new_stmt
3938 		= gimple_build_assign
3939 			(vect_get_new_vect_var (resvectype, vect_simple_var,
3940 						"vec_iv_"),
3941 			 VIEW_CONVERT_EXPR,
3942 			 build1 (VIEW_CONVERT_EXPR, resvectype,
3943 				 gimple_assign_lhs (new_stmt)));
3944 	      gimple_assign_set_lhs (new_stmt,
3945 				     make_ssa_name
3946 				       (gimple_assign_lhs (new_stmt), new_stmt));
3947 	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3948 	    }
3949 	  set_vinfo_for_stmt (new_stmt,
3950 			      new_stmt_vec_info (new_stmt, loop_vinfo));
3951 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3952 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3953 	}
3954     }
3955 
3956   if (nested_in_vect_loop)
3957     {
3958       /* Find the loop-closed exit-phi of the induction, and record
3959          the final vector of induction results:  */
3960       exit_phi = NULL;
3961       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3962         {
3963 	  gimple *use_stmt = USE_STMT (use_p);
3964 	  if (is_gimple_debug (use_stmt))
3965 	    continue;
3966 
3967 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3968 	    {
3969 	      exit_phi = use_stmt;
3970 	      break;
3971 	    }
3972         }
3973       if (exit_phi)
3974 	{
3975 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3976 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
3977 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
3978 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3979 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
3980 
3981 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3982 	  if (dump_enabled_p ())
3983 	    {
3984 	      dump_printf_loc (MSG_NOTE, vect_location,
3985 			       "vector of inductions after inner-loop:");
3986 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3987               dump_printf (MSG_NOTE, "\n");
3988 	    }
3989 	}
3990     }
3991 
3992 
3993   if (dump_enabled_p ())
3994     {
3995       dump_printf_loc (MSG_NOTE, vect_location,
3996 		       "transform induction: created def-use cycle: ");
3997       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3998       dump_printf (MSG_NOTE, "\n");
3999       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4000 			SSA_NAME_DEF_STMT (vec_def), 0);
4001       dump_printf (MSG_NOTE, "\n");
4002     }
4003 
4004   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4005   if (!useless_type_conversion_p (resvectype, vectype))
4006     {
4007       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4008 							     vect_simple_var,
4009 							     "vec_iv_"),
4010 				      VIEW_CONVERT_EXPR,
4011 				      build1 (VIEW_CONVERT_EXPR, resvectype,
4012 					      induc_def));
4013       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4014       gimple_assign_set_lhs (new_stmt, induc_def);
4015       si = gsi_after_labels (bb);
4016       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4017       set_vinfo_for_stmt (new_stmt,
4018 			  new_stmt_vec_info (new_stmt, loop_vinfo));
4019       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4020 	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4021     }
4022 
4023   return induc_def;
4024 }
4025 
4026 
4027 /* Function get_initial_def_for_reduction
4028 
4029    Input:
4030    STMT - a stmt that performs a reduction operation in the loop.
4031    INIT_VAL - the initial value of the reduction variable
4032 
4033    Output:
4034    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4035         of the reduction (used for adjusting the epilog - see below).
4036    Return a vector variable, initialized according to the operation that STMT
4037         performs. This vector will be used as the initial value of the
4038         vector of partial results.
4039 
4040    Option1 (adjust in epilog): Initialize the vector as follows:
4041      add/bit or/xor:    [0,0,...,0,0]
4042      mult/bit and:      [1,1,...,1,1]
4043      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4044    and when necessary (e.g. add/mult case) let the caller know
4045    that it needs to adjust the result by init_val.
4046 
4047    Option2: Initialize the vector as follows:
4048      add/bit or/xor:    [init_val,0,0,...,0]
4049      mult/bit and:      [init_val,1,1,...,1]
4050      min/max/cond_expr: [init_val,init_val,...,init_val]
4051    and no adjustments are needed.
4052 
4053    For example, for the following code:
4054 
4055    s = init_val;
4056    for (i=0;i<n;i++)
4057      s = s + a[i];
4058 
4059    STMT is 's = s + a[i]', and the reduction variable is 's'.
4060    For a vector of 4 units, we want to return either [0,0,0,init_val],
4061    or [0,0,0,0] and let the caller know that it needs to adjust
4062    the result at the end by 'init_val'.
4063 
4064    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4065    initialization vector is simpler (same element in all entries), if
4066    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4067 
4068    A cost model should help decide between these two schemes.  */
4069 
4070 tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)4071 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4072                                tree *adjustment_def)
4073 {
4074   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4075   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4076   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4077   tree scalar_type = TREE_TYPE (init_val);
4078   tree vectype = get_vectype_for_scalar_type (scalar_type);
4079   int nunits;
4080   enum tree_code code = gimple_assign_rhs_code (stmt);
4081   tree def_for_init;
4082   tree init_def;
4083   tree *elts;
4084   int i;
4085   bool nested_in_vect_loop = false;
4086   REAL_VALUE_TYPE real_init_val = dconst0;
4087   int int_init_val = 0;
4088   gimple *def_stmt = NULL;
4089   gimple_seq stmts = NULL;
4090 
4091   gcc_assert (vectype);
4092   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4093 
4094   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4095 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4096 
4097   if (nested_in_vect_loop_p (loop, stmt))
4098     nested_in_vect_loop = true;
4099   else
4100     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4101 
4102   /* In case of double reduction we only create a vector variable to be put
4103      in the reduction phi node.  The actual statement creation is done in
4104      vect_create_epilog_for_reduction.  */
4105   if (adjustment_def && nested_in_vect_loop
4106       && TREE_CODE (init_val) == SSA_NAME
4107       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4108       && gimple_code (def_stmt) == GIMPLE_PHI
4109       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4110       && vinfo_for_stmt (def_stmt)
4111       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4112           == vect_double_reduction_def)
4113     {
4114       *adjustment_def = NULL;
4115       return vect_create_destination_var (init_val, vectype);
4116     }
4117 
4118   /* In case of a nested reduction do not use an adjustment def as
4119      that case is not supported by the epilogue generation correctly
4120      if ncopies is not one.  */
4121   if (adjustment_def && nested_in_vect_loop)
4122     {
4123       *adjustment_def = NULL;
4124       return vect_get_vec_def_for_operand (init_val, stmt);
4125     }
4126 
4127   switch (code)
4128     {
4129       case WIDEN_SUM_EXPR:
4130       case DOT_PROD_EXPR:
4131       case SAD_EXPR:
4132       case PLUS_EXPR:
4133       case MINUS_EXPR:
4134       case BIT_IOR_EXPR:
4135       case BIT_XOR_EXPR:
4136       case MULT_EXPR:
4137       case BIT_AND_EXPR:
4138         /* ADJUSMENT_DEF is NULL when called from
4139            vect_create_epilog_for_reduction to vectorize double reduction.  */
4140         if (adjustment_def)
4141 	  *adjustment_def = init_val;
4142 
4143         if (code == MULT_EXPR)
4144           {
4145             real_init_val = dconst1;
4146             int_init_val = 1;
4147           }
4148 
4149         if (code == BIT_AND_EXPR)
4150           int_init_val = -1;
4151 
4152         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4153           def_for_init = build_real (scalar_type, real_init_val);
4154         else
4155           def_for_init = build_int_cst (scalar_type, int_init_val);
4156 
4157         /* Create a vector of '0' or '1' except the first element.  */
4158 	elts = XALLOCAVEC (tree, nunits);
4159         for (i = nunits - 2; i >= 0; --i)
4160 	  elts[i + 1] = def_for_init;
4161 
4162         /* Option1: the first element is '0' or '1' as well.  */
4163         if (adjustment_def)
4164           {
4165 	    elts[0] = def_for_init;
4166             init_def = build_vector (vectype, elts);
4167             break;
4168           }
4169 
4170         /* Option2: the first element is INIT_VAL.  */
4171 	elts[0] = init_val;
4172         if (TREE_CONSTANT (init_val))
4173           init_def = build_vector (vectype, elts);
4174         else
4175 	  {
4176 	    vec<constructor_elt, va_gc> *v;
4177 	    vec_alloc (v, nunits);
4178 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4179 	    for (i = 1; i < nunits; ++i)
4180 	      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4181 	    init_def = build_constructor (vectype, v);
4182 	  }
4183 
4184         break;
4185 
4186       case MIN_EXPR:
4187       case MAX_EXPR:
4188       case COND_EXPR:
4189 	if (adjustment_def)
4190           {
4191 	    *adjustment_def = NULL_TREE;
4192 	    if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4193 	      {
4194 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
4195 		break;
4196 	      }
4197 	  }
4198 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4199 	if (! gimple_seq_empty_p (stmts))
4200 	  gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4201 	init_def = build_vector_from_val (vectype, init_val);
4202 	break;
4203 
4204       default:
4205         gcc_unreachable ();
4206     }
4207 
4208   return init_def;
4209 }
4210 
4211 /* Function vect_create_epilog_for_reduction
4212 
4213    Create code at the loop-epilog to finalize the result of a reduction
4214    computation.
4215 
4216    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4217      reduction statements.
4218    STMT is the scalar reduction stmt that is being vectorized.
4219    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4220      number of elements that we can fit in a vectype (nunits).  In this case
4221      we have to generate more than one vector stmt - i.e - we need to "unroll"
4222      the vector stmt by a factor VF/nunits.  For more details see documentation
4223      in vectorizable_operation.
4224    REDUC_CODE is the tree-code for the epilog reduction.
4225    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4226      computation.
4227    REDUC_INDEX is the index of the operand in the right hand side of the
4228      statement that is defined by REDUCTION_PHI.
4229    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4230    SLP_NODE is an SLP node containing a group of reduction statements. The
4231      first one in this group is STMT.
4232    INDUCTION_INDEX is the index of the loop for condition reductions.
4233      Otherwise it is undefined.
4234 
4235    This function:
4236    1. Creates the reduction def-use cycles: sets the arguments for
4237       REDUCTION_PHIS:
4238       The loop-entry argument is the vectorized initial-value of the reduction.
4239       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4240       sums.
4241    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4242       by applying the operation specified by REDUC_CODE if available, or by
4243       other means (whole-vector shifts or a scalar loop).
4244       The function also creates a new phi node at the loop exit to preserve
4245       loop-closed form, as illustrated below.
4246 
4247      The flow at the entry to this function:
4248 
4249         loop:
4250           vec_def = phi <null, null>            # REDUCTION_PHI
4251           VECT_DEF = vector_stmt                # vectorized form of STMT
4252           s_loop = scalar_stmt                  # (scalar) STMT
4253         loop_exit:
4254           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4255           use <s_out0>
4256           use <s_out0>
4257 
4258      The above is transformed by this function into:
4259 
4260         loop:
4261           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4262           VECT_DEF = vector_stmt                # vectorized form of STMT
4263           s_loop = scalar_stmt                  # (scalar) STMT
4264         loop_exit:
4265           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4266           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4267           v_out2 = reduce <v_out1>
4268           s_out3 = extract_field <v_out2, 0>
4269           s_out4 = adjust_result <s_out3>
4270           use <s_out4>
4271           use <s_out4>
4272 */
4273 
4274 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,int ncopies,enum tree_code reduc_code,vec<gimple * > reduction_phis,int reduc_index,bool double_reduc,slp_tree slp_node,tree induction_index)4275 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4276 				  int ncopies, enum tree_code reduc_code,
4277 				  vec<gimple *> reduction_phis,
4278                                   int reduc_index, bool double_reduc,
4279 				  slp_tree slp_node, tree induction_index)
4280 {
4281   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4282   stmt_vec_info prev_phi_info;
4283   tree vectype;
4284   machine_mode mode;
4285   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4287   basic_block exit_bb;
4288   tree scalar_dest;
4289   tree scalar_type;
4290   gimple *new_phi = NULL, *phi;
4291   gimple_stmt_iterator exit_gsi;
4292   tree vec_dest;
4293   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4294   gimple *epilog_stmt = NULL;
4295   enum tree_code code = gimple_assign_rhs_code (stmt);
4296   gimple *exit_phi;
4297   tree bitsize;
4298   tree adjustment_def = NULL;
4299   tree vec_initial_def = NULL;
4300   tree reduction_op, expr, def, initial_def = NULL;
4301   tree orig_name, scalar_result;
4302   imm_use_iterator imm_iter, phi_imm_iter;
4303   use_operand_p use_p, phi_use_p;
4304   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4305   bool nested_in_vect_loop = false;
4306   auto_vec<gimple *> new_phis;
4307   auto_vec<gimple *> inner_phis;
4308   enum vect_def_type dt = vect_unknown_def_type;
4309   int j, i;
4310   auto_vec<tree> scalar_results;
4311   unsigned int group_size = 1, k, ratio;
4312   auto_vec<tree> vec_initial_defs;
4313   auto_vec<gimple *> phis;
4314   bool slp_reduc = false;
4315   tree new_phi_result;
4316   gimple *inner_phi = NULL;
4317 
4318   if (slp_node)
4319     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4320 
4321   if (nested_in_vect_loop_p (loop, stmt))
4322     {
4323       outer_loop = loop;
4324       loop = loop->inner;
4325       nested_in_vect_loop = true;
4326       gcc_assert (!slp_node);
4327     }
4328 
4329   reduction_op = get_reduction_op (stmt, reduc_index);
4330 
4331   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4332   gcc_assert (vectype);
4333   mode = TYPE_MODE (vectype);
4334 
4335   /* 1. Create the reduction def-use cycle:
4336      Set the arguments of REDUCTION_PHIS, i.e., transform
4337 
4338         loop:
4339           vec_def = phi <null, null>            # REDUCTION_PHI
4340           VECT_DEF = vector_stmt                # vectorized form of STMT
4341           ...
4342 
4343      into:
4344 
4345         loop:
4346           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4347           VECT_DEF = vector_stmt                # vectorized form of STMT
4348           ...
4349 
4350      (in case of SLP, do it for all the phis). */
4351 
4352   /* Get the loop-entry arguments.  */
4353   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4354   if (slp_node)
4355     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4356                        NULL, slp_node, reduc_index);
4357   else
4358     {
4359       /* Get at the scalar def before the loop, that defines the initial value
4360 	 of the reduction variable.  */
4361       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4362       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4363 					   loop_preheader_edge (loop));
4364       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4365       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4366 						       &adjustment_def);
4367       vec_initial_defs.create (1);
4368       vec_initial_defs.quick_push (vec_initial_def);
4369     }
4370 
4371   /* Set phi nodes arguments.  */
4372   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4373     {
4374       tree vec_init_def, def;
4375       gimple_seq stmts;
4376       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4377 					   true, NULL_TREE);
4378       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4379       def = vect_defs[i];
4380       for (j = 0; j < ncopies; j++)
4381         {
4382 	  if (j != 0)
4383 	    {
4384 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4385 	      if (nested_in_vect_loop)
4386 		vec_init_def
4387 		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4388 						    vec_init_def);
4389 	    }
4390 
4391 	  /* Set the loop-entry arg of the reduction-phi.  */
4392 
4393 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4394 	      == INTEGER_INDUC_COND_REDUCTION)
4395 	    {
4396 	      /* Initialise the reduction phi to zero.  This prevents initial
4397 		 values of non-zero interferring with the reduction op.  */
4398 	      gcc_assert (ncopies == 1);
4399 	      gcc_assert (i == 0);
4400 
4401 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4402 	      tree zero_vec = build_zero_cst (vec_init_def_type);
4403 
4404 	      add_phi_arg (as_a <gphi *> (phi), zero_vec,
4405 			   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4406 	    }
4407 	  else
4408 	    add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4409 			 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4410 
4411           /* Set the loop-latch arg for the reduction-phi.  */
4412           if (j > 0)
4413             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4414 
4415           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4416 		       UNKNOWN_LOCATION);
4417 
4418           if (dump_enabled_p ())
4419             {
4420               dump_printf_loc (MSG_NOTE, vect_location,
4421 			       "transform reduction: created def-use cycle: ");
4422               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4423               dump_printf (MSG_NOTE, "\n");
4424               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4425               dump_printf (MSG_NOTE, "\n");
4426             }
4427         }
4428     }
4429 
4430   /* 2. Create epilog code.
4431         The reduction epilog code operates across the elements of the vector
4432         of partial results computed by the vectorized loop.
4433         The reduction epilog code consists of:
4434 
4435         step 1: compute the scalar result in a vector (v_out2)
4436         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4437         step 3: adjust the scalar result (s_out3) if needed.
4438 
4439         Step 1 can be accomplished using one the following three schemes:
4440           (scheme 1) using reduc_code, if available.
4441           (scheme 2) using whole-vector shifts, if available.
4442           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4443                      combined.
4444 
4445           The overall epilog code looks like this:
4446 
4447           s_out0 = phi <s_loop>         # original EXIT_PHI
4448           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4449           v_out2 = reduce <v_out1>              # step 1
4450           s_out3 = extract_field <v_out2, 0>    # step 2
4451           s_out4 = adjust_result <s_out3>       # step 3
4452 
4453           (step 3 is optional, and steps 1 and 2 may be combined).
4454           Lastly, the uses of s_out0 are replaced by s_out4.  */
4455 
4456 
4457   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4458          v_out1 = phi <VECT_DEF>
4459          Store them in NEW_PHIS.  */
4460 
4461   exit_bb = single_exit (loop)->dest;
4462   prev_phi_info = NULL;
4463   new_phis.create (vect_defs.length ());
4464   FOR_EACH_VEC_ELT (vect_defs, i, def)
4465     {
4466       for (j = 0; j < ncopies; j++)
4467         {
4468 	  tree new_def = copy_ssa_name (def);
4469           phi = create_phi_node (new_def, exit_bb);
4470           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4471           if (j == 0)
4472             new_phis.quick_push (phi);
4473           else
4474 	    {
4475 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
4476 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4477 	    }
4478 
4479           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4480           prev_phi_info = vinfo_for_stmt (phi);
4481         }
4482     }
4483 
4484   /* The epilogue is created for the outer-loop, i.e., for the loop being
4485      vectorized.  Create exit phis for the outer loop.  */
4486   if (double_reduc)
4487     {
4488       loop = outer_loop;
4489       exit_bb = single_exit (loop)->dest;
4490       inner_phis.create (vect_defs.length ());
4491       FOR_EACH_VEC_ELT (new_phis, i, phi)
4492 	{
4493 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
4494 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
4495 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4496 			   PHI_RESULT (phi));
4497 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4498 							    loop_vinfo));
4499 	  inner_phis.quick_push (phi);
4500 	  new_phis[i] = outer_phi;
4501 	  prev_phi_info = vinfo_for_stmt (outer_phi);
4502           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4503             {
4504 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4505 	      new_result = copy_ssa_name (PHI_RESULT (phi));
4506 	      outer_phi = create_phi_node (new_result, exit_bb);
4507 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4508 			       PHI_RESULT (phi));
4509 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4510 								loop_vinfo));
4511 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4512 	      prev_phi_info = vinfo_for_stmt (outer_phi);
4513 	    }
4514 	}
4515     }
4516 
4517   exit_gsi = gsi_after_labels (exit_bb);
4518 
4519   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4520          (i.e. when reduc_code is not available) and in the final adjustment
4521 	 code (if needed).  Also get the original scalar reduction variable as
4522          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4523          represents a reduction pattern), the tree-code and scalar-def are
4524          taken from the original stmt that the pattern-stmt (STMT) replaces.
4525          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4526          are taken from STMT.  */
4527 
4528   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4529   if (!orig_stmt)
4530     {
4531       /* Regular reduction  */
4532       orig_stmt = stmt;
4533     }
4534   else
4535     {
4536       /* Reduction pattern  */
4537       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4538       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4539       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4540     }
4541 
4542   code = gimple_assign_rhs_code (orig_stmt);
4543   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4544      partial results are added and not subtracted.  */
4545   if (code == MINUS_EXPR)
4546     code = PLUS_EXPR;
4547 
4548   scalar_dest = gimple_assign_lhs (orig_stmt);
4549   scalar_type = TREE_TYPE (scalar_dest);
4550   scalar_results.create (group_size);
4551   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4552   bitsize = TYPE_SIZE (scalar_type);
4553 
4554   /* In case this is a reduction in an inner-loop while vectorizing an outer
4555      loop - we don't need to extract a single scalar result at the end of the
4556      inner-loop (unless it is double reduction, i.e., the use of reduction is
4557      outside the outer-loop).  The final vector of partial results will be used
4558      in the vectorized outer-loop, or reduced to a scalar result at the end of
4559      the outer-loop.  */
4560   if (nested_in_vect_loop && !double_reduc)
4561     goto vect_finalize_reduction;
4562 
4563   /* SLP reduction without reduction chain, e.g.,
4564      # a1 = phi <a2, a0>
4565      # b1 = phi <b2, b0>
4566      a2 = operation (a1)
4567      b2 = operation (b1)  */
4568   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4569 
4570   /* In case of reduction chain, e.g.,
4571      # a1 = phi <a3, a0>
4572      a2 = operation (a1)
4573      a3 = operation (a2),
4574 
4575      we may end up with more than one vector result.  Here we reduce them to
4576      one vector.  */
4577   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4578     {
4579       tree first_vect = PHI_RESULT (new_phis[0]);
4580       tree tmp;
4581       gassign *new_vec_stmt = NULL;
4582 
4583       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4584       for (k = 1; k < new_phis.length (); k++)
4585         {
4586 	  gimple *next_phi = new_phis[k];
4587           tree second_vect = PHI_RESULT (next_phi);
4588 
4589           tmp = build2 (code, vectype,  first_vect, second_vect);
4590           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4591           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4592           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4593           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4594         }
4595 
4596       new_phi_result = first_vect;
4597       if (new_vec_stmt)
4598         {
4599           new_phis.truncate (0);
4600           new_phis.safe_push (new_vec_stmt);
4601         }
4602     }
4603   else
4604     new_phi_result = PHI_RESULT (new_phis[0]);
4605 
4606   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4607     {
4608       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4609 	 various data values where the condition matched and another vector
4610 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4611 	 need to extract the last matching index (which will be the index with
4612 	 highest value) and use this to index into the data vector.
4613 	 For the case where there were no matches, the data vector will contain
4614 	 all default values and the index vector will be all zeros.  */
4615 
4616       /* Get various versions of the type of the vector of indexes.  */
4617       tree index_vec_type = TREE_TYPE (induction_index);
4618       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4619       tree index_scalar_type = TREE_TYPE (index_vec_type);
4620       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4621 	(index_vec_type);
4622 
4623       /* Get an unsigned integer version of the type of the data vector.  */
4624       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4625       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4626       tree vectype_unsigned = build_vector_type
4627 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4628 
4629       /* First we need to create a vector (ZERO_VEC) of zeros and another
4630 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4631 	 can create using a MAX reduction and then expanding.
4632 	 In the case where the loop never made any matches, the max index will
4633 	 be zero.  */
4634 
4635       /* Vector of {0, 0, 0,...}.  */
4636       tree zero_vec = make_ssa_name (vectype);
4637       tree zero_vec_rhs = build_zero_cst (vectype);
4638       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4639       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4640 
4641       /* Find maximum value from the vector of found indexes.  */
4642       tree max_index = make_ssa_name (index_scalar_type);
4643       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4644 						    induction_index);
4645       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4646 
4647       /* Vector of {max_index, max_index, max_index,...}.  */
4648       tree max_index_vec = make_ssa_name (index_vec_type);
4649       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4650 						      max_index);
4651       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4652 							max_index_vec_rhs);
4653       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4654 
4655       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4656 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4657 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4658 	 otherwise.  Only one value should match, resulting in a vector
4659 	 (VEC_COND) with one data value and the rest zeros.
4660 	 In the case where the loop never made any matches, every index will
4661 	 match, resulting in a vector with all data values (which will all be
4662 	 the default value).  */
4663 
4664       /* Compare the max index vector to the vector of found indexes to find
4665 	 the position of the max value.  */
4666       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4667       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4668 						      induction_index,
4669 						      max_index_vec);
4670       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4671 
4672       /* Use the compare to choose either values from the data vector or
4673 	 zero.  */
4674       tree vec_cond = make_ssa_name (vectype);
4675       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4676 						   vec_compare, new_phi_result,
4677 						   zero_vec);
4678       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4679 
4680       /* Finally we need to extract the data value from the vector (VEC_COND)
4681 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4682 	 reduction, but because this doesn't exist, we can use a MAX reduction
4683 	 instead.  The data value might be signed or a float so we need to cast
4684 	 it first.
4685 	 In the case where the loop never made any matches, the data values are
4686 	 all identical, and so will reduce down correctly.  */
4687 
4688       /* Make the matched data values unsigned.  */
4689       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4690       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4691 				       vec_cond);
4692       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4693 							VIEW_CONVERT_EXPR,
4694 							vec_cond_cast_rhs);
4695       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4696 
4697       /* Reduce down to a scalar value.  */
4698       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4699       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4700 				      optab_default);
4701       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4702 		  != CODE_FOR_nothing);
4703       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4704 						     REDUC_MAX_EXPR,
4705 						     vec_cond_cast);
4706       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4707 
4708       /* Convert the reduced value back to the result type and set as the
4709 	 result.  */
4710       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4711 				     data_reduc);
4712       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4713       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4714       gimple_assign_set_lhs (epilog_stmt, new_temp);
4715       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4716       scalar_results.safe_push (new_temp);
4717     }
4718 
4719   /* 2.3 Create the reduction code, using one of the three schemes described
4720          above. In SLP we simply need to extract all the elements from the
4721          vector (without reducing them), so we use scalar shifts.  */
4722   else if (reduc_code != ERROR_MARK && !slp_reduc)
4723     {
4724       tree tmp;
4725       tree vec_elem_type;
4726 
4727       /*** Case 1:  Create:
4728            v_out2 = reduc_expr <v_out1>  */
4729 
4730       if (dump_enabled_p ())
4731         dump_printf_loc (MSG_NOTE, vect_location,
4732 			 "Reduce using direct vector reduction.\n");
4733 
4734       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4735       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4736 	{
4737           tree tmp_dest =
4738 	      vect_create_destination_var (scalar_dest, vec_elem_type);
4739 	  tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4740 	  epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4741 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4742 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4743 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4744 
4745 	  tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4746 	}
4747       else
4748 	tmp = build1 (reduc_code, scalar_type, new_phi_result);
4749 
4750       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4751       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4752       gimple_assign_set_lhs (epilog_stmt, new_temp);
4753       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4754 
4755       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4756 	  == INTEGER_INDUC_COND_REDUCTION)
4757 	{
4758 	  /* Earlier we set the initial value to be zero.  Check the result
4759 	     and if it is zero then replace with the original initial
4760 	     value.  */
4761 	  tree zero = build_zero_cst (scalar_type);
4762 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4763 
4764 	  tmp = make_ssa_name (new_scalar_dest);
4765 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4766 					     initial_def, new_temp);
4767 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4768 	  new_temp = tmp;
4769 	}
4770 
4771       scalar_results.safe_push (new_temp);
4772     }
4773   else
4774     {
4775       bool reduce_with_shift = have_whole_vector_shift (mode);
4776       int element_bitsize = tree_to_uhwi (bitsize);
4777       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4778       tree vec_temp;
4779 
4780       /* Regardless of whether we have a whole vector shift, if we're
4781          emulating the operation via tree-vect-generic, we don't want
4782          to use it.  Only the first round of the reduction is likely
4783          to still be profitable via emulation.  */
4784       /* ??? It might be better to emit a reduction tree code here, so that
4785          tree-vect-generic can expand the first round via bit tricks.  */
4786       if (!VECTOR_MODE_P (mode))
4787         reduce_with_shift = false;
4788       else
4789         {
4790           optab optab = optab_for_tree_code (code, vectype, optab_default);
4791           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4792             reduce_with_shift = false;
4793         }
4794 
4795       if (reduce_with_shift && !slp_reduc)
4796         {
4797           int nelements = vec_size_in_bits / element_bitsize;
4798           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4799 
4800           int elt_offset;
4801 
4802           tree zero_vec = build_zero_cst (vectype);
4803           /*** Case 2: Create:
4804              for (offset = nelements/2; offset >= 1; offset/=2)
4805                 {
4806                   Create:  va' = vec_shift <va, offset>
4807                   Create:  va = vop <va, va'>
4808                 }  */
4809 
4810           tree rhs;
4811 
4812           if (dump_enabled_p ())
4813             dump_printf_loc (MSG_NOTE, vect_location,
4814 			     "Reduce using vector shifts\n");
4815 
4816           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4817           new_temp = new_phi_result;
4818           for (elt_offset = nelements / 2;
4819                elt_offset >= 1;
4820                elt_offset /= 2)
4821             {
4822               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4823               tree mask = vect_gen_perm_mask_any (vectype, sel);
4824 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4825 						 new_temp, zero_vec, mask);
4826               new_name = make_ssa_name (vec_dest, epilog_stmt);
4827               gimple_assign_set_lhs (epilog_stmt, new_name);
4828               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4829 
4830 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4831 						 new_temp);
4832               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4833               gimple_assign_set_lhs (epilog_stmt, new_temp);
4834               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4835             }
4836 
4837 	  /* 2.4  Extract the final scalar result.  Create:
4838 	     s_out3 = extract_field <v_out2, bitpos>  */
4839 
4840 	  if (dump_enabled_p ())
4841 	    dump_printf_loc (MSG_NOTE, vect_location,
4842 			     "extract scalar result\n");
4843 
4844 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4845 			bitsize, bitsize_zero_node);
4846 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4847 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4848 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4849 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4850 	  scalar_results.safe_push (new_temp);
4851         }
4852       else
4853         {
4854           /*** Case 3: Create:
4855              s = extract_field <v_out2, 0>
4856              for (offset = element_size;
4857                   offset < vector_size;
4858                   offset += element_size;)
4859                {
4860                  Create:  s' = extract_field <v_out2, offset>
4861                  Create:  s = op <s, s'>  // For non SLP cases
4862                }  */
4863 
4864           if (dump_enabled_p ())
4865             dump_printf_loc (MSG_NOTE, vect_location,
4866 			     "Reduce using scalar code.\n");
4867 
4868           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4869           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4870             {
4871               int bit_offset;
4872               if (gimple_code (new_phi) == GIMPLE_PHI)
4873                 vec_temp = PHI_RESULT (new_phi);
4874               else
4875                 vec_temp = gimple_assign_lhs (new_phi);
4876               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4877                             bitsize_zero_node);
4878               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4879               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4880               gimple_assign_set_lhs (epilog_stmt, new_temp);
4881               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4882 
4883               /* In SLP we don't need to apply reduction operation, so we just
4884                  collect s' values in SCALAR_RESULTS.  */
4885               if (slp_reduc)
4886                 scalar_results.safe_push (new_temp);
4887 
4888               for (bit_offset = element_bitsize;
4889                    bit_offset < vec_size_in_bits;
4890                    bit_offset += element_bitsize)
4891                 {
4892                   tree bitpos = bitsize_int (bit_offset);
4893                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4894                                      bitsize, bitpos);
4895 
4896                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4897                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4898                   gimple_assign_set_lhs (epilog_stmt, new_name);
4899                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4900 
4901                   if (slp_reduc)
4902                     {
4903                       /* In SLP we don't need to apply reduction operation, so
4904                          we just collect s' values in SCALAR_RESULTS.  */
4905                       new_temp = new_name;
4906                       scalar_results.safe_push (new_name);
4907                     }
4908                   else
4909                     {
4910 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4911 							 new_name, new_temp);
4912                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4913                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4914                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4915                     }
4916                 }
4917             }
4918 
4919           /* The only case where we need to reduce scalar results in SLP, is
4920              unrolling.  If the size of SCALAR_RESULTS is greater than
4921              GROUP_SIZE, we reduce them combining elements modulo
4922              GROUP_SIZE.  */
4923           if (slp_reduc)
4924             {
4925               tree res, first_res, new_res;
4926 	      gimple *new_stmt;
4927 
4928               /* Reduce multiple scalar results in case of SLP unrolling.  */
4929               for (j = group_size; scalar_results.iterate (j, &res);
4930                    j++)
4931                 {
4932                   first_res = scalar_results[j % group_size];
4933 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
4934 						  first_res, res);
4935                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4936                   gimple_assign_set_lhs (new_stmt, new_res);
4937                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4938                   scalar_results[j % group_size] = new_res;
4939                 }
4940             }
4941           else
4942             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4943             scalar_results.safe_push (new_temp);
4944         }
4945     }
4946 
4947 vect_finalize_reduction:
4948 
4949   if (double_reduc)
4950     loop = loop->inner;
4951 
4952   /* 2.5 Adjust the final result by the initial value of the reduction
4953 	 variable. (When such adjustment is not needed, then
4954 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
4955 	 new_temp = loop_exit_def + adjustment_def  */
4956 
4957   if (adjustment_def)
4958     {
4959       gcc_assert (!slp_reduc);
4960       if (nested_in_vect_loop)
4961 	{
4962           new_phi = new_phis[0];
4963 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4964 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4965 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
4966 	}
4967       else
4968 	{
4969           new_temp = scalar_results[0];
4970 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4971 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
4972 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4973 	}
4974 
4975       epilog_stmt = gimple_build_assign (new_dest, expr);
4976       new_temp = make_ssa_name (new_dest, epilog_stmt);
4977       gimple_assign_set_lhs (epilog_stmt, new_temp);
4978       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979       if (nested_in_vect_loop)
4980         {
4981           set_vinfo_for_stmt (epilog_stmt,
4982                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4983           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4984                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4985 
4986           if (!double_reduc)
4987             scalar_results.quick_push (new_temp);
4988           else
4989             scalar_results[0] = new_temp;
4990         }
4991       else
4992         scalar_results[0] = new_temp;
4993 
4994       new_phis[0] = epilog_stmt;
4995     }
4996 
4997   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4998           phis with new adjusted scalar results, i.e., replace use <s_out0>
4999           with use <s_out4>.
5000 
5001      Transform:
5002         loop_exit:
5003           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5004           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5005           v_out2 = reduce <v_out1>
5006           s_out3 = extract_field <v_out2, 0>
5007           s_out4 = adjust_result <s_out3>
5008           use <s_out0>
5009           use <s_out0>
5010 
5011      into:
5012 
5013         loop_exit:
5014           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5015           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5016           v_out2 = reduce <v_out1>
5017           s_out3 = extract_field <v_out2, 0>
5018           s_out4 = adjust_result <s_out3>
5019           use <s_out4>
5020           use <s_out4> */
5021 
5022 
5023   /* In SLP reduction chain we reduce vector results into one vector if
5024      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5025      the last stmt in the reduction chain, since we are looking for the loop
5026      exit phi node.  */
5027   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5028     {
5029       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5030       /* Handle reduction patterns.  */
5031       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5032 	dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5033 
5034       scalar_dest = gimple_assign_lhs (dest_stmt);
5035       group_size = 1;
5036     }
5037 
5038   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5039      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5040      need to match SCALAR_RESULTS with corresponding statements.  The first
5041      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5042      the first vector stmt, etc.
5043      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5044   if (group_size > new_phis.length ())
5045     {
5046       ratio = group_size / new_phis.length ();
5047       gcc_assert (!(group_size % new_phis.length ()));
5048     }
5049   else
5050     ratio = 1;
5051 
5052   for (k = 0; k < group_size; k++)
5053     {
5054       if (k % ratio == 0)
5055         {
5056           epilog_stmt = new_phis[k / ratio];
5057           reduction_phi = reduction_phis[k / ratio];
5058 	  if (double_reduc)
5059 	    inner_phi = inner_phis[k / ratio];
5060         }
5061 
5062       if (slp_reduc)
5063         {
5064 	  gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5065 
5066           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5067           /* SLP statements can't participate in patterns.  */
5068           gcc_assert (!orig_stmt);
5069           scalar_dest = gimple_assign_lhs (current_stmt);
5070         }
5071 
5072       phis.create (3);
5073       /* Find the loop-closed-use at the loop exit of the original scalar
5074          result.  (The reduction result is expected to have two immediate uses -
5075          one at the latch block, and one at the loop exit).  */
5076       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5077         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5078 	    && !is_gimple_debug (USE_STMT (use_p)))
5079           phis.safe_push (USE_STMT (use_p));
5080 
5081       /* While we expect to have found an exit_phi because of loop-closed-ssa
5082          form we can end up without one if the scalar cycle is dead.  */
5083 
5084       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5085         {
5086           if (outer_loop)
5087             {
5088               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5089               gphi *vect_phi;
5090 
5091               /* FORNOW. Currently not supporting the case that an inner-loop
5092                  reduction is not used in the outer-loop (but only outside the
5093                  outer-loop), unless it is double reduction.  */
5094               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5095                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5096                           || double_reduc);
5097 
5098 	      if (double_reduc)
5099 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5100 	      else
5101 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5102               if (!double_reduc
5103                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5104                       != vect_double_reduction_def)
5105                 continue;
5106 
5107               /* Handle double reduction:
5108 
5109                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5110                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5111                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5112                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5113 
5114                  At that point the regular reduction (stmt2 and stmt3) is
5115                  already vectorized, as well as the exit phi node, stmt4.
5116                  Here we vectorize the phi node of double reduction, stmt1, and
5117                  update all relevant statements.  */
5118 
5119               /* Go through all the uses of s2 to find double reduction phi
5120                  node, i.e., stmt1 above.  */
5121               orig_name = PHI_RESULT (exit_phi);
5122               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5123                 {
5124                   stmt_vec_info use_stmt_vinfo;
5125                   stmt_vec_info new_phi_vinfo;
5126                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5127                   basic_block bb = gimple_bb (use_stmt);
5128 		  gimple *use;
5129 
5130                   /* Check that USE_STMT is really double reduction phi
5131                      node.  */
5132                   if (gimple_code (use_stmt) != GIMPLE_PHI
5133                       || gimple_phi_num_args (use_stmt) != 2
5134                       || bb->loop_father != outer_loop)
5135                     continue;
5136                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5137                   if (!use_stmt_vinfo
5138                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5139                           != vect_double_reduction_def)
5140 		    continue;
5141 
5142                   /* Create vector phi node for double reduction:
5143                      vs1 = phi <vs0, vs2>
5144                      vs1 was created previously in this function by a call to
5145                        vect_get_vec_def_for_operand and is stored in
5146                        vec_initial_def;
5147                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5148                      vs0 is created here.  */
5149 
5150                   /* Create vector phi node.  */
5151                   vect_phi = create_phi_node (vec_initial_def, bb);
5152                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5153                                     loop_vec_info_for_loop (outer_loop));
5154                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5155 
5156                   /* Create vs0 - initial def of the double reduction phi.  */
5157                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5158                                              loop_preheader_edge (outer_loop));
5159                   init_def = get_initial_def_for_reduction (stmt,
5160                                                           preheader_arg, NULL);
5161                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5162                                                     vectype, NULL);
5163 
5164                   /* Update phi node arguments with vs0 and vs2.  */
5165                   add_phi_arg (vect_phi, vect_phi_init,
5166                                loop_preheader_edge (outer_loop),
5167                                UNKNOWN_LOCATION);
5168                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5169                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5170                   if (dump_enabled_p ())
5171                     {
5172                       dump_printf_loc (MSG_NOTE, vect_location,
5173 				       "created double reduction phi node: ");
5174                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5175                       dump_printf (MSG_NOTE, "\n");
5176                     }
5177 
5178                   vect_phi_res = PHI_RESULT (vect_phi);
5179 
5180                   /* Replace the use, i.e., set the correct vs1 in the regular
5181                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5182                      loop is redundant.  */
5183                   use = reduction_phi;
5184                   for (j = 0; j < ncopies; j++)
5185                     {
5186                       edge pr_edge = loop_preheader_edge (loop);
5187                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5188                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5189                     }
5190                 }
5191             }
5192         }
5193 
5194       phis.release ();
5195       if (nested_in_vect_loop)
5196         {
5197           if (double_reduc)
5198             loop = outer_loop;
5199           else
5200             continue;
5201         }
5202 
5203       phis.create (3);
5204       /* Find the loop-closed-use at the loop exit of the original scalar
5205          result.  (The reduction result is expected to have two immediate uses,
5206          one at the latch block, and one at the loop exit).  For double
5207          reductions we are looking for exit phis of the outer loop.  */
5208       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5209         {
5210           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5211 	    {
5212 	      if (!is_gimple_debug (USE_STMT (use_p)))
5213 		phis.safe_push (USE_STMT (use_p));
5214 	    }
5215           else
5216             {
5217               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5218                 {
5219                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5220 
5221                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5222                     {
5223                       if (!flow_bb_inside_loop_p (loop,
5224                                              gimple_bb (USE_STMT (phi_use_p)))
5225 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5226                         phis.safe_push (USE_STMT (phi_use_p));
5227                     }
5228                 }
5229             }
5230         }
5231 
5232       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5233         {
5234           /* Replace the uses:  */
5235           orig_name = PHI_RESULT (exit_phi);
5236           scalar_result = scalar_results[k];
5237           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5238             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5239               SET_USE (use_p, scalar_result);
5240         }
5241 
5242       phis.release ();
5243     }
5244 }
5245 
5246 
5247 /* Function is_nonwrapping_integer_induction.
5248 
5249    Check if STMT (which is part of loop LOOP) both increments and
5250    does not cause overflow.  */
5251 
5252 static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)5253 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5254 {
5255   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5256   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5257   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5258   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5259   widest_int ni, max_loop_value, lhs_max;
5260   bool overflow = false;
5261 
5262   /* Make sure the loop is integer based.  */
5263   if (TREE_CODE (base) != INTEGER_CST
5264       || TREE_CODE (step) != INTEGER_CST)
5265     return false;
5266 
5267   /* Check that the induction increments.  */
5268   if (tree_int_cst_sgn (step) == -1)
5269     return false;
5270 
5271   /* Check that the max size of the loop will not wrap.  */
5272 
5273   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5274     return true;
5275 
5276   if (! max_stmt_executions (loop, &ni))
5277     return false;
5278 
5279   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5280 			    &overflow);
5281   if (overflow)
5282     return false;
5283 
5284   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5285 			    TYPE_SIGN (lhs_type), &overflow);
5286   if (overflow)
5287     return false;
5288 
5289   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5290 	  <= TYPE_PRECISION (lhs_type));
5291 }
5292 
5293 /* Function vectorizable_reduction.
5294 
5295    Check if STMT performs a reduction operation that can be vectorized.
5296    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5297    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5298    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5299 
5300    This function also handles reduction idioms (patterns) that have been
5301    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5302    of this form:
5303      X = pattern_expr (arg0, arg1, ..., X)
5304    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5305    sequence that had been detected and replaced by the pattern-stmt (STMT).
5306 
5307    This function also handles reduction of condition expressions, for example:
5308      for (int i = 0; i < N; i++)
5309        if (a[i] < value)
5310 	 last = a[i];
5311    This is handled by vectorising the loop and creating an additional vector
5312    containing the loop indexes for which "a[i] < value" was true.  In the
5313    function epilogue this is reduced to a single max value and then used to
5314    index into the vector of results.
5315 
5316    In some cases of reduction patterns, the type of the reduction variable X is
5317    different than the type of the other arguments of STMT.
5318    In such cases, the vectype that is used when transforming STMT into a vector
5319    stmt is different than the vectype that is used to determine the
5320    vectorization factor, because it consists of a different number of elements
5321    than the actual number of elements that are being operated upon in parallel.
5322 
5323    For example, consider an accumulation of shorts into an int accumulator.
5324    On some targets it's possible to vectorize this pattern operating on 8
5325    shorts at a time (hence, the vectype for purposes of determining the
5326    vectorization factor should be V8HI); on the other hand, the vectype that
5327    is used to create the vector form is actually V4SI (the type of the result).
5328 
5329    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5330    indicates what is the actual level of parallelism (V8HI in the example), so
5331    that the right vectorization factor would be derived.  This vectype
5332    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5333    be used to create the vectorized stmt.  The right vectype for the vectorized
5334    stmt is obtained from the type of the result X:
5335         get_vectype_for_scalar_type (TREE_TYPE (X))
5336 
5337    This means that, contrary to "regular" reductions (or "regular" stmts in
5338    general), the following equation:
5339       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5340    does *NOT* necessarily hold for reduction patterns.  */
5341 
5342 bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)5343 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5344 			gimple **vec_stmt, slp_tree slp_node)
5345 {
5346   tree vec_dest;
5347   tree scalar_dest;
5348   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5349   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5350   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5351   tree vectype_in = NULL_TREE;
5352   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354   enum tree_code code, orig_code, epilog_reduc_code;
5355   machine_mode vec_mode;
5356   int op_type;
5357   optab optab, reduc_optab;
5358   tree new_temp = NULL_TREE;
5359   gimple *def_stmt;
5360   enum vect_def_type dt;
5361   gphi *new_phi = NULL;
5362   tree scalar_type;
5363   bool is_simple_use;
5364   gimple *orig_stmt;
5365   stmt_vec_info orig_stmt_info;
5366   tree expr = NULL_TREE;
5367   int i;
5368   int ncopies;
5369   int epilog_copies;
5370   stmt_vec_info prev_stmt_info, prev_phi_info;
5371   bool single_defuse_cycle = false;
5372   tree reduc_def = NULL_TREE;
5373   gimple *new_stmt = NULL;
5374   int j;
5375   tree ops[3];
5376   bool nested_cycle = false, found_nested_cycle_def = false;
5377   gimple *reduc_def_stmt = NULL;
5378   bool double_reduc = false, dummy;
5379   basic_block def_bb;
5380   struct loop * def_stmt_loop, *outer_loop = NULL;
5381   tree def_arg;
5382   gimple *def_arg_stmt;
5383   auto_vec<tree> vec_oprnds0;
5384   auto_vec<tree> vec_oprnds1;
5385   auto_vec<tree> vect_defs;
5386   auto_vec<gimple *> phis;
5387   int vec_num;
5388   tree def0, def1, tem, op0, op1 = NULL_TREE;
5389   bool first_p = true;
5390   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5391   gimple *cond_expr_induction_def_stmt = NULL;
5392 
5393   /* In case of reduction chain we switch to the first stmt in the chain, but
5394      we don't update STMT_INFO, since only the last stmt is marked as reduction
5395      and has reduction properties.  */
5396   if (GROUP_FIRST_ELEMENT (stmt_info)
5397       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5398     {
5399       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5400       first_p = false;
5401     }
5402 
5403   if (nested_in_vect_loop_p (loop, stmt))
5404     {
5405       outer_loop = loop;
5406       loop = loop->inner;
5407       nested_cycle = true;
5408     }
5409 
5410   /* 1. Is vectorizable reduction?  */
5411   /* Not supportable if the reduction variable is used in the loop, unless
5412      it's a reduction chain.  */
5413   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5414       && !GROUP_FIRST_ELEMENT (stmt_info))
5415     return false;
5416 
5417   /* Reductions that are not used even in an enclosing outer-loop,
5418      are expected to be "live" (used out of the loop).  */
5419   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5420       && !STMT_VINFO_LIVE_P (stmt_info))
5421     return false;
5422 
5423   /* Make sure it was already recognized as a reduction computation.  */
5424   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5425       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5426     return false;
5427 
5428   /* 2. Has this been recognized as a reduction pattern?
5429 
5430      Check if STMT represents a pattern that has been recognized
5431      in earlier analysis stages.  For stmts that represent a pattern,
5432      the STMT_VINFO_RELATED_STMT field records the last stmt in
5433      the original sequence that constitutes the pattern.  */
5434 
5435   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5436   if (orig_stmt)
5437     {
5438       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5439       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5440       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5441     }
5442 
5443   /* 3. Check the operands of the operation.  The first operands are defined
5444         inside the loop body. The last operand is the reduction variable,
5445         which is defined by the loop-header-phi.  */
5446 
5447   gcc_assert (is_gimple_assign (stmt));
5448 
5449   /* Flatten RHS.  */
5450   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5451     {
5452     case GIMPLE_SINGLE_RHS:
5453       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5454       if (op_type == ternary_op)
5455 	{
5456 	  tree rhs = gimple_assign_rhs1 (stmt);
5457 	  ops[0] = TREE_OPERAND (rhs, 0);
5458 	  ops[1] = TREE_OPERAND (rhs, 1);
5459 	  ops[2] = TREE_OPERAND (rhs, 2);
5460 	  code = TREE_CODE (rhs);
5461 	}
5462       else
5463 	return false;
5464       break;
5465 
5466     case GIMPLE_BINARY_RHS:
5467       code = gimple_assign_rhs_code (stmt);
5468       op_type = TREE_CODE_LENGTH (code);
5469       gcc_assert (op_type == binary_op);
5470       ops[0] = gimple_assign_rhs1 (stmt);
5471       ops[1] = gimple_assign_rhs2 (stmt);
5472       break;
5473 
5474     case GIMPLE_TERNARY_RHS:
5475       code = gimple_assign_rhs_code (stmt);
5476       op_type = TREE_CODE_LENGTH (code);
5477       gcc_assert (op_type == ternary_op);
5478       ops[0] = gimple_assign_rhs1 (stmt);
5479       ops[1] = gimple_assign_rhs2 (stmt);
5480       ops[2] = gimple_assign_rhs3 (stmt);
5481       break;
5482 
5483     case GIMPLE_UNARY_RHS:
5484       return false;
5485 
5486     default:
5487       gcc_unreachable ();
5488     }
5489   /* The default is that the reduction variable is the last in statement.  */
5490   int reduc_index = op_type - 1;
5491   if (code == MINUS_EXPR)
5492     reduc_index = 0;
5493 
5494   if (code == COND_EXPR && slp_node)
5495     return false;
5496 
5497   scalar_dest = gimple_assign_lhs (stmt);
5498   scalar_type = TREE_TYPE (scalar_dest);
5499   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5500       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5501     return false;
5502 
5503   /* Do not try to vectorize bit-precision reductions.  */
5504   if ((TYPE_PRECISION (scalar_type)
5505        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5506     return false;
5507 
5508   /* All uses but the last are expected to be defined in the loop.
5509      The last use is the reduction variable.  In case of nested cycle this
5510      assumption is not true: we use reduc_index to record the index of the
5511      reduction variable.  */
5512   for (i = 0; i < op_type; i++)
5513     {
5514       if (i == reduc_index)
5515 	continue;
5516 
5517       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5518       if (i == 0 && code == COND_EXPR)
5519         continue;
5520 
5521       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5522 					  &def_stmt, &dt, &tem);
5523       if (!vectype_in)
5524 	vectype_in = tem;
5525       gcc_assert (is_simple_use);
5526 
5527       if (dt != vect_internal_def
5528 	  && dt != vect_external_def
5529 	  && dt != vect_constant_def
5530 	  && dt != vect_induction_def
5531           && !(dt == vect_nested_cycle && nested_cycle))
5532 	return false;
5533 
5534       if (dt == vect_nested_cycle)
5535         {
5536           found_nested_cycle_def = true;
5537           reduc_def_stmt = def_stmt;
5538           reduc_index = i;
5539         }
5540 
5541       if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5542 	cond_expr_induction_def_stmt = def_stmt;
5543     }
5544 
5545   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5546 				      &def_stmt, &dt, &tem);
5547   if (!vectype_in)
5548     vectype_in = tem;
5549   gcc_assert (is_simple_use);
5550   if (!found_nested_cycle_def)
5551     reduc_def_stmt = def_stmt;
5552 
5553   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5554     return false;
5555 
5556   if (!(dt == vect_reduction_def
5557 	|| dt == vect_nested_cycle
5558 	|| ((dt == vect_internal_def || dt == vect_external_def
5559 	     || dt == vect_constant_def || dt == vect_induction_def)
5560 	    && nested_cycle && found_nested_cycle_def)))
5561     {
5562       /* For pattern recognized stmts, orig_stmt might be a reduction,
5563 	 but some helper statements for the pattern might not, or
5564 	 might be COND_EXPRs with reduction uses in the condition.  */
5565       gcc_assert (orig_stmt);
5566       return false;
5567     }
5568 
5569   enum vect_reduction_type v_reduc_type;
5570   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5571 					  !nested_cycle, &dummy, false,
5572 					  &v_reduc_type);
5573 
5574   /* If we have a condition reduction, see if we can simplify it further.  */
5575   if (v_reduc_type == COND_REDUCTION
5576       && cond_expr_induction_def_stmt != NULL
5577       && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5578     {
5579       if (dump_enabled_p ())
5580 	dump_printf_loc (MSG_NOTE, vect_location,
5581 			 "condition expression based on integer induction.\n");
5582       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5583     }
5584   else
5585    STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5586 
5587   if (orig_stmt)
5588     gcc_assert (tmp == orig_stmt
5589 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5590   else
5591     /* We changed STMT to be the first stmt in reduction chain, hence we
5592        check that in this case the first element in the chain is STMT.  */
5593     gcc_assert (stmt == tmp
5594 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5595 
5596   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5597     return false;
5598 
5599   if (slp_node || PURE_SLP_STMT (stmt_info))
5600     ncopies = 1;
5601   else
5602     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5603                / TYPE_VECTOR_SUBPARTS (vectype_in));
5604 
5605   gcc_assert (ncopies >= 1);
5606 
5607   vec_mode = TYPE_MODE (vectype_in);
5608 
5609   if (code == COND_EXPR)
5610     {
5611       /* Only call during the analysis stage, otherwise we'll lose
5612 	 STMT_VINFO_TYPE.  */
5613       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5614 						ops[reduc_index], 0, NULL))
5615         {
5616           if (dump_enabled_p ())
5617 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5618 			     "unsupported condition in reduction\n");
5619 	  return false;
5620         }
5621     }
5622   else
5623     {
5624       /* 4. Supportable by target?  */
5625 
5626       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5627 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
5628 	{
5629 	  /* Shifts and rotates are only supported by vectorizable_shifts,
5630 	     not vectorizable_reduction.  */
5631           if (dump_enabled_p ())
5632 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5633 			     "unsupported shift or rotation.\n");
5634 	  return false;
5635 	}
5636 
5637       /* 4.1. check support for the operation in the loop  */
5638       optab = optab_for_tree_code (code, vectype_in, optab_default);
5639       if (!optab)
5640         {
5641           if (dump_enabled_p ())
5642 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 			     "no optab.\n");
5644 
5645           return false;
5646         }
5647 
5648       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5649         {
5650           if (dump_enabled_p ())
5651             dump_printf (MSG_NOTE, "op not supported by target.\n");
5652 
5653           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5654               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5655 	          < vect_min_worthwhile_factor (code))
5656             return false;
5657 
5658           if (dump_enabled_p ())
5659   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5660         }
5661 
5662       /* Worthwhile without SIMD support?  */
5663       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5664           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5665    	     < vect_min_worthwhile_factor (code))
5666         {
5667           if (dump_enabled_p ())
5668 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 			     "not worthwhile without SIMD support.\n");
5670 
5671           return false;
5672         }
5673     }
5674 
5675   /* 4.2. Check support for the epilog operation.
5676 
5677           If STMT represents a reduction pattern, then the type of the
5678           reduction variable may be different than the type of the rest
5679           of the arguments.  For example, consider the case of accumulation
5680           of shorts into an int accumulator; The original code:
5681                         S1: int_a = (int) short_a;
5682           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5683 
5684           was replaced with:
5685                         STMT: int_acc = widen_sum <short_a, int_acc>
5686 
5687           This means that:
5688           1. The tree-code that is used to create the vector operation in the
5689              epilog code (that reduces the partial results) is not the
5690              tree-code of STMT, but is rather the tree-code of the original
5691              stmt from the pattern that STMT is replacing.  I.e, in the example
5692              above we want to use 'widen_sum' in the loop, but 'plus' in the
5693              epilog.
5694           2. The type (mode) we use to check available target support
5695              for the vector operation to be created in the *epilog*, is
5696              determined by the type of the reduction variable (in the example
5697              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5698              However the type (mode) we use to check available target support
5699              for the vector operation to be created *inside the loop*, is
5700              determined by the type of the other arguments to STMT (in the
5701              example we'd check this: optab_handler (widen_sum_optab,
5702 	     vect_short_mode)).
5703 
5704           This is contrary to "regular" reductions, in which the types of all
5705           the arguments are the same as the type of the reduction variable.
5706           For "regular" reductions we can therefore use the same vector type
5707           (and also the same tree-code) when generating the epilog code and
5708           when generating the code inside the loop.  */
5709 
5710   if (orig_stmt)
5711     {
5712       /* This is a reduction pattern: get the vectype from the type of the
5713          reduction variable, and get the tree-code from orig_stmt.  */
5714       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5715 		  == TREE_CODE_REDUCTION);
5716       orig_code = gimple_assign_rhs_code (orig_stmt);
5717       gcc_assert (vectype_out);
5718       vec_mode = TYPE_MODE (vectype_out);
5719     }
5720   else
5721     {
5722       /* Regular reduction: use the same vectype and tree-code as used for
5723          the vector code inside the loop can be used for the epilog code. */
5724       orig_code = code;
5725 
5726       if (code == MINUS_EXPR)
5727 	orig_code = PLUS_EXPR;
5728 
5729       /* For simple condition reductions, replace with the actual expression
5730 	 we want to base our reduction around.  */
5731       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5732 	  == INTEGER_INDUC_COND_REDUCTION)
5733 	orig_code = MAX_EXPR;
5734     }
5735 
5736   if (nested_cycle)
5737     {
5738       def_bb = gimple_bb (reduc_def_stmt);
5739       def_stmt_loop = def_bb->loop_father;
5740       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5741                                        loop_preheader_edge (def_stmt_loop));
5742       if (TREE_CODE (def_arg) == SSA_NAME
5743           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5744           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5745           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5746           && vinfo_for_stmt (def_arg_stmt)
5747           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5748               == vect_double_reduction_def)
5749         double_reduc = true;
5750     }
5751 
5752   epilog_reduc_code = ERROR_MARK;
5753 
5754   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5755       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5756 		== INTEGER_INDUC_COND_REDUCTION)
5757     {
5758       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5759 	{
5760 	  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5761                                          optab_default);
5762 	  if (!reduc_optab)
5763 	    {
5764 	      if (dump_enabled_p ())
5765 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 				 "no optab for reduction.\n");
5767 
5768 	      epilog_reduc_code = ERROR_MARK;
5769 	    }
5770 	  else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5771 	    {
5772 	      if (dump_enabled_p ())
5773 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 				 "reduc op not supported by target.\n");
5775 
5776 	      epilog_reduc_code = ERROR_MARK;
5777 	    }
5778 
5779 	  /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5780 	     generated in the epilog using multiple expressions.  This does not
5781 	     work for condition reductions.  */
5782 	  if (epilog_reduc_code == ERROR_MARK
5783 	      && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5784 			== INTEGER_INDUC_COND_REDUCTION)
5785 	    {
5786 	      if (dump_enabled_p ())
5787 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 				 "no reduc code for scalar code.\n");
5789 	      return false;
5790 	    }
5791 	}
5792       else
5793 	{
5794 	  if (!nested_cycle || double_reduc)
5795 	    {
5796 	      if (dump_enabled_p ())
5797 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 				 "no reduc code for scalar code.\n");
5799 
5800 	      return false;
5801 	    }
5802 	}
5803     }
5804   else
5805     {
5806       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5807       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5808       cr_index_vector_type = build_vector_type
5809 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5810 
5811       epilog_reduc_code = REDUC_MAX_EXPR;
5812       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5813 				   optab_default);
5814       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5815 	  == CODE_FOR_nothing)
5816 	{
5817 	  if (dump_enabled_p ())
5818 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 			     "reduc max op not supported by target.\n");
5820 	  return false;
5821 	}
5822     }
5823 
5824   if ((double_reduc
5825        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5826        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5827 		== INTEGER_INDUC_COND_REDUCTION)
5828       && ncopies > 1)
5829     {
5830       if (dump_enabled_p ())
5831 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 			 "multiple types in double reduction or condition "
5833 			 "reduction.\n");
5834       return false;
5835     }
5836 
5837   /* In case of widenning multiplication by a constant, we update the type
5838      of the constant to be the type of the other operand.  We check that the
5839      constant fits the type in the pattern recognition pass.  */
5840   if (code == DOT_PROD_EXPR
5841       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5842     {
5843       if (TREE_CODE (ops[0]) == INTEGER_CST)
5844         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5845       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5846         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5847       else
5848         {
5849           if (dump_enabled_p ())
5850 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5851 			     "invalid types in dot-prod\n");
5852 
5853           return false;
5854         }
5855     }
5856 
5857   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5858     {
5859       widest_int ni;
5860 
5861       if (! max_loop_iterations (loop, &ni))
5862 	{
5863 	  if (dump_enabled_p ())
5864 	    dump_printf_loc (MSG_NOTE, vect_location,
5865 			     "loop count not known, cannot create cond "
5866 			     "reduction.\n");
5867 	  return false;
5868 	}
5869       /* Convert backedges to iterations.  */
5870       ni += 1;
5871 
5872       /* The additional index will be the same type as the condition.  Check
5873 	 that the loop can fit into this less one (because we'll use up the
5874 	 zero slot for when there are no matches).  */
5875       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5876       if (wi::geu_p (ni, wi::to_widest (max_index)))
5877 	{
5878 	  if (dump_enabled_p ())
5879 	    dump_printf_loc (MSG_NOTE, vect_location,
5880 			     "loop size is greater than data size.\n");
5881 	  return false;
5882 	}
5883     }
5884 
5885   if (!vec_stmt) /* transformation not required.  */
5886     {
5887       if (first_p
5888 	  && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5889 					 reduc_index))
5890         return false;
5891       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5892       return true;
5893     }
5894 
5895   /** Transform.  **/
5896 
5897   if (dump_enabled_p ())
5898     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5899 
5900   /* FORNOW: Multiple types are not supported for condition.  */
5901   if (code == COND_EXPR)
5902     gcc_assert (ncopies == 1);
5903 
5904   /* Create the destination vector  */
5905   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5906 
5907   /* In case the vectorization factor (VF) is bigger than the number
5908      of elements that we can fit in a vectype (nunits), we have to generate
5909      more than one vector stmt - i.e - we need to "unroll" the
5910      vector stmt by a factor VF/nunits.  For more details see documentation
5911      in vectorizable_operation.  */
5912 
5913   /* If the reduction is used in an outer loop we need to generate
5914      VF intermediate results, like so (e.g. for ncopies=2):
5915 	r0 = phi (init, r0)
5916 	r1 = phi (init, r1)
5917 	r0 = x0 + r0;
5918         r1 = x1 + r1;
5919     (i.e. we generate VF results in 2 registers).
5920     In this case we have a separate def-use cycle for each copy, and therefore
5921     for each copy we get the vector def for the reduction variable from the
5922     respective phi node created for this copy.
5923 
5924     Otherwise (the reduction is unused in the loop nest), we can combine
5925     together intermediate results, like so (e.g. for ncopies=2):
5926 	r = phi (init, r)
5927 	r = x0 + r;
5928 	r = x1 + r;
5929    (i.e. we generate VF/2 results in a single register).
5930    In this case for each copy we get the vector def for the reduction variable
5931    from the vectorized reduction operation generated in the previous iteration.
5932   */
5933 
5934   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5935     {
5936       single_defuse_cycle = true;
5937       epilog_copies = 1;
5938     }
5939   else
5940     epilog_copies = ncopies;
5941 
5942   prev_stmt_info = NULL;
5943   prev_phi_info = NULL;
5944   if (slp_node)
5945     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5946   else
5947     {
5948       vec_num = 1;
5949       vec_oprnds0.create (1);
5950       if (op_type == ternary_op)
5951         vec_oprnds1.create (1);
5952     }
5953 
5954   phis.create (vec_num);
5955   vect_defs.create (vec_num);
5956   if (!slp_node)
5957     vect_defs.quick_push (NULL_TREE);
5958 
5959   for (j = 0; j < ncopies; j++)
5960     {
5961       if (j == 0 || !single_defuse_cycle)
5962 	{
5963           for (i = 0; i < vec_num; i++)
5964             {
5965               /* Create the reduction-phi that defines the reduction
5966                  operand.  */
5967               new_phi = create_phi_node (vec_dest, loop->header);
5968               set_vinfo_for_stmt (new_phi,
5969                                   new_stmt_vec_info (new_phi, loop_vinfo));
5970                if (j == 0 || slp_node)
5971                  phis.quick_push (new_phi);
5972             }
5973         }
5974 
5975       if (code == COND_EXPR)
5976         {
5977           gcc_assert (!slp_node);
5978           vectorizable_condition (stmt, gsi, vec_stmt,
5979                                   PHI_RESULT (phis[0]),
5980                                   reduc_index, NULL);
5981           /* Multiple types are not supported for condition.  */
5982           break;
5983         }
5984 
5985       /* Handle uses.  */
5986       if (j == 0)
5987         {
5988           op0 = ops[!reduc_index];
5989           if (op_type == ternary_op)
5990             {
5991               if (reduc_index == 0)
5992                 op1 = ops[2];
5993               else
5994                 op1 = ops[1];
5995             }
5996 
5997           if (slp_node)
5998             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5999                                slp_node, -1);
6000           else
6001             {
6002               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6003                                                             stmt);
6004               vec_oprnds0.quick_push (loop_vec_def0);
6005               if (op_type == ternary_op)
6006                {
6007                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6008                  vec_oprnds1.quick_push (loop_vec_def1);
6009                }
6010             }
6011         }
6012       else
6013         {
6014           if (!slp_node)
6015             {
6016               enum vect_def_type dt;
6017 	      gimple *dummy_stmt;
6018 
6019               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6020                                   &dummy_stmt, &dt);
6021               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6022                                                               loop_vec_def0);
6023               vec_oprnds0[0] = loop_vec_def0;
6024               if (op_type == ternary_op)
6025                 {
6026                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6027                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6028                                                                 loop_vec_def1);
6029                   vec_oprnds1[0] = loop_vec_def1;
6030                 }
6031             }
6032 
6033           if (single_defuse_cycle)
6034             reduc_def = gimple_assign_lhs (new_stmt);
6035 
6036           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6037         }
6038 
6039       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6040         {
6041           if (slp_node)
6042             reduc_def = PHI_RESULT (phis[i]);
6043           else
6044             {
6045               if (!single_defuse_cycle || j == 0)
6046                 reduc_def = PHI_RESULT (new_phi);
6047             }
6048 
6049           def1 = ((op_type == ternary_op)
6050                   ? vec_oprnds1[i] : NULL);
6051           if (op_type == binary_op)
6052             {
6053               if (reduc_index == 0)
6054                 expr = build2 (code, vectype_out, reduc_def, def0);
6055               else
6056                 expr = build2 (code, vectype_out, def0, reduc_def);
6057             }
6058           else
6059             {
6060               if (reduc_index == 0)
6061                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6062               else
6063                 {
6064                   if (reduc_index == 1)
6065                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
6066                   else
6067                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
6068                 }
6069             }
6070 
6071           new_stmt = gimple_build_assign (vec_dest, expr);
6072           new_temp = make_ssa_name (vec_dest, new_stmt);
6073           gimple_assign_set_lhs (new_stmt, new_temp);
6074           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6075 
6076           if (slp_node)
6077             {
6078               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6079               vect_defs.quick_push (new_temp);
6080             }
6081           else
6082             vect_defs[0] = new_temp;
6083         }
6084 
6085       if (slp_node)
6086         continue;
6087 
6088       if (j == 0)
6089 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6090       else
6091 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6092 
6093       prev_stmt_info = vinfo_for_stmt (new_stmt);
6094       prev_phi_info = vinfo_for_stmt (new_phi);
6095     }
6096 
6097   tree indx_before_incr, indx_after_incr, cond_name = NULL;
6098 
6099   /* Finalize the reduction-phi (set its arguments) and create the
6100      epilog reduction code.  */
6101   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6102     {
6103       new_temp = gimple_assign_lhs (*vec_stmt);
6104       vect_defs[0] = new_temp;
6105 
6106       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6107 	 which is updated with the current index of the loop for every match of
6108 	 the original loop's cond_expr (VEC_STMT).  This results in a vector
6109 	 containing the last time the condition passed for that vector lane.
6110 	 The first match will be a 1 to allow 0 to be used for non-matching
6111 	 indexes.  If there are no matches at all then the vector will be all
6112 	 zeroes.  */
6113       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6114 	{
6115 	  int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6116 	  int k;
6117 
6118 	  gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6119 
6120 	  /* First we create a simple vector induction variable which starts
6121 	     with the values {1,2,3,...} (SERIES_VECT) and increments by the
6122 	     vector size (STEP).  */
6123 
6124 	  /* Create a {1,2,3,...} vector.  */
6125 	  tree *vtemp = XALLOCAVEC (tree, nunits_out);
6126 	  for (k = 0; k < nunits_out; ++k)
6127 	    vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6128 	  tree series_vect = build_vector (cr_index_vector_type, vtemp);
6129 
6130 	  /* Create a vector of the step value.  */
6131 	  tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6132 	  tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6133 
6134 	  /* Create an induction variable.  */
6135 	  gimple_stmt_iterator incr_gsi;
6136 	  bool insert_after;
6137 	  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6138 	  create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6139 		     insert_after, &indx_before_incr, &indx_after_incr);
6140 
6141 	  /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6142 	     filled with zeros (VEC_ZERO).  */
6143 
6144 	  /* Create a vector of 0s.  */
6145 	  tree zero = build_zero_cst (cr_index_scalar_type);
6146 	  tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6147 
6148 	  /* Create a vector phi node.  */
6149 	  tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6150 	  new_phi = create_phi_node (new_phi_tree, loop->header);
6151 	  set_vinfo_for_stmt (new_phi,
6152 			      new_stmt_vec_info (new_phi, loop_vinfo));
6153 	  add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6154 		       UNKNOWN_LOCATION);
6155 
6156 	  /* Now take the condition from the loops original cond_expr
6157 	     (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6158 	     every match uses values from the induction variable
6159 	     (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6160 	     (NEW_PHI_TREE).
6161 	     Finally, we update the phi (NEW_PHI_TREE) to take the value of
6162 	     the new cond_expr (INDEX_COND_EXPR).  */
6163 
6164 	  /* Turn the condition from vec_stmt into an ssa name.  */
6165 	  gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6166 	  tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6167 	  tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6168 	  gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6169 						       ccompare);
6170 	  gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6171 	  gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6172 	  update_stmt (*vec_stmt);
6173 
6174 	  /* Create a conditional, where the condition is taken from vec_stmt
6175 	     (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6176 	     and else is the phi (NEW_PHI_TREE).  */
6177 	  tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6178 					 ccompare_name, indx_before_incr,
6179 					 new_phi_tree);
6180 	  cond_name = make_ssa_name (cr_index_vector_type);
6181 	  gimple *index_condition = gimple_build_assign (cond_name,
6182 							 index_cond_expr);
6183 	  gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6184 	  stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6185 							    loop_vinfo);
6186 	  STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6187 	  set_vinfo_for_stmt (index_condition, index_vec_info);
6188 
6189 	  /* Update the phi with the vec cond.  */
6190 	  add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6191 		       UNKNOWN_LOCATION);
6192 	}
6193     }
6194 
6195   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6196                                     epilog_reduc_code, phis, reduc_index,
6197 				    double_reduc, slp_node, cond_name);
6198 
6199   return true;
6200 }
6201 
6202 /* Function vect_min_worthwhile_factor.
6203 
6204    For a loop where we could vectorize the operation indicated by CODE,
6205    return the minimum vectorization factor that makes it worthwhile
6206    to use generic vectors.  */
6207 int
vect_min_worthwhile_factor(enum tree_code code)6208 vect_min_worthwhile_factor (enum tree_code code)
6209 {
6210   switch (code)
6211     {
6212     case PLUS_EXPR:
6213     case MINUS_EXPR:
6214     case NEGATE_EXPR:
6215       return 4;
6216 
6217     case BIT_AND_EXPR:
6218     case BIT_IOR_EXPR:
6219     case BIT_XOR_EXPR:
6220     case BIT_NOT_EXPR:
6221       return 2;
6222 
6223     default:
6224       return INT_MAX;
6225     }
6226 }
6227 
6228 
6229 /* Function vectorizable_induction
6230 
6231    Check if PHI performs an induction computation that can be vectorized.
6232    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6233    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6234    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6235 
6236 bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt)6237 vectorizable_induction (gimple *phi,
6238 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6239 			gimple **vec_stmt)
6240 {
6241   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6242   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6243   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6244   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6245   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6246   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6247   tree vec_def;
6248 
6249   gcc_assert (ncopies >= 1);
6250   /* FORNOW. These restrictions should be relaxed.  */
6251   if (nested_in_vect_loop_p (loop, phi))
6252     {
6253       imm_use_iterator imm_iter;
6254       use_operand_p use_p;
6255       gimple *exit_phi;
6256       edge latch_e;
6257       tree loop_arg;
6258 
6259       if (ncopies > 1)
6260 	{
6261 	  if (dump_enabled_p ())
6262 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 			     "multiple types in nested loop.\n");
6264 	  return false;
6265 	}
6266 
6267       exit_phi = NULL;
6268       latch_e = loop_latch_edge (loop->inner);
6269       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6270       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6271 	{
6272 	  gimple *use_stmt = USE_STMT (use_p);
6273 	  if (is_gimple_debug (use_stmt))
6274 	    continue;
6275 
6276 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6277 	    {
6278 	      exit_phi = use_stmt;
6279 	      break;
6280 	    }
6281 	}
6282       if (exit_phi)
6283 	{
6284 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6285 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6286 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6287 	    {
6288 	      if (dump_enabled_p ())
6289 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290 				 "inner-loop induction only used outside "
6291 				 "of the outer vectorized loop.\n");
6292 	      return false;
6293 	    }
6294 	}
6295     }
6296 
6297   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6298     return false;
6299 
6300   /* FORNOW: SLP not supported.  */
6301   if (STMT_SLP_TYPE (stmt_info))
6302     return false;
6303 
6304   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6305 
6306   if (gimple_code (phi) != GIMPLE_PHI)
6307     return false;
6308 
6309   if (!vec_stmt) /* transformation not required.  */
6310     {
6311       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6312       if (dump_enabled_p ())
6313         dump_printf_loc (MSG_NOTE, vect_location,
6314                          "=== vectorizable_induction ===\n");
6315       vect_model_induction_cost (stmt_info, ncopies);
6316       return true;
6317     }
6318 
6319   /** Transform.  **/
6320 
6321   if (dump_enabled_p ())
6322     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6323 
6324   vec_def = get_initial_def_for_induction (phi);
6325   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6326   return true;
6327 }
6328 
6329 /* Function vectorizable_live_operation.
6330 
6331    STMT computes a value that is used outside the loop.  Check if
6332    it can be supported.  */
6333 
6334 bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt)6335 vectorizable_live_operation (gimple *stmt,
6336 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6337 			     gimple **vec_stmt)
6338 {
6339   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6340   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6341   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6342   tree op;
6343   gimple *def_stmt;
6344   ssa_op_iter iter;
6345 
6346   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6347 
6348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6349     return false;
6350 
6351   if (!is_gimple_assign (stmt))
6352     {
6353       if (gimple_call_internal_p (stmt)
6354 	  && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6355 	  && gimple_call_lhs (stmt)
6356 	  && loop->simduid
6357 	  && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6358 	  && loop->simduid
6359 	     == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6360 	{
6361 	  edge e = single_exit (loop);
6362 	  basic_block merge_bb = e->dest;
6363 	  imm_use_iterator imm_iter;
6364 	  use_operand_p use_p;
6365 	  tree lhs = gimple_call_lhs (stmt);
6366 
6367 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6368 	    {
6369 	      gimple *use_stmt = USE_STMT (use_p);
6370 	      if (gimple_code (use_stmt) == GIMPLE_PHI
6371 		  && gimple_bb (use_stmt) == merge_bb)
6372 		{
6373 		  if (vec_stmt)
6374 		    {
6375 		      tree vfm1
6376 			= build_int_cst (unsigned_type_node,
6377 					 loop_vinfo->vectorization_factor - 1);
6378 		      SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6379 		    }
6380 		  return true;
6381 		}
6382 	    }
6383 	}
6384 
6385       return false;
6386     }
6387 
6388   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6389     return false;
6390 
6391   /* FORNOW. CHECKME. */
6392   if (nested_in_vect_loop_p (loop, stmt))
6393     return false;
6394 
6395   /* FORNOW: support only if all uses are invariant.  This means
6396      that the scalar operations can remain in place, unvectorized.
6397      The original last scalar value that they compute will be used.  */
6398   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6399     {
6400       enum vect_def_type dt = vect_uninitialized_def;
6401 
6402       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6403         {
6404           if (dump_enabled_p ())
6405 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406 			     "use not simple.\n");
6407           return false;
6408         }
6409 
6410       if (dt != vect_external_def && dt != vect_constant_def)
6411         return false;
6412     }
6413 
6414   /* No transformation is required for the cases we currently support.  */
6415   return true;
6416 }
6417 
6418 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6419 
6420 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)6421 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6422 {
6423   ssa_op_iter op_iter;
6424   imm_use_iterator imm_iter;
6425   def_operand_p def_p;
6426   gimple *ustmt;
6427 
6428   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6429     {
6430       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6431 	{
6432 	  basic_block bb;
6433 
6434 	  if (!is_gimple_debug (ustmt))
6435 	    continue;
6436 
6437 	  bb = gimple_bb (ustmt);
6438 
6439 	  if (!flow_bb_inside_loop_p (loop, bb))
6440 	    {
6441 	      if (gimple_debug_bind_p (ustmt))
6442 		{
6443 		  if (dump_enabled_p ())
6444 		    dump_printf_loc (MSG_NOTE, vect_location,
6445                                      "killing debug use\n");
6446 
6447 		  gimple_debug_bind_reset_value (ustmt);
6448 		  update_stmt (ustmt);
6449 		}
6450 	      else
6451 		gcc_unreachable ();
6452 	    }
6453 	}
6454     }
6455 }
6456 
6457 
6458 /* This function builds ni_name = number of iterations.  Statements
6459    are emitted on the loop preheader edge.  */
6460 
6461 static tree
vect_build_loop_niters(loop_vec_info loop_vinfo)6462 vect_build_loop_niters (loop_vec_info loop_vinfo)
6463 {
6464   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6465   if (TREE_CODE (ni) == INTEGER_CST)
6466     return ni;
6467   else
6468     {
6469       tree ni_name, var;
6470       gimple_seq stmts = NULL;
6471       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6472 
6473       var = create_tmp_var (TREE_TYPE (ni), "niters");
6474       ni_name = force_gimple_operand (ni, &stmts, false, var);
6475       if (stmts)
6476 	gsi_insert_seq_on_edge_immediate (pe, stmts);
6477 
6478       return ni_name;
6479     }
6480 }
6481 
6482 
6483 /* This function generates the following statements:
6484 
6485    ni_name = number of iterations loop executes
6486    ratio = ni_name / vf
6487    ratio_mult_vf_name = ratio * vf
6488 
6489    and places them on the loop preheader edge.  */
6490 
6491 static void
vect_generate_tmps_on_preheader(loop_vec_info loop_vinfo,tree ni_name,tree * ratio_mult_vf_name_ptr,tree * ratio_name_ptr)6492 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6493 				 tree ni_name,
6494 				 tree *ratio_mult_vf_name_ptr,
6495 				 tree *ratio_name_ptr)
6496 {
6497   tree ni_minus_gap_name;
6498   tree var;
6499   tree ratio_name;
6500   tree ratio_mult_vf_name;
6501   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6502   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6503   tree log_vf;
6504 
6505   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6506 
6507   /* If epilogue loop is required because of data accesses with gaps, we
6508      subtract one iteration from the total number of iterations here for
6509      correct calculation of RATIO.  */
6510   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6511     {
6512       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6513 				       ni_name,
6514 			               build_one_cst (TREE_TYPE (ni_name)));
6515       if (!is_gimple_val (ni_minus_gap_name))
6516 	{
6517 	  var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6518 	  gimple *stmts = NULL;
6519           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6520 						    true, var);
6521 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
6522         }
6523     }
6524   else
6525     ni_minus_gap_name = ni_name;
6526 
6527   /* Create: ratio = ni >> log2(vf) */
6528   /* ???  As we have ni == number of latch executions + 1, ni could
6529      have overflown to zero.  So avoid computing ratio based on ni
6530      but compute it using the fact that we know ratio will be at least
6531      one, thus via (ni - vf) >> log2(vf) + 1.  */
6532   ratio_name
6533     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6534 		   fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6535 				fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6536 					     ni_minus_gap_name,
6537 					     build_int_cst
6538 					       (TREE_TYPE (ni_name), vf)),
6539 				log_vf),
6540 		   build_int_cst (TREE_TYPE (ni_name), 1));
6541   if (!is_gimple_val (ratio_name))
6542     {
6543       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6544       gimple *stmts = NULL;
6545       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6546       gsi_insert_seq_on_edge_immediate (pe, stmts);
6547     }
6548   *ratio_name_ptr = ratio_name;
6549 
6550   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6551 
6552   if (ratio_mult_vf_name_ptr)
6553     {
6554       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6555 					ratio_name, log_vf);
6556       if (!is_gimple_val (ratio_mult_vf_name))
6557 	{
6558 	  var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6559 	  gimple *stmts = NULL;
6560 	  ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6561 						     true, var);
6562 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
6563 	}
6564       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6565     }
6566 
6567   return;
6568 }
6569 
6570 
6571 /* Function vect_transform_loop.
6572 
6573    The analysis phase has determined that the loop is vectorizable.
6574    Vectorize the loop - created vectorized stmts to replace the scalar
6575    stmts in the loop, and update the loop exit condition.  */
6576 
6577 void
vect_transform_loop(loop_vec_info loop_vinfo)6578 vect_transform_loop (loop_vec_info loop_vinfo)
6579 {
6580   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6581   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6582   int nbbs = loop->num_nodes;
6583   int i;
6584   tree ratio = NULL;
6585   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6586   bool grouped_store;
6587   bool slp_scheduled = false;
6588   gimple *stmt, *pattern_stmt;
6589   gimple_seq pattern_def_seq = NULL;
6590   gimple_stmt_iterator pattern_def_si = gsi_none ();
6591   bool transform_pattern_stmt = false;
6592   bool check_profitability = false;
6593   int th;
6594   /* Record number of iterations before we started tampering with the profile. */
6595   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6596 
6597   if (dump_enabled_p ())
6598     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6599 
6600   /* If profile is inprecise, we have chance to fix it up.  */
6601   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6602     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6603 
6604   /* Use the more conservative vectorization threshold.  If the number
6605      of iterations is constant assume the cost check has been performed
6606      by our caller.  If the threshold makes all loops profitable that
6607      run at least the vectorization factor number of times checking
6608      is pointless, too.  */
6609   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6610   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6611       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6612     {
6613       if (dump_enabled_p ())
6614 	dump_printf_loc (MSG_NOTE, vect_location,
6615 			 "Profitability threshold is %d loop iterations.\n",
6616                          th);
6617       check_profitability = true;
6618     }
6619 
6620   /* Version the loop first, if required, so the profitability check
6621      comes first.  */
6622 
6623   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6624       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6625     {
6626       vect_loop_versioning (loop_vinfo, th, check_profitability);
6627       check_profitability = false;
6628     }
6629 
6630   tree ni_name = vect_build_loop_niters (loop_vinfo);
6631   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6632 
6633   /* Peel the loop if there are data refs with unknown alignment.
6634      Only one data ref with unknown store is allowed.  */
6635 
6636   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6637     {
6638       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6639 				     th, check_profitability);
6640       check_profitability = false;
6641       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6642 	 be re-computed.  */
6643       ni_name = NULL_TREE;
6644     }
6645 
6646   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6647      compile time constant), or it is a constant that doesn't divide by the
6648      vectorization factor, then an epilog loop needs to be created.
6649      We therefore duplicate the loop: the original loop will be vectorized,
6650      and will compute the first (n/VF) iterations.  The second copy of the loop
6651      will remain scalar and will compute the remaining (n%VF) iterations.
6652      (VF is the vectorization factor).  */
6653 
6654   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6655       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6656     {
6657       tree ratio_mult_vf;
6658       if (!ni_name)
6659 	ni_name = vect_build_loop_niters (loop_vinfo);
6660       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6661 				       &ratio);
6662       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6663 				      th, check_profitability);
6664     }
6665   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6666     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6667 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6668   else
6669     {
6670       if (!ni_name)
6671 	ni_name = vect_build_loop_niters (loop_vinfo);
6672       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6673     }
6674 
6675   /* 1) Make sure the loop header has exactly two entries
6676      2) Make sure we have a preheader basic block.  */
6677 
6678   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6679 
6680   split_edge (loop_preheader_edge (loop));
6681 
6682   /* FORNOW: the vectorizer supports only loops which body consist
6683      of one basic block (header + empty latch). When the vectorizer will
6684      support more involved loop forms, the order by which the BBs are
6685      traversed need to be reconsidered.  */
6686 
6687   for (i = 0; i < nbbs; i++)
6688     {
6689       basic_block bb = bbs[i];
6690       stmt_vec_info stmt_info;
6691 
6692       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6693 	   gsi_next (&si))
6694         {
6695 	  gphi *phi = si.phi ();
6696 	  if (dump_enabled_p ())
6697 	    {
6698 	      dump_printf_loc (MSG_NOTE, vect_location,
6699                                "------>vectorizing phi: ");
6700 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6701               dump_printf (MSG_NOTE, "\n");
6702 	    }
6703 	  stmt_info = vinfo_for_stmt (phi);
6704 	  if (!stmt_info)
6705 	    continue;
6706 
6707 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6708 	    vect_loop_kill_debug_uses (loop, phi);
6709 
6710 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6711 	      && !STMT_VINFO_LIVE_P (stmt_info))
6712 	    continue;
6713 
6714 	  if (STMT_VINFO_VECTYPE (stmt_info)
6715 	      && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6716 		  != (unsigned HOST_WIDE_INT) vectorization_factor)
6717 	      && dump_enabled_p ())
6718 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6719 
6720 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6721 	    {
6722 	      if (dump_enabled_p ())
6723 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6724 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6725 	    }
6726 	}
6727 
6728       pattern_stmt = NULL;
6729       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6730 	   !gsi_end_p (si) || transform_pattern_stmt;)
6731 	{
6732 	  bool is_store;
6733 
6734           if (transform_pattern_stmt)
6735 	    stmt = pattern_stmt;
6736           else
6737 	    {
6738 	      stmt = gsi_stmt (si);
6739 	      /* During vectorization remove existing clobber stmts.  */
6740 	      if (gimple_clobber_p (stmt))
6741 		{
6742 		  unlink_stmt_vdef (stmt);
6743 		  gsi_remove (&si, true);
6744 		  release_defs (stmt);
6745 		  continue;
6746 		}
6747 	    }
6748 
6749 	  if (dump_enabled_p ())
6750 	    {
6751 	      dump_printf_loc (MSG_NOTE, vect_location,
6752 			       "------>vectorizing statement: ");
6753 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6754               dump_printf (MSG_NOTE, "\n");
6755 	    }
6756 
6757 	  stmt_info = vinfo_for_stmt (stmt);
6758 
6759 	  /* vector stmts created in the outer-loop during vectorization of
6760 	     stmts in an inner-loop may not have a stmt_info, and do not
6761 	     need to be vectorized.  */
6762 	  if (!stmt_info)
6763 	    {
6764 	      gsi_next (&si);
6765 	      continue;
6766 	    }
6767 
6768 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6769 	    vect_loop_kill_debug_uses (loop, stmt);
6770 
6771 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6772 	      && !STMT_VINFO_LIVE_P (stmt_info))
6773             {
6774               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6775                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6776                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6777                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6778                 {
6779                   stmt = pattern_stmt;
6780                   stmt_info = vinfo_for_stmt (stmt);
6781                 }
6782               else
6783 	        {
6784    	          gsi_next (&si);
6785 	          continue;
6786                 }
6787 	    }
6788           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6789                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6790                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6791                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6792             transform_pattern_stmt = true;
6793 
6794 	  /* If pattern statement has def stmts, vectorize them too.  */
6795 	  if (is_pattern_stmt_p (stmt_info))
6796 	    {
6797 	      if (pattern_def_seq == NULL)
6798 		{
6799 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6800 		  pattern_def_si = gsi_start (pattern_def_seq);
6801 		}
6802 	      else if (!gsi_end_p (pattern_def_si))
6803 		gsi_next (&pattern_def_si);
6804 	      if (pattern_def_seq != NULL)
6805 		{
6806 		  gimple *pattern_def_stmt = NULL;
6807 		  stmt_vec_info pattern_def_stmt_info = NULL;
6808 
6809 		  while (!gsi_end_p (pattern_def_si))
6810 		    {
6811 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
6812 		      pattern_def_stmt_info
6813 			= vinfo_for_stmt (pattern_def_stmt);
6814 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6815 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6816 			break;
6817 		      gsi_next (&pattern_def_si);
6818 		    }
6819 
6820 		  if (!gsi_end_p (pattern_def_si))
6821 		    {
6822 		      if (dump_enabled_p ())
6823 			{
6824 			  dump_printf_loc (MSG_NOTE, vect_location,
6825 					   "==> vectorizing pattern def "
6826 					   "stmt: ");
6827 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6828 					    pattern_def_stmt, 0);
6829                           dump_printf (MSG_NOTE, "\n");
6830 			}
6831 
6832 		      stmt = pattern_def_stmt;
6833 		      stmt_info = pattern_def_stmt_info;
6834 		    }
6835 		  else
6836 		    {
6837 		      pattern_def_si = gsi_none ();
6838 		      transform_pattern_stmt = false;
6839 		    }
6840 		}
6841 	      else
6842 		transform_pattern_stmt = false;
6843             }
6844 
6845 	  if (STMT_VINFO_VECTYPE (stmt_info))
6846 	    {
6847 	      unsigned int nunits
6848 		= (unsigned int)
6849 		  TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6850 	      if (!STMT_SLP_TYPE (stmt_info)
6851 		  && nunits != (unsigned int) vectorization_factor
6852 		  && dump_enabled_p ())
6853 		  /* For SLP VF is set according to unrolling factor, and not
6854 		     to vector size, hence for SLP this print is not valid.  */
6855 		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6856 	    }
6857 
6858 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
6859 	     reached.  */
6860 	  if (STMT_SLP_TYPE (stmt_info))
6861 	    {
6862 	      if (!slp_scheduled)
6863 		{
6864 		  slp_scheduled = true;
6865 
6866 		  if (dump_enabled_p ())
6867 		    dump_printf_loc (MSG_NOTE, vect_location,
6868 				     "=== scheduling SLP instances ===\n");
6869 
6870 		  vect_schedule_slp (loop_vinfo);
6871 		}
6872 
6873 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6874 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6875 		{
6876 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6877 		    {
6878 		      pattern_def_seq = NULL;
6879 		      gsi_next (&si);
6880 		    }
6881 		  continue;
6882 		}
6883 	    }
6884 
6885 	  /* -------- vectorize statement ------------ */
6886 	  if (dump_enabled_p ())
6887 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6888 
6889 	  grouped_store = false;
6890 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6891           if (is_store)
6892             {
6893 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6894 		{
6895 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6896 		     interleaving chain was completed - free all the stores in
6897 		     the chain.  */
6898 		  gsi_next (&si);
6899 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6900 		}
6901 	      else
6902 		{
6903 		  /* Free the attached stmt_vec_info and remove the stmt.  */
6904 		  gimple *store = gsi_stmt (si);
6905 		  free_stmt_vec_info (store);
6906 		  unlink_stmt_vdef (store);
6907 		  gsi_remove (&si, true);
6908 		  release_defs (store);
6909 		}
6910 
6911 	      /* Stores can only appear at the end of pattern statements.  */
6912 	      gcc_assert (!transform_pattern_stmt);
6913 	      pattern_def_seq = NULL;
6914 	    }
6915 	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6916 	    {
6917 	      pattern_def_seq = NULL;
6918 	      gsi_next (&si);
6919 	    }
6920 	}		        /* stmts in BB */
6921     }				/* BBs in loop */
6922 
6923   slpeel_make_loop_iterate_ntimes (loop, ratio);
6924 
6925   /* Reduce loop iterations by the vectorization factor.  */
6926   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6927 		      expected_iterations / vectorization_factor);
6928   loop->nb_iterations_upper_bound
6929     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6930   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6931       && loop->nb_iterations_upper_bound != 0)
6932     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6933   if (loop->any_estimate)
6934     {
6935       loop->nb_iterations_estimate
6936         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6937        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6938 	   && loop->nb_iterations_estimate != 0)
6939 	 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6940     }
6941 
6942   if (dump_enabled_p ())
6943     {
6944       dump_printf_loc (MSG_NOTE, vect_location,
6945 		       "LOOP VECTORIZED\n");
6946       if (loop->inner)
6947 	dump_printf_loc (MSG_NOTE, vect_location,
6948 			 "OUTER LOOP VECTORIZED\n");
6949       dump_printf (MSG_NOTE, "\n");
6950     }
6951 
6952   /* Free SLP instances here because otherwise stmt reference counting
6953      won't work.  */
6954   slp_instance instance;
6955   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6956     vect_free_slp_instance (instance);
6957   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6958 }
6959 
6960 /* The code below is trying to perform simple optimization - revert
6961    if-conversion for masked stores, i.e. if the mask of a store is zero
6962    do not perform it and all stored value producers also if possible.
6963    For example,
6964      for (i=0; i<n; i++)
6965        if (c[i])
6966 	{
6967 	  p1[i] += 1;
6968 	  p2[i] = p3[i] +2;
6969 	}
6970    this transformation will produce the following semi-hammock:
6971 
6972    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6973      {
6974        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6975        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6976        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6977        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6978        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6979        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6980      }
6981 */
6982 
6983 void
optimize_mask_stores(struct loop * loop)6984 optimize_mask_stores (struct loop *loop)
6985 {
6986   basic_block *bbs = get_loop_body (loop);
6987   unsigned nbbs = loop->num_nodes;
6988   unsigned i;
6989   basic_block bb;
6990   gimple_stmt_iterator gsi;
6991   gimple *stmt;
6992   auto_vec<gimple *> worklist;
6993 
6994   vect_location = find_loop_location (loop);
6995   /* Pick up all masked stores in loop if any.  */
6996   for (i = 0; i < nbbs; i++)
6997     {
6998       bb = bbs[i];
6999       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7000 	   gsi_next (&gsi))
7001 	{
7002 	  stmt = gsi_stmt (gsi);
7003 	  if (is_gimple_call (stmt)
7004 	      && gimple_call_internal_p (stmt)
7005 	      && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7006 	    worklist.safe_push (stmt);
7007 	}
7008     }
7009 
7010   free (bbs);
7011   if (worklist.is_empty ())
7012     return;
7013 
7014   /* Loop has masked stores.  */
7015   while (!worklist.is_empty ())
7016     {
7017       gimple *last, *last_store;
7018       edge e, efalse;
7019       tree mask;
7020       basic_block store_bb, join_bb;
7021       gimple_stmt_iterator gsi_to;
7022       tree vdef, new_vdef;
7023       gphi *phi;
7024       tree vectype;
7025       tree zero;
7026 
7027       last = worklist.pop ();
7028       mask = gimple_call_arg (last, 2);
7029       bb = gimple_bb (last);
7030       /* Create new bb.  */
7031       e = split_block (bb, last);
7032       join_bb = e->dest;
7033       store_bb = create_empty_bb (bb);
7034       add_bb_to_loop (store_bb, loop);
7035       e->flags = EDGE_TRUE_VALUE;
7036       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7037       /* Put STORE_BB to likely part.  */
7038       efalse->probability = PROB_UNLIKELY;
7039       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7040       make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7041       if (dom_info_available_p (CDI_DOMINATORS))
7042 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7043       if (dump_enabled_p ())
7044 	dump_printf_loc (MSG_NOTE, vect_location,
7045 			 "Create new block %d to sink mask stores.",
7046 			 store_bb->index);
7047       /* Create vector comparison with boolean result.  */
7048       vectype = TREE_TYPE (mask);
7049       zero = build_zero_cst (vectype);
7050       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7051       gsi = gsi_last_bb (bb);
7052       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7053       /* Create new PHI node for vdef of the last masked store:
7054 	 .MEM_2 = VDEF <.MEM_1>
7055 	 will be converted to
7056 	 .MEM.3 = VDEF <.MEM_1>
7057 	 and new PHI node will be created in join bb
7058 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
7059       */
7060       vdef = gimple_vdef (last);
7061       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7062       gimple_set_vdef (last, new_vdef);
7063       phi = create_phi_node (vdef, join_bb);
7064       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7065 
7066       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7067       while (true)
7068 	{
7069 	  gimple_stmt_iterator gsi_from;
7070 	  gimple *stmt1 = NULL;
7071 
7072 	  /* Move masked store to STORE_BB.  */
7073 	  last_store = last;
7074 	  gsi = gsi_for_stmt (last);
7075 	  gsi_from = gsi;
7076 	  /* Shift GSI to the previous stmt for further traversal.  */
7077 	  gsi_prev (&gsi);
7078 	  gsi_to = gsi_start_bb (store_bb);
7079 	  gsi_move_before (&gsi_from, &gsi_to);
7080 	  /* Setup GSI_TO to the non-empty block start.  */
7081 	  gsi_to = gsi_start_bb (store_bb);
7082 	  if (dump_enabled_p ())
7083 	    {
7084 	      dump_printf_loc (MSG_NOTE, vect_location,
7085 			       "Move stmt to created bb\n");
7086 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7087 	    }
7088 	  /* Move all stored value producers if possible.  */
7089 	  while (!gsi_end_p (gsi))
7090 	    {
7091 	      tree lhs;
7092 	      imm_use_iterator imm_iter;
7093 	      use_operand_p use_p;
7094 	      bool res;
7095 
7096 	      /* Skip debug statements.  */
7097 	      if (is_gimple_debug (gsi_stmt (gsi)))
7098 		{
7099 		  gsi_prev (&gsi);
7100 		  continue;
7101 		}
7102 	      stmt1 = gsi_stmt (gsi);
7103 	      /* Do not consider statements writing to memory or having
7104 		 volatile operand.  */
7105 	      if (gimple_vdef (stmt1)
7106 		  || gimple_has_volatile_ops (stmt1))
7107 		break;
7108 	      gsi_from = gsi;
7109 	      gsi_prev (&gsi);
7110 	      lhs = gimple_get_lhs (stmt1);
7111 	      if (!lhs)
7112 		break;
7113 
7114 	      /* LHS of vectorized stmt must be SSA_NAME.  */
7115 	      if (TREE_CODE (lhs) != SSA_NAME)
7116 		break;
7117 
7118 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7119 		{
7120 		  /* Remove dead scalar statement.  */
7121 		  if (has_zero_uses (lhs))
7122 		    {
7123 		      gsi_remove (&gsi_from, true);
7124 		      continue;
7125 		    }
7126 		}
7127 
7128 	      /* Check that LHS does not have uses outside of STORE_BB.  */
7129 	      res = true;
7130 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7131 		{
7132 		  gimple *use_stmt;
7133 		  use_stmt = USE_STMT (use_p);
7134 		  if (is_gimple_debug (use_stmt))
7135 		    continue;
7136 		  if (gimple_bb (use_stmt) != store_bb)
7137 		    {
7138 		      res = false;
7139 		      break;
7140 		    }
7141 		}
7142 	      if (!res)
7143 		break;
7144 
7145 	      if (gimple_vuse (stmt1)
7146 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
7147 		break;
7148 
7149 	      /* Can move STMT1 to STORE_BB.  */
7150 	      if (dump_enabled_p ())
7151 		{
7152 		  dump_printf_loc (MSG_NOTE, vect_location,
7153 				   "Move stmt to created bb\n");
7154 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7155 		}
7156 	      gsi_move_before (&gsi_from, &gsi_to);
7157 	      /* Shift GSI_TO for further insertion.  */
7158 	      gsi_prev (&gsi_to);
7159 	    }
7160 	  /* Put other masked stores with the same mask to STORE_BB.  */
7161 	  if (worklist.is_empty ()
7162 	      || gimple_call_arg (worklist.last (), 2) != mask
7163 	      || worklist.last () != stmt1)
7164 	    break;
7165 	  last = worklist.pop ();
7166 	}
7167       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7168     }
7169 }
7170