1 /* Loop Vectorization
2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
43 
44 /* Loop Vectorization Pass.
45 
46    This pass tries to vectorize loops.
47 
48    For example, the vectorizer transforms the following simple loop:
49 
50         short a[N]; short b[N]; short c[N]; int i;
51 
52         for (i=0; i<N; i++){
53           a[i] = b[i] + c[i];
54         }
55 
56    as if it was manually vectorized by rewriting the source code into:
57 
58         typedef int __attribute__((mode(V8HI))) v8hi;
59         short a[N];  short b[N]; short c[N];   int i;
60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61         v8hi va, vb, vc;
62 
63         for (i=0; i<N/8; i++){
64           vb = pb[i];
65           vc = pc[i];
66           va = vb + vc;
67           pa[i] = va;
68         }
69 
70         The main entry to this pass is vectorize_loops(), in which
71    the vectorizer applies a set of analyses on a given set of loops,
72    followed by the actual vectorization transformation for the loops that
73    had successfully passed the analysis phase.
74         Throughout this pass we make a distinction between two types of
75    data: scalars (which are represented by SSA_NAMES), and memory references
76    ("data-refs").  These two types of data require different handling both
77    during analysis and transformation. The types of data-refs that the
78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80    accesses are required to have a simple (consecutive) access pattern.
81 
82    Analysis phase:
83    ===============
84         The driver for the analysis phase is vect_analyze_loop().
85    It applies a set of analyses, some of which rely on the scalar evolution
86    analyzer (scev) developed by Sebastian Pop.
87 
88         During the analysis phase the vectorizer records some information
89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90    loop, as well as general information about the loop as a whole, which is
91    recorded in a "loop_vec_info" struct attached to each loop.
92 
93    Transformation phase:
94    =====================
95         The loop transformation phase scans all the stmts in the loop, and
96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97    the loop that needs to be vectorized.  It inserts the vector code sequence
98    just before the scalar stmt S, and records a pointer to the vector code
99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100    attached to S).  This pointer will be used for the vectorization of following
101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102    otherwise, we rely on dead code elimination for removing it.
103 
104         For example, say stmt S1 was vectorized into stmt VS1:
105 
106    VS1: vb = px[i];
107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108    S2:  a = b;
109 
110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
113    resulting sequence would be:
114 
115    VS1: vb = px[i];
116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117    VS2: va = vb;
118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119 
120         Operands that are not SSA_NAMEs, are data-refs that appear in
121    load/store operations (like 'x[i]' in S1), and are handled differently.
122 
123    Target modeling:
124    =================
125         Currently the only target specific information that is used is the
126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127    Targets that can support different sizes of vectors, for now will need
128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
129    flexibility will be added in the future.
130 
131         Since we only vectorize operations which vector form can be
132    expressed using existing tree codes, to verify that an operation is
133    supported, the vectorizer checks the relevant optab at the relevant
134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
135    the value found is CODE_FOR_nothing, then there's no target support, and
136    we can't vectorize the stmt.
137 
138    For additional information on this project see:
139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
140 */
141 
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
143 
144 /* Function vect_determine_vectorization_factor
145 
146    Determine the vectorization factor (VF).  VF is the number of data elements
147    that are operated upon in parallel in a single iteration of the vectorized
148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150    elements can fit in a single vector register.
151 
152    We currently support vectorization of loops in which all types operated upon
153    are of the same size.  Therefore this function currently sets VF according to
154    the size of the types operated upon, and fails if there are multiple sizes
155    in the loop.
156 
157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
158    original loop:
159         for (i=0; i<N; i++){
160           a[i] = b[i] + c[i];
161         }
162 
163    vectorized loop:
164         for (i=0; i<N; i+=VF){
165           a[i:VF] = b[i:VF] + c[i:VF];
166         }
167 */
168 
169 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174   int nbbs = loop->num_nodes;
175   gimple_stmt_iterator si;
176   unsigned int vectorization_factor = 0;
177   tree scalar_type;
178   gimple phi;
179   tree vectype;
180   unsigned int nunits;
181   stmt_vec_info stmt_info;
182   int i;
183   HOST_WIDE_INT dummy;
184   gimple stmt, pattern_stmt = NULL;
185   gimple_seq pattern_def_seq = NULL;
186   gimple_stmt_iterator pattern_def_si = gsi_none ();
187   bool analyze_pattern_stmt = false;
188 
189   if (dump_enabled_p ())
190     dump_printf_loc (MSG_NOTE, vect_location,
191                      "=== vect_determine_vectorization_factor ===");
192 
193   for (i = 0; i < nbbs; i++)
194     {
195       basic_block bb = bbs[i];
196 
197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
198 	{
199 	  phi = gsi_stmt (si);
200 	  stmt_info = vinfo_for_stmt (phi);
201 	  if (dump_enabled_p ())
202 	    {
203 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
205 	    }
206 
207 	  gcc_assert (stmt_info);
208 
209 	  if (STMT_VINFO_RELEVANT_P (stmt_info))
210             {
211 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
213 
214 	      if (dump_enabled_p ())
215 		{
216 		  dump_printf_loc (MSG_NOTE, vect_location,
217                                    "get vectype for scalar type:  ");
218 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
219 		}
220 
221 	      vectype = get_vectype_for_scalar_type (scalar_type);
222 	      if (!vectype)
223 		{
224 		  if (dump_enabled_p ())
225 		    {
226 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227                                        "not vectorized: unsupported "
228                                        "data-type ");
229 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230                                          scalar_type);
231 		    }
232 		  return false;
233 		}
234 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
235 
236 	      if (dump_enabled_p ())
237 		{
238 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
240 		}
241 
242 	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 	      if (dump_enabled_p ())
244 		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245 
246 	      if (!vectorization_factor
247 		  || (nunits > vectorization_factor))
248 		vectorization_factor = nunits;
249 	    }
250 	}
251 
252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
253         {
254           tree vf_vectype;
255 
256           if (analyze_pattern_stmt)
257 	    stmt = pattern_stmt;
258           else
259             stmt = gsi_stmt (si);
260 
261           stmt_info = vinfo_for_stmt (stmt);
262 
263 	  if (dump_enabled_p ())
264 	    {
265 	      dump_printf_loc (MSG_NOTE, vect_location,
266                                "==> examining statement: ");
267 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
268 	    }
269 
270 	  gcc_assert (stmt_info);
271 
272 	  /* Skip stmts which do not need to be vectorized.  */
273 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 	      && !STMT_VINFO_LIVE_P (stmt_info))
275             {
276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
280                 {
281                   stmt = pattern_stmt;
282                   stmt_info = vinfo_for_stmt (pattern_stmt);
283                   if (dump_enabled_p ())
284                     {
285                       dump_printf_loc (MSG_NOTE, vect_location,
286                                        "==> examining pattern statement: ");
287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288                     }
289                 }
290               else
291 	        {
292 	          if (dump_enabled_p ())
293 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294                   gsi_next (&si);
295 	          continue;
296                 }
297 	    }
298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302             analyze_pattern_stmt = true;
303 
304 	  /* If a pattern statement has def stmts, analyze them too.  */
305 	  if (is_pattern_stmt_p (stmt_info))
306 	    {
307 	      if (pattern_def_seq == NULL)
308 		{
309 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 		  pattern_def_si = gsi_start (pattern_def_seq);
311 		}
312 	      else if (!gsi_end_p (pattern_def_si))
313 		gsi_next (&pattern_def_si);
314 	      if (pattern_def_seq != NULL)
315 		{
316 		  gimple pattern_def_stmt = NULL;
317 		  stmt_vec_info pattern_def_stmt_info = NULL;
318 
319 		  while (!gsi_end_p (pattern_def_si))
320 		    {
321 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
322 		      pattern_def_stmt_info
323 			= vinfo_for_stmt (pattern_def_stmt);
324 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 			break;
327 		      gsi_next (&pattern_def_si);
328 		    }
329 
330 		  if (!gsi_end_p (pattern_def_si))
331 		    {
332 		      if (dump_enabled_p ())
333 			{
334 			  dump_printf_loc (MSG_NOTE, vect_location,
335                                            "==> examining pattern def stmt: ");
336 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337                                             pattern_def_stmt, 0);
338 			}
339 
340 		      stmt = pattern_def_stmt;
341 		      stmt_info = pattern_def_stmt_info;
342 		    }
343 		  else
344 		    {
345 		      pattern_def_si = gsi_none ();
346 		      analyze_pattern_stmt = false;
347 		    }
348 		}
349 	      else
350 		analyze_pattern_stmt = false;
351 	    }
352 
353 	  if (gimple_get_lhs (stmt) == NULL_TREE)
354 	    {
355 	      if (dump_enabled_p ())
356 		{
357 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358                                    "not vectorized: irregular stmt.");
359 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
360                                     0);
361 		}
362 	      return false;
363 	    }
364 
365 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
366 	    {
367 	      if (dump_enabled_p ())
368 	        {
369 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370                                    "not vectorized: vector stmt in loop:");
371 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
372 	        }
373 	      return false;
374 	    }
375 
376 	  if (STMT_VINFO_VECTYPE (stmt_info))
377 	    {
378 	      /* The only case when a vectype had been already set is for stmts
379 	         that contain a dataref, or for "pattern-stmts" (stmts
380 		 generated by the vectorizer to represent/replace a certain
381 		 idiom).  */
382 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 			  || is_pattern_stmt_p (stmt_info)
384 			  || !gsi_end_p (pattern_def_si));
385 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
386 	    }
387 	  else
388 	    {
389 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 	      scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 	      if (dump_enabled_p ())
392 		{
393 		  dump_printf_loc (MSG_NOTE, vect_location,
394                                    "get vectype for scalar type:  ");
395 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
396 		}
397 	      vectype = get_vectype_for_scalar_type (scalar_type);
398 	      if (!vectype)
399 		{
400 		  if (dump_enabled_p ())
401 		    {
402 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403                                        "not vectorized: unsupported "
404                                        "data-type ");
405 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406                                          scalar_type);
407 		    }
408 		  return false;
409 		}
410 
411 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
412             }
413 
414 	  /* The vectorization factor is according to the smallest
415 	     scalar type (or the largest vector size, but we only
416 	     support one vector size per loop).  */
417 	  scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
418 						       &dummy);
419 	  if (dump_enabled_p ())
420 	    {
421 	      dump_printf_loc (MSG_NOTE, vect_location,
422                                "get vectype for scalar type:  ");
423 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
424 	    }
425 	  vf_vectype = get_vectype_for_scalar_type (scalar_type);
426 	  if (!vf_vectype)
427 	    {
428 	      if (dump_enabled_p ())
429 		{
430 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431                                    "not vectorized: unsupported data-type ");
432 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
433                                      scalar_type);
434 		}
435 	      return false;
436 	    }
437 
438 	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
439 	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
440 	    {
441 	      if (dump_enabled_p ())
442 		{
443 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444                                    "not vectorized: different sized vector "
445                                    "types in statement, ");
446 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
447                                      vectype);
448 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
449 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
450                                      vf_vectype);
451 		}
452 	      return false;
453 	    }
454 
455 	  if (dump_enabled_p ())
456 	    {
457 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
458 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
459 	    }
460 
461 	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
462 	  if (dump_enabled_p ())
463 	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
464 	  if (!vectorization_factor
465 	      || (nunits > vectorization_factor))
466 	    vectorization_factor = nunits;
467 
468 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
469 	    {
470 	      pattern_def_seq = NULL;
471 	      gsi_next (&si);
472 	    }
473         }
474     }
475 
476   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
477   if (dump_enabled_p ())
478     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
479                      vectorization_factor);
480   if (vectorization_factor <= 1)
481     {
482       if (dump_enabled_p ())
483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484                          "not vectorized: unsupported data-type");
485       return false;
486     }
487   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
488 
489   return true;
490 }
491 
492 
493 /* Function vect_is_simple_iv_evolution.
494 
495    FORNOW: A simple evolution of an induction variables in the loop is
496    considered a polynomial evolution with constant step.  */
497 
498 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)499 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
500                              tree * step)
501 {
502   tree init_expr;
503   tree step_expr;
504   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
505 
506   /* When there is no evolution in this loop, the evolution function
507      is not "simple".  */
508   if (evolution_part == NULL_TREE)
509     return false;
510 
511   /* When the evolution is a polynomial of degree >= 2
512      the evolution function is not "simple".  */
513   if (tree_is_chrec (evolution_part))
514     return false;
515 
516   step_expr = evolution_part;
517   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
518 
519   if (dump_enabled_p ())
520     {
521       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
522       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
523       dump_printf (MSG_NOTE, ",  init: ");
524       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
525     }
526 
527   *init = init_expr;
528   *step = step_expr;
529 
530   if (TREE_CODE (step_expr) != INTEGER_CST)
531     {
532       if (dump_enabled_p ())
533         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534                          "step unknown.");
535       return false;
536     }
537 
538   return true;
539 }
540 
541 /* Function vect_analyze_scalar_cycles_1.
542 
543    Examine the cross iteration def-use cycles of scalar variables
544    in LOOP.  LOOP_VINFO represents the loop that is now being
545    considered for vectorization (can be LOOP, or an outer-loop
546    enclosing LOOP).  */
547 
548 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)549 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
550 {
551   basic_block bb = loop->header;
552   tree dumy;
553   vec<gimple> worklist;
554   worklist.create (64);
555   gimple_stmt_iterator gsi;
556   bool double_reduc;
557 
558   if (dump_enabled_p ())
559     dump_printf_loc (MSG_NOTE, vect_location,
560                      "=== vect_analyze_scalar_cycles ===");
561 
562   /* First - identify all inductions.  Reduction detection assumes that all the
563      inductions have been identified, therefore, this order must not be
564      changed.  */
565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
566     {
567       gimple phi = gsi_stmt (gsi);
568       tree access_fn = NULL;
569       tree def = PHI_RESULT (phi);
570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
571 
572       if (dump_enabled_p ())
573 	{
574 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
575 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
576 	}
577 
578       /* Skip virtual phi's.  The data dependences that are associated with
579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
580       if (virtual_operand_p (def))
581 	continue;
582 
583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
584 
585       /* Analyze the evolution function.  */
586       access_fn = analyze_scalar_evolution (loop, def);
587       if (access_fn)
588 	{
589 	  STRIP_NOPS (access_fn);
590 	  if (dump_enabled_p ())
591 	    {
592 	      dump_printf_loc (MSG_NOTE, vect_location,
593                                "Access function of PHI: ");
594 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
595 	    }
596 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
597 	    = evolution_part_in_loop_num (access_fn, loop->num);
598 	}
599 
600       if (!access_fn
601 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
602 	{
603 	  worklist.safe_push (phi);
604 	  continue;
605 	}
606 
607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
608 
609       if (dump_enabled_p ())
610 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
612     }
613 
614 
615   /* Second - identify all reductions and nested cycles.  */
616   while (worklist.length () > 0)
617     {
618       gimple phi = worklist.pop ();
619       tree def = PHI_RESULT (phi);
620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
621       gimple reduc_stmt;
622       bool nested_cycle;
623 
624       if (dump_enabled_p ())
625         {
626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
628         }
629 
630       gcc_assert (!virtual_operand_p (def)
631 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
632 
633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
635 						&double_reduc);
636       if (reduc_stmt)
637         {
638           if (double_reduc)
639             {
640               if (dump_enabled_p ())
641                 dump_printf_loc (MSG_NOTE, vect_location,
642 				 "Detected double reduction.");
643 
644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
646                                                     vect_double_reduction_def;
647             }
648           else
649             {
650               if (nested_cycle)
651                 {
652                   if (dump_enabled_p ())
653                     dump_printf_loc (MSG_NOTE, vect_location,
654 				     "Detected vectorizable nested cycle.");
655 
656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
658                                                              vect_nested_cycle;
659                 }
660               else
661                 {
662                   if (dump_enabled_p ())
663                     dump_printf_loc (MSG_NOTE, vect_location,
664 				     "Detected reduction.");
665 
666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
668                                                            vect_reduction_def;
669                   /* Store the reduction cycles for possible vectorization in
670                      loop-aware SLP.  */
671                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
672                 }
673             }
674         }
675       else
676         if (dump_enabled_p ())
677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 			   "Unknown def-use cycle pattern.");
679     }
680 
681   worklist.release ();
682 }
683 
684 
685 /* Function vect_analyze_scalar_cycles.
686 
687    Examine the cross iteration def-use cycles of scalar variables, by
688    analyzing the loop-header PHIs of scalar variables.  Classify each
689    cycle as one of the following: invariant, induction, reduction, unknown.
690    We do that for the loop represented by LOOP_VINFO, and also to its
691    inner-loop, if exists.
692    Examples for scalar cycles:
693 
694    Example1: reduction:
695 
696               loop1:
697               for (i=0; i<N; i++)
698                  sum += a[i];
699 
700    Example2: induction:
701 
702               loop2:
703               for (i=0; i<N; i++)
704                  a[i] = i;  */
705 
706 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
708 {
709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
710 
711   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
712 
713   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
714      Reductions in such inner-loop therefore have different properties than
715      the reductions in the nest that gets vectorized:
716      1. When vectorized, they are executed in the same order as in the original
717         scalar loop, so we can't change the order of computation when
718         vectorizing them.
719      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
720         current checks are too strict.  */
721 
722   if (loop->inner)
723     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
724 }
725 
726 /* Function vect_get_loop_niters.
727 
728    Determine how many iterations the loop is executed.
729    If an expression that represents the number of iterations
730    can be constructed, place it in NUMBER_OF_ITERATIONS.
731    Return the loop exit condition.  */
732 
733 static gimple
vect_get_loop_niters(struct loop * loop,tree * number_of_iterations)734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
735 {
736   tree niters;
737 
738   if (dump_enabled_p ())
739     dump_printf_loc (MSG_NOTE, vect_location,
740 		     "=== get_loop_niters ===");
741   niters = number_of_exit_cond_executions (loop);
742 
743   if (niters != NULL_TREE
744       && niters != chrec_dont_know)
745     {
746       *number_of_iterations = niters;
747 
748       if (dump_enabled_p ())
749         {
750           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
751           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
752         }
753     }
754 
755   return get_loop_exit_condition (loop);
756 }
757 
758 
759 /* Function bb_in_loop_p
760 
761    Used as predicate for dfs order traversal of the loop bbs.  */
762 
763 static bool
bb_in_loop_p(const_basic_block bb,const void * data)764 bb_in_loop_p (const_basic_block bb, const void *data)
765 {
766   const struct loop *const loop = (const struct loop *)data;
767   if (flow_bb_inside_loop_p (loop, bb))
768     return true;
769   return false;
770 }
771 
772 
773 /* Function new_loop_vec_info.
774 
775    Create and initialize a new loop_vec_info struct for LOOP, as well as
776    stmt_vec_info structs for all the stmts in LOOP.  */
777 
778 static loop_vec_info
new_loop_vec_info(struct loop * loop)779 new_loop_vec_info (struct loop *loop)
780 {
781   loop_vec_info res;
782   basic_block *bbs;
783   gimple_stmt_iterator si;
784   unsigned int i, nbbs;
785 
786   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
787   LOOP_VINFO_LOOP (res) = loop;
788 
789   bbs = get_loop_body (loop);
790 
791   /* Create/Update stmt_info for all stmts in the loop.  */
792   for (i = 0; i < loop->num_nodes; i++)
793     {
794       basic_block bb = bbs[i];
795 
796       /* BBs in a nested inner-loop will have been already processed (because
797          we will have called vect_analyze_loop_form for any nested inner-loop).
798          Therefore, for stmts in an inner-loop we just want to update the
799          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
800          loop_info of the outer-loop we are currently considering to vectorize
801          (instead of the loop_info of the inner-loop).
802          For stmts in other BBs we need to create a stmt_info from scratch.  */
803       if (bb->loop_father != loop)
804         {
805           /* Inner-loop bb.  */
806           gcc_assert (loop->inner && bb->loop_father == loop->inner);
807           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
808             {
809               gimple phi = gsi_stmt (si);
810               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
811               loop_vec_info inner_loop_vinfo =
812                 STMT_VINFO_LOOP_VINFO (stmt_info);
813               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
814               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
815             }
816           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
817            {
818               gimple stmt = gsi_stmt (si);
819               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
820               loop_vec_info inner_loop_vinfo =
821                  STMT_VINFO_LOOP_VINFO (stmt_info);
822               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
823               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
824            }
825         }
826       else
827         {
828           /* bb in current nest.  */
829           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
830             {
831               gimple phi = gsi_stmt (si);
832               gimple_set_uid (phi, 0);
833               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
834             }
835 
836           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
837             {
838               gimple stmt = gsi_stmt (si);
839               gimple_set_uid (stmt, 0);
840               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
841             }
842         }
843     }
844 
845   /* CHECKME: We want to visit all BBs before their successors (except for
846      latch blocks, for which this assertion wouldn't hold).  In the simple
847      case of the loop forms we allow, a dfs order of the BBs would the same
848      as reversed postorder traversal, so we are safe.  */
849 
850    free (bbs);
851    bbs = XCNEWVEC (basic_block, loop->num_nodes);
852    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
853                               bbs, loop->num_nodes, loop);
854    gcc_assert (nbbs == loop->num_nodes);
855 
856   LOOP_VINFO_BBS (res) = bbs;
857   LOOP_VINFO_NITERS (res) = NULL;
858   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
859   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
860   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
861   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
862   LOOP_VINFO_VECT_FACTOR (res) = 0;
863   LOOP_VINFO_LOOP_NEST (res).create (3);
864   LOOP_VINFO_DATAREFS (res).create (10);
865   LOOP_VINFO_DDRS (res).create (10 * 10);
866   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
867   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
868 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
869   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
870 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
871   LOOP_VINFO_GROUPED_STORES (res).create (10);
872   LOOP_VINFO_REDUCTIONS (res).create (10);
873   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
874   LOOP_VINFO_SLP_INSTANCES (res).create (10);
875   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
876   LOOP_VINFO_PEELING_HTAB (res) = NULL;
877   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
878   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
879   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
880 
881   return res;
882 }
883 
884 
885 /* Function destroy_loop_vec_info.
886 
887    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
888    stmts in the loop.  */
889 
890 void
destroy_loop_vec_info(loop_vec_info loop_vinfo,bool clean_stmts)891 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
892 {
893   struct loop *loop;
894   basic_block *bbs;
895   int nbbs;
896   gimple_stmt_iterator si;
897   int j;
898   vec<slp_instance> slp_instances;
899   slp_instance instance;
900   bool swapped;
901 
902   if (!loop_vinfo)
903     return;
904 
905   loop = LOOP_VINFO_LOOP (loop_vinfo);
906 
907   bbs = LOOP_VINFO_BBS (loop_vinfo);
908   nbbs = clean_stmts ? loop->num_nodes : 0;
909   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
910 
911   for (j = 0; j < nbbs; j++)
912     {
913       basic_block bb = bbs[j];
914       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
915         free_stmt_vec_info (gsi_stmt (si));
916 
917       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
918         {
919           gimple stmt = gsi_stmt (si);
920 
921 	  /* We may have broken canonical form by moving a constant
922 	     into RHS1 of a commutative op.  Fix such occurrences.  */
923 	  if (swapped && is_gimple_assign (stmt))
924 	    {
925 	      enum tree_code code = gimple_assign_rhs_code (stmt);
926 
927 	      if ((code == PLUS_EXPR
928 		   || code == POINTER_PLUS_EXPR
929 		   || code == MULT_EXPR)
930 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
931 		swap_tree_operands (stmt,
932 				    gimple_assign_rhs1_ptr (stmt),
933 				    gimple_assign_rhs2_ptr (stmt));
934 	    }
935 
936 	  /* Free stmt_vec_info.  */
937 	  free_stmt_vec_info (stmt);
938           gsi_next (&si);
939         }
940     }
941 
942   free (LOOP_VINFO_BBS (loop_vinfo));
943   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
944   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
945   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
946   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
947   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
948   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
949   FOR_EACH_VEC_ELT (slp_instances, j, instance)
950     vect_free_slp_instance (instance);
951 
952   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
953   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
954   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
955   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
956 
957   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
958     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
959 
960   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
961 
962   free (loop_vinfo);
963   loop->aux = NULL;
964 }
965 
966 
967 /* Function vect_analyze_loop_1.
968 
969    Apply a set of analyses on LOOP, and create a loop_vec_info struct
970    for it. The different analyses will record information in the
971    loop_vec_info struct.  This is a subset of the analyses applied in
972    vect_analyze_loop, to be applied on an inner-loop nested in the loop
973    that is now considered for (outer-loop) vectorization.  */
974 
975 static loop_vec_info
vect_analyze_loop_1(struct loop * loop)976 vect_analyze_loop_1 (struct loop *loop)
977 {
978   loop_vec_info loop_vinfo;
979 
980   if (dump_enabled_p ())
981     dump_printf_loc (MSG_NOTE, vect_location,
982 		     "===== analyze_loop_nest_1 =====");
983 
984   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
985 
986   loop_vinfo = vect_analyze_loop_form (loop);
987   if (!loop_vinfo)
988     {
989       if (dump_enabled_p ())
990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
991 			 "bad inner-loop form.");
992       return NULL;
993     }
994 
995   return loop_vinfo;
996 }
997 
998 
999 /* Function vect_analyze_loop_form.
1000 
1001    Verify that certain CFG restrictions hold, including:
1002    - the loop has a pre-header
1003    - the loop has a single entry and exit
1004    - the loop exit condition is simple enough, and the number of iterations
1005      can be analyzed (a countable loop).  */
1006 
1007 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1008 vect_analyze_loop_form (struct loop *loop)
1009 {
1010   loop_vec_info loop_vinfo;
1011   gimple loop_cond;
1012   tree number_of_iterations = NULL;
1013   loop_vec_info inner_loop_vinfo = NULL;
1014 
1015   if (dump_enabled_p ())
1016     dump_printf_loc (MSG_NOTE, vect_location,
1017 		     "=== vect_analyze_loop_form ===");
1018 
1019   /* Different restrictions apply when we are considering an inner-most loop,
1020      vs. an outer (nested) loop.
1021      (FORNOW. May want to relax some of these restrictions in the future).  */
1022 
1023   if (!loop->inner)
1024     {
1025       /* Inner-most loop.  We currently require that the number of BBs is
1026 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1027 	 look like this:
1028 
1029                         (pre-header)
1030                            |
1031                           header <--------+
1032                            | |            |
1033                            | +--> latch --+
1034                            |
1035                         (exit-bb)  */
1036 
1037       if (loop->num_nodes != 2)
1038         {
1039           if (dump_enabled_p ())
1040             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 			     "not vectorized: control flow in loop.");
1042           return NULL;
1043         }
1044 
1045       if (empty_block_p (loop->header))
1046     {
1047           if (dump_enabled_p ())
1048             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 			     "not vectorized: empty loop.");
1050       return NULL;
1051     }
1052     }
1053   else
1054     {
1055       struct loop *innerloop = loop->inner;
1056       edge entryedge;
1057 
1058       /* Nested loop. We currently require that the loop is doubly-nested,
1059 	 contains a single inner loop, and the number of BBs is exactly 5.
1060 	 Vectorizable outer-loops look like this:
1061 
1062 			(pre-header)
1063 			   |
1064 			  header <---+
1065 			   |         |
1066 		          inner-loop |
1067 			   |         |
1068 			  tail ------+
1069 			   |
1070 		        (exit-bb)
1071 
1072 	 The inner-loop has the properties expected of inner-most loops
1073 	 as described above.  */
1074 
1075       if ((loop->inner)->inner || (loop->inner)->next)
1076 	{
1077 	  if (dump_enabled_p ())
1078 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 			     "not vectorized: multiple nested loops.");
1080 	  return NULL;
1081 	}
1082 
1083       /* Analyze the inner-loop.  */
1084       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1085       if (!inner_loop_vinfo)
1086 	{
1087 	  if (dump_enabled_p ())
1088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 			     "not vectorized: Bad inner loop.");
1090 	  return NULL;
1091 	}
1092 
1093       if (!expr_invariant_in_loop_p (loop,
1094 					LOOP_VINFO_NITERS (inner_loop_vinfo)))
1095 	{
1096 	  if (dump_enabled_p ())
1097 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 			     "not vectorized: inner-loop count not invariant.");
1099 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1100 	  return NULL;
1101 	}
1102 
1103       if (loop->num_nodes != 5)
1104         {
1105 	  if (dump_enabled_p ())
1106 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107 			     "not vectorized: control flow in loop.");
1108 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1109 	  return NULL;
1110         }
1111 
1112       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1113       entryedge = EDGE_PRED (innerloop->header, 0);
1114       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1115 	entryedge = EDGE_PRED (innerloop->header, 1);
1116 
1117       if (entryedge->src != loop->header
1118 	  || !single_exit (innerloop)
1119 	  || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1120 	{
1121 	  if (dump_enabled_p ())
1122 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 			     "not vectorized: unsupported outerloop form.");
1124 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1125 	  return NULL;
1126 	}
1127 
1128       if (dump_enabled_p ())
1129         dump_printf_loc (MSG_NOTE, vect_location,
1130 			 "Considering outer-loop vectorization.");
1131     }
1132 
1133   if (!single_exit (loop)
1134       || EDGE_COUNT (loop->header->preds) != 2)
1135     {
1136       if (dump_enabled_p ())
1137         {
1138           if (!single_exit (loop))
1139 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 			     "not vectorized: multiple exits.");
1141           else if (EDGE_COUNT (loop->header->preds) != 2)
1142 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 			     "not vectorized: too many incoming edges.");
1144         }
1145       if (inner_loop_vinfo)
1146 	destroy_loop_vec_info (inner_loop_vinfo, true);
1147       return NULL;
1148     }
1149 
1150   /* We assume that the loop exit condition is at the end of the loop. i.e,
1151      that the loop is represented as a do-while (with a proper if-guard
1152      before the loop if needed), where the loop header contains all the
1153      executable statements, and the latch is empty.  */
1154   if (!empty_block_p (loop->latch)
1155       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1156     {
1157       if (dump_enabled_p ())
1158 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 			 "not vectorized: latch block not empty.");
1160       if (inner_loop_vinfo)
1161 	destroy_loop_vec_info (inner_loop_vinfo, true);
1162       return NULL;
1163     }
1164 
1165   /* Make sure there exists a single-predecessor exit bb:  */
1166   if (!single_pred_p (single_exit (loop)->dest))
1167     {
1168       edge e = single_exit (loop);
1169       if (!(e->flags & EDGE_ABNORMAL))
1170 	{
1171 	  split_loop_exit_edge (e);
1172 	  if (dump_enabled_p ())
1173 	    dump_printf (MSG_NOTE, "split exit edge.");
1174 	}
1175       else
1176 	{
1177 	  if (dump_enabled_p ())
1178 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 			     "not vectorized: abnormal loop exit edge.");
1180 	  if (inner_loop_vinfo)
1181 	    destroy_loop_vec_info (inner_loop_vinfo, true);
1182 	  return NULL;
1183 	}
1184     }
1185 
1186   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1187   if (!loop_cond)
1188     {
1189       if (dump_enabled_p ())
1190 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 			 "not vectorized: complicated exit condition.");
1192       if (inner_loop_vinfo)
1193 	destroy_loop_vec_info (inner_loop_vinfo, true);
1194       return NULL;
1195     }
1196 
1197   if (!number_of_iterations)
1198     {
1199       if (dump_enabled_p ())
1200 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1201 			 "not vectorized: number of iterations cannot be "
1202 			 "computed.");
1203       if (inner_loop_vinfo)
1204 	destroy_loop_vec_info (inner_loop_vinfo, true);
1205       return NULL;
1206     }
1207 
1208   if (chrec_contains_undetermined (number_of_iterations))
1209     {
1210       if (dump_enabled_p ())
1211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212 			     "Infinite number of iterations.");
1213       if (inner_loop_vinfo)
1214 	destroy_loop_vec_info (inner_loop_vinfo, true);
1215       return NULL;
1216     }
1217 
1218   if (!NITERS_KNOWN_P (number_of_iterations))
1219     {
1220       if (dump_enabled_p ())
1221         {
1222           dump_printf_loc (MSG_NOTE, vect_location,
1223 			   "Symbolic number of iterations is ");
1224 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1225         }
1226     }
1227   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1228     {
1229       if (dump_enabled_p ())
1230 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 			 "not vectorized: number of iterations = 0.");
1232       if (inner_loop_vinfo)
1233         destroy_loop_vec_info (inner_loop_vinfo, true);
1234       return NULL;
1235     }
1236 
1237   loop_vinfo = new_loop_vec_info (loop);
1238   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1239   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1240 
1241   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1242 
1243   /* CHECKME: May want to keep it around it in the future.  */
1244   if (inner_loop_vinfo)
1245     destroy_loop_vec_info (inner_loop_vinfo, false);
1246 
1247   gcc_assert (!loop->aux);
1248   loop->aux = loop_vinfo;
1249   return loop_vinfo;
1250 }
1251 
1252 
1253 /* Function vect_analyze_loop_operations.
1254 
1255    Scan the loop stmts and make sure they are all vectorizable.  */
1256 
1257 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo,bool slp)1258 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1259 {
1260   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1261   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1262   int nbbs = loop->num_nodes;
1263   gimple_stmt_iterator si;
1264   unsigned int vectorization_factor = 0;
1265   int i;
1266   gimple phi;
1267   stmt_vec_info stmt_info;
1268   bool need_to_vectorize = false;
1269   int min_profitable_iters;
1270   int min_scalar_loop_bound;
1271   unsigned int th;
1272   bool only_slp_in_loop = true, ok;
1273   HOST_WIDE_INT max_niter;
1274   HOST_WIDE_INT estimated_niter;
1275   int min_profitable_estimate;
1276 
1277   if (dump_enabled_p ())
1278     dump_printf_loc (MSG_NOTE, vect_location,
1279 		     "=== vect_analyze_loop_operations ===");
1280 
1281   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1282   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1283   if (slp)
1284     {
1285       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1286 	 vectorization factor of the loop is the unrolling factor required by
1287 	 the SLP instances.  If that unrolling factor is 1, we say, that we
1288 	 perform pure SLP on loop - cross iteration parallelism is not
1289 	 exploited.  */
1290       for (i = 0; i < nbbs; i++)
1291 	{
1292 	  basic_block bb = bbs[i];
1293 	  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1294 	    {
1295 	      gimple stmt = gsi_stmt (si);
1296 	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1297 	      gcc_assert (stmt_info);
1298 	      if ((STMT_VINFO_RELEVANT_P (stmt_info)
1299 		   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1300 		  && !PURE_SLP_STMT (stmt_info))
1301 		/* STMT needs both SLP and loop-based vectorization.  */
1302 		only_slp_in_loop = false;
1303 	    }
1304 	}
1305 
1306       if (only_slp_in_loop)
1307 	vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1308       else
1309 	vectorization_factor = least_common_multiple (vectorization_factor,
1310 				LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1311 
1312       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1313       if (dump_enabled_p ())
1314 	dump_printf_loc (MSG_NOTE, vect_location,
1315 			 "Updating vectorization factor to %d ",
1316 			 vectorization_factor);
1317     }
1318 
1319   for (i = 0; i < nbbs; i++)
1320     {
1321       basic_block bb = bbs[i];
1322 
1323       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1324         {
1325           phi = gsi_stmt (si);
1326           ok = true;
1327 
1328           stmt_info = vinfo_for_stmt (phi);
1329           if (dump_enabled_p ())
1330             {
1331               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1332               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1333             }
1334 
1335           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1336              (i.e., a phi in the tail of the outer-loop).  */
1337           if (! is_loop_header_bb_p (bb))
1338             {
1339               /* FORNOW: we currently don't support the case that these phis
1340                  are not used in the outerloop (unless it is double reduction,
1341                  i.e., this phi is vect_reduction_def), cause this case
1342                  requires to actually do something here.  */
1343               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1344                    || STMT_VINFO_LIVE_P (stmt_info))
1345                   && STMT_VINFO_DEF_TYPE (stmt_info)
1346                      != vect_double_reduction_def)
1347                 {
1348                   if (dump_enabled_p ())
1349 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350 				     "Unsupported loop-closed phi in "
1351 				     "outer-loop.");
1352                   return false;
1353                 }
1354 
1355               /* If PHI is used in the outer loop, we check that its operand
1356                  is defined in the inner loop.  */
1357               if (STMT_VINFO_RELEVANT_P (stmt_info))
1358                 {
1359                   tree phi_op;
1360                   gimple op_def_stmt;
1361 
1362                   if (gimple_phi_num_args (phi) != 1)
1363                     return false;
1364 
1365                   phi_op = PHI_ARG_DEF (phi, 0);
1366                   if (TREE_CODE (phi_op) != SSA_NAME)
1367                     return false;
1368 
1369                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1370 		  if (!op_def_stmt
1371 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1372 		      || !vinfo_for_stmt (op_def_stmt))
1373                     return false;
1374 
1375                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1376                         != vect_used_in_outer
1377                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1378                            != vect_used_in_outer_by_reduction)
1379                     return false;
1380                 }
1381 
1382               continue;
1383             }
1384 
1385           gcc_assert (stmt_info);
1386 
1387           if (STMT_VINFO_LIVE_P (stmt_info))
1388             {
1389               /* FORNOW: not yet supported.  */
1390               if (dump_enabled_p ())
1391 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392 				 "not vectorized: value used after loop.");
1393               return false;
1394             }
1395 
1396           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1397               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1398             {
1399               /* A scalar-dependence cycle that we don't support.  */
1400               if (dump_enabled_p ())
1401 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 				 "not vectorized: scalar dependence cycle.");
1403               return false;
1404             }
1405 
1406           if (STMT_VINFO_RELEVANT_P (stmt_info))
1407             {
1408               need_to_vectorize = true;
1409               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1410                 ok = vectorizable_induction (phi, NULL, NULL);
1411             }
1412 
1413           if (!ok)
1414             {
1415               if (dump_enabled_p ())
1416                 {
1417 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418 				   "not vectorized: relevant phi not "
1419 				   "supported: ");
1420                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1421                 }
1422 	      return false;
1423             }
1424         }
1425 
1426       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1427         {
1428           gimple stmt = gsi_stmt (si);
1429 	  if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1430 	    return false;
1431         }
1432     } /* bbs */
1433 
1434   /* All operations in the loop are either irrelevant (deal with loop
1435      control, or dead), or only used outside the loop and can be moved
1436      out of the loop (e.g. invariants, inductions).  The loop can be
1437      optimized away by scalar optimizations.  We're better off not
1438      touching this loop.  */
1439   if (!need_to_vectorize)
1440     {
1441       if (dump_enabled_p ())
1442         dump_printf_loc (MSG_NOTE, vect_location,
1443 			 "All the computation can be taken out of the loop.");
1444       if (dump_enabled_p ())
1445 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446 			 "not vectorized: redundant loop. no profit to "
1447 			 "vectorize.");
1448       return false;
1449     }
1450 
1451   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1452     dump_printf_loc (MSG_NOTE, vect_location,
1453 		     "vectorization_factor = %d, niters = "
1454 		     HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1455 		     LOOP_VINFO_INT_NITERS (loop_vinfo));
1456 
1457   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1458        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1459       || ((max_niter = max_stmt_executions_int (loop)) != -1
1460 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1461     {
1462       if (dump_enabled_p ())
1463 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 			 "not vectorized: iteration count too small.");
1465       if (dump_enabled_p ())
1466 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467 			 "not vectorized: iteration count smaller than "
1468 			 "vectorization factor.");
1469       return false;
1470     }
1471 
1472   /* Analyze cost.  Decide if worth while to vectorize.  */
1473 
1474   /* Once VF is set, SLP costs should be updated since the number of created
1475      vector stmts depends on VF.  */
1476   vect_update_slp_costs_according_to_vf (loop_vinfo);
1477 
1478   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1479 				      &min_profitable_estimate);
1480   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1481 
1482   if (min_profitable_iters < 0)
1483     {
1484       if (dump_enabled_p ())
1485 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486 			 "not vectorized: vectorization not profitable.");
1487       if (dump_enabled_p ())
1488 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489 			 "not vectorized: vector version will never be "
1490 			 "profitable.");
1491       return false;
1492     }
1493 
1494   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1495                             * vectorization_factor) - 1);
1496 
1497 
1498   /* Use the cost model only if it is more conservative than user specified
1499      threshold.  */
1500 
1501   th = (unsigned) min_scalar_loop_bound;
1502   if (min_profitable_iters
1503       && (!min_scalar_loop_bound
1504           || min_profitable_iters > min_scalar_loop_bound))
1505     th = (unsigned) min_profitable_iters;
1506 
1507   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1508       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1509     {
1510       if (dump_enabled_p ())
1511 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512 			 "not vectorized: vectorization not profitable.");
1513       if (dump_enabled_p ())
1514         dump_printf_loc (MSG_NOTE, vect_location,
1515 			 "not vectorized: iteration count smaller than user "
1516 			 "specified loop bound parameter or minimum profitable "
1517 			 "iterations (whichever is more conservative).");
1518       return false;
1519     }
1520 
1521   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1522       && ((unsigned HOST_WIDE_INT) estimated_niter
1523           <= MAX (th, (unsigned)min_profitable_estimate)))
1524     {
1525       if (dump_enabled_p ())
1526 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527 			 "not vectorized: estimated iteration count too "
1528                          "small.");
1529       if (dump_enabled_p ())
1530         dump_printf_loc (MSG_NOTE, vect_location,
1531 			 "not vectorized: estimated iteration count smaller "
1532                          "than specified loop bound parameter or minimum "
1533                          "profitable iterations (whichever is more "
1534                          "conservative).");
1535       return false;
1536     }
1537 
1538   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1539       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1540       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1541     {
1542       if (dump_enabled_p ())
1543         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1544       if (!vect_can_advance_ivs_p (loop_vinfo))
1545         {
1546           if (dump_enabled_p ())
1547 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1548 			     "not vectorized: can't create epilog loop 1.");
1549           return false;
1550         }
1551       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1552         {
1553           if (dump_enabled_p ())
1554 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555 			     "not vectorized: can't create epilog loop 2.");
1556           return false;
1557         }
1558     }
1559 
1560   return true;
1561 }
1562 
1563 
1564 /* Function vect_analyze_loop_2.
1565 
1566    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1567    for it.  The different analyses will record information in the
1568    loop_vec_info struct.  */
1569 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo)1570 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1571 {
1572   bool ok, slp = false;
1573   int max_vf = MAX_VECTORIZATION_FACTOR;
1574   int min_vf = 2;
1575 
1576   /* Find all data references in the loop (which correspond to vdefs/vuses)
1577      and analyze their evolution in the loop.  Also adjust the minimal
1578      vectorization factor according to the loads and stores.
1579 
1580      FORNOW: Handle only simple, array references, which
1581      alignment can be forced, and aligned pointer-references.  */
1582 
1583   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1584   if (!ok)
1585     {
1586       if (dump_enabled_p ())
1587 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1588 			 "bad data references.");
1589       return false;
1590     }
1591 
1592   /* Classify all cross-iteration scalar data-flow cycles.
1593      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1594 
1595   vect_analyze_scalar_cycles (loop_vinfo);
1596 
1597   vect_pattern_recog (loop_vinfo, NULL);
1598 
1599   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1600 
1601   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 			 "unexpected pattern.");
1607       return false;
1608     }
1609 
1610   /* Analyze data dependences between the data-refs in the loop
1611      and adjust the maximum vectorization factor according to
1612      the dependences.
1613      FORNOW: fail at the first data dependence that we encounter.  */
1614 
1615   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1616   if (!ok
1617       || max_vf < min_vf)
1618     {
1619       if (dump_enabled_p ())
1620 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621 			     "bad data dependence.");
1622       return false;
1623     }
1624 
1625   ok = vect_determine_vectorization_factor (loop_vinfo);
1626   if (!ok)
1627     {
1628       if (dump_enabled_p ())
1629 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 			 "can't determine vectorization factor.");
1631       return false;
1632     }
1633   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1634     {
1635       if (dump_enabled_p ())
1636 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637 			 "bad data dependence.");
1638       return false;
1639     }
1640 
1641   /* Analyze the alignment of the data-refs in the loop.
1642      Fail if a data reference is found that cannot be vectorized.  */
1643 
1644   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1645   if (!ok)
1646     {
1647       if (dump_enabled_p ())
1648 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649 			 "bad data alignment.");
1650       return false;
1651     }
1652 
1653   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1654      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1655 
1656   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661 			 "bad data access.");
1662       return false;
1663     }
1664 
1665   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1666      It is important to call pruning after vect_analyze_data_ref_accesses,
1667      since we use grouping information gathered by interleaving analysis.  */
1668   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1669   if (!ok)
1670     {
1671       if (dump_enabled_p ())
1672 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 			 "too long list of versioning for alias "
1674 			 "run-time tests.");
1675       return false;
1676     }
1677 
1678   /* This pass will decide on using loop versioning and/or loop peeling in
1679      order to enhance the alignment of data references in the loop.  */
1680 
1681   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1682   if (!ok)
1683     {
1684       if (dump_enabled_p ())
1685 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 			 "bad data alignment.");
1687       return false;
1688     }
1689 
1690   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1691   ok = vect_analyze_slp (loop_vinfo, NULL);
1692   if (ok)
1693     {
1694       /* Decide which possible SLP instances to SLP.  */
1695       slp = vect_make_slp_decision (loop_vinfo);
1696 
1697       /* Find stmts that need to be both vectorized and SLPed.  */
1698       vect_detect_hybrid_slp (loop_vinfo);
1699     }
1700   else
1701     return false;
1702 
1703   /* Scan all the operations in the loop and make sure they are
1704      vectorizable.  */
1705 
1706   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1707   if (!ok)
1708     {
1709       if (dump_enabled_p ())
1710 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711 			 "bad operation or unsupported loop bound.");
1712       return false;
1713     }
1714 
1715   return true;
1716 }
1717 
1718 /* Function vect_analyze_loop.
1719 
1720    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1721    for it.  The different analyses will record information in the
1722    loop_vec_info struct.  */
1723 loop_vec_info
vect_analyze_loop(struct loop * loop)1724 vect_analyze_loop (struct loop *loop)
1725 {
1726   loop_vec_info loop_vinfo;
1727   unsigned int vector_sizes;
1728 
1729   /* Autodetect first vector size we try.  */
1730   current_vector_size = 0;
1731   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1732 
1733   if (dump_enabled_p ())
1734     dump_printf_loc (MSG_NOTE, vect_location,
1735 		     "===== analyze_loop_nest =====");
1736 
1737   if (loop_outer (loop)
1738       && loop_vec_info_for_loop (loop_outer (loop))
1739       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1740     {
1741       if (dump_enabled_p ())
1742 	dump_printf_loc (MSG_NOTE, vect_location,
1743 			 "outer-loop already vectorized.");
1744       return NULL;
1745     }
1746 
1747   while (1)
1748     {
1749       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1750       loop_vinfo = vect_analyze_loop_form (loop);
1751       if (!loop_vinfo)
1752 	{
1753 	  if (dump_enabled_p ())
1754 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755 			     "bad loop form.");
1756 	  return NULL;
1757 	}
1758 
1759       if (vect_analyze_loop_2 (loop_vinfo))
1760 	{
1761 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1762 
1763 	  return loop_vinfo;
1764 	}
1765 
1766       destroy_loop_vec_info (loop_vinfo, true);
1767 
1768       vector_sizes &= ~current_vector_size;
1769       if (vector_sizes == 0
1770 	  || current_vector_size == 0)
1771 	return NULL;
1772 
1773       /* Try the next biggest vector size.  */
1774       current_vector_size = 1 << floor_log2 (vector_sizes);
1775       if (dump_enabled_p ())
1776 	dump_printf_loc (MSG_NOTE, vect_location,
1777 			 "***** Re-trying analysis with "
1778 			 "vector size %d\n", current_vector_size);
1779     }
1780 }
1781 
1782 
1783 /* Function reduction_code_for_scalar_code
1784 
1785    Input:
1786    CODE - tree_code of a reduction operations.
1787 
1788    Output:
1789    REDUC_CODE - the corresponding tree-code to be used to reduce the
1790       vector of partial results into a single scalar result (which
1791       will also reside in a vector) or ERROR_MARK if the operation is
1792       a supported reduction operation, but does not have such tree-code.
1793 
1794    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1795 
1796 static bool
reduction_code_for_scalar_code(enum tree_code code,enum tree_code * reduc_code)1797 reduction_code_for_scalar_code (enum tree_code code,
1798                                 enum tree_code *reduc_code)
1799 {
1800   switch (code)
1801     {
1802       case MAX_EXPR:
1803         *reduc_code = REDUC_MAX_EXPR;
1804         return true;
1805 
1806       case MIN_EXPR:
1807         *reduc_code = REDUC_MIN_EXPR;
1808         return true;
1809 
1810       case PLUS_EXPR:
1811         *reduc_code = REDUC_PLUS_EXPR;
1812         return true;
1813 
1814       case MULT_EXPR:
1815       case MINUS_EXPR:
1816       case BIT_IOR_EXPR:
1817       case BIT_XOR_EXPR:
1818       case BIT_AND_EXPR:
1819         *reduc_code = ERROR_MARK;
1820         return true;
1821 
1822       default:
1823        return false;
1824     }
1825 }
1826 
1827 
1828 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1829    STMT is printed with a message MSG. */
1830 
1831 static void
report_vect_op(int msg_type,gimple stmt,const char * msg)1832 report_vect_op (int msg_type, gimple stmt, const char *msg)
1833 {
1834   dump_printf_loc (msg_type, vect_location, "%s", msg);
1835   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1836 }
1837 
1838 
1839 /* Detect SLP reduction of the form:
1840 
1841    #a1 = phi <a5, a0>
1842    a2 = operation (a1)
1843    a3 = operation (a2)
1844    a4 = operation (a3)
1845    a5 = operation (a4)
1846 
1847    #a = phi <a5>
1848 
1849    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1850    FIRST_STMT is the first reduction stmt in the chain
1851    (a2 = operation (a1)).
1852 
1853    Return TRUE if a reduction chain was detected.  */
1854 
1855 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple phi,gimple first_stmt)1856 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1857 {
1858   struct loop *loop = (gimple_bb (phi))->loop_father;
1859   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1860   enum tree_code code;
1861   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1862   stmt_vec_info use_stmt_info, current_stmt_info;
1863   tree lhs;
1864   imm_use_iterator imm_iter;
1865   use_operand_p use_p;
1866   int nloop_uses, size = 0, n_out_of_loop_uses;
1867   bool found = false;
1868 
1869   if (loop != vect_loop)
1870     return false;
1871 
1872   lhs = PHI_RESULT (phi);
1873   code = gimple_assign_rhs_code (first_stmt);
1874   while (1)
1875     {
1876       nloop_uses = 0;
1877       n_out_of_loop_uses = 0;
1878       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1879         {
1880 	  gimple use_stmt = USE_STMT (use_p);
1881           if (is_gimple_debug (use_stmt))
1882             continue;
1883 
1884 	  use_stmt = USE_STMT (use_p);
1885 
1886           /* Check if we got back to the reduction phi.  */
1887 	  if (use_stmt == phi)
1888             {
1889 	      loop_use_stmt = use_stmt;
1890               found = true;
1891               break;
1892             }
1893 
1894           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1895             {
1896               if (vinfo_for_stmt (use_stmt)
1897                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1898                 {
1899                   loop_use_stmt = use_stmt;
1900                   nloop_uses++;
1901                 }
1902             }
1903            else
1904              n_out_of_loop_uses++;
1905 
1906            /* There are can be either a single use in the loop or two uses in
1907               phi nodes.  */
1908            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1909              return false;
1910         }
1911 
1912       if (found)
1913         break;
1914 
1915       /* We reached a statement with no loop uses.  */
1916       if (nloop_uses == 0)
1917 	return false;
1918 
1919       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1920       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1921         return false;
1922 
1923       if (!is_gimple_assign (loop_use_stmt)
1924 	  || code != gimple_assign_rhs_code (loop_use_stmt)
1925 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1926         return false;
1927 
1928       /* Insert USE_STMT into reduction chain.  */
1929       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1930       if (current_stmt)
1931         {
1932           current_stmt_info = vinfo_for_stmt (current_stmt);
1933 	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1934           GROUP_FIRST_ELEMENT (use_stmt_info)
1935             = GROUP_FIRST_ELEMENT (current_stmt_info);
1936         }
1937       else
1938 	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1939 
1940       lhs = gimple_assign_lhs (loop_use_stmt);
1941       current_stmt = loop_use_stmt;
1942       size++;
1943    }
1944 
1945   if (!found || loop_use_stmt != phi || size < 2)
1946     return false;
1947 
1948   /* Swap the operands, if needed, to make the reduction operand be the second
1949      operand.  */
1950   lhs = PHI_RESULT (phi);
1951   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1952   while (next_stmt)
1953     {
1954       if (gimple_assign_rhs2 (next_stmt) == lhs)
1955 	{
1956 	  tree op = gimple_assign_rhs1 (next_stmt);
1957           gimple def_stmt = NULL;
1958 
1959           if (TREE_CODE (op) == SSA_NAME)
1960             def_stmt = SSA_NAME_DEF_STMT (op);
1961 
1962 	  /* Check that the other def is either defined in the loop
1963 	     ("vect_internal_def"), or it's an induction (defined by a
1964 	     loop-header phi-node).  */
1965           if (def_stmt
1966               && gimple_bb (def_stmt)
1967 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1968               && (is_gimple_assign (def_stmt)
1969                   || is_gimple_call (def_stmt)
1970                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1971                            == vect_induction_def
1972                   || (gimple_code (def_stmt) == GIMPLE_PHI
1973                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1974                                   == vect_internal_def
1975                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1976 	    {
1977 	      lhs = gimple_assign_lhs (next_stmt);
1978 	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1979  	      continue;
1980 	    }
1981 
1982 	  return false;
1983 	}
1984       else
1985 	{
1986           tree op = gimple_assign_rhs2 (next_stmt);
1987           gimple def_stmt = NULL;
1988 
1989           if (TREE_CODE (op) == SSA_NAME)
1990             def_stmt = SSA_NAME_DEF_STMT (op);
1991 
1992           /* Check that the other def is either defined in the loop
1993             ("vect_internal_def"), or it's an induction (defined by a
1994             loop-header phi-node).  */
1995           if (def_stmt
1996               && gimple_bb (def_stmt)
1997 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1998               && (is_gimple_assign (def_stmt)
1999                   || is_gimple_call (def_stmt)
2000                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2001                               == vect_induction_def
2002                   || (gimple_code (def_stmt) == GIMPLE_PHI
2003                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2004                                   == vect_internal_def
2005                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2006   	    {
2007 	      if (dump_enabled_p ())
2008 		{
2009 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2010 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2011 		}
2012 
2013 	      swap_tree_operands (next_stmt,
2014 	 		          gimple_assign_rhs1_ptr (next_stmt),
2015                                   gimple_assign_rhs2_ptr (next_stmt));
2016 	      update_stmt (next_stmt);
2017 
2018 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2019 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2020 	    }
2021 	  else
2022 	    return false;
2023         }
2024 
2025       lhs = gimple_assign_lhs (next_stmt);
2026       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2027     }
2028 
2029   /* Save the chain for further analysis in SLP detection.  */
2030   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2031   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2032   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2033 
2034   return true;
2035 }
2036 
2037 
2038 /* Function vect_is_simple_reduction_1
2039 
2040    (1) Detect a cross-iteration def-use cycle that represents a simple
2041    reduction computation.  We look for the following pattern:
2042 
2043    loop_header:
2044      a1 = phi < a0, a2 >
2045      a3 = ...
2046      a2 = operation (a3, a1)
2047 
2048    such that:
2049    1. operation is commutative and associative and it is safe to
2050       change the order of the computation (if CHECK_REDUCTION is true)
2051    2. no uses for a2 in the loop (a2 is used out of the loop)
2052    3. no uses of a1 in the loop besides the reduction operation
2053    4. no uses of a1 outside the loop.
2054 
2055    Conditions 1,4 are tested here.
2056    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2057 
2058    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2059    nested cycles, if CHECK_REDUCTION is false.
2060 
2061    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2062    reductions:
2063 
2064      a1 = phi < a0, a2 >
2065      inner loop (def of a3)
2066      a2 = phi < a3 >
2067 
2068    If MODIFY is true it tries also to rework the code in-place to enable
2069    detection of more reduction patterns.  For the time being we rewrite
2070    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2071 */
2072 
2073 static gimple
vect_is_simple_reduction_1(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc,bool modify)2074 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2075 			    bool check_reduction, bool *double_reduc,
2076 			    bool modify)
2077 {
2078   struct loop *loop = (gimple_bb (phi))->loop_father;
2079   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2080   edge latch_e = loop_latch_edge (loop);
2081   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2082   gimple def_stmt, def1 = NULL, def2 = NULL;
2083   enum tree_code orig_code, code;
2084   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2085   tree type;
2086   int nloop_uses;
2087   tree name;
2088   imm_use_iterator imm_iter;
2089   use_operand_p use_p;
2090   bool phi_def;
2091 
2092   *double_reduc = false;
2093 
2094   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2095      otherwise, we assume outer loop vectorization.  */
2096   gcc_assert ((check_reduction && loop == vect_loop)
2097               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2098 
2099   name = PHI_RESULT (phi);
2100   /* ???  If there are no uses of the PHI result the inner loop reduction
2101      won't be detected as possibly double-reduction by vectorizable_reduction
2102      because that tries to walk the PHI arg from the preheader edge which
2103      can be constant.  See PR60382.  */
2104   if (has_zero_uses (name))
2105     return NULL;
2106   nloop_uses = 0;
2107   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2108     {
2109       gimple use_stmt = USE_STMT (use_p);
2110       if (is_gimple_debug (use_stmt))
2111 	continue;
2112 
2113       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2114         {
2115           if (dump_enabled_p ())
2116 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 			     "intermediate value used outside loop.");
2118 
2119           return NULL;
2120         }
2121 
2122       if (vinfo_for_stmt (use_stmt)
2123 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2124         nloop_uses++;
2125       if (nloop_uses > 1)
2126         {
2127           if (dump_enabled_p ())
2128 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 			     "reduction used in loop.");
2130           return NULL;
2131         }
2132     }
2133 
2134   if (TREE_CODE (loop_arg) != SSA_NAME)
2135     {
2136       if (dump_enabled_p ())
2137 	{
2138 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 			   "reduction: not ssa_name: ");
2140 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2141 	}
2142       return NULL;
2143     }
2144 
2145   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2146   if (!def_stmt)
2147     {
2148       if (dump_enabled_p ())
2149 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 			 "reduction: no def_stmt.");
2151       return NULL;
2152     }
2153 
2154   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2155     {
2156       if (dump_enabled_p ())
2157         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2158       return NULL;
2159     }
2160 
2161   if (is_gimple_assign (def_stmt))
2162     {
2163       name = gimple_assign_lhs (def_stmt);
2164       phi_def = false;
2165     }
2166   else
2167     {
2168       name = PHI_RESULT (def_stmt);
2169       phi_def = true;
2170     }
2171 
2172   nloop_uses = 0;
2173   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2174     {
2175       gimple use_stmt = USE_STMT (use_p);
2176       if (is_gimple_debug (use_stmt))
2177 	continue;
2178       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2179 	  && vinfo_for_stmt (use_stmt)
2180 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2181 	nloop_uses++;
2182       if (nloop_uses > 1)
2183 	{
2184 	  if (dump_enabled_p ())
2185 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186 			     "reduction used in loop.");
2187 	  return NULL;
2188 	}
2189     }
2190 
2191   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192      defined in the inner loop.  */
2193   if (phi_def)
2194     {
2195       op1 = PHI_ARG_DEF (def_stmt, 0);
2196 
2197       if (gimple_phi_num_args (def_stmt) != 1
2198           || TREE_CODE (op1) != SSA_NAME)
2199         {
2200           if (dump_enabled_p ())
2201 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202 			     "unsupported phi node definition.");
2203 
2204           return NULL;
2205         }
2206 
2207       def1 = SSA_NAME_DEF_STMT (op1);
2208       if (gimple_bb (def1)
2209 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2210           && loop->inner
2211           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2212           && is_gimple_assign (def1))
2213         {
2214           if (dump_enabled_p ())
2215             report_vect_op (MSG_NOTE, def_stmt,
2216 			    "detected double reduction: ");
2217 
2218           *double_reduc = true;
2219           return def_stmt;
2220         }
2221 
2222       return NULL;
2223     }
2224 
2225   code = orig_code = gimple_assign_rhs_code (def_stmt);
2226 
2227   /* We can handle "res -= x[i]", which is non-associative by
2228      simply rewriting this into "res += -x[i]".  Avoid changing
2229      gimple instruction for the first simple tests and only do this
2230      if we're allowed to change code at all.  */
2231   if (code == MINUS_EXPR
2232       && modify
2233       && (op1 = gimple_assign_rhs1 (def_stmt))
2234       && TREE_CODE (op1) == SSA_NAME
2235       && SSA_NAME_DEF_STMT (op1) == phi)
2236     code = PLUS_EXPR;
2237 
2238   if (check_reduction
2239       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2240     {
2241       if (dump_enabled_p ())
2242         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2243 			"reduction: not commutative/associative: ");
2244       return NULL;
2245     }
2246 
2247   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2248     {
2249       if (code != COND_EXPR)
2250         {
2251 	  if (dump_enabled_p ())
2252 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2253 			    "reduction: not binary operation: ");
2254 
2255           return NULL;
2256         }
2257 
2258       op3 = gimple_assign_rhs1 (def_stmt);
2259       if (COMPARISON_CLASS_P (op3))
2260         {
2261           op4 = TREE_OPERAND (op3, 1);
2262           op3 = TREE_OPERAND (op3, 0);
2263         }
2264 
2265       op1 = gimple_assign_rhs2 (def_stmt);
2266       op2 = gimple_assign_rhs3 (def_stmt);
2267 
2268       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2269         {
2270           if (dump_enabled_p ())
2271             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2272 			    "reduction: uses not ssa_names: ");
2273 
2274           return NULL;
2275         }
2276     }
2277   else
2278     {
2279       op1 = gimple_assign_rhs1 (def_stmt);
2280       op2 = gimple_assign_rhs2 (def_stmt);
2281 
2282       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2283         {
2284           if (dump_enabled_p ())
2285 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2286 			    "reduction: uses not ssa_names: ");
2287 
2288           return NULL;
2289         }
2290    }
2291 
2292   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2293   if ((TREE_CODE (op1) == SSA_NAME
2294        && !types_compatible_p (type,TREE_TYPE (op1)))
2295       || (TREE_CODE (op2) == SSA_NAME
2296           && !types_compatible_p (type, TREE_TYPE (op2)))
2297       || (op3 && TREE_CODE (op3) == SSA_NAME
2298           && !types_compatible_p (type, TREE_TYPE (op3)))
2299       || (op4 && TREE_CODE (op4) == SSA_NAME
2300           && !types_compatible_p (type, TREE_TYPE (op4))))
2301     {
2302       if (dump_enabled_p ())
2303         {
2304           dump_printf_loc (MSG_NOTE, vect_location,
2305 			   "reduction: multiple types: operation type: ");
2306           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2307           dump_printf (MSG_NOTE, ", operands types: ");
2308           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2309 			     TREE_TYPE (op1));
2310           dump_printf (MSG_NOTE, ",");
2311           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2312 			     TREE_TYPE (op2));
2313           if (op3)
2314             {
2315               dump_printf (MSG_NOTE, ",");
2316               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2317 				 TREE_TYPE (op3));
2318             }
2319 
2320           if (op4)
2321             {
2322               dump_printf (MSG_NOTE, ",");
2323               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2324 				 TREE_TYPE (op4));
2325             }
2326         }
2327 
2328       return NULL;
2329     }
2330 
2331   /* Check that it's ok to change the order of the computation.
2332      Generally, when vectorizing a reduction we change the order of the
2333      computation.  This may change the behavior of the program in some
2334      cases, so we need to check that this is ok.  One exception is when
2335      vectorizing an outer-loop: the inner-loop is executed sequentially,
2336      and therefore vectorizing reductions in the inner-loop during
2337      outer-loop vectorization is safe.  */
2338 
2339   /* CHECKME: check for !flag_finite_math_only too?  */
2340   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2341       && check_reduction)
2342     {
2343       /* Changing the order of operations changes the semantics.  */
2344       if (dump_enabled_p ())
2345 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2346 			"reduction: unsafe fp math optimization: ");
2347       return NULL;
2348     }
2349   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2350 	   && check_reduction)
2351     {
2352       /* Changing the order of operations changes the semantics.  */
2353       if (dump_enabled_p ())
2354 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2355 			"reduction: unsafe int math optimization: ");
2356       return NULL;
2357     }
2358   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2359     {
2360       /* Changing the order of operations changes the semantics.  */
2361       if (dump_enabled_p ())
2362 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2363 			"reduction: unsafe fixed-point math optimization: ");
2364       return NULL;
2365     }
2366 
2367   /* If we detected "res -= x[i]" earlier, rewrite it into
2368      "res += -x[i]" now.  If this turns out to be useless reassoc
2369      will clean it up again.  */
2370   if (orig_code == MINUS_EXPR)
2371     {
2372       tree rhs = gimple_assign_rhs2 (def_stmt);
2373       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2374       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2375 							 rhs, NULL);
2376       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2377       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2378 							  loop_info, NULL));
2379       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2380       gimple_assign_set_rhs2 (def_stmt, negrhs);
2381       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2382       update_stmt (def_stmt);
2383     }
2384 
2385   /* Reduction is safe. We're dealing with one of the following:
2386      1) integer arithmetic and no trapv
2387      2) floating point arithmetic, and special flags permit this optimization
2388      3) nested cycle (i.e., outer loop vectorization).  */
2389   if (TREE_CODE (op1) == SSA_NAME)
2390     def1 = SSA_NAME_DEF_STMT (op1);
2391 
2392   if (TREE_CODE (op2) == SSA_NAME)
2393     def2 = SSA_NAME_DEF_STMT (op2);
2394 
2395   if (code != COND_EXPR
2396       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2397     {
2398       if (dump_enabled_p ())
2399 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2400       return NULL;
2401     }
2402 
2403   /* Check that one def is the reduction def, defined by PHI,
2404      the other def is either defined in the loop ("vect_internal_def"),
2405      or it's an induction (defined by a loop-header phi-node).  */
2406 
2407   if (def2 && def2 == phi
2408       && (code == COND_EXPR
2409 	  || !def1 || gimple_nop_p (def1)
2410           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2411               && (is_gimple_assign (def1)
2412 		  || is_gimple_call (def1)
2413   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2414                       == vect_induction_def
2415    	          || (gimple_code (def1) == GIMPLE_PHI
2416 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2417                           == vect_internal_def
2418  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
2419     {
2420       if (dump_enabled_p ())
2421 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2422       return def_stmt;
2423     }
2424 
2425   if (def1 && def1 == phi
2426       && (code == COND_EXPR
2427 	  || !def2 || gimple_nop_p (def2)
2428           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2429  	      && (is_gimple_assign (def2)
2430 		  || is_gimple_call (def2)
2431 	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2432                       == vect_induction_def
2433  	          || (gimple_code (def2) == GIMPLE_PHI
2434 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2435                           == vect_internal_def
2436 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
2437     {
2438       if (check_reduction)
2439         {
2440           /* Swap operands (just for simplicity - so that the rest of the code
2441 	     can assume that the reduction variable is always the last (second)
2442 	     argument).  */
2443           if (dump_enabled_p ())
2444 	    report_vect_op (MSG_NOTE, def_stmt,
2445 	  	            "detected reduction: need to swap operands: ");
2446 
2447           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2448  			      gimple_assign_rhs2_ptr (def_stmt));
2449 
2450 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2451 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2452         }
2453       else
2454         {
2455           if (dump_enabled_p ())
2456             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2457         }
2458 
2459       return def_stmt;
2460     }
2461 
2462   /* Try to find SLP reduction chain.  */
2463   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2464     {
2465       if (dump_enabled_p ())
2466         report_vect_op (MSG_NOTE, def_stmt,
2467 			"reduction: detected reduction chain: ");
2468 
2469       return def_stmt;
2470     }
2471 
2472   if (dump_enabled_p ())
2473     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2474 		    "reduction: unknown pattern: ");
2475 
2476   return NULL;
2477 }
2478 
2479 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2480    in-place.  Arguments as there.  */
2481 
2482 static gimple
vect_is_simple_reduction(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc)2483 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2484                           bool check_reduction, bool *double_reduc)
2485 {
2486   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2487 				     double_reduc, false);
2488 }
2489 
2490 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2491    in-place if it enables detection of more reductions.  Arguments
2492    as there.  */
2493 
2494 gimple
vect_force_simple_reduction(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc)2495 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2496                           bool check_reduction, bool *double_reduc)
2497 {
2498   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2499 				     double_reduc, true);
2500 }
2501 
2502 /* Calculate the cost of one scalar iteration of the loop.  */
2503 int
vect_get_single_scalar_iteration_cost(loop_vec_info loop_vinfo)2504 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2505 {
2506   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2507   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2508   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2509   int innerloop_iters, i, stmt_cost;
2510 
2511   /* Count statements in scalar loop.  Using this as scalar cost for a single
2512      iteration for now.
2513 
2514      TODO: Add outer loop support.
2515 
2516      TODO: Consider assigning different costs to different scalar
2517      statements.  */
2518 
2519   /* FORNOW.  */
2520   innerloop_iters = 1;
2521   if (loop->inner)
2522     innerloop_iters = 50; /* FIXME */
2523 
2524   for (i = 0; i < nbbs; i++)
2525     {
2526       gimple_stmt_iterator si;
2527       basic_block bb = bbs[i];
2528 
2529       if (bb->loop_father == loop->inner)
2530         factor = innerloop_iters;
2531       else
2532         factor = 1;
2533 
2534       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2535         {
2536           gimple stmt = gsi_stmt (si);
2537           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2538 
2539           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2540             continue;
2541 
2542           /* Skip stmts that are not vectorized inside the loop.  */
2543           if (stmt_info
2544               && !STMT_VINFO_RELEVANT_P (stmt_info)
2545               && (!STMT_VINFO_LIVE_P (stmt_info)
2546                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2547 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2548             continue;
2549 
2550           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2551             {
2552               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2553                stmt_cost = vect_get_stmt_cost (scalar_load);
2554              else
2555                stmt_cost = vect_get_stmt_cost (scalar_store);
2556             }
2557           else
2558             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2559 
2560           scalar_single_iter_cost += stmt_cost * factor;
2561         }
2562     }
2563   return scalar_single_iter_cost;
2564 }
2565 
2566 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2567 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,int scalar_single_iter_cost,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)2568 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2569                              int *peel_iters_epilogue,
2570                              int scalar_single_iter_cost,
2571 			     stmt_vector_for_cost *prologue_cost_vec,
2572 			     stmt_vector_for_cost *epilogue_cost_vec)
2573 {
2574   int retval = 0;
2575   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2576 
2577   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2578     {
2579       *peel_iters_epilogue = vf/2;
2580       if (dump_enabled_p ())
2581         dump_printf_loc (MSG_NOTE, vect_location,
2582 			 "cost model: epilogue peel iters set to vf/2 "
2583 			 "because loop iterations are unknown .");
2584 
2585       /* If peeled iterations are known but number of scalar loop
2586          iterations are unknown, count a taken branch per peeled loop.  */
2587       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2588 				 NULL, 0, vect_prologue);
2589     }
2590   else
2591     {
2592       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2593       peel_iters_prologue = niters < peel_iters_prologue ?
2594                             niters : peel_iters_prologue;
2595       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2596       /* If we need to peel for gaps, but no peeling is required, we have to
2597 	 peel VF iterations.  */
2598       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2599         *peel_iters_epilogue = vf;
2600     }
2601 
2602   if (peel_iters_prologue)
2603     retval += record_stmt_cost (prologue_cost_vec,
2604 				peel_iters_prologue * scalar_single_iter_cost,
2605 				scalar_stmt, NULL, 0, vect_prologue);
2606   if (*peel_iters_epilogue)
2607     retval += record_stmt_cost (epilogue_cost_vec,
2608 				*peel_iters_epilogue * scalar_single_iter_cost,
2609 				scalar_stmt, NULL, 0, vect_epilogue);
2610   return retval;
2611 }
2612 
2613 /* Function vect_estimate_min_profitable_iters
2614 
2615    Return the number of iterations required for the vector version of the
2616    loop to be profitable relative to the cost of the scalar version of the
2617    loop.  */
2618 
2619 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)2620 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2621 				    int *ret_min_profitable_niters,
2622 				    int *ret_min_profitable_estimate)
2623 {
2624   int min_profitable_iters;
2625   int min_profitable_estimate;
2626   int peel_iters_prologue;
2627   int peel_iters_epilogue;
2628   unsigned vec_inside_cost = 0;
2629   int vec_outside_cost = 0;
2630   unsigned vec_prologue_cost = 0;
2631   unsigned vec_epilogue_cost = 0;
2632   int scalar_single_iter_cost = 0;
2633   int scalar_outside_cost = 0;
2634   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2635   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2636   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2637 
2638   /* Cost model disabled.  */
2639   if (!flag_vect_cost_model)
2640     {
2641       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2642       *ret_min_profitable_niters = 0;
2643       *ret_min_profitable_estimate = 0;
2644       return;
2645     }
2646 
2647   /* Requires loop versioning tests to handle misalignment.  */
2648   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2649     {
2650       /*  FIXME: Make cost depend on complexity of individual check.  */
2651       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2652       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2653 			    vect_prologue);
2654       dump_printf (MSG_NOTE,
2655                    "cost model: Adding cost of checks for loop "
2656                    "versioning to treat misalignment.\n");
2657     }
2658 
2659   /* Requires loop versioning with alias checks.  */
2660   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2661     {
2662       /*  FIXME: Make cost depend on complexity of individual check.  */
2663       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2664       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2665 			    vect_prologue);
2666       dump_printf (MSG_NOTE,
2667                    "cost model: Adding cost of checks for loop "
2668                    "versioning aliasing.\n");
2669     }
2670 
2671   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2672       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2673     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2674 			  vect_prologue);
2675 
2676   /* Count statements in scalar loop.  Using this as scalar cost for a single
2677      iteration for now.
2678 
2679      TODO: Add outer loop support.
2680 
2681      TODO: Consider assigning different costs to different scalar
2682      statements.  */
2683 
2684   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2685 
2686   /* Add additional cost for the peeled instructions in prologue and epilogue
2687      loop.
2688 
2689      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2690      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2691 
2692      TODO: Build an expression that represents peel_iters for prologue and
2693      epilogue to be used in a run-time test.  */
2694 
2695   if (npeel  < 0)
2696     {
2697       peel_iters_prologue = vf/2;
2698       dump_printf (MSG_NOTE, "cost model: "
2699                    "prologue peel iters set to vf/2.");
2700 
2701       /* If peeling for alignment is unknown, loop bound of main loop becomes
2702          unknown.  */
2703       peel_iters_epilogue = vf/2;
2704       dump_printf (MSG_NOTE, "cost model: "
2705                    "epilogue peel iters set to vf/2 because "
2706                    "peeling for alignment is unknown.");
2707 
2708       /* If peeled iterations are unknown, count a taken branch and a not taken
2709          branch per peeled loop. Even if scalar loop iterations are known,
2710          vector iterations are not known since peeled prologue iterations are
2711          not known. Hence guards remain the same.  */
2712       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2713 			    NULL, 0, vect_prologue);
2714       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2715 			    NULL, 0, vect_prologue);
2716       /* FORNOW: Don't attempt to pass individual scalar instructions to
2717 	 the model; just assume linear cost for scalar iterations.  */
2718       (void) add_stmt_cost (target_cost_data,
2719 			    peel_iters_prologue * scalar_single_iter_cost,
2720 			    scalar_stmt, NULL, 0, vect_prologue);
2721       (void) add_stmt_cost (target_cost_data,
2722 			    peel_iters_epilogue * scalar_single_iter_cost,
2723 			    scalar_stmt, NULL, 0, vect_epilogue);
2724     }
2725   else
2726     {
2727       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2728       stmt_info_for_cost *si;
2729       int j;
2730       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2731 
2732       prologue_cost_vec.create (2);
2733       epilogue_cost_vec.create (2);
2734       peel_iters_prologue = npeel;
2735 
2736       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2737 					  &peel_iters_epilogue,
2738 					  scalar_single_iter_cost,
2739 					  &prologue_cost_vec,
2740 					  &epilogue_cost_vec);
2741 
2742       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2743 	{
2744 	  struct _stmt_vec_info *stmt_info
2745 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2746 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2747 				si->misalign, vect_prologue);
2748 	}
2749 
2750       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2751 	{
2752 	  struct _stmt_vec_info *stmt_info
2753 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2754 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2755 				si->misalign, vect_epilogue);
2756 	}
2757 
2758       prologue_cost_vec.release ();
2759       epilogue_cost_vec.release ();
2760     }
2761 
2762   /* FORNOW: The scalar outside cost is incremented in one of the
2763      following ways:
2764 
2765      1. The vectorizer checks for alignment and aliasing and generates
2766      a condition that allows dynamic vectorization.  A cost model
2767      check is ANDED with the versioning condition.  Hence scalar code
2768      path now has the added cost of the versioning check.
2769 
2770        if (cost > th & versioning_check)
2771          jmp to vector code
2772 
2773      Hence run-time scalar is incremented by not-taken branch cost.
2774 
2775      2. The vectorizer then checks if a prologue is required.  If the
2776      cost model check was not done before during versioning, it has to
2777      be done before the prologue check.
2778 
2779        if (cost <= th)
2780          prologue = scalar_iters
2781        if (prologue == 0)
2782          jmp to vector code
2783        else
2784          execute prologue
2785        if (prologue == num_iters)
2786 	 go to exit
2787 
2788      Hence the run-time scalar cost is incremented by a taken branch,
2789      plus a not-taken branch, plus a taken branch cost.
2790 
2791      3. The vectorizer then checks if an epilogue is required.  If the
2792      cost model check was not done before during prologue check, it
2793      has to be done with the epilogue check.
2794 
2795        if (prologue == 0)
2796          jmp to vector code
2797        else
2798          execute prologue
2799        if (prologue == num_iters)
2800 	 go to exit
2801        vector code:
2802          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2803            jmp to epilogue
2804 
2805      Hence the run-time scalar cost should be incremented by 2 taken
2806      branches.
2807 
2808      TODO: The back end may reorder the BBS's differently and reverse
2809      conditions/branch directions.  Change the estimates below to
2810      something more reasonable.  */
2811 
2812   /* If the number of iterations is known and we do not do versioning, we can
2813      decide whether to vectorize at compile time.  Hence the scalar version
2814      do not carry cost model guard costs.  */
2815   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2816       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2817       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2818     {
2819       /* Cost model check occurs at versioning.  */
2820       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2821           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2822 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2823       else
2824 	{
2825 	  /* Cost model check occurs at prologue generation.  */
2826 	  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2827 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2828 	      + vect_get_stmt_cost (cond_branch_not_taken);
2829 	  /* Cost model check occurs at epilogue generation.  */
2830 	  else
2831 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2832 	}
2833     }
2834 
2835   /* Complete the target-specific cost calculations.  */
2836   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2837 	       &vec_inside_cost, &vec_epilogue_cost);
2838 
2839   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2840 
2841   /* Calculate number of iterations required to make the vector version
2842      profitable, relative to the loop bodies only.  The following condition
2843      must hold true:
2844      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2845      where
2846      SIC = scalar iteration cost, VIC = vector iteration cost,
2847      VOC = vector outside cost, VF = vectorization factor,
2848      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2849      SOC = scalar outside cost for run time cost model check.  */
2850 
2851   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2852     {
2853       if (vec_outside_cost <= 0)
2854         min_profitable_iters = 1;
2855       else
2856         {
2857           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2858 				  - vec_inside_cost * peel_iters_prologue
2859                                   - vec_inside_cost * peel_iters_epilogue)
2860                                  / ((scalar_single_iter_cost * vf)
2861                                     - vec_inside_cost);
2862 
2863           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2864               <= (((int) vec_inside_cost * min_profitable_iters)
2865                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2866             min_profitable_iters++;
2867         }
2868     }
2869   /* vector version will never be profitable.  */
2870   else
2871     {
2872       if (dump_enabled_p ())
2873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874 			 "cost model: the vector iteration cost = %d "
2875 			 "divided by the scalar iteration cost = %d "
2876 			 "is greater or equal to the vectorization factor = %d.",
2877 			 vec_inside_cost, scalar_single_iter_cost, vf);
2878       *ret_min_profitable_niters = -1;
2879       *ret_min_profitable_estimate = -1;
2880       return;
2881     }
2882 
2883   if (dump_enabled_p ())
2884     {
2885       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2886       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2887                    vec_inside_cost);
2888       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2889                    vec_prologue_cost);
2890       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2891                    vec_epilogue_cost);
2892       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2893                    scalar_single_iter_cost);
2894       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2895                    scalar_outside_cost);
2896       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2897                    vec_outside_cost);
2898       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2899                    peel_iters_prologue);
2900       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2901                    peel_iters_epilogue);
2902       dump_printf (MSG_NOTE,
2903                    "  Calculated minimum iters for profitability: %d\n",
2904                    min_profitable_iters);
2905     }
2906 
2907   min_profitable_iters =
2908 	min_profitable_iters < vf ? vf : min_profitable_iters;
2909 
2910   /* Because the condition we create is:
2911      if (niters <= min_profitable_iters)
2912        then skip the vectorized loop.  */
2913   min_profitable_iters--;
2914 
2915   if (dump_enabled_p ())
2916     dump_printf_loc (MSG_NOTE, vect_location,
2917                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2918 
2919   *ret_min_profitable_niters = min_profitable_iters;
2920 
2921   /* Calculate number of iterations required to make the vector version
2922      profitable, relative to the loop bodies only.
2923 
2924      Non-vectorized variant is SIC * niters and it must win over vector
2925      variant on the expected loop trip count.  The following condition must hold true:
2926      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2927 
2928   if (vec_outside_cost <= 0)
2929     min_profitable_estimate = 1;
2930   else
2931     {
2932       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2933 				 - vec_inside_cost * peel_iters_prologue
2934 				 - vec_inside_cost * peel_iters_epilogue)
2935 				 / ((scalar_single_iter_cost * vf)
2936 				   - vec_inside_cost);
2937     }
2938   min_profitable_estimate --;
2939   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2940   if (dump_enabled_p ())
2941     dump_printf_loc (MSG_NOTE, vect_location,
2942                      "  Static estimate profitability threshold = %d\n",
2943                       min_profitable_iters);
2944 
2945   *ret_min_profitable_estimate = min_profitable_estimate;
2946 }
2947 
2948 
2949 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2950    functions. Design better to avoid maintenance issues.  */
2951 
2952 /* Function vect_model_reduction_cost.
2953 
2954    Models cost for a reduction operation, including the vector ops
2955    generated within the strip-mine loop, the initial definition before
2956    the loop, and the epilogue code that must be generated.  */
2957 
2958 static bool
vect_model_reduction_cost(stmt_vec_info stmt_info,enum tree_code reduc_code,int ncopies)2959 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2960 			   int ncopies)
2961 {
2962   int prologue_cost = 0, epilogue_cost = 0;
2963   enum tree_code code;
2964   optab optab;
2965   tree vectype;
2966   gimple stmt, orig_stmt;
2967   tree reduction_op;
2968   enum machine_mode mode;
2969   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2970   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2971   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2972 
2973   /* Cost of reduction op inside loop.  */
2974   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2975 					stmt_info, 0, vect_body);
2976   stmt = STMT_VINFO_STMT (stmt_info);
2977 
2978   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2979     {
2980     case GIMPLE_SINGLE_RHS:
2981       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2982       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2983       break;
2984     case GIMPLE_UNARY_RHS:
2985       reduction_op = gimple_assign_rhs1 (stmt);
2986       break;
2987     case GIMPLE_BINARY_RHS:
2988       reduction_op = gimple_assign_rhs2 (stmt);
2989       break;
2990     case GIMPLE_TERNARY_RHS:
2991       reduction_op = gimple_assign_rhs3 (stmt);
2992       break;
2993     default:
2994       gcc_unreachable ();
2995     }
2996 
2997   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2998   if (!vectype)
2999     {
3000       if (dump_enabled_p ())
3001         {
3002 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3003 			   "unsupported data-type ");
3004           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3005 			     TREE_TYPE (reduction_op));
3006         }
3007       return false;
3008    }
3009 
3010   mode = TYPE_MODE (vectype);
3011   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3012 
3013   if (!orig_stmt)
3014     orig_stmt = STMT_VINFO_STMT (stmt_info);
3015 
3016   code = gimple_assign_rhs_code (orig_stmt);
3017 
3018   /* Add in cost for initial definition.  */
3019   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3020 				  stmt_info, 0, vect_prologue);
3021 
3022   /* Determine cost of epilogue code.
3023 
3024      We have a reduction operator that will reduce the vector in one statement.
3025      Also requires scalar extract.  */
3026 
3027   if (!nested_in_vect_loop_p (loop, orig_stmt))
3028     {
3029       if (reduc_code != ERROR_MARK)
3030 	{
3031 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3032 					  stmt_info, 0, vect_epilogue);
3033 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3034 					  stmt_info, 0, vect_epilogue);
3035 	}
3036       else
3037 	{
3038 	  int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3039 	  tree bitsize =
3040 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3041 	  int element_bitsize = tree_low_cst (bitsize, 1);
3042 	  int nelements = vec_size_in_bits / element_bitsize;
3043 
3044 	  optab = optab_for_tree_code (code, vectype, optab_default);
3045 
3046 	  /* We have a whole vector shift available.  */
3047 	  if (VECTOR_MODE_P (mode)
3048 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3049 	      && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3050 	    {
3051 	      /* Final reduction via vector shifts and the reduction operator.
3052 		 Also requires scalar extract.  */
3053 	      epilogue_cost += add_stmt_cost (target_cost_data,
3054 					      exact_log2 (nelements) * 2,
3055 					      vector_stmt, stmt_info, 0,
3056 					      vect_epilogue);
3057 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3058 					      vec_to_scalar, stmt_info, 0,
3059 					      vect_epilogue);
3060 	    }
3061 	  else
3062 	    /* Use extracts and reduction op for final reduction.  For N
3063 	       elements, we have N extracts and N-1 reduction ops.  */
3064 	    epilogue_cost += add_stmt_cost (target_cost_data,
3065 					    nelements + nelements - 1,
3066 					    vector_stmt, stmt_info, 0,
3067 					    vect_epilogue);
3068 	}
3069     }
3070 
3071   if (dump_enabled_p ())
3072     dump_printf (MSG_NOTE,
3073                  "vect_model_reduction_cost: inside_cost = %d, "
3074                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3075                  prologue_cost, epilogue_cost);
3076 
3077   return true;
3078 }
3079 
3080 
3081 /* Function vect_model_induction_cost.
3082 
3083    Models cost for induction operations.  */
3084 
3085 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)3086 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3087 {
3088   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3089   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3090   unsigned inside_cost, prologue_cost;
3091 
3092   /* loop cost for vec_loop.  */
3093   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3094 			       stmt_info, 0, vect_body);
3095 
3096   /* prologue cost for vec_init and vec_step.  */
3097   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3098 				 stmt_info, 0, vect_prologue);
3099 
3100   if (dump_enabled_p ())
3101     dump_printf_loc (MSG_NOTE, vect_location,
3102                      "vect_model_induction_cost: inside_cost = %d, "
3103                      "prologue_cost = %d .", inside_cost, prologue_cost);
3104 }
3105 
3106 
3107 /* Function get_initial_def_for_induction
3108 
3109    Input:
3110    STMT - a stmt that performs an induction operation in the loop.
3111    IV_PHI - the initial value of the induction variable
3112 
3113    Output:
3114    Return a vector variable, initialized with the first VF values of
3115    the induction variable.  E.g., for an iv with IV_PHI='X' and
3116    evolution S, for a vector of 4 units, we want to return:
3117    [X, X + S, X + 2*S, X + 3*S].  */
3118 
3119 static tree
get_initial_def_for_induction(gimple iv_phi)3120 get_initial_def_for_induction (gimple iv_phi)
3121 {
3122   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3123   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3124   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3125   tree vectype;
3126   int nunits;
3127   edge pe = loop_preheader_edge (loop);
3128   struct loop *iv_loop;
3129   basic_block new_bb;
3130   tree new_vec, vec_init, vec_step, t;
3131   tree new_var;
3132   tree new_name;
3133   gimple init_stmt, induction_phi, new_stmt;
3134   tree induc_def, vec_def, vec_dest;
3135   tree init_expr, step_expr;
3136   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3137   int i;
3138   int ncopies;
3139   tree expr;
3140   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3141   bool nested_in_vect_loop = false;
3142   gimple_seq stmts = NULL;
3143   imm_use_iterator imm_iter;
3144   use_operand_p use_p;
3145   gimple exit_phi;
3146   edge latch_e;
3147   tree loop_arg;
3148   gimple_stmt_iterator si;
3149   basic_block bb = gimple_bb (iv_phi);
3150   tree stepvectype;
3151   tree resvectype;
3152 
3153   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3154   if (nested_in_vect_loop_p (loop, iv_phi))
3155     {
3156       nested_in_vect_loop = true;
3157       iv_loop = loop->inner;
3158     }
3159   else
3160     iv_loop = loop;
3161   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3162 
3163   latch_e = loop_latch_edge (iv_loop);
3164   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3165 
3166   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3167   gcc_assert (step_expr != NULL_TREE);
3168 
3169   pe = loop_preheader_edge (iv_loop);
3170   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3171 				     loop_preheader_edge (iv_loop));
3172 
3173   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3174   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3175   gcc_assert (vectype);
3176   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3177   ncopies = vf / nunits;
3178 
3179   gcc_assert (phi_info);
3180   gcc_assert (ncopies >= 1);
3181 
3182   /* Convert the step to the desired type.  */
3183   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3184 						  step_expr),
3185 				    &stmts, true, NULL_TREE);
3186   if (stmts)
3187     {
3188       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3189       gcc_assert (!new_bb);
3190     }
3191 
3192   /* Find the first insertion point in the BB.  */
3193   si = gsi_after_labels (bb);
3194 
3195   /* Create the vector that holds the initial_value of the induction.  */
3196   if (nested_in_vect_loop)
3197     {
3198       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3199 	 been created during vectorization of previous stmts.  We obtain it
3200 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3201       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3202       /* If the initial value is not of proper type, convert it.  */
3203       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3204 	{
3205 	  new_stmt = gimple_build_assign_with_ops
3206 	      (VIEW_CONVERT_EXPR,
3207 	       vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3208 	       build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3209 	  vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3210 	  gimple_assign_set_lhs (new_stmt, vec_init);
3211 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3212 						 new_stmt);
3213 	  gcc_assert (!new_bb);
3214 	  set_vinfo_for_stmt (new_stmt,
3215 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3216 	}
3217     }
3218   else
3219     {
3220       vec<constructor_elt, va_gc> *v;
3221 
3222       /* iv_loop is the loop to be vectorized. Create:
3223 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3224       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3225 				       vect_scalar_var, "var_");
3226       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3227 						     init_expr),
3228 				       &stmts, false, new_var);
3229       if (stmts)
3230 	{
3231 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3232 	  gcc_assert (!new_bb);
3233 	}
3234 
3235       vec_alloc (v, nunits);
3236       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3237       for (i = 1; i < nunits; i++)
3238 	{
3239 	  /* Create: new_name_i = new_name + step_expr  */
3240 	  init_stmt = gimple_build_assign_with_ops (PLUS_EXPR, new_var,
3241 						    new_name, step_expr);
3242 	  new_name = make_ssa_name (new_var, init_stmt);
3243 	  gimple_assign_set_lhs (init_stmt, new_name);
3244 
3245 	  new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3246 	  gcc_assert (!new_bb);
3247 
3248 	  if (dump_enabled_p ())
3249 	    {
3250 	      dump_printf_loc (MSG_NOTE, vect_location,
3251 			       "created new init_stmt: ");
3252 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3253 	    }
3254 	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3255 	}
3256       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3257       new_vec = build_constructor (vectype, v);
3258       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3259     }
3260 
3261 
3262   /* Create the vector that holds the step of the induction.  */
3263   if (nested_in_vect_loop)
3264     /* iv_loop is nested in the loop to be vectorized. Generate:
3265        vec_step = [S, S, S, S]  */
3266     new_name = step_expr;
3267   else
3268     {
3269       /* iv_loop is the loop to be vectorized. Generate:
3270 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3271       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3272       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3273 			      expr, step_expr);
3274     }
3275 
3276   t = unshare_expr (new_name);
3277   gcc_assert (CONSTANT_CLASS_P (new_name));
3278   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3279   gcc_assert (stepvectype);
3280   new_vec = build_vector_from_val (stepvectype, t);
3281   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3282 
3283 
3284   /* Create the following def-use cycle:
3285      loop prolog:
3286          vec_init = ...
3287 	 vec_step = ...
3288      loop:
3289          vec_iv = PHI <vec_init, vec_loop>
3290          ...
3291          STMT
3292          ...
3293          vec_loop = vec_iv + vec_step;  */
3294 
3295   /* Create the induction-phi that defines the induction-operand.  */
3296   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3297   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3298   set_vinfo_for_stmt (induction_phi,
3299 		      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3300   induc_def = PHI_RESULT (induction_phi);
3301 
3302   /* Create the iv update inside the loop  */
3303   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3304 					   induc_def, vec_step);
3305   vec_def = make_ssa_name (vec_dest, new_stmt);
3306   gimple_assign_set_lhs (new_stmt, vec_def);
3307   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3308   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3309                                                    NULL));
3310 
3311   /* Set the arguments of the phi node:  */
3312   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3313   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3314 	       UNKNOWN_LOCATION);
3315 
3316 
3317   /* In case that vectorization factor (VF) is bigger than the number
3318      of elements that we can fit in a vectype (nunits), we have to generate
3319      more than one vector stmt - i.e - we need to "unroll" the
3320      vector stmt by a factor VF/nunits.  For more details see documentation
3321      in vectorizable_operation.  */
3322 
3323   if (ncopies > 1)
3324     {
3325       stmt_vec_info prev_stmt_vinfo;
3326       /* FORNOW. This restriction should be relaxed.  */
3327       gcc_assert (!nested_in_vect_loop);
3328 
3329       /* Create the vector that holds the step of the induction.  */
3330       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3331       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3332 			      expr, step_expr);
3333       t = unshare_expr (new_name);
3334       gcc_assert (CONSTANT_CLASS_P (new_name));
3335       new_vec = build_vector_from_val (stepvectype, t);
3336       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3337 
3338       vec_def = induc_def;
3339       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3340       for (i = 1; i < ncopies; i++)
3341 	{
3342 	  /* vec_i = vec_prev + vec_step  */
3343 	  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3344 						   vec_def, vec_step);
3345 	  vec_def = make_ssa_name (vec_dest, new_stmt);
3346 	  gimple_assign_set_lhs (new_stmt, vec_def);
3347 
3348 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3349 	  if (!useless_type_conversion_p (resvectype, vectype))
3350 	    {
3351 	      new_stmt = gimple_build_assign_with_ops
3352 		  (VIEW_CONVERT_EXPR,
3353 		   vect_get_new_vect_var (resvectype, vect_simple_var,
3354 					  "vec_iv_"),
3355 		   build1 (VIEW_CONVERT_EXPR, resvectype,
3356 			   gimple_assign_lhs (new_stmt)), NULL_TREE);
3357 	      gimple_assign_set_lhs (new_stmt,
3358 				     make_ssa_name
3359 				       (gimple_assign_lhs (new_stmt), new_stmt));
3360 	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3361 	    }
3362 	  set_vinfo_for_stmt (new_stmt,
3363 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3364 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3365 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3366 	}
3367     }
3368 
3369   if (nested_in_vect_loop)
3370     {
3371       /* Find the loop-closed exit-phi of the induction, and record
3372          the final vector of induction results:  */
3373       exit_phi = NULL;
3374       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3375         {
3376 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3377 	    {
3378 	      exit_phi = USE_STMT (use_p);
3379 	      break;
3380 	    }
3381         }
3382       if (exit_phi)
3383 	{
3384 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3385 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
3386 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
3387 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3388 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
3389 
3390 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3391 	  if (dump_enabled_p ())
3392 	    {
3393 	      dump_printf_loc (MSG_NOTE, vect_location,
3394 			       "vector of inductions after inner-loop:");
3395 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3396 	    }
3397 	}
3398     }
3399 
3400 
3401   if (dump_enabled_p ())
3402     {
3403       dump_printf_loc (MSG_NOTE, vect_location,
3404 		       "transform induction: created def-use cycle: ");
3405       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3406       dump_printf (MSG_NOTE, "\n");
3407       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3408 			SSA_NAME_DEF_STMT (vec_def), 0);
3409     }
3410 
3411   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3412   if (!useless_type_conversion_p (resvectype, vectype))
3413     {
3414       new_stmt = gimple_build_assign_with_ops
3415 	 (VIEW_CONVERT_EXPR,
3416 	  vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3417 	  build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3418       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3419       gimple_assign_set_lhs (new_stmt, induc_def);
3420       si = gsi_after_labels (bb);
3421       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3422       set_vinfo_for_stmt (new_stmt,
3423 			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3424       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3425 	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3426     }
3427 
3428   return induc_def;
3429 }
3430 
3431 
3432 /* Function get_initial_def_for_reduction
3433 
3434    Input:
3435    STMT - a stmt that performs a reduction operation in the loop.
3436    INIT_VAL - the initial value of the reduction variable
3437 
3438    Output:
3439    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3440         of the reduction (used for adjusting the epilog - see below).
3441    Return a vector variable, initialized according to the operation that STMT
3442         performs. This vector will be used as the initial value of the
3443         vector of partial results.
3444 
3445    Option1 (adjust in epilog): Initialize the vector as follows:
3446      add/bit or/xor:    [0,0,...,0,0]
3447      mult/bit and:      [1,1,...,1,1]
3448      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3449    and when necessary (e.g. add/mult case) let the caller know
3450    that it needs to adjust the result by init_val.
3451 
3452    Option2: Initialize the vector as follows:
3453      add/bit or/xor:    [init_val,0,0,...,0]
3454      mult/bit and:      [init_val,1,1,...,1]
3455      min/max/cond_expr: [init_val,init_val,...,init_val]
3456    and no adjustments are needed.
3457 
3458    For example, for the following code:
3459 
3460    s = init_val;
3461    for (i=0;i<n;i++)
3462      s = s + a[i];
3463 
3464    STMT is 's = s + a[i]', and the reduction variable is 's'.
3465    For a vector of 4 units, we want to return either [0,0,0,init_val],
3466    or [0,0,0,0] and let the caller know that it needs to adjust
3467    the result at the end by 'init_val'.
3468 
3469    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3470    initialization vector is simpler (same element in all entries), if
3471    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3472 
3473    A cost model should help decide between these two schemes.  */
3474 
3475 tree
get_initial_def_for_reduction(gimple stmt,tree init_val,tree * adjustment_def)3476 get_initial_def_for_reduction (gimple stmt, tree init_val,
3477                                tree *adjustment_def)
3478 {
3479   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3481   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3482   tree scalar_type = TREE_TYPE (init_val);
3483   tree vectype = get_vectype_for_scalar_type (scalar_type);
3484   int nunits;
3485   enum tree_code code = gimple_assign_rhs_code (stmt);
3486   tree def_for_init;
3487   tree init_def;
3488   tree *elts;
3489   int i;
3490   bool nested_in_vect_loop = false;
3491   tree init_value;
3492   REAL_VALUE_TYPE real_init_val = dconst0;
3493   int int_init_val = 0;
3494   gimple def_stmt = NULL;
3495 
3496   gcc_assert (vectype);
3497   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3498 
3499   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3500 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
3501 
3502   if (nested_in_vect_loop_p (loop, stmt))
3503     nested_in_vect_loop = true;
3504   else
3505     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3506 
3507   /* In case of double reduction we only create a vector variable to be put
3508      in the reduction phi node.  The actual statement creation is done in
3509      vect_create_epilog_for_reduction.  */
3510   if (adjustment_def && nested_in_vect_loop
3511       && TREE_CODE (init_val) == SSA_NAME
3512       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3513       && gimple_code (def_stmt) == GIMPLE_PHI
3514       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3515       && vinfo_for_stmt (def_stmt)
3516       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3517           == vect_double_reduction_def)
3518     {
3519       *adjustment_def = NULL;
3520       return vect_create_destination_var (init_val, vectype);
3521     }
3522 
3523   if (TREE_CONSTANT (init_val))
3524     {
3525       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3526         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3527       else
3528         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3529     }
3530   else
3531     init_value = init_val;
3532 
3533   switch (code)
3534     {
3535       case WIDEN_SUM_EXPR:
3536       case DOT_PROD_EXPR:
3537       case PLUS_EXPR:
3538       case MINUS_EXPR:
3539       case BIT_IOR_EXPR:
3540       case BIT_XOR_EXPR:
3541       case MULT_EXPR:
3542       case BIT_AND_EXPR:
3543         /* ADJUSMENT_DEF is NULL when called from
3544            vect_create_epilog_for_reduction to vectorize double reduction.  */
3545         if (adjustment_def)
3546           {
3547             if (nested_in_vect_loop)
3548               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3549                                                               NULL);
3550             else
3551               *adjustment_def = init_val;
3552           }
3553 
3554         if (code == MULT_EXPR)
3555           {
3556             real_init_val = dconst1;
3557             int_init_val = 1;
3558           }
3559 
3560         if (code == BIT_AND_EXPR)
3561           int_init_val = -1;
3562 
3563         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3564           def_for_init = build_real (scalar_type, real_init_val);
3565         else
3566           def_for_init = build_int_cst (scalar_type, int_init_val);
3567 
3568         /* Create a vector of '0' or '1' except the first element.  */
3569 	elts = XALLOCAVEC (tree, nunits);
3570         for (i = nunits - 2; i >= 0; --i)
3571 	  elts[i + 1] = def_for_init;
3572 
3573         /* Option1: the first element is '0' or '1' as well.  */
3574         if (adjustment_def)
3575           {
3576 	    elts[0] = def_for_init;
3577             init_def = build_vector (vectype, elts);
3578             break;
3579           }
3580 
3581         /* Option2: the first element is INIT_VAL.  */
3582 	elts[0] = init_val;
3583         if (TREE_CONSTANT (init_val))
3584           init_def = build_vector (vectype, elts);
3585         else
3586 	  {
3587 	    vec<constructor_elt, va_gc> *v;
3588 	    vec_alloc (v, nunits);
3589 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3590 	    for (i = 1; i < nunits; ++i)
3591 	      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3592 	    init_def = build_constructor (vectype, v);
3593 	  }
3594 
3595         break;
3596 
3597       case MIN_EXPR:
3598       case MAX_EXPR:
3599       case COND_EXPR:
3600         if (adjustment_def)
3601           {
3602             *adjustment_def = NULL_TREE;
3603             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3604             break;
3605           }
3606 
3607 	init_def = build_vector_from_val (vectype, init_value);
3608         break;
3609 
3610       default:
3611         gcc_unreachable ();
3612     }
3613 
3614   return init_def;
3615 }
3616 
3617 
3618 /* Function vect_create_epilog_for_reduction
3619 
3620    Create code at the loop-epilog to finalize the result of a reduction
3621    computation.
3622 
3623    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3624      reduction statements.
3625    STMT is the scalar reduction stmt that is being vectorized.
3626    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3627      number of elements that we can fit in a vectype (nunits).  In this case
3628      we have to generate more than one vector stmt - i.e - we need to "unroll"
3629      the vector stmt by a factor VF/nunits.  For more details see documentation
3630      in vectorizable_operation.
3631    REDUC_CODE is the tree-code for the epilog reduction.
3632    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3633      computation.
3634    REDUC_INDEX is the index of the operand in the right hand side of the
3635      statement that is defined by REDUCTION_PHI.
3636    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3637    SLP_NODE is an SLP node containing a group of reduction statements. The
3638      first one in this group is STMT.
3639 
3640    This function:
3641    1. Creates the reduction def-use cycles: sets the arguments for
3642       REDUCTION_PHIS:
3643       The loop-entry argument is the vectorized initial-value of the reduction.
3644       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3645       sums.
3646    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3647       by applying the operation specified by REDUC_CODE if available, or by
3648       other means (whole-vector shifts or a scalar loop).
3649       The function also creates a new phi node at the loop exit to preserve
3650       loop-closed form, as illustrated below.
3651 
3652      The flow at the entry to this function:
3653 
3654         loop:
3655           vec_def = phi <null, null>            # REDUCTION_PHI
3656           VECT_DEF = vector_stmt                # vectorized form of STMT
3657           s_loop = scalar_stmt                  # (scalar) STMT
3658         loop_exit:
3659           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3660           use <s_out0>
3661           use <s_out0>
3662 
3663      The above is transformed by this function into:
3664 
3665         loop:
3666           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3667           VECT_DEF = vector_stmt                # vectorized form of STMT
3668           s_loop = scalar_stmt                  # (scalar) STMT
3669         loop_exit:
3670           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3671           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3672           v_out2 = reduce <v_out1>
3673           s_out3 = extract_field <v_out2, 0>
3674           s_out4 = adjust_result <s_out3>
3675           use <s_out4>
3676           use <s_out4>
3677 */
3678 
3679 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple stmt,int ncopies,enum tree_code reduc_code,vec<gimple> reduction_phis,int reduc_index,bool double_reduc,slp_tree slp_node)3680 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3681 				  int ncopies, enum tree_code reduc_code,
3682 				  vec<gimple> reduction_phis,
3683                                   int reduc_index, bool double_reduc,
3684                                   slp_tree slp_node)
3685 {
3686   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3687   stmt_vec_info prev_phi_info;
3688   tree vectype;
3689   enum machine_mode mode;
3690   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3691   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3692   basic_block exit_bb;
3693   tree scalar_dest;
3694   tree scalar_type;
3695   gimple new_phi = NULL, phi;
3696   gimple_stmt_iterator exit_gsi;
3697   tree vec_dest;
3698   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3699   gimple epilog_stmt = NULL;
3700   enum tree_code code = gimple_assign_rhs_code (stmt);
3701   gimple exit_phi;
3702   tree bitsize, bitpos;
3703   tree adjustment_def = NULL;
3704   tree vec_initial_def = NULL;
3705   tree reduction_op, expr, def;
3706   tree orig_name, scalar_result;
3707   imm_use_iterator imm_iter, phi_imm_iter;
3708   use_operand_p use_p, phi_use_p;
3709   bool extract_scalar_result = false;
3710   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3711   bool nested_in_vect_loop = false;
3712   vec<gimple> new_phis = vNULL;
3713   vec<gimple> inner_phis = vNULL;
3714   enum vect_def_type dt = vect_unknown_def_type;
3715   int j, i;
3716   vec<tree> scalar_results = vNULL;
3717   unsigned int group_size = 1, k, ratio;
3718   vec<tree> vec_initial_defs = vNULL;
3719   vec<gimple> phis;
3720   bool slp_reduc = false;
3721   tree new_phi_result;
3722   gimple inner_phi = NULL;
3723 
3724   if (slp_node)
3725     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3726 
3727   if (nested_in_vect_loop_p (loop, stmt))
3728     {
3729       outer_loop = loop;
3730       loop = loop->inner;
3731       nested_in_vect_loop = true;
3732       gcc_assert (!slp_node);
3733     }
3734 
3735   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3736     {
3737     case GIMPLE_SINGLE_RHS:
3738       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3739 		  == ternary_op);
3740       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3741       break;
3742     case GIMPLE_UNARY_RHS:
3743       reduction_op = gimple_assign_rhs1 (stmt);
3744       break;
3745     case GIMPLE_BINARY_RHS:
3746       reduction_op = reduc_index ?
3747                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3748       break;
3749     case GIMPLE_TERNARY_RHS:
3750       reduction_op = gimple_op (stmt, reduc_index + 1);
3751       break;
3752     default:
3753       gcc_unreachable ();
3754     }
3755 
3756   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3757   gcc_assert (vectype);
3758   mode = TYPE_MODE (vectype);
3759 
3760   /* 1. Create the reduction def-use cycle:
3761      Set the arguments of REDUCTION_PHIS, i.e., transform
3762 
3763         loop:
3764           vec_def = phi <null, null>            # REDUCTION_PHI
3765           VECT_DEF = vector_stmt                # vectorized form of STMT
3766           ...
3767 
3768      into:
3769 
3770         loop:
3771           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3772           VECT_DEF = vector_stmt                # vectorized form of STMT
3773           ...
3774 
3775      (in case of SLP, do it for all the phis). */
3776 
3777   /* Get the loop-entry arguments.  */
3778   if (slp_node)
3779     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3780                        NULL, slp_node, reduc_index);
3781   else
3782     {
3783       vec_initial_defs.create (1);
3784      /* For the case of reduction, vect_get_vec_def_for_operand returns
3785         the scalar def before the loop, that defines the initial value
3786         of the reduction variable.  */
3787       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3788                                                       &adjustment_def);
3789       vec_initial_defs.quick_push (vec_initial_def);
3790     }
3791 
3792   /* Set phi nodes arguments.  */
3793   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3794     {
3795       tree vec_init_def, def;
3796       gimple_seq stmts;
3797       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3798 					   true, NULL_TREE);
3799       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3800       def = vect_defs[i];
3801       for (j = 0; j < ncopies; j++)
3802         {
3803           /* Set the loop-entry arg of the reduction-phi.  */
3804           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3805                        UNKNOWN_LOCATION);
3806 
3807           /* Set the loop-latch arg for the reduction-phi.  */
3808           if (j > 0)
3809             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3810 
3811           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3812 
3813           if (dump_enabled_p ())
3814             {
3815               dump_printf_loc (MSG_NOTE, vect_location,
3816 			       "transform reduction: created def-use cycle: ");
3817               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3818               dump_printf (MSG_NOTE, "\n");
3819               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3820             }
3821 
3822           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3823         }
3824     }
3825 
3826   vec_initial_defs.release ();
3827 
3828   /* 2. Create epilog code.
3829         The reduction epilog code operates across the elements of the vector
3830         of partial results computed by the vectorized loop.
3831         The reduction epilog code consists of:
3832 
3833         step 1: compute the scalar result in a vector (v_out2)
3834         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3835         step 3: adjust the scalar result (s_out3) if needed.
3836 
3837         Step 1 can be accomplished using one the following three schemes:
3838           (scheme 1) using reduc_code, if available.
3839           (scheme 2) using whole-vector shifts, if available.
3840           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3841                      combined.
3842 
3843           The overall epilog code looks like this:
3844 
3845           s_out0 = phi <s_loop>         # original EXIT_PHI
3846           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3847           v_out2 = reduce <v_out1>              # step 1
3848           s_out3 = extract_field <v_out2, 0>    # step 2
3849           s_out4 = adjust_result <s_out3>       # step 3
3850 
3851           (step 3 is optional, and steps 1 and 2 may be combined).
3852           Lastly, the uses of s_out0 are replaced by s_out4.  */
3853 
3854 
3855   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3856          v_out1 = phi <VECT_DEF>
3857          Store them in NEW_PHIS.  */
3858 
3859   exit_bb = single_exit (loop)->dest;
3860   prev_phi_info = NULL;
3861   new_phis.create (vect_defs.length ());
3862   FOR_EACH_VEC_ELT (vect_defs, i, def)
3863     {
3864       for (j = 0; j < ncopies; j++)
3865         {
3866 	  tree new_def = copy_ssa_name (def, NULL);
3867           phi = create_phi_node (new_def, exit_bb);
3868           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3869           if (j == 0)
3870             new_phis.quick_push (phi);
3871           else
3872 	    {
3873 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
3874 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3875 	    }
3876 
3877           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3878           prev_phi_info = vinfo_for_stmt (phi);
3879         }
3880     }
3881 
3882   /* The epilogue is created for the outer-loop, i.e., for the loop being
3883      vectorized.  Create exit phis for the outer loop.  */
3884   if (double_reduc)
3885     {
3886       loop = outer_loop;
3887       exit_bb = single_exit (loop)->dest;
3888       inner_phis.create (vect_defs.length ());
3889       FOR_EACH_VEC_ELT (new_phis, i, phi)
3890 	{
3891 	  tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3892 	  gimple outer_phi = create_phi_node (new_result, exit_bb);
3893 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3894 			   PHI_RESULT (phi));
3895 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3896 							    loop_vinfo, NULL));
3897 	  inner_phis.quick_push (phi);
3898 	  new_phis[i] = outer_phi;
3899 	  prev_phi_info = vinfo_for_stmt (outer_phi);
3900           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3901             {
3902 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3903 	      new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3904 	      outer_phi = create_phi_node (new_result, exit_bb);
3905 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3906 			       PHI_RESULT (phi));
3907 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3908 							loop_vinfo, NULL));
3909 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3910 	      prev_phi_info = vinfo_for_stmt (outer_phi);
3911 	    }
3912 	}
3913     }
3914 
3915   exit_gsi = gsi_after_labels (exit_bb);
3916 
3917   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3918          (i.e. when reduc_code is not available) and in the final adjustment
3919 	 code (if needed).  Also get the original scalar reduction variable as
3920          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3921          represents a reduction pattern), the tree-code and scalar-def are
3922          taken from the original stmt that the pattern-stmt (STMT) replaces.
3923          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3924          are taken from STMT.  */
3925 
3926   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3927   if (!orig_stmt)
3928     {
3929       /* Regular reduction  */
3930       orig_stmt = stmt;
3931     }
3932   else
3933     {
3934       /* Reduction pattern  */
3935       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3936       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3937       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3938     }
3939 
3940   code = gimple_assign_rhs_code (orig_stmt);
3941   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3942      partial results are added and not subtracted.  */
3943   if (code == MINUS_EXPR)
3944     code = PLUS_EXPR;
3945 
3946   scalar_dest = gimple_assign_lhs (orig_stmt);
3947   scalar_type = TREE_TYPE (scalar_dest);
3948   scalar_results.create (group_size);
3949   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3950   bitsize = TYPE_SIZE (scalar_type);
3951 
3952   /* In case this is a reduction in an inner-loop while vectorizing an outer
3953      loop - we don't need to extract a single scalar result at the end of the
3954      inner-loop (unless it is double reduction, i.e., the use of reduction is
3955      outside the outer-loop).  The final vector of partial results will be used
3956      in the vectorized outer-loop, or reduced to a scalar result at the end of
3957      the outer-loop.  */
3958   if (nested_in_vect_loop && !double_reduc)
3959     goto vect_finalize_reduction;
3960 
3961   /* SLP reduction without reduction chain, e.g.,
3962      # a1 = phi <a2, a0>
3963      # b1 = phi <b2, b0>
3964      a2 = operation (a1)
3965      b2 = operation (b1)  */
3966   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3967 
3968   /* In case of reduction chain, e.g.,
3969      # a1 = phi <a3, a0>
3970      a2 = operation (a1)
3971      a3 = operation (a2),
3972 
3973      we may end up with more than one vector result.  Here we reduce them to
3974      one vector.  */
3975   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3976     {
3977       tree first_vect = PHI_RESULT (new_phis[0]);
3978       tree tmp;
3979       gimple new_vec_stmt = NULL;
3980 
3981       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3982       for (k = 1; k < new_phis.length (); k++)
3983         {
3984           gimple next_phi = new_phis[k];
3985           tree second_vect = PHI_RESULT (next_phi);
3986 
3987           tmp = build2 (code, vectype,  first_vect, second_vect);
3988           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3989           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3990           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3991           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3992         }
3993 
3994       new_phi_result = first_vect;
3995       if (new_vec_stmt)
3996         {
3997           new_phis.truncate (0);
3998           new_phis.safe_push (new_vec_stmt);
3999         }
4000     }
4001   else
4002     new_phi_result = PHI_RESULT (new_phis[0]);
4003 
4004   /* 2.3 Create the reduction code, using one of the three schemes described
4005          above. In SLP we simply need to extract all the elements from the
4006          vector (without reducing them), so we use scalar shifts.  */
4007   if (reduc_code != ERROR_MARK && !slp_reduc)
4008     {
4009       tree tmp;
4010 
4011       /*** Case 1:  Create:
4012            v_out2 = reduc_expr <v_out1>  */
4013 
4014       if (dump_enabled_p ())
4015         dump_printf_loc (MSG_NOTE, vect_location,
4016 			 "Reduce using direct vector reduction.");
4017 
4018       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4019       tmp = build1 (reduc_code, vectype, new_phi_result);
4020       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4021       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4022       gimple_assign_set_lhs (epilog_stmt, new_temp);
4023       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4024 
4025       extract_scalar_result = true;
4026     }
4027   else
4028     {
4029       enum tree_code shift_code = ERROR_MARK;
4030       bool have_whole_vector_shift = true;
4031       int bit_offset;
4032       int element_bitsize = tree_low_cst (bitsize, 1);
4033       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4034       tree vec_temp;
4035 
4036       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4037         shift_code = VEC_RSHIFT_EXPR;
4038       else
4039         have_whole_vector_shift = false;
4040 
4041       /* Regardless of whether we have a whole vector shift, if we're
4042          emulating the operation via tree-vect-generic, we don't want
4043          to use it.  Only the first round of the reduction is likely
4044          to still be profitable via emulation.  */
4045       /* ??? It might be better to emit a reduction tree code here, so that
4046          tree-vect-generic can expand the first round via bit tricks.  */
4047       if (!VECTOR_MODE_P (mode))
4048         have_whole_vector_shift = false;
4049       else
4050         {
4051           optab optab = optab_for_tree_code (code, vectype, optab_default);
4052           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4053             have_whole_vector_shift = false;
4054         }
4055 
4056       if (have_whole_vector_shift && !slp_reduc)
4057         {
4058           /*** Case 2: Create:
4059              for (offset = VS/2; offset >= element_size; offset/=2)
4060                 {
4061                   Create:  va' = vec_shift <va, offset>
4062                   Create:  va = vop <va, va'>
4063                 }  */
4064 
4065           if (dump_enabled_p ())
4066             dump_printf_loc (MSG_NOTE, vect_location,
4067 			     "Reduce using vector shifts");
4068 
4069           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4070           new_temp = new_phi_result;
4071           for (bit_offset = vec_size_in_bits/2;
4072                bit_offset >= element_bitsize;
4073                bit_offset /= 2)
4074             {
4075               tree bitpos = size_int (bit_offset);
4076 
4077               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4078                                                vec_dest, new_temp, bitpos);
4079               new_name = make_ssa_name (vec_dest, epilog_stmt);
4080               gimple_assign_set_lhs (epilog_stmt, new_name);
4081               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4082 
4083               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4084                                                           new_name, new_temp);
4085               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4086               gimple_assign_set_lhs (epilog_stmt, new_temp);
4087               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4088             }
4089 
4090           extract_scalar_result = true;
4091         }
4092       else
4093         {
4094           tree rhs;
4095 
4096           /*** Case 3: Create:
4097              s = extract_field <v_out2, 0>
4098              for (offset = element_size;
4099                   offset < vector_size;
4100                   offset += element_size;)
4101                {
4102                  Create:  s' = extract_field <v_out2, offset>
4103                  Create:  s = op <s, s'>  // For non SLP cases
4104                }  */
4105 
4106           if (dump_enabled_p ())
4107             dump_printf_loc (MSG_NOTE, vect_location,
4108 			     "Reduce using scalar code. ");
4109 
4110           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4111           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4112             {
4113               if (gimple_code (new_phi) == GIMPLE_PHI)
4114                 vec_temp = PHI_RESULT (new_phi);
4115               else
4116                 vec_temp = gimple_assign_lhs (new_phi);
4117               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4118                             bitsize_zero_node);
4119               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4120               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4121               gimple_assign_set_lhs (epilog_stmt, new_temp);
4122               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4123 
4124               /* In SLP we don't need to apply reduction operation, so we just
4125                  collect s' values in SCALAR_RESULTS.  */
4126               if (slp_reduc)
4127                 scalar_results.safe_push (new_temp);
4128 
4129               for (bit_offset = element_bitsize;
4130                    bit_offset < vec_size_in_bits;
4131                    bit_offset += element_bitsize)
4132                 {
4133                   tree bitpos = bitsize_int (bit_offset);
4134                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4135                                      bitsize, bitpos);
4136 
4137                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4138                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4139                   gimple_assign_set_lhs (epilog_stmt, new_name);
4140                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4141 
4142                   if (slp_reduc)
4143                     {
4144                       /* In SLP we don't need to apply reduction operation, so
4145                          we just collect s' values in SCALAR_RESULTS.  */
4146                       new_temp = new_name;
4147                       scalar_results.safe_push (new_name);
4148                     }
4149                   else
4150                     {
4151                       epilog_stmt = gimple_build_assign_with_ops (code,
4152                                           new_scalar_dest, new_name, new_temp);
4153                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4154                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4155                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4156                     }
4157                 }
4158             }
4159 
4160           /* The only case where we need to reduce scalar results in SLP, is
4161              unrolling.  If the size of SCALAR_RESULTS is greater than
4162              GROUP_SIZE, we reduce them combining elements modulo
4163              GROUP_SIZE.  */
4164           if (slp_reduc)
4165             {
4166               tree res, first_res, new_res;
4167               gimple new_stmt;
4168 
4169               /* Reduce multiple scalar results in case of SLP unrolling.  */
4170               for (j = group_size; scalar_results.iterate (j, &res);
4171                    j++)
4172                 {
4173                   first_res = scalar_results[j % group_size];
4174                   new_stmt = gimple_build_assign_with_ops (code,
4175                                               new_scalar_dest, first_res, res);
4176                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4177                   gimple_assign_set_lhs (new_stmt, new_res);
4178                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4179                   scalar_results[j % group_size] = new_res;
4180                 }
4181             }
4182           else
4183             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4184             scalar_results.safe_push (new_temp);
4185 
4186           extract_scalar_result = false;
4187         }
4188     }
4189 
4190   /* 2.4  Extract the final scalar result.  Create:
4191           s_out3 = extract_field <v_out2, bitpos>  */
4192 
4193   if (extract_scalar_result)
4194     {
4195       tree rhs;
4196 
4197       if (dump_enabled_p ())
4198         dump_printf_loc (MSG_NOTE, vect_location,
4199 			 "extract scalar result");
4200 
4201       if (BYTES_BIG_ENDIAN)
4202         bitpos = size_binop (MULT_EXPR,
4203                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4204                              TYPE_SIZE (scalar_type));
4205       else
4206         bitpos = bitsize_zero_node;
4207 
4208       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4209       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4210       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4211       gimple_assign_set_lhs (epilog_stmt, new_temp);
4212       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4213       scalar_results.safe_push (new_temp);
4214     }
4215 
4216 vect_finalize_reduction:
4217 
4218   if (double_reduc)
4219     loop = loop->inner;
4220 
4221   /* 2.5 Adjust the final result by the initial value of the reduction
4222 	 variable. (When such adjustment is not needed, then
4223 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
4224 	 new_temp = loop_exit_def + adjustment_def  */
4225 
4226   if (adjustment_def)
4227     {
4228       gcc_assert (!slp_reduc);
4229       if (nested_in_vect_loop)
4230 	{
4231           new_phi = new_phis[0];
4232 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4233 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4234 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
4235 	}
4236       else
4237 	{
4238           new_temp = scalar_results[0];
4239 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4240 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
4241 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4242 	}
4243 
4244       epilog_stmt = gimple_build_assign (new_dest, expr);
4245       new_temp = make_ssa_name (new_dest, epilog_stmt);
4246       gimple_assign_set_lhs (epilog_stmt, new_temp);
4247       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4248       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4249       if (nested_in_vect_loop)
4250         {
4251           set_vinfo_for_stmt (epilog_stmt,
4252                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4253                                                  NULL));
4254           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4255                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4256 
4257           if (!double_reduc)
4258             scalar_results.quick_push (new_temp);
4259           else
4260             scalar_results[0] = new_temp;
4261         }
4262       else
4263         scalar_results[0] = new_temp;
4264 
4265       new_phis[0] = epilog_stmt;
4266     }
4267 
4268   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4269           phis with new adjusted scalar results, i.e., replace use <s_out0>
4270           with use <s_out4>.
4271 
4272      Transform:
4273         loop_exit:
4274           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4275           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4276           v_out2 = reduce <v_out1>
4277           s_out3 = extract_field <v_out2, 0>
4278           s_out4 = adjust_result <s_out3>
4279           use <s_out0>
4280           use <s_out0>
4281 
4282      into:
4283 
4284         loop_exit:
4285           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4286           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4287           v_out2 = reduce <v_out1>
4288           s_out3 = extract_field <v_out2, 0>
4289           s_out4 = adjust_result <s_out3>
4290           use <s_out4>
4291           use <s_out4> */
4292 
4293 
4294   /* In SLP reduction chain we reduce vector results into one vector if
4295      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4296      the last stmt in the reduction chain, since we are looking for the loop
4297      exit phi node.  */
4298   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4299     {
4300       scalar_dest = gimple_assign_lhs (
4301 			SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4302       group_size = 1;
4303     }
4304 
4305   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4306      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4307      need to match SCALAR_RESULTS with corresponding statements.  The first
4308      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4309      the first vector stmt, etc.
4310      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4311   if (group_size > new_phis.length ())
4312     {
4313       ratio = group_size / new_phis.length ();
4314       gcc_assert (!(group_size % new_phis.length ()));
4315     }
4316   else
4317     ratio = 1;
4318 
4319   for (k = 0; k < group_size; k++)
4320     {
4321       if (k % ratio == 0)
4322         {
4323           epilog_stmt = new_phis[k / ratio];
4324           reduction_phi = reduction_phis[k / ratio];
4325 	  if (double_reduc)
4326 	    inner_phi = inner_phis[k / ratio];
4327         }
4328 
4329       if (slp_reduc)
4330         {
4331           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4332 
4333           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4334           /* SLP statements can't participate in patterns.  */
4335           gcc_assert (!orig_stmt);
4336           scalar_dest = gimple_assign_lhs (current_stmt);
4337         }
4338 
4339       phis.create (3);
4340       /* Find the loop-closed-use at the loop exit of the original scalar
4341          result.  (The reduction result is expected to have two immediate uses -
4342          one at the latch block, and one at the loop exit).  */
4343       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4344         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4345 	    && !is_gimple_debug (USE_STMT (use_p)))
4346           phis.safe_push (USE_STMT (use_p));
4347 
4348       /* While we expect to have found an exit_phi because of loop-closed-ssa
4349          form we can end up without one if the scalar cycle is dead.  */
4350 
4351       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4352         {
4353           if (outer_loop)
4354             {
4355               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4356               gimple vect_phi;
4357 
4358               /* FORNOW. Currently not supporting the case that an inner-loop
4359                  reduction is not used in the outer-loop (but only outside the
4360                  outer-loop), unless it is double reduction.  */
4361               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4362                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4363                           || double_reduc);
4364 
4365 	      if (double_reduc)
4366 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4367 	      else
4368 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4369               if (!double_reduc
4370                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4371                       != vect_double_reduction_def)
4372                 continue;
4373 
4374               /* Handle double reduction:
4375 
4376                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4377                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4378                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4379                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4380 
4381                  At that point the regular reduction (stmt2 and stmt3) is
4382                  already vectorized, as well as the exit phi node, stmt4.
4383                  Here we vectorize the phi node of double reduction, stmt1, and
4384                  update all relevant statements.  */
4385 
4386               /* Go through all the uses of s2 to find double reduction phi
4387                  node, i.e., stmt1 above.  */
4388               orig_name = PHI_RESULT (exit_phi);
4389               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4390                 {
4391                   stmt_vec_info use_stmt_vinfo;
4392                   stmt_vec_info new_phi_vinfo;
4393                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4394                   basic_block bb = gimple_bb (use_stmt);
4395                   gimple use;
4396 
4397                   /* Check that USE_STMT is really double reduction phi
4398                      node.  */
4399                   if (gimple_code (use_stmt) != GIMPLE_PHI
4400                       || gimple_phi_num_args (use_stmt) != 2
4401                       || bb->loop_father != outer_loop)
4402                     continue;
4403                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4404                   if (!use_stmt_vinfo
4405                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4406                           != vect_double_reduction_def)
4407 		    continue;
4408 
4409                   /* Create vector phi node for double reduction:
4410                      vs1 = phi <vs0, vs2>
4411                      vs1 was created previously in this function by a call to
4412                        vect_get_vec_def_for_operand and is stored in
4413                        vec_initial_def;
4414                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4415                      vs0 is created here.  */
4416 
4417                   /* Create vector phi node.  */
4418                   vect_phi = create_phi_node (vec_initial_def, bb);
4419                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4420                                     loop_vec_info_for_loop (outer_loop), NULL);
4421                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4422 
4423                   /* Create vs0 - initial def of the double reduction phi.  */
4424                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4425                                              loop_preheader_edge (outer_loop));
4426                   init_def = get_initial_def_for_reduction (stmt,
4427                                                           preheader_arg, NULL);
4428                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4429                                                     vectype, NULL);
4430 
4431                   /* Update phi node arguments with vs0 and vs2.  */
4432                   add_phi_arg (vect_phi, vect_phi_init,
4433                                loop_preheader_edge (outer_loop),
4434                                UNKNOWN_LOCATION);
4435                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4436                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4437                   if (dump_enabled_p ())
4438                     {
4439                       dump_printf_loc (MSG_NOTE, vect_location,
4440 				       "created double reduction phi node: ");
4441                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4442                     }
4443 
4444                   vect_phi_res = PHI_RESULT (vect_phi);
4445 
4446                   /* Replace the use, i.e., set the correct vs1 in the regular
4447                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4448                      loop is redundant.  */
4449                   use = reduction_phi;
4450                   for (j = 0; j < ncopies; j++)
4451                     {
4452                       edge pr_edge = loop_preheader_edge (loop);
4453                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4454                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4455                     }
4456                 }
4457             }
4458         }
4459 
4460       phis.release ();
4461       if (nested_in_vect_loop)
4462         {
4463           if (double_reduc)
4464             loop = outer_loop;
4465           else
4466             continue;
4467         }
4468 
4469       phis.create (3);
4470       /* Find the loop-closed-use at the loop exit of the original scalar
4471          result.  (The reduction result is expected to have two immediate uses,
4472          one at the latch block, and one at the loop exit).  For double
4473          reductions we are looking for exit phis of the outer loop.  */
4474       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4475         {
4476           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4477 	    {
4478 	      if (!is_gimple_debug (USE_STMT (use_p)))
4479 		phis.safe_push (USE_STMT (use_p));
4480 	    }
4481           else
4482             {
4483               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4484                 {
4485                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4486 
4487                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4488                     {
4489                       if (!flow_bb_inside_loop_p (loop,
4490                                              gimple_bb (USE_STMT (phi_use_p)))
4491 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
4492                         phis.safe_push (USE_STMT (phi_use_p));
4493                     }
4494                 }
4495             }
4496         }
4497 
4498       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4499         {
4500           /* Replace the uses:  */
4501           orig_name = PHI_RESULT (exit_phi);
4502           scalar_result = scalar_results[k];
4503           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4504             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4505               SET_USE (use_p, scalar_result);
4506         }
4507 
4508       phis.release ();
4509     }
4510 
4511   scalar_results.release ();
4512   inner_phis.release ();
4513   new_phis.release ();
4514 }
4515 
4516 
4517 /* Function vectorizable_reduction.
4518 
4519    Check if STMT performs a reduction operation that can be vectorized.
4520    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4521    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4522    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4523 
4524    This function also handles reduction idioms (patterns) that have been
4525    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4526    of this form:
4527      X = pattern_expr (arg0, arg1, ..., X)
4528    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4529    sequence that had been detected and replaced by the pattern-stmt (STMT).
4530 
4531    In some cases of reduction patterns, the type of the reduction variable X is
4532    different than the type of the other arguments of STMT.
4533    In such cases, the vectype that is used when transforming STMT into a vector
4534    stmt is different than the vectype that is used to determine the
4535    vectorization factor, because it consists of a different number of elements
4536    than the actual number of elements that are being operated upon in parallel.
4537 
4538    For example, consider an accumulation of shorts into an int accumulator.
4539    On some targets it's possible to vectorize this pattern operating on 8
4540    shorts at a time (hence, the vectype for purposes of determining the
4541    vectorization factor should be V8HI); on the other hand, the vectype that
4542    is used to create the vector form is actually V4SI (the type of the result).
4543 
4544    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4545    indicates what is the actual level of parallelism (V8HI in the example), so
4546    that the right vectorization factor would be derived.  This vectype
4547    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4548    be used to create the vectorized stmt.  The right vectype for the vectorized
4549    stmt is obtained from the type of the result X:
4550         get_vectype_for_scalar_type (TREE_TYPE (X))
4551 
4552    This means that, contrary to "regular" reductions (or "regular" stmts in
4553    general), the following equation:
4554       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4555    does *NOT* necessarily hold for reduction patterns.  */
4556 
4557 bool
vectorizable_reduction(gimple stmt,gimple_stmt_iterator * gsi,gimple * vec_stmt,slp_tree slp_node)4558 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4559 			gimple *vec_stmt, slp_tree slp_node)
4560 {
4561   tree vec_dest;
4562   tree scalar_dest;
4563   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4564   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4565   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4566   tree vectype_in = NULL_TREE;
4567   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4568   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4569   enum tree_code code, orig_code, epilog_reduc_code;
4570   enum machine_mode vec_mode;
4571   int op_type;
4572   optab optab, reduc_optab;
4573   tree new_temp = NULL_TREE;
4574   tree def;
4575   gimple def_stmt;
4576   enum vect_def_type dt;
4577   gimple new_phi = NULL;
4578   tree scalar_type;
4579   bool is_simple_use;
4580   gimple orig_stmt;
4581   stmt_vec_info orig_stmt_info;
4582   tree expr = NULL_TREE;
4583   int i;
4584   int ncopies;
4585   int epilog_copies;
4586   stmt_vec_info prev_stmt_info, prev_phi_info;
4587   bool single_defuse_cycle = false;
4588   tree reduc_def = NULL_TREE;
4589   gimple new_stmt = NULL;
4590   int j;
4591   tree ops[3];
4592   bool nested_cycle = false, found_nested_cycle_def = false;
4593   gimple reduc_def_stmt = NULL;
4594   /* The default is that the reduction variable is the last in statement.  */
4595   int reduc_index = 2;
4596   bool double_reduc = false, dummy;
4597   basic_block def_bb;
4598   struct loop * def_stmt_loop, *outer_loop = NULL;
4599   tree def_arg;
4600   gimple def_arg_stmt;
4601   vec<tree> vec_oprnds0 = vNULL;
4602   vec<tree> vec_oprnds1 = vNULL;
4603   vec<tree> vect_defs = vNULL;
4604   vec<gimple> phis = vNULL;
4605   int vec_num;
4606   tree def0, def1, tem, op0, op1 = NULL_TREE;
4607 
4608   /* In case of reduction chain we switch to the first stmt in the chain, but
4609      we don't update STMT_INFO, since only the last stmt is marked as reduction
4610      and has reduction properties.  */
4611   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4612     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4613 
4614   if (nested_in_vect_loop_p (loop, stmt))
4615     {
4616       outer_loop = loop;
4617       loop = loop->inner;
4618       nested_cycle = true;
4619     }
4620 
4621   /* 1. Is vectorizable reduction?  */
4622   /* Not supportable if the reduction variable is used in the loop, unless
4623      it's a reduction chain.  */
4624   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4625       && !GROUP_FIRST_ELEMENT (stmt_info))
4626     return false;
4627 
4628   /* Reductions that are not used even in an enclosing outer-loop,
4629      are expected to be "live" (used out of the loop).  */
4630   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4631       && !STMT_VINFO_LIVE_P (stmt_info))
4632     return false;
4633 
4634   /* Make sure it was already recognized as a reduction computation.  */
4635   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4636       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4637     return false;
4638 
4639   /* 2. Has this been recognized as a reduction pattern?
4640 
4641      Check if STMT represents a pattern that has been recognized
4642      in earlier analysis stages.  For stmts that represent a pattern,
4643      the STMT_VINFO_RELATED_STMT field records the last stmt in
4644      the original sequence that constitutes the pattern.  */
4645 
4646   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4647   if (orig_stmt)
4648     {
4649       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4650       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4651       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4652     }
4653 
4654   /* 3. Check the operands of the operation.  The first operands are defined
4655         inside the loop body. The last operand is the reduction variable,
4656         which is defined by the loop-header-phi.  */
4657 
4658   gcc_assert (is_gimple_assign (stmt));
4659 
4660   /* Flatten RHS.  */
4661   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4662     {
4663     case GIMPLE_SINGLE_RHS:
4664       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4665       if (op_type == ternary_op)
4666 	{
4667 	  tree rhs = gimple_assign_rhs1 (stmt);
4668 	  ops[0] = TREE_OPERAND (rhs, 0);
4669 	  ops[1] = TREE_OPERAND (rhs, 1);
4670 	  ops[2] = TREE_OPERAND (rhs, 2);
4671 	  code = TREE_CODE (rhs);
4672 	}
4673       else
4674 	return false;
4675       break;
4676 
4677     case GIMPLE_BINARY_RHS:
4678       code = gimple_assign_rhs_code (stmt);
4679       op_type = TREE_CODE_LENGTH (code);
4680       gcc_assert (op_type == binary_op);
4681       ops[0] = gimple_assign_rhs1 (stmt);
4682       ops[1] = gimple_assign_rhs2 (stmt);
4683       break;
4684 
4685     case GIMPLE_TERNARY_RHS:
4686       code = gimple_assign_rhs_code (stmt);
4687       op_type = TREE_CODE_LENGTH (code);
4688       gcc_assert (op_type == ternary_op);
4689       ops[0] = gimple_assign_rhs1 (stmt);
4690       ops[1] = gimple_assign_rhs2 (stmt);
4691       ops[2] = gimple_assign_rhs3 (stmt);
4692       break;
4693 
4694     case GIMPLE_UNARY_RHS:
4695       return false;
4696 
4697     default:
4698       gcc_unreachable ();
4699     }
4700 
4701   if (code == COND_EXPR && slp_node)
4702     return false;
4703 
4704   scalar_dest = gimple_assign_lhs (stmt);
4705   scalar_type = TREE_TYPE (scalar_dest);
4706   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4707       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4708     return false;
4709 
4710   /* Do not try to vectorize bit-precision reductions.  */
4711   if ((TYPE_PRECISION (scalar_type)
4712        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4713     return false;
4714 
4715   /* All uses but the last are expected to be defined in the loop.
4716      The last use is the reduction variable.  In case of nested cycle this
4717      assumption is not true: we use reduc_index to record the index of the
4718      reduction variable.  */
4719   for (i = 0; i < op_type - 1; i++)
4720     {
4721       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4722       if (i == 0 && code == COND_EXPR)
4723         continue;
4724 
4725       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4726 					    &def_stmt, &def, &dt, &tem);
4727       if (!vectype_in)
4728 	vectype_in = tem;
4729       gcc_assert (is_simple_use);
4730 
4731       if (dt != vect_internal_def
4732 	  && dt != vect_external_def
4733 	  && dt != vect_constant_def
4734 	  && dt != vect_induction_def
4735           && !(dt == vect_nested_cycle && nested_cycle))
4736 	return false;
4737 
4738       if (dt == vect_nested_cycle)
4739         {
4740           found_nested_cycle_def = true;
4741           reduc_def_stmt = def_stmt;
4742           reduc_index = i;
4743         }
4744     }
4745 
4746   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4747 					&def_stmt, &def, &dt, &tem);
4748   if (!vectype_in)
4749     vectype_in = tem;
4750   gcc_assert (is_simple_use);
4751   if (!(dt == vect_reduction_def
4752 	|| dt == vect_nested_cycle
4753 	|| ((dt == vect_internal_def || dt == vect_external_def
4754 	     || dt == vect_constant_def || dt == vect_induction_def)
4755 	    && nested_cycle && found_nested_cycle_def)))
4756     {
4757       /* For pattern recognized stmts, orig_stmt might be a reduction,
4758 	 but some helper statements for the pattern might not, or
4759 	 might be COND_EXPRs with reduction uses in the condition.  */
4760       gcc_assert (orig_stmt);
4761       return false;
4762     }
4763   if (!found_nested_cycle_def)
4764     reduc_def_stmt = def_stmt;
4765 
4766   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4767   if (orig_stmt)
4768     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4769                                                        reduc_def_stmt,
4770                                                        !nested_cycle,
4771                                                        &dummy));
4772   else
4773     {
4774       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4775                                              !nested_cycle, &dummy);
4776       /* We changed STMT to be the first stmt in reduction chain, hence we
4777          check that in this case the first element in the chain is STMT.  */
4778       gcc_assert (stmt == tmp
4779                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4780     }
4781 
4782   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4783     return false;
4784 
4785   if (slp_node || PURE_SLP_STMT (stmt_info))
4786     ncopies = 1;
4787   else
4788     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4789                / TYPE_VECTOR_SUBPARTS (vectype_in));
4790 
4791   gcc_assert (ncopies >= 1);
4792 
4793   vec_mode = TYPE_MODE (vectype_in);
4794 
4795   if (code == COND_EXPR)
4796     {
4797       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4798         {
4799           if (dump_enabled_p ())
4800 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4801 			     "unsupported condition in reduction");
4802 
4803             return false;
4804         }
4805     }
4806   else
4807     {
4808       /* 4. Supportable by target?  */
4809 
4810       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4811 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
4812 	{
4813 	  /* Shifts and rotates are only supported by vectorizable_shifts,
4814 	     not vectorizable_reduction.  */
4815           if (dump_enabled_p ())
4816 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4817 			     "unsupported shift or rotation.");
4818 	  return false;
4819 	}
4820 
4821       /* 4.1. check support for the operation in the loop  */
4822       optab = optab_for_tree_code (code, vectype_in, optab_default);
4823       if (!optab)
4824         {
4825           if (dump_enabled_p ())
4826 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4827 			     "no optab.");
4828 
4829           return false;
4830         }
4831 
4832       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4833         {
4834           if (dump_enabled_p ())
4835             dump_printf (MSG_NOTE, "op not supported by target.");
4836 
4837           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4838               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4839 	          < vect_min_worthwhile_factor (code))
4840             return false;
4841 
4842           if (dump_enabled_p ())
4843   	    dump_printf (MSG_NOTE, "proceeding using word mode.");
4844         }
4845 
4846       /* Worthwhile without SIMD support?  */
4847       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4848           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4849    	     < vect_min_worthwhile_factor (code))
4850         {
4851           if (dump_enabled_p ())
4852 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4853 			     "not worthwhile without SIMD support.");
4854 
4855           return false;
4856         }
4857     }
4858 
4859   /* 4.2. Check support for the epilog operation.
4860 
4861           If STMT represents a reduction pattern, then the type of the
4862           reduction variable may be different than the type of the rest
4863           of the arguments.  For example, consider the case of accumulation
4864           of shorts into an int accumulator; The original code:
4865                         S1: int_a = (int) short_a;
4866           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4867 
4868           was replaced with:
4869                         STMT: int_acc = widen_sum <short_a, int_acc>
4870 
4871           This means that:
4872           1. The tree-code that is used to create the vector operation in the
4873              epilog code (that reduces the partial results) is not the
4874              tree-code of STMT, but is rather the tree-code of the original
4875              stmt from the pattern that STMT is replacing.  I.e, in the example
4876              above we want to use 'widen_sum' in the loop, but 'plus' in the
4877              epilog.
4878           2. The type (mode) we use to check available target support
4879              for the vector operation to be created in the *epilog*, is
4880              determined by the type of the reduction variable (in the example
4881              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4882              However the type (mode) we use to check available target support
4883              for the vector operation to be created *inside the loop*, is
4884              determined by the type of the other arguments to STMT (in the
4885              example we'd check this: optab_handler (widen_sum_optab,
4886 	     vect_short_mode)).
4887 
4888           This is contrary to "regular" reductions, in which the types of all
4889           the arguments are the same as the type of the reduction variable.
4890           For "regular" reductions we can therefore use the same vector type
4891           (and also the same tree-code) when generating the epilog code and
4892           when generating the code inside the loop.  */
4893 
4894   if (orig_stmt)
4895     {
4896       /* This is a reduction pattern: get the vectype from the type of the
4897          reduction variable, and get the tree-code from orig_stmt.  */
4898       orig_code = gimple_assign_rhs_code (orig_stmt);
4899       gcc_assert (vectype_out);
4900       vec_mode = TYPE_MODE (vectype_out);
4901     }
4902   else
4903     {
4904       /* Regular reduction: use the same vectype and tree-code as used for
4905          the vector code inside the loop can be used for the epilog code. */
4906       orig_code = code;
4907     }
4908 
4909   if (nested_cycle)
4910     {
4911       def_bb = gimple_bb (reduc_def_stmt);
4912       def_stmt_loop = def_bb->loop_father;
4913       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4914                                        loop_preheader_edge (def_stmt_loop));
4915       if (TREE_CODE (def_arg) == SSA_NAME
4916           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4917           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4918           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4919           && vinfo_for_stmt (def_arg_stmt)
4920           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4921               == vect_double_reduction_def)
4922         double_reduc = true;
4923     }
4924 
4925   epilog_reduc_code = ERROR_MARK;
4926   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4927     {
4928       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4929                                          optab_default);
4930       if (!reduc_optab)
4931         {
4932           if (dump_enabled_p ())
4933 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4934 			     "no optab for reduction.");
4935 
4936           epilog_reduc_code = ERROR_MARK;
4937         }
4938 
4939       if (reduc_optab
4940           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4941         {
4942           if (dump_enabled_p ())
4943 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4944 			     "reduc op not supported by target.");
4945 
4946           epilog_reduc_code = ERROR_MARK;
4947         }
4948     }
4949   else
4950     {
4951       if (!nested_cycle || double_reduc)
4952         {
4953           if (dump_enabled_p ())
4954 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4955 			     "no reduc code for scalar code.");
4956 
4957           return false;
4958         }
4959     }
4960 
4961   if (double_reduc && ncopies > 1)
4962     {
4963       if (dump_enabled_p ())
4964 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4965 			 "multiple types in double reduction");
4966 
4967       return false;
4968     }
4969 
4970   /* In case of widenning multiplication by a constant, we update the type
4971      of the constant to be the type of the other operand.  We check that the
4972      constant fits the type in the pattern recognition pass.  */
4973   if (code == DOT_PROD_EXPR
4974       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4975     {
4976       if (TREE_CODE (ops[0]) == INTEGER_CST)
4977         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4978       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4979         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4980       else
4981         {
4982           if (dump_enabled_p ())
4983 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4984 			     "invalid types in dot-prod");
4985 
4986           return false;
4987         }
4988     }
4989 
4990   if (!vec_stmt) /* transformation not required.  */
4991     {
4992       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4993         return false;
4994       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4995       return true;
4996     }
4997 
4998   /** Transform.  **/
4999 
5000   if (dump_enabled_p ())
5001     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
5002 
5003   /* FORNOW: Multiple types are not supported for condition.  */
5004   if (code == COND_EXPR)
5005     gcc_assert (ncopies == 1);
5006 
5007   /* Create the destination vector  */
5008   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5009 
5010   /* In case the vectorization factor (VF) is bigger than the number
5011      of elements that we can fit in a vectype (nunits), we have to generate
5012      more than one vector stmt - i.e - we need to "unroll" the
5013      vector stmt by a factor VF/nunits.  For more details see documentation
5014      in vectorizable_operation.  */
5015 
5016   /* If the reduction is used in an outer loop we need to generate
5017      VF intermediate results, like so (e.g. for ncopies=2):
5018 	r0 = phi (init, r0)
5019 	r1 = phi (init, r1)
5020 	r0 = x0 + r0;
5021         r1 = x1 + r1;
5022     (i.e. we generate VF results in 2 registers).
5023     In this case we have a separate def-use cycle for each copy, and therefore
5024     for each copy we get the vector def for the reduction variable from the
5025     respective phi node created for this copy.
5026 
5027     Otherwise (the reduction is unused in the loop nest), we can combine
5028     together intermediate results, like so (e.g. for ncopies=2):
5029 	r = phi (init, r)
5030 	r = x0 + r;
5031 	r = x1 + r;
5032    (i.e. we generate VF/2 results in a single register).
5033    In this case for each copy we get the vector def for the reduction variable
5034    from the vectorized reduction operation generated in the previous iteration.
5035   */
5036 
5037   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5038     {
5039       single_defuse_cycle = true;
5040       epilog_copies = 1;
5041     }
5042   else
5043     epilog_copies = ncopies;
5044 
5045   prev_stmt_info = NULL;
5046   prev_phi_info = NULL;
5047   if (slp_node)
5048     {
5049       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5050       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5051                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5052     }
5053   else
5054     {
5055       vec_num = 1;
5056       vec_oprnds0.create (1);
5057       if (op_type == ternary_op)
5058         vec_oprnds1.create (1);
5059     }
5060 
5061   phis.create (vec_num);
5062   vect_defs.create (vec_num);
5063   if (!slp_node)
5064     vect_defs.quick_push (NULL_TREE);
5065 
5066   for (j = 0; j < ncopies; j++)
5067     {
5068       if (j == 0 || !single_defuse_cycle)
5069 	{
5070           for (i = 0; i < vec_num; i++)
5071             {
5072               /* Create the reduction-phi that defines the reduction
5073                  operand.  */
5074               new_phi = create_phi_node (vec_dest, loop->header);
5075               set_vinfo_for_stmt (new_phi,
5076                                   new_stmt_vec_info (new_phi, loop_vinfo,
5077                                                      NULL));
5078                if (j == 0 || slp_node)
5079                  phis.quick_push (new_phi);
5080             }
5081         }
5082 
5083       if (code == COND_EXPR)
5084         {
5085           gcc_assert (!slp_node);
5086           vectorizable_condition (stmt, gsi, vec_stmt,
5087                                   PHI_RESULT (phis[0]),
5088                                   reduc_index, NULL);
5089           /* Multiple types are not supported for condition.  */
5090           break;
5091         }
5092 
5093       /* Handle uses.  */
5094       if (j == 0)
5095         {
5096           op0 = ops[!reduc_index];
5097           if (op_type == ternary_op)
5098             {
5099               if (reduc_index == 0)
5100                 op1 = ops[2];
5101               else
5102                 op1 = ops[1];
5103             }
5104 
5105           if (slp_node)
5106             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5107                                slp_node, -1);
5108           else
5109             {
5110               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5111                                                             stmt, NULL);
5112               vec_oprnds0.quick_push (loop_vec_def0);
5113               if (op_type == ternary_op)
5114                {
5115                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5116                                                                NULL);
5117                  vec_oprnds1.quick_push (loop_vec_def1);
5118                }
5119             }
5120         }
5121       else
5122         {
5123           if (!slp_node)
5124             {
5125               enum vect_def_type dt;
5126               gimple dummy_stmt;
5127               tree dummy;
5128 
5129               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5130                                   &dummy_stmt, &dummy, &dt);
5131               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5132                                                               loop_vec_def0);
5133               vec_oprnds0[0] = loop_vec_def0;
5134               if (op_type == ternary_op)
5135                 {
5136                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5137                                       &dummy, &dt);
5138                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5139                                                                 loop_vec_def1);
5140                   vec_oprnds1[0] = loop_vec_def1;
5141                 }
5142             }
5143 
5144           if (single_defuse_cycle)
5145             reduc_def = gimple_assign_lhs (new_stmt);
5146 
5147           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5148         }
5149 
5150       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5151         {
5152           if (slp_node)
5153             reduc_def = PHI_RESULT (phis[i]);
5154           else
5155             {
5156               if (!single_defuse_cycle || j == 0)
5157                 reduc_def = PHI_RESULT (new_phi);
5158             }
5159 
5160           def1 = ((op_type == ternary_op)
5161                   ? vec_oprnds1[i] : NULL);
5162           if (op_type == binary_op)
5163             {
5164               if (reduc_index == 0)
5165                 expr = build2 (code, vectype_out, reduc_def, def0);
5166               else
5167                 expr = build2 (code, vectype_out, def0, reduc_def);
5168             }
5169           else
5170             {
5171               if (reduc_index == 0)
5172                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5173               else
5174                 {
5175                   if (reduc_index == 1)
5176                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5177                   else
5178                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5179                 }
5180             }
5181 
5182           new_stmt = gimple_build_assign (vec_dest, expr);
5183           new_temp = make_ssa_name (vec_dest, new_stmt);
5184           gimple_assign_set_lhs (new_stmt, new_temp);
5185           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5186 
5187           if (slp_node)
5188             {
5189               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5190               vect_defs.quick_push (new_temp);
5191             }
5192           else
5193             vect_defs[0] = new_temp;
5194         }
5195 
5196       if (slp_node)
5197         continue;
5198 
5199       if (j == 0)
5200 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5201       else
5202 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5203 
5204       prev_stmt_info = vinfo_for_stmt (new_stmt);
5205       prev_phi_info = vinfo_for_stmt (new_phi);
5206     }
5207 
5208   /* Finalize the reduction-phi (set its arguments) and create the
5209      epilog reduction code.  */
5210   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5211     {
5212       new_temp = gimple_assign_lhs (*vec_stmt);
5213       vect_defs[0] = new_temp;
5214     }
5215 
5216   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5217                                     epilog_reduc_code, phis, reduc_index,
5218                                     double_reduc, slp_node);
5219 
5220   phis.release ();
5221   vect_defs.release ();
5222   vec_oprnds0.release ();
5223   vec_oprnds1.release ();
5224 
5225   return true;
5226 }
5227 
5228 /* Function vect_min_worthwhile_factor.
5229 
5230    For a loop where we could vectorize the operation indicated by CODE,
5231    return the minimum vectorization factor that makes it worthwhile
5232    to use generic vectors.  */
5233 int
vect_min_worthwhile_factor(enum tree_code code)5234 vect_min_worthwhile_factor (enum tree_code code)
5235 {
5236   switch (code)
5237     {
5238     case PLUS_EXPR:
5239     case MINUS_EXPR:
5240     case NEGATE_EXPR:
5241       return 4;
5242 
5243     case BIT_AND_EXPR:
5244     case BIT_IOR_EXPR:
5245     case BIT_XOR_EXPR:
5246     case BIT_NOT_EXPR:
5247       return 2;
5248 
5249     default:
5250       return INT_MAX;
5251     }
5252 }
5253 
5254 
5255 /* Function vectorizable_induction
5256 
5257    Check if PHI performs an induction computation that can be vectorized.
5258    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5259    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5260    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5261 
5262 bool
vectorizable_induction(gimple phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple * vec_stmt)5263 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5264 			gimple *vec_stmt)
5265 {
5266   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5267   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5268   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5269   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5270   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5271   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5272   tree vec_def;
5273 
5274   gcc_assert (ncopies >= 1);
5275   /* FORNOW. These restrictions should be relaxed.  */
5276   if (nested_in_vect_loop_p (loop, phi))
5277     {
5278       imm_use_iterator imm_iter;
5279       use_operand_p use_p;
5280       gimple exit_phi;
5281       edge latch_e;
5282       tree loop_arg;
5283 
5284       if (ncopies > 1)
5285 	{
5286 	  if (dump_enabled_p ())
5287 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5288 			     "multiple types in nested loop.");
5289 	  return false;
5290 	}
5291 
5292       exit_phi = NULL;
5293       latch_e = loop_latch_edge (loop->inner);
5294       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5295       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5296 	{
5297 	  if (!flow_bb_inside_loop_p (loop->inner,
5298 				      gimple_bb (USE_STMT (use_p))))
5299 	    {
5300 	      exit_phi = USE_STMT (use_p);
5301 	      break;
5302 	    }
5303 	}
5304       if (exit_phi)
5305 	{
5306 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5307 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5308 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5309 	    {
5310 	      if (dump_enabled_p ())
5311 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5312 				 "inner-loop induction only used outside "
5313 				 "of the outer vectorized loop.");
5314 	      return false;
5315 	    }
5316 	}
5317     }
5318 
5319   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5320     return false;
5321 
5322   /* FORNOW: SLP not supported.  */
5323   if (STMT_SLP_TYPE (stmt_info))
5324     return false;
5325 
5326   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5327 
5328   if (gimple_code (phi) != GIMPLE_PHI)
5329     return false;
5330 
5331   if (!vec_stmt) /* transformation not required.  */
5332     {
5333       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5334       if (dump_enabled_p ())
5335         dump_printf_loc (MSG_NOTE, vect_location,
5336                          "=== vectorizable_induction ===");
5337       vect_model_induction_cost (stmt_info, ncopies);
5338       return true;
5339     }
5340 
5341   /** Transform.  **/
5342 
5343   if (dump_enabled_p ())
5344     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5345 
5346   vec_def = get_initial_def_for_induction (phi);
5347   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5348   return true;
5349 }
5350 
5351 /* Function vectorizable_live_operation.
5352 
5353    STMT computes a value that is used outside the loop.  Check if
5354    it can be supported.  */
5355 
5356 bool
vectorizable_live_operation(gimple stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple * vec_stmt ATTRIBUTE_UNUSED)5357 vectorizable_live_operation (gimple stmt,
5358 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5359 			     gimple *vec_stmt ATTRIBUTE_UNUSED)
5360 {
5361   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5362   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5363   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5364   int i;
5365   int op_type;
5366   tree op;
5367   tree def;
5368   gimple def_stmt;
5369   enum vect_def_type dt;
5370   enum tree_code code;
5371   enum gimple_rhs_class rhs_class;
5372 
5373   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5374 
5375   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5376     return false;
5377 
5378   if (!is_gimple_assign (stmt))
5379     return false;
5380 
5381   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5382     return false;
5383 
5384   /* FORNOW. CHECKME. */
5385   if (nested_in_vect_loop_p (loop, stmt))
5386     return false;
5387 
5388   code = gimple_assign_rhs_code (stmt);
5389   op_type = TREE_CODE_LENGTH (code);
5390   rhs_class = get_gimple_rhs_class (code);
5391   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5392   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5393 
5394   /* FORNOW: support only if all uses are invariant.  This means
5395      that the scalar operations can remain in place, unvectorized.
5396      The original last scalar value that they compute will be used.  */
5397 
5398   for (i = 0; i < op_type; i++)
5399     {
5400       if (rhs_class == GIMPLE_SINGLE_RHS)
5401 	op = TREE_OPERAND (gimple_op (stmt, 1), i);
5402       else
5403 	op = gimple_op (stmt, i + 1);
5404       if (op
5405           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5406 				  &dt))
5407         {
5408           if (dump_enabled_p ())
5409 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5410 			     "use not simple.");
5411           return false;
5412         }
5413 
5414       if (dt != vect_external_def && dt != vect_constant_def)
5415         return false;
5416     }
5417 
5418   /* No transformation is required for the cases we currently support.  */
5419   return true;
5420 }
5421 
5422 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5423 
5424 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple stmt)5425 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5426 {
5427   ssa_op_iter op_iter;
5428   imm_use_iterator imm_iter;
5429   def_operand_p def_p;
5430   gimple ustmt;
5431 
5432   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5433     {
5434       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5435 	{
5436 	  basic_block bb;
5437 
5438 	  if (!is_gimple_debug (ustmt))
5439 	    continue;
5440 
5441 	  bb = gimple_bb (ustmt);
5442 
5443 	  if (!flow_bb_inside_loop_p (loop, bb))
5444 	    {
5445 	      if (gimple_debug_bind_p (ustmt))
5446 		{
5447 		  if (dump_enabled_p ())
5448 		    dump_printf_loc (MSG_NOTE, vect_location,
5449                                      "killing debug use");
5450 
5451 		  gimple_debug_bind_reset_value (ustmt);
5452 		  update_stmt (ustmt);
5453 		}
5454 	      else
5455 		gcc_unreachable ();
5456 	    }
5457 	}
5458     }
5459 }
5460 
5461 /* Function vect_transform_loop.
5462 
5463    The analysis phase has determined that the loop is vectorizable.
5464    Vectorize the loop - created vectorized stmts to replace the scalar
5465    stmts in the loop, and update the loop exit condition.  */
5466 
5467 void
vect_transform_loop(loop_vec_info loop_vinfo)5468 vect_transform_loop (loop_vec_info loop_vinfo)
5469 {
5470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5471   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5472   int nbbs = loop->num_nodes;
5473   gimple_stmt_iterator si;
5474   int i;
5475   tree ratio = NULL;
5476   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5477   bool grouped_store;
5478   bool slp_scheduled = false;
5479   unsigned int nunits;
5480   gimple stmt, pattern_stmt;
5481   gimple_seq pattern_def_seq = NULL;
5482   gimple_stmt_iterator pattern_def_si = gsi_none ();
5483   bool transform_pattern_stmt = false;
5484   bool check_profitability = false;
5485   int th;
5486   /* Record number of iterations before we started tampering with the profile. */
5487   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5488 
5489   if (dump_enabled_p ())
5490     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5491 
5492   /* If profile is inprecise, we have chance to fix it up.  */
5493   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5494     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5495 
5496   /* Use the more conservative vectorization threshold.  If the number
5497      of iterations is constant assume the cost check has been performed
5498      by our caller.  If the threshold makes all loops profitable that
5499      run at least the vectorization factor number of times checking
5500      is pointless, too.  */
5501   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5502 	 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5503   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5504   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5505       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5506     {
5507       if (dump_enabled_p ())
5508 	dump_printf_loc (MSG_NOTE, vect_location,
5509 			 "Profitability threshold is %d loop iterations.", th);
5510       check_profitability = true;
5511     }
5512 
5513   /* Peel the loop if there are data refs with unknown alignment.
5514      Only one data ref with unknown store is allowed.  */
5515 
5516   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5517     {
5518       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5519       check_profitability = false;
5520     }
5521 
5522   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5523       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5524     {
5525       vect_loop_versioning (loop_vinfo, th, check_profitability);
5526       check_profitability = false;
5527     }
5528 
5529   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5530      compile time constant), or it is a constant that doesn't divide by the
5531      vectorization factor, then an epilog loop needs to be created.
5532      We therefore duplicate the loop: the original loop will be vectorized,
5533      and will compute the first (n/VF) iterations.  The second copy of the loop
5534      will remain scalar and will compute the remaining (n%VF) iterations.
5535      (VF is the vectorization factor).  */
5536 
5537   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5538        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5539 	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5540        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5541     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5542 				    th, check_profitability);
5543   else
5544     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5545 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5546 
5547   /* 1) Make sure the loop header has exactly two entries
5548      2) Make sure we have a preheader basic block.  */
5549 
5550   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5551 
5552   split_edge (loop_preheader_edge (loop));
5553 
5554   /* FORNOW: the vectorizer supports only loops which body consist
5555      of one basic block (header + empty latch). When the vectorizer will
5556      support more involved loop forms, the order by which the BBs are
5557      traversed need to be reconsidered.  */
5558 
5559   for (i = 0; i < nbbs; i++)
5560     {
5561       basic_block bb = bbs[i];
5562       stmt_vec_info stmt_info;
5563       gimple phi;
5564 
5565       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5566         {
5567 	  phi = gsi_stmt (si);
5568 	  if (dump_enabled_p ())
5569 	    {
5570 	      dump_printf_loc (MSG_NOTE, vect_location,
5571                                "------>vectorizing phi: ");
5572 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5573 	    }
5574 	  stmt_info = vinfo_for_stmt (phi);
5575 	  if (!stmt_info)
5576 	    continue;
5577 
5578 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5579 	    vect_loop_kill_debug_uses (loop, phi);
5580 
5581 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
5582 	      && !STMT_VINFO_LIVE_P (stmt_info))
5583 	    continue;
5584 
5585 	  if (STMT_VINFO_VECTYPE (stmt_info)
5586 	      && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5587 		  != (unsigned HOST_WIDE_INT) vectorization_factor)
5588 	      && dump_enabled_p ())
5589 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5590 
5591 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5592 	    {
5593 	      if (dump_enabled_p ())
5594 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5595 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5596 	    }
5597 	}
5598 
5599       pattern_stmt = NULL;
5600       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5601 	{
5602 	  bool is_store;
5603 
5604           if (transform_pattern_stmt)
5605 	    stmt = pattern_stmt;
5606           else
5607             stmt = gsi_stmt (si);
5608 
5609 	  if (dump_enabled_p ())
5610 	    {
5611 	      dump_printf_loc (MSG_NOTE, vect_location,
5612 			       "------>vectorizing statement: ");
5613 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5614 	    }
5615 
5616 	  stmt_info = vinfo_for_stmt (stmt);
5617 
5618 	  /* vector stmts created in the outer-loop during vectorization of
5619 	     stmts in an inner-loop may not have a stmt_info, and do not
5620 	     need to be vectorized.  */
5621 	  if (!stmt_info)
5622 	    {
5623 	      gsi_next (&si);
5624 	      continue;
5625 	    }
5626 
5627 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5628 	    vect_loop_kill_debug_uses (loop, stmt);
5629 
5630 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
5631 	      && !STMT_VINFO_LIVE_P (stmt_info))
5632             {
5633               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5634                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5635                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5636                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5637                 {
5638                   stmt = pattern_stmt;
5639                   stmt_info = vinfo_for_stmt (stmt);
5640                 }
5641               else
5642 	        {
5643    	          gsi_next (&si);
5644 	          continue;
5645                 }
5646 	    }
5647           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5648                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5649                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5650                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5651             transform_pattern_stmt = true;
5652 
5653 	  /* If pattern statement has def stmts, vectorize them too.  */
5654 	  if (is_pattern_stmt_p (stmt_info))
5655 	    {
5656 	      if (pattern_def_seq == NULL)
5657 		{
5658 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5659 		  pattern_def_si = gsi_start (pattern_def_seq);
5660 		}
5661 	      else if (!gsi_end_p (pattern_def_si))
5662 		gsi_next (&pattern_def_si);
5663 	      if (pattern_def_seq != NULL)
5664 		{
5665 		  gimple pattern_def_stmt = NULL;
5666 		  stmt_vec_info pattern_def_stmt_info = NULL;
5667 
5668 		  while (!gsi_end_p (pattern_def_si))
5669 		    {
5670 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
5671 		      pattern_def_stmt_info
5672 			= vinfo_for_stmt (pattern_def_stmt);
5673 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5674 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5675 			break;
5676 		      gsi_next (&pattern_def_si);
5677 		    }
5678 
5679 		  if (!gsi_end_p (pattern_def_si))
5680 		    {
5681 		      if (dump_enabled_p ())
5682 			{
5683 			  dump_printf_loc (MSG_NOTE, vect_location,
5684 					   "==> vectorizing pattern def "
5685 					   "stmt: ");
5686 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5687 					    pattern_def_stmt, 0);
5688 			}
5689 
5690 		      stmt = pattern_def_stmt;
5691 		      stmt_info = pattern_def_stmt_info;
5692 		    }
5693 		  else
5694 		    {
5695 		      pattern_def_si = gsi_none ();
5696 		      transform_pattern_stmt = false;
5697 		    }
5698 		}
5699 	      else
5700 		transform_pattern_stmt = false;
5701             }
5702 
5703 	  gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5704 	  nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5705                                                STMT_VINFO_VECTYPE (stmt_info));
5706 	  if (!STMT_SLP_TYPE (stmt_info)
5707 	      && nunits != (unsigned int) vectorization_factor
5708               && dump_enabled_p ())
5709 	    /* For SLP VF is set according to unrolling factor, and not to
5710 	       vector size, hence for SLP this print is not valid.  */
5711             dump_printf_loc (MSG_NOTE, vect_location,
5712 			     "multiple-types.");
5713 
5714 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
5715 	     reached.  */
5716 	  if (STMT_SLP_TYPE (stmt_info))
5717 	    {
5718 	      if (!slp_scheduled)
5719 		{
5720 		  slp_scheduled = true;
5721 
5722 		  if (dump_enabled_p ())
5723 		    dump_printf_loc (MSG_NOTE, vect_location,
5724 				     "=== scheduling SLP instances ===");
5725 
5726 		  vect_schedule_slp (loop_vinfo, NULL);
5727 		}
5728 
5729 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5730 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5731 		{
5732 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5733 		    {
5734 		      pattern_def_seq = NULL;
5735 		      gsi_next (&si);
5736 		    }
5737 		  continue;
5738 		}
5739 	    }
5740 
5741 	  /* -------- vectorize statement ------------ */
5742 	  if (dump_enabled_p ())
5743 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5744 
5745 	  grouped_store = false;
5746 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5747           if (is_store)
5748             {
5749 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5750 		{
5751 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5752 		     interleaving chain was completed - free all the stores in
5753 		     the chain.  */
5754 		  gsi_next (&si);
5755 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5756  		  continue;
5757 		}
5758 	      else
5759 		{
5760 		  /* Free the attached stmt_vec_info and remove the stmt.  */
5761 		  gimple store = gsi_stmt (si);
5762 		  free_stmt_vec_info (store);
5763 		  unlink_stmt_vdef (store);
5764 		  gsi_remove (&si, true);
5765 		  release_defs (store);
5766 		  continue;
5767 		}
5768 	    }
5769 
5770 	  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5771 	    {
5772 	      pattern_def_seq = NULL;
5773 	      gsi_next (&si);
5774 	    }
5775 	}		        /* stmts in BB */
5776     }				/* BBs in loop */
5777 
5778   slpeel_make_loop_iterate_ntimes (loop, ratio);
5779 
5780   /* Reduce loop iterations by the vectorization factor.  */
5781   scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
5782 		      expected_iterations / vectorization_factor);
5783   loop->nb_iterations_upper_bound
5784     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5785 					    FLOOR_DIV_EXPR);
5786   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5787       && loop->nb_iterations_upper_bound != double_int_zero)
5788     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5789   if (loop->any_estimate)
5790     {
5791       loop->nb_iterations_estimate
5792         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5793 					     FLOOR_DIV_EXPR);
5794        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5795 	   && loop->nb_iterations_estimate != double_int_zero)
5796 	 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5797     }
5798 
5799   /* The memory tags and pointers in vectorized statements need to
5800      have their SSA forms updated.  FIXME, why can't this be delayed
5801      until all the loops have been transformed?  */
5802   update_ssa (TODO_update_ssa);
5803 
5804   if (dump_enabled_p ())
5805     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5806   if (loop->inner && dump_enabled_p ())
5807     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5808 		     "OUTER LOOP VECTORIZED.");
5809 }
5810