1 /* Loop Vectorization
2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
43 
44 /* Loop Vectorization Pass.
45 
46    This pass tries to vectorize loops.
47 
48    For example, the vectorizer transforms the following simple loop:
49 
50         short a[N]; short b[N]; short c[N]; int i;
51 
52         for (i=0; i<N; i++){
53           a[i] = b[i] + c[i];
54         }
55 
56    as if it was manually vectorized by rewriting the source code into:
57 
58         typedef int __attribute__((mode(V8HI))) v8hi;
59         short a[N];  short b[N]; short c[N];   int i;
60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61         v8hi va, vb, vc;
62 
63         for (i=0; i<N/8; i++){
64           vb = pb[i];
65           vc = pc[i];
66           va = vb + vc;
67           pa[i] = va;
68         }
69 
70         The main entry to this pass is vectorize_loops(), in which
71    the vectorizer applies a set of analyses on a given set of loops,
72    followed by the actual vectorization transformation for the loops that
73    had successfully passed the analysis phase.
74         Throughout this pass we make a distinction between two types of
75    data: scalars (which are represented by SSA_NAMES), and memory references
76    ("data-refs").  These two types of data require different handling both
77    during analysis and transformation. The types of data-refs that the
78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80    accesses are required to have a simple (consecutive) access pattern.
81 
82    Analysis phase:
83    ===============
84         The driver for the analysis phase is vect_analyze_loop().
85    It applies a set of analyses, some of which rely on the scalar evolution
86    analyzer (scev) developed by Sebastian Pop.
87 
88         During the analysis phase the vectorizer records some information
89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90    loop, as well as general information about the loop as a whole, which is
91    recorded in a "loop_vec_info" struct attached to each loop.
92 
93    Transformation phase:
94    =====================
95         The loop transformation phase scans all the stmts in the loop, and
96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97    the loop that needs to be vectorized.  It inserts the vector code sequence
98    just before the scalar stmt S, and records a pointer to the vector code
99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100    attached to S).  This pointer will be used for the vectorization of following
101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102    otherwise, we rely on dead code elimination for removing it.
103 
104         For example, say stmt S1 was vectorized into stmt VS1:
105 
106    VS1: vb = px[i];
107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108    S2:  a = b;
109 
110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
113    resulting sequence would be:
114 
115    VS1: vb = px[i];
116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117    VS2: va = vb;
118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119 
120         Operands that are not SSA_NAMEs, are data-refs that appear in
121    load/store operations (like 'x[i]' in S1), and are handled differently.
122 
123    Target modeling:
124    =================
125         Currently the only target specific information that is used is the
126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127    Targets that can support different sizes of vectors, for now will need
128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
129    flexibility will be added in the future.
130 
131         Since we only vectorize operations which vector form can be
132    expressed using existing tree codes, to verify that an operation is
133    supported, the vectorizer checks the relevant optab at the relevant
134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
135    the value found is CODE_FOR_nothing, then there's no target support, and
136    we can't vectorize the stmt.
137 
138    For additional information on this project see:
139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
140 */
141 
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
143 
144 /* Function vect_determine_vectorization_factor
145 
146    Determine the vectorization factor (VF).  VF is the number of data elements
147    that are operated upon in parallel in a single iteration of the vectorized
148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150    elements can fit in a single vector register.
151 
152    We currently support vectorization of loops in which all types operated upon
153    are of the same size.  Therefore this function currently sets VF according to
154    the size of the types operated upon, and fails if there are multiple sizes
155    in the loop.
156 
157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
158    original loop:
159         for (i=0; i<N; i++){
160           a[i] = b[i] + c[i];
161         }
162 
163    vectorized loop:
164         for (i=0; i<N; i+=VF){
165           a[i:VF] = b[i:VF] + c[i:VF];
166         }
167 */
168 
169 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174   int nbbs = loop->num_nodes;
175   gimple_stmt_iterator si;
176   unsigned int vectorization_factor = 0;
177   tree scalar_type;
178   gimple phi;
179   tree vectype;
180   unsigned int nunits;
181   stmt_vec_info stmt_info;
182   int i;
183   HOST_WIDE_INT dummy;
184   gimple stmt, pattern_stmt = NULL;
185   gimple_seq pattern_def_seq = NULL;
186   gimple_stmt_iterator pattern_def_si = gsi_none ();
187   bool analyze_pattern_stmt = false;
188 
189   if (dump_enabled_p ())
190     dump_printf_loc (MSG_NOTE, vect_location,
191                      "=== vect_determine_vectorization_factor ===");
192 
193   for (i = 0; i < nbbs; i++)
194     {
195       basic_block bb = bbs[i];
196 
197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
198 	{
199 	  phi = gsi_stmt (si);
200 	  stmt_info = vinfo_for_stmt (phi);
201 	  if (dump_enabled_p ())
202 	    {
203 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
205 	    }
206 
207 	  gcc_assert (stmt_info);
208 
209 	  if (STMT_VINFO_RELEVANT_P (stmt_info))
210             {
211 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
213 
214 	      if (dump_enabled_p ())
215 		{
216 		  dump_printf_loc (MSG_NOTE, vect_location,
217                                    "get vectype for scalar type:  ");
218 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
219 		}
220 
221 	      vectype = get_vectype_for_scalar_type (scalar_type);
222 	      if (!vectype)
223 		{
224 		  if (dump_enabled_p ())
225 		    {
226 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227                                        "not vectorized: unsupported "
228                                        "data-type ");
229 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230                                          scalar_type);
231 		    }
232 		  return false;
233 		}
234 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
235 
236 	      if (dump_enabled_p ())
237 		{
238 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
240 		}
241 
242 	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 	      if (dump_enabled_p ())
244 		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245 
246 	      if (!vectorization_factor
247 		  || (nunits > vectorization_factor))
248 		vectorization_factor = nunits;
249 	    }
250 	}
251 
252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
253         {
254           tree vf_vectype;
255 
256           if (analyze_pattern_stmt)
257 	    stmt = pattern_stmt;
258           else
259             stmt = gsi_stmt (si);
260 
261           stmt_info = vinfo_for_stmt (stmt);
262 
263 	  if (dump_enabled_p ())
264 	    {
265 	      dump_printf_loc (MSG_NOTE, vect_location,
266                                "==> examining statement: ");
267 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
268 	    }
269 
270 	  gcc_assert (stmt_info);
271 
272 	  /* Skip stmts which do not need to be vectorized.  */
273 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 	      && !STMT_VINFO_LIVE_P (stmt_info))
275             {
276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
280                 {
281                   stmt = pattern_stmt;
282                   stmt_info = vinfo_for_stmt (pattern_stmt);
283                   if (dump_enabled_p ())
284                     {
285                       dump_printf_loc (MSG_NOTE, vect_location,
286                                        "==> examining pattern statement: ");
287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288                     }
289                 }
290               else
291 	        {
292 	          if (dump_enabled_p ())
293 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294                   gsi_next (&si);
295 	          continue;
296                 }
297 	    }
298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302             analyze_pattern_stmt = true;
303 
304 	  /* If a pattern statement has def stmts, analyze them too.  */
305 	  if (is_pattern_stmt_p (stmt_info))
306 	    {
307 	      if (pattern_def_seq == NULL)
308 		{
309 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 		  pattern_def_si = gsi_start (pattern_def_seq);
311 		}
312 	      else if (!gsi_end_p (pattern_def_si))
313 		gsi_next (&pattern_def_si);
314 	      if (pattern_def_seq != NULL)
315 		{
316 		  gimple pattern_def_stmt = NULL;
317 		  stmt_vec_info pattern_def_stmt_info = NULL;
318 
319 		  while (!gsi_end_p (pattern_def_si))
320 		    {
321 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
322 		      pattern_def_stmt_info
323 			= vinfo_for_stmt (pattern_def_stmt);
324 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 			break;
327 		      gsi_next (&pattern_def_si);
328 		    }
329 
330 		  if (!gsi_end_p (pattern_def_si))
331 		    {
332 		      if (dump_enabled_p ())
333 			{
334 			  dump_printf_loc (MSG_NOTE, vect_location,
335                                            "==> examining pattern def stmt: ");
336 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337                                             pattern_def_stmt, 0);
338 			}
339 
340 		      stmt = pattern_def_stmt;
341 		      stmt_info = pattern_def_stmt_info;
342 		    }
343 		  else
344 		    {
345 		      pattern_def_si = gsi_none ();
346 		      analyze_pattern_stmt = false;
347 		    }
348 		}
349 	      else
350 		analyze_pattern_stmt = false;
351 	    }
352 
353 	  if (gimple_get_lhs (stmt) == NULL_TREE)
354 	    {
355 	      if (dump_enabled_p ())
356 		{
357 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358                                    "not vectorized: irregular stmt.");
359 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
360                                     0);
361 		}
362 	      return false;
363 	    }
364 
365 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
366 	    {
367 	      if (dump_enabled_p ())
368 	        {
369 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370                                    "not vectorized: vector stmt in loop:");
371 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
372 	        }
373 	      return false;
374 	    }
375 
376 	  if (STMT_VINFO_VECTYPE (stmt_info))
377 	    {
378 	      /* The only case when a vectype had been already set is for stmts
379 	         that contain a dataref, or for "pattern-stmts" (stmts
380 		 generated by the vectorizer to represent/replace a certain
381 		 idiom).  */
382 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 			  || is_pattern_stmt_p (stmt_info)
384 			  || !gsi_end_p (pattern_def_si));
385 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
386 	    }
387 	  else
388 	    {
389 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 	      scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 	      if (dump_enabled_p ())
392 		{
393 		  dump_printf_loc (MSG_NOTE, vect_location,
394                                    "get vectype for scalar type:  ");
395 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
396 		}
397 	      vectype = get_vectype_for_scalar_type (scalar_type);
398 	      if (!vectype)
399 		{
400 		  if (dump_enabled_p ())
401 		    {
402 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403                                        "not vectorized: unsupported "
404                                        "data-type ");
405 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406                                          scalar_type);
407 		    }
408 		  return false;
409 		}
410 
411 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
412             }
413 
414 	  /* The vectorization factor is according to the smallest
415 	     scalar type (or the largest vector size, but we only
416 	     support one vector size per loop).  */
417 	  scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
418 						       &dummy);
419 	  if (dump_enabled_p ())
420 	    {
421 	      dump_printf_loc (MSG_NOTE, vect_location,
422                                "get vectype for scalar type:  ");
423 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
424 	    }
425 	  vf_vectype = get_vectype_for_scalar_type (scalar_type);
426 	  if (!vf_vectype)
427 	    {
428 	      if (dump_enabled_p ())
429 		{
430 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431                                    "not vectorized: unsupported data-type ");
432 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
433                                      scalar_type);
434 		}
435 	      return false;
436 	    }
437 
438 	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
439 	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
440 	    {
441 	      if (dump_enabled_p ())
442 		{
443 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444                                    "not vectorized: different sized vector "
445                                    "types in statement, ");
446 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
447                                      vectype);
448 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
449 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
450                                      vf_vectype);
451 		}
452 	      return false;
453 	    }
454 
455 	  if (dump_enabled_p ())
456 	    {
457 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
458 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
459 	    }
460 
461 	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
462 	  if (dump_enabled_p ())
463 	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
464 	  if (!vectorization_factor
465 	      || (nunits > vectorization_factor))
466 	    vectorization_factor = nunits;
467 
468 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
469 	    {
470 	      pattern_def_seq = NULL;
471 	      gsi_next (&si);
472 	    }
473         }
474     }
475 
476   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
477   if (dump_enabled_p ())
478     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
479                      vectorization_factor);
480   if (vectorization_factor <= 1)
481     {
482       if (dump_enabled_p ())
483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484                          "not vectorized: unsupported data-type");
485       return false;
486     }
487   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
488 
489   return true;
490 }
491 
492 
493 /* Function vect_is_simple_iv_evolution.
494 
495    FORNOW: A simple evolution of an induction variables in the loop is
496    considered a polynomial evolution with constant step.  */
497 
498 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)499 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
500                              tree * step)
501 {
502   tree init_expr;
503   tree step_expr;
504   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
505 
506   /* When there is no evolution in this loop, the evolution function
507      is not "simple".  */
508   if (evolution_part == NULL_TREE)
509     return false;
510 
511   /* When the evolution is a polynomial of degree >= 2
512      the evolution function is not "simple".  */
513   if (tree_is_chrec (evolution_part))
514     return false;
515 
516   step_expr = evolution_part;
517   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
518 
519   if (dump_enabled_p ())
520     {
521       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
522       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
523       dump_printf (MSG_NOTE, ",  init: ");
524       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
525     }
526 
527   *init = init_expr;
528   *step = step_expr;
529 
530   if (TREE_CODE (step_expr) != INTEGER_CST)
531     {
532       if (dump_enabled_p ())
533         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534                          "step unknown.");
535       return false;
536     }
537 
538   return true;
539 }
540 
541 /* Function vect_analyze_scalar_cycles_1.
542 
543    Examine the cross iteration def-use cycles of scalar variables
544    in LOOP.  LOOP_VINFO represents the loop that is now being
545    considered for vectorization (can be LOOP, or an outer-loop
546    enclosing LOOP).  */
547 
548 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)549 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
550 {
551   basic_block bb = loop->header;
552   tree dumy;
553   vec<gimple> worklist;
554   worklist.create (64);
555   gimple_stmt_iterator gsi;
556   bool double_reduc;
557 
558   if (dump_enabled_p ())
559     dump_printf_loc (MSG_NOTE, vect_location,
560                      "=== vect_analyze_scalar_cycles ===");
561 
562   /* First - identify all inductions.  Reduction detection assumes that all the
563      inductions have been identified, therefore, this order must not be
564      changed.  */
565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
566     {
567       gimple phi = gsi_stmt (gsi);
568       tree access_fn = NULL;
569       tree def = PHI_RESULT (phi);
570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
571 
572       if (dump_enabled_p ())
573 	{
574 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
575 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
576 	}
577 
578       /* Skip virtual phi's.  The data dependences that are associated with
579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
580       if (virtual_operand_p (def))
581 	continue;
582 
583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
584 
585       /* Analyze the evolution function.  */
586       access_fn = analyze_scalar_evolution (loop, def);
587       if (access_fn)
588 	{
589 	  STRIP_NOPS (access_fn);
590 	  if (dump_enabled_p ())
591 	    {
592 	      dump_printf_loc (MSG_NOTE, vect_location,
593                                "Access function of PHI: ");
594 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
595 	    }
596 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
597 	    = evolution_part_in_loop_num (access_fn, loop->num);
598 	}
599 
600       if (!access_fn
601 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
602 	{
603 	  worklist.safe_push (phi);
604 	  continue;
605 	}
606 
607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
608 
609       if (dump_enabled_p ())
610 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
612     }
613 
614 
615   /* Second - identify all reductions and nested cycles.  */
616   while (worklist.length () > 0)
617     {
618       gimple phi = worklist.pop ();
619       tree def = PHI_RESULT (phi);
620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
621       gimple reduc_stmt;
622       bool nested_cycle;
623 
624       if (dump_enabled_p ())
625         {
626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
628         }
629 
630       gcc_assert (!virtual_operand_p (def)
631 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
632 
633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
635 						&double_reduc);
636       if (reduc_stmt)
637         {
638           if (double_reduc)
639             {
640               if (dump_enabled_p ())
641                 dump_printf_loc (MSG_NOTE, vect_location,
642 				 "Detected double reduction.");
643 
644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
646                                                     vect_double_reduction_def;
647             }
648           else
649             {
650               if (nested_cycle)
651                 {
652                   if (dump_enabled_p ())
653                     dump_printf_loc (MSG_NOTE, vect_location,
654 				     "Detected vectorizable nested cycle.");
655 
656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
658                                                              vect_nested_cycle;
659                 }
660               else
661                 {
662                   if (dump_enabled_p ())
663                     dump_printf_loc (MSG_NOTE, vect_location,
664 				     "Detected reduction.");
665 
666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
668                                                            vect_reduction_def;
669                   /* Store the reduction cycles for possible vectorization in
670                      loop-aware SLP.  */
671                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
672                 }
673             }
674         }
675       else
676         if (dump_enabled_p ())
677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 			   "Unknown def-use cycle pattern.");
679     }
680 
681   worklist.release ();
682 }
683 
684 
685 /* Function vect_analyze_scalar_cycles.
686 
687    Examine the cross iteration def-use cycles of scalar variables, by
688    analyzing the loop-header PHIs of scalar variables.  Classify each
689    cycle as one of the following: invariant, induction, reduction, unknown.
690    We do that for the loop represented by LOOP_VINFO, and also to its
691    inner-loop, if exists.
692    Examples for scalar cycles:
693 
694    Example1: reduction:
695 
696               loop1:
697               for (i=0; i<N; i++)
698                  sum += a[i];
699 
700    Example2: induction:
701 
702               loop2:
703               for (i=0; i<N; i++)
704                  a[i] = i;  */
705 
706 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
708 {
709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
710 
711   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
712 
713   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
714      Reductions in such inner-loop therefore have different properties than
715      the reductions in the nest that gets vectorized:
716      1. When vectorized, they are executed in the same order as in the original
717         scalar loop, so we can't change the order of computation when
718         vectorizing them.
719      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
720         current checks are too strict.  */
721 
722   if (loop->inner)
723     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
724 }
725 
726 /* Function vect_get_loop_niters.
727 
728    Determine how many iterations the loop is executed.
729    If an expression that represents the number of iterations
730    can be constructed, place it in NUMBER_OF_ITERATIONS.
731    Return the loop exit condition.  */
732 
733 static gimple
vect_get_loop_niters(struct loop * loop,tree * number_of_iterations)734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
735 {
736   tree niters;
737 
738   if (dump_enabled_p ())
739     dump_printf_loc (MSG_NOTE, vect_location,
740 		     "=== get_loop_niters ===");
741   niters = number_of_exit_cond_executions (loop);
742 
743   if (niters != NULL_TREE
744       && niters != chrec_dont_know)
745     {
746       *number_of_iterations = niters;
747 
748       if (dump_enabled_p ())
749         {
750           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
751           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
752         }
753     }
754 
755   return get_loop_exit_condition (loop);
756 }
757 
758 
759 /* Function bb_in_loop_p
760 
761    Used as predicate for dfs order traversal of the loop bbs.  */
762 
763 static bool
bb_in_loop_p(const_basic_block bb,const void * data)764 bb_in_loop_p (const_basic_block bb, const void *data)
765 {
766   const struct loop *const loop = (const struct loop *)data;
767   if (flow_bb_inside_loop_p (loop, bb))
768     return true;
769   return false;
770 }
771 
772 
773 /* Function new_loop_vec_info.
774 
775    Create and initialize a new loop_vec_info struct for LOOP, as well as
776    stmt_vec_info structs for all the stmts in LOOP.  */
777 
778 static loop_vec_info
new_loop_vec_info(struct loop * loop)779 new_loop_vec_info (struct loop *loop)
780 {
781   loop_vec_info res;
782   basic_block *bbs;
783   gimple_stmt_iterator si;
784   unsigned int i, nbbs;
785 
786   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
787   LOOP_VINFO_LOOP (res) = loop;
788 
789   bbs = get_loop_body (loop);
790 
791   /* Create/Update stmt_info for all stmts in the loop.  */
792   for (i = 0; i < loop->num_nodes; i++)
793     {
794       basic_block bb = bbs[i];
795 
796       /* BBs in a nested inner-loop will have been already processed (because
797          we will have called vect_analyze_loop_form for any nested inner-loop).
798          Therefore, for stmts in an inner-loop we just want to update the
799          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
800          loop_info of the outer-loop we are currently considering to vectorize
801          (instead of the loop_info of the inner-loop).
802          For stmts in other BBs we need to create a stmt_info from scratch.  */
803       if (bb->loop_father != loop)
804         {
805           /* Inner-loop bb.  */
806           gcc_assert (loop->inner && bb->loop_father == loop->inner);
807           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
808             {
809               gimple phi = gsi_stmt (si);
810               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
811               loop_vec_info inner_loop_vinfo =
812                 STMT_VINFO_LOOP_VINFO (stmt_info);
813               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
814               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
815             }
816           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
817            {
818               gimple stmt = gsi_stmt (si);
819               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
820               loop_vec_info inner_loop_vinfo =
821                  STMT_VINFO_LOOP_VINFO (stmt_info);
822               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
823               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
824            }
825         }
826       else
827         {
828           /* bb in current nest.  */
829           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
830             {
831               gimple phi = gsi_stmt (si);
832               gimple_set_uid (phi, 0);
833               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
834             }
835 
836           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
837             {
838               gimple stmt = gsi_stmt (si);
839               gimple_set_uid (stmt, 0);
840               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
841             }
842         }
843     }
844 
845   /* CHECKME: We want to visit all BBs before their successors (except for
846      latch blocks, for which this assertion wouldn't hold).  In the simple
847      case of the loop forms we allow, a dfs order of the BBs would the same
848      as reversed postorder traversal, so we are safe.  */
849 
850    free (bbs);
851    bbs = XCNEWVEC (basic_block, loop->num_nodes);
852    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
853                               bbs, loop->num_nodes, loop);
854    gcc_assert (nbbs == loop->num_nodes);
855 
856   LOOP_VINFO_BBS (res) = bbs;
857   LOOP_VINFO_NITERS (res) = NULL;
858   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
859   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
860   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
861   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
862   LOOP_VINFO_VECT_FACTOR (res) = 0;
863   LOOP_VINFO_LOOP_NEST (res).create (3);
864   LOOP_VINFO_DATAREFS (res).create (10);
865   LOOP_VINFO_DDRS (res).create (10 * 10);
866   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
867   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
868 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
869   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
870 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
871   LOOP_VINFO_GROUPED_STORES (res).create (10);
872   LOOP_VINFO_REDUCTIONS (res).create (10);
873   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
874   LOOP_VINFO_SLP_INSTANCES (res).create (10);
875   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
876   LOOP_VINFO_PEELING_HTAB (res) = NULL;
877   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
878   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
879   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
880 
881   return res;
882 }
883 
884 
885 /* Function destroy_loop_vec_info.
886 
887    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
888    stmts in the loop.  */
889 
890 void
destroy_loop_vec_info(loop_vec_info loop_vinfo,bool clean_stmts)891 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
892 {
893   struct loop *loop;
894   basic_block *bbs;
895   int nbbs;
896   gimple_stmt_iterator si;
897   int j;
898   vec<slp_instance> slp_instances;
899   slp_instance instance;
900   bool swapped;
901 
902   if (!loop_vinfo)
903     return;
904 
905   loop = LOOP_VINFO_LOOP (loop_vinfo);
906 
907   bbs = LOOP_VINFO_BBS (loop_vinfo);
908   nbbs = clean_stmts ? loop->num_nodes : 0;
909   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
910 
911   for (j = 0; j < nbbs; j++)
912     {
913       basic_block bb = bbs[j];
914       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
915         free_stmt_vec_info (gsi_stmt (si));
916 
917       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
918         {
919           gimple stmt = gsi_stmt (si);
920 
921 	  /* We may have broken canonical form by moving a constant
922 	     into RHS1 of a commutative op.  Fix such occurrences.  */
923 	  if (swapped && is_gimple_assign (stmt))
924 	    {
925 	      enum tree_code code = gimple_assign_rhs_code (stmt);
926 
927 	      if ((code == PLUS_EXPR
928 		   || code == POINTER_PLUS_EXPR
929 		   || code == MULT_EXPR)
930 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
931 		swap_tree_operands (stmt,
932 				    gimple_assign_rhs1_ptr (stmt),
933 				    gimple_assign_rhs2_ptr (stmt));
934 	    }
935 
936 	  /* Free stmt_vec_info.  */
937 	  free_stmt_vec_info (stmt);
938           gsi_next (&si);
939         }
940     }
941 
942   free (LOOP_VINFO_BBS (loop_vinfo));
943   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
944   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
945   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
946   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
947   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
948   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
949   FOR_EACH_VEC_ELT (slp_instances, j, instance)
950     vect_free_slp_instance (instance);
951 
952   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
953   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
954   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
955   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
956 
957   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
958     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
959 
960   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
961 
962   free (loop_vinfo);
963   loop->aux = NULL;
964 }
965 
966 
967 /* Function vect_analyze_loop_1.
968 
969    Apply a set of analyses on LOOP, and create a loop_vec_info struct
970    for it. The different analyses will record information in the
971    loop_vec_info struct.  This is a subset of the analyses applied in
972    vect_analyze_loop, to be applied on an inner-loop nested in the loop
973    that is now considered for (outer-loop) vectorization.  */
974 
975 static loop_vec_info
vect_analyze_loop_1(struct loop * loop)976 vect_analyze_loop_1 (struct loop *loop)
977 {
978   loop_vec_info loop_vinfo;
979 
980   if (dump_enabled_p ())
981     dump_printf_loc (MSG_NOTE, vect_location,
982 		     "===== analyze_loop_nest_1 =====");
983 
984   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
985 
986   loop_vinfo = vect_analyze_loop_form (loop);
987   if (!loop_vinfo)
988     {
989       if (dump_enabled_p ())
990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
991 			 "bad inner-loop form.");
992       return NULL;
993     }
994 
995   return loop_vinfo;
996 }
997 
998 
999 /* Function vect_analyze_loop_form.
1000 
1001    Verify that certain CFG restrictions hold, including:
1002    - the loop has a pre-header
1003    - the loop has a single entry and exit
1004    - the loop exit condition is simple enough, and the number of iterations
1005      can be analyzed (a countable loop).  */
1006 
1007 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1008 vect_analyze_loop_form (struct loop *loop)
1009 {
1010   loop_vec_info loop_vinfo;
1011   gimple loop_cond;
1012   tree number_of_iterations = NULL;
1013   loop_vec_info inner_loop_vinfo = NULL;
1014 
1015   if (dump_enabled_p ())
1016     dump_printf_loc (MSG_NOTE, vect_location,
1017 		     "=== vect_analyze_loop_form ===");
1018 
1019   /* Different restrictions apply when we are considering an inner-most loop,
1020      vs. an outer (nested) loop.
1021      (FORNOW. May want to relax some of these restrictions in the future).  */
1022 
1023   if (!loop->inner)
1024     {
1025       /* Inner-most loop.  We currently require that the number of BBs is
1026 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1027 	 look like this:
1028 
1029                         (pre-header)
1030                            |
1031                           header <--------+
1032                            | |            |
1033                            | +--> latch --+
1034                            |
1035                         (exit-bb)  */
1036 
1037       if (loop->num_nodes != 2)
1038         {
1039           if (dump_enabled_p ())
1040             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 			     "not vectorized: control flow in loop.");
1042           return NULL;
1043         }
1044 
1045       if (empty_block_p (loop->header))
1046     {
1047           if (dump_enabled_p ())
1048             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 			     "not vectorized: empty loop.");
1050       return NULL;
1051     }
1052     }
1053   else
1054     {
1055       struct loop *innerloop = loop->inner;
1056       edge entryedge;
1057 
1058       /* Nested loop. We currently require that the loop is doubly-nested,
1059 	 contains a single inner loop, and the number of BBs is exactly 5.
1060 	 Vectorizable outer-loops look like this:
1061 
1062 			(pre-header)
1063 			   |
1064 			  header <---+
1065 			   |         |
1066 		          inner-loop |
1067 			   |         |
1068 			  tail ------+
1069 			   |
1070 		        (exit-bb)
1071 
1072 	 The inner-loop has the properties expected of inner-most loops
1073 	 as described above.  */
1074 
1075       if ((loop->inner)->inner || (loop->inner)->next)
1076 	{
1077 	  if (dump_enabled_p ())
1078 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 			     "not vectorized: multiple nested loops.");
1080 	  return NULL;
1081 	}
1082 
1083       /* Analyze the inner-loop.  */
1084       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1085       if (!inner_loop_vinfo)
1086 	{
1087 	  if (dump_enabled_p ())
1088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 			     "not vectorized: Bad inner loop.");
1090 	  return NULL;
1091 	}
1092 
1093       if (!expr_invariant_in_loop_p (loop,
1094 					LOOP_VINFO_NITERS (inner_loop_vinfo)))
1095 	{
1096 	  if (dump_enabled_p ())
1097 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 			     "not vectorized: inner-loop count not invariant.");
1099 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1100 	  return NULL;
1101 	}
1102 
1103       if (loop->num_nodes != 5)
1104         {
1105 	  if (dump_enabled_p ())
1106 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107 			     "not vectorized: control flow in loop.");
1108 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1109 	  return NULL;
1110         }
1111 
1112       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1113       entryedge = EDGE_PRED (innerloop->header, 0);
1114       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1115 	entryedge = EDGE_PRED (innerloop->header, 1);
1116 
1117       if (entryedge->src != loop->header
1118 	  || !single_exit (innerloop)
1119 	  || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1120 	{
1121 	  if (dump_enabled_p ())
1122 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 			     "not vectorized: unsupported outerloop form.");
1124 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1125 	  return NULL;
1126 	}
1127 
1128       if (dump_enabled_p ())
1129         dump_printf_loc (MSG_NOTE, vect_location,
1130 			 "Considering outer-loop vectorization.");
1131     }
1132 
1133   if (!single_exit (loop)
1134       || EDGE_COUNT (loop->header->preds) != 2)
1135     {
1136       if (dump_enabled_p ())
1137         {
1138           if (!single_exit (loop))
1139 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 			     "not vectorized: multiple exits.");
1141           else if (EDGE_COUNT (loop->header->preds) != 2)
1142 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 			     "not vectorized: too many incoming edges.");
1144         }
1145       if (inner_loop_vinfo)
1146 	destroy_loop_vec_info (inner_loop_vinfo, true);
1147       return NULL;
1148     }
1149 
1150   /* We assume that the loop exit condition is at the end of the loop. i.e,
1151      that the loop is represented as a do-while (with a proper if-guard
1152      before the loop if needed), where the loop header contains all the
1153      executable statements, and the latch is empty.  */
1154   if (!empty_block_p (loop->latch)
1155       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1156     {
1157       if (dump_enabled_p ())
1158 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 			 "not vectorized: latch block not empty.");
1160       if (inner_loop_vinfo)
1161 	destroy_loop_vec_info (inner_loop_vinfo, true);
1162       return NULL;
1163     }
1164 
1165   /* Make sure there exists a single-predecessor exit bb:  */
1166   if (!single_pred_p (single_exit (loop)->dest))
1167     {
1168       edge e = single_exit (loop);
1169       if (!(e->flags & EDGE_ABNORMAL))
1170 	{
1171 	  split_loop_exit_edge (e);
1172 	  if (dump_enabled_p ())
1173 	    dump_printf (MSG_NOTE, "split exit edge.");
1174 	}
1175       else
1176 	{
1177 	  if (dump_enabled_p ())
1178 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 			     "not vectorized: abnormal loop exit edge.");
1180 	  if (inner_loop_vinfo)
1181 	    destroy_loop_vec_info (inner_loop_vinfo, true);
1182 	  return NULL;
1183 	}
1184     }
1185 
1186   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1187   if (!loop_cond)
1188     {
1189       if (dump_enabled_p ())
1190 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 			 "not vectorized: complicated exit condition.");
1192       if (inner_loop_vinfo)
1193 	destroy_loop_vec_info (inner_loop_vinfo, true);
1194       return NULL;
1195     }
1196 
1197   if (!number_of_iterations)
1198     {
1199       if (dump_enabled_p ())
1200 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1201 			 "not vectorized: number of iterations cannot be "
1202 			 "computed.");
1203       if (inner_loop_vinfo)
1204 	destroy_loop_vec_info (inner_loop_vinfo, true);
1205       return NULL;
1206     }
1207 
1208   if (chrec_contains_undetermined (number_of_iterations))
1209     {
1210       if (dump_enabled_p ())
1211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212 			     "Infinite number of iterations.");
1213       if (inner_loop_vinfo)
1214 	destroy_loop_vec_info (inner_loop_vinfo, true);
1215       return NULL;
1216     }
1217 
1218   if (!NITERS_KNOWN_P (number_of_iterations))
1219     {
1220       if (dump_enabled_p ())
1221         {
1222           dump_printf_loc (MSG_NOTE, vect_location,
1223 			   "Symbolic number of iterations is ");
1224 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1225         }
1226     }
1227   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1228     {
1229       if (dump_enabled_p ())
1230 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 			 "not vectorized: number of iterations = 0.");
1232       if (inner_loop_vinfo)
1233         destroy_loop_vec_info (inner_loop_vinfo, true);
1234       return NULL;
1235     }
1236 
1237   loop_vinfo = new_loop_vec_info (loop);
1238   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1239   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1240 
1241   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1242 
1243   /* CHECKME: May want to keep it around it in the future.  */
1244   if (inner_loop_vinfo)
1245     destroy_loop_vec_info (inner_loop_vinfo, false);
1246 
1247   gcc_assert (!loop->aux);
1248   loop->aux = loop_vinfo;
1249   return loop_vinfo;
1250 }
1251 
1252 
1253 /* Function vect_analyze_loop_operations.
1254 
1255    Scan the loop stmts and make sure they are all vectorizable.  */
1256 
1257 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo,bool slp)1258 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1259 {
1260   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1261   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1262   int nbbs = loop->num_nodes;
1263   gimple_stmt_iterator si;
1264   unsigned int vectorization_factor = 0;
1265   int i;
1266   gimple phi;
1267   stmt_vec_info stmt_info;
1268   bool need_to_vectorize = false;
1269   int min_profitable_iters;
1270   int min_scalar_loop_bound;
1271   unsigned int th;
1272   bool only_slp_in_loop = true, ok;
1273   HOST_WIDE_INT max_niter;
1274   HOST_WIDE_INT estimated_niter;
1275   int min_profitable_estimate;
1276 
1277   if (dump_enabled_p ())
1278     dump_printf_loc (MSG_NOTE, vect_location,
1279 		     "=== vect_analyze_loop_operations ===");
1280 
1281   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1282   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1283   if (slp)
1284     {
1285       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1286 	 vectorization factor of the loop is the unrolling factor required by
1287 	 the SLP instances.  If that unrolling factor is 1, we say, that we
1288 	 perform pure SLP on loop - cross iteration parallelism is not
1289 	 exploited.  */
1290       for (i = 0; i < nbbs; i++)
1291 	{
1292 	  basic_block bb = bbs[i];
1293 	  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1294 	    {
1295 	      gimple stmt = gsi_stmt (si);
1296 	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1297 	      gcc_assert (stmt_info);
1298 	      if ((STMT_VINFO_RELEVANT_P (stmt_info)
1299 		   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1300 		  && !PURE_SLP_STMT (stmt_info))
1301 		/* STMT needs both SLP and loop-based vectorization.  */
1302 		only_slp_in_loop = false;
1303 	    }
1304 	}
1305 
1306       if (only_slp_in_loop)
1307 	vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1308       else
1309 	vectorization_factor = least_common_multiple (vectorization_factor,
1310 				LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1311 
1312       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1313       if (dump_enabled_p ())
1314 	dump_printf_loc (MSG_NOTE, vect_location,
1315 			 "Updating vectorization factor to %d ",
1316 			 vectorization_factor);
1317     }
1318 
1319   for (i = 0; i < nbbs; i++)
1320     {
1321       basic_block bb = bbs[i];
1322 
1323       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1324         {
1325           phi = gsi_stmt (si);
1326           ok = true;
1327 
1328           stmt_info = vinfo_for_stmt (phi);
1329           if (dump_enabled_p ())
1330             {
1331               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1332               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1333             }
1334 
1335           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1336              (i.e., a phi in the tail of the outer-loop).  */
1337           if (! is_loop_header_bb_p (bb))
1338             {
1339               /* FORNOW: we currently don't support the case that these phis
1340                  are not used in the outerloop (unless it is double reduction,
1341                  i.e., this phi is vect_reduction_def), cause this case
1342                  requires to actually do something here.  */
1343               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1344                    || STMT_VINFO_LIVE_P (stmt_info))
1345                   && STMT_VINFO_DEF_TYPE (stmt_info)
1346                      != vect_double_reduction_def)
1347                 {
1348                   if (dump_enabled_p ())
1349 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350 				     "Unsupported loop-closed phi in "
1351 				     "outer-loop.");
1352                   return false;
1353                 }
1354 
1355               /* If PHI is used in the outer loop, we check that its operand
1356                  is defined in the inner loop.  */
1357               if (STMT_VINFO_RELEVANT_P (stmt_info))
1358                 {
1359                   tree phi_op;
1360                   gimple op_def_stmt;
1361 
1362                   if (gimple_phi_num_args (phi) != 1)
1363                     return false;
1364 
1365                   phi_op = PHI_ARG_DEF (phi, 0);
1366                   if (TREE_CODE (phi_op) != SSA_NAME)
1367                     return false;
1368 
1369                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1370 		  if (!op_def_stmt
1371 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1372 		      || !vinfo_for_stmt (op_def_stmt))
1373                     return false;
1374 
1375                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1376                         != vect_used_in_outer
1377                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1378                            != vect_used_in_outer_by_reduction)
1379                     return false;
1380                 }
1381 
1382               continue;
1383             }
1384 
1385           gcc_assert (stmt_info);
1386 
1387           if (STMT_VINFO_LIVE_P (stmt_info))
1388             {
1389               /* FORNOW: not yet supported.  */
1390               if (dump_enabled_p ())
1391 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392 				 "not vectorized: value used after loop.");
1393               return false;
1394             }
1395 
1396           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1397               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1398             {
1399               /* A scalar-dependence cycle that we don't support.  */
1400               if (dump_enabled_p ())
1401 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 				 "not vectorized: scalar dependence cycle.");
1403               return false;
1404             }
1405 
1406           if (STMT_VINFO_RELEVANT_P (stmt_info))
1407             {
1408               need_to_vectorize = true;
1409               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1410                 ok = vectorizable_induction (phi, NULL, NULL);
1411             }
1412 
1413           if (!ok)
1414             {
1415               if (dump_enabled_p ())
1416                 {
1417 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418 				   "not vectorized: relevant phi not "
1419 				   "supported: ");
1420                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1421                 }
1422 	      return false;
1423             }
1424         }
1425 
1426       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1427         {
1428           gimple stmt = gsi_stmt (si);
1429 	  if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1430 	    return false;
1431         }
1432     } /* bbs */
1433 
1434   /* All operations in the loop are either irrelevant (deal with loop
1435      control, or dead), or only used outside the loop and can be moved
1436      out of the loop (e.g. invariants, inductions).  The loop can be
1437      optimized away by scalar optimizations.  We're better off not
1438      touching this loop.  */
1439   if (!need_to_vectorize)
1440     {
1441       if (dump_enabled_p ())
1442         dump_printf_loc (MSG_NOTE, vect_location,
1443 			 "All the computation can be taken out of the loop.");
1444       if (dump_enabled_p ())
1445 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446 			 "not vectorized: redundant loop. no profit to "
1447 			 "vectorize.");
1448       return false;
1449     }
1450 
1451   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1452     dump_printf_loc (MSG_NOTE, vect_location,
1453 		     "vectorization_factor = %d, niters = "
1454 		     HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1455 		     LOOP_VINFO_INT_NITERS (loop_vinfo));
1456 
1457   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1458        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1459       || ((max_niter = max_stmt_executions_int (loop)) != -1
1460 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1461     {
1462       if (dump_enabled_p ())
1463 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 			 "not vectorized: iteration count too small.");
1465       if (dump_enabled_p ())
1466 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467 			 "not vectorized: iteration count smaller than "
1468 			 "vectorization factor.");
1469       return false;
1470     }
1471 
1472   /* Analyze cost.  Decide if worth while to vectorize.  */
1473 
1474   /* Once VF is set, SLP costs should be updated since the number of created
1475      vector stmts depends on VF.  */
1476   vect_update_slp_costs_according_to_vf (loop_vinfo);
1477 
1478   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1479 				      &min_profitable_estimate);
1480   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1481 
1482   if (min_profitable_iters < 0)
1483     {
1484       if (dump_enabled_p ())
1485 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486 			 "not vectorized: vectorization not profitable.");
1487       if (dump_enabled_p ())
1488 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489 			 "not vectorized: vector version will never be "
1490 			 "profitable.");
1491       return false;
1492     }
1493 
1494   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1495                             * vectorization_factor) - 1);
1496 
1497 
1498   /* Use the cost model only if it is more conservative than user specified
1499      threshold.  */
1500 
1501   th = (unsigned) min_scalar_loop_bound;
1502   if (min_profitable_iters
1503       && (!min_scalar_loop_bound
1504           || min_profitable_iters > min_scalar_loop_bound))
1505     th = (unsigned) min_profitable_iters;
1506 
1507   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1508       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1509     {
1510       if (dump_enabled_p ())
1511 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512 			 "not vectorized: vectorization not profitable.");
1513       if (dump_enabled_p ())
1514         dump_printf_loc (MSG_NOTE, vect_location,
1515 			 "not vectorized: iteration count smaller than user "
1516 			 "specified loop bound parameter or minimum profitable "
1517 			 "iterations (whichever is more conservative).");
1518       return false;
1519     }
1520 
1521   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1522       && ((unsigned HOST_WIDE_INT) estimated_niter
1523           <= MAX (th, (unsigned)min_profitable_estimate)))
1524     {
1525       if (dump_enabled_p ())
1526 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527 			 "not vectorized: estimated iteration count too "
1528                          "small.");
1529       if (dump_enabled_p ())
1530         dump_printf_loc (MSG_NOTE, vect_location,
1531 			 "not vectorized: estimated iteration count smaller "
1532                          "than specified loop bound parameter or minimum "
1533                          "profitable iterations (whichever is more "
1534                          "conservative).");
1535       return false;
1536     }
1537 
1538   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1539       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1540       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1541     {
1542       if (dump_enabled_p ())
1543         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1544       if (!vect_can_advance_ivs_p (loop_vinfo))
1545         {
1546           if (dump_enabled_p ())
1547 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1548 			     "not vectorized: can't create epilog loop 1.");
1549           return false;
1550         }
1551       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1552         {
1553           if (dump_enabled_p ())
1554 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555 			     "not vectorized: can't create epilog loop 2.");
1556           return false;
1557         }
1558     }
1559 
1560   return true;
1561 }
1562 
1563 
1564 /* Function vect_analyze_loop_2.
1565 
1566    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1567    for it.  The different analyses will record information in the
1568    loop_vec_info struct.  */
1569 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo)1570 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1571 {
1572   bool ok, slp = false;
1573   int max_vf = MAX_VECTORIZATION_FACTOR;
1574   int min_vf = 2;
1575 
1576   /* Find all data references in the loop (which correspond to vdefs/vuses)
1577      and analyze their evolution in the loop.  Also adjust the minimal
1578      vectorization factor according to the loads and stores.
1579 
1580      FORNOW: Handle only simple, array references, which
1581      alignment can be forced, and aligned pointer-references.  */
1582 
1583   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1584   if (!ok)
1585     {
1586       if (dump_enabled_p ())
1587 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1588 			 "bad data references.");
1589       return false;
1590     }
1591 
1592   /* Classify all cross-iteration scalar data-flow cycles.
1593      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1594 
1595   vect_analyze_scalar_cycles (loop_vinfo);
1596 
1597   vect_pattern_recog (loop_vinfo, NULL);
1598 
1599   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1600 
1601   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 			 "unexpected pattern.");
1607       return false;
1608     }
1609 
1610   /* Analyze data dependences between the data-refs in the loop
1611      and adjust the maximum vectorization factor according to
1612      the dependences.
1613      FORNOW: fail at the first data dependence that we encounter.  */
1614 
1615   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1616   if (!ok
1617       || max_vf < min_vf)
1618     {
1619       if (dump_enabled_p ())
1620 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621 			     "bad data dependence.");
1622       return false;
1623     }
1624 
1625   ok = vect_determine_vectorization_factor (loop_vinfo);
1626   if (!ok)
1627     {
1628       if (dump_enabled_p ())
1629 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 			 "can't determine vectorization factor.");
1631       return false;
1632     }
1633   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1634     {
1635       if (dump_enabled_p ())
1636 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637 			 "bad data dependence.");
1638       return false;
1639     }
1640 
1641   /* Analyze the alignment of the data-refs in the loop.
1642      Fail if a data reference is found that cannot be vectorized.  */
1643 
1644   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1645   if (!ok)
1646     {
1647       if (dump_enabled_p ())
1648 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649 			 "bad data alignment.");
1650       return false;
1651     }
1652 
1653   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1654      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1655 
1656   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661 			 "bad data access.");
1662       return false;
1663     }
1664 
1665   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1666      It is important to call pruning after vect_analyze_data_ref_accesses,
1667      since we use grouping information gathered by interleaving analysis.  */
1668   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1669   if (!ok)
1670     {
1671       if (dump_enabled_p ())
1672 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 			 "too long list of versioning for alias "
1674 			 "run-time tests.");
1675       return false;
1676     }
1677 
1678   /* This pass will decide on using loop versioning and/or loop peeling in
1679      order to enhance the alignment of data references in the loop.  */
1680 
1681   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1682   if (!ok)
1683     {
1684       if (dump_enabled_p ())
1685 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 			 "bad data alignment.");
1687       return false;
1688     }
1689 
1690   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1691   ok = vect_analyze_slp (loop_vinfo, NULL);
1692   if (ok)
1693     {
1694       /* Decide which possible SLP instances to SLP.  */
1695       slp = vect_make_slp_decision (loop_vinfo);
1696 
1697       /* Find stmts that need to be both vectorized and SLPed.  */
1698       vect_detect_hybrid_slp (loop_vinfo);
1699     }
1700   else
1701     return false;
1702 
1703   /* Scan all the operations in the loop and make sure they are
1704      vectorizable.  */
1705 
1706   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1707   if (!ok)
1708     {
1709       if (dump_enabled_p ())
1710 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711 			 "bad operation or unsupported loop bound.");
1712       return false;
1713     }
1714 
1715   return true;
1716 }
1717 
1718 /* Function vect_analyze_loop.
1719 
1720    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1721    for it.  The different analyses will record information in the
1722    loop_vec_info struct.  */
1723 loop_vec_info
vect_analyze_loop(struct loop * loop)1724 vect_analyze_loop (struct loop *loop)
1725 {
1726   loop_vec_info loop_vinfo;
1727   unsigned int vector_sizes;
1728 
1729   /* Autodetect first vector size we try.  */
1730   current_vector_size = 0;
1731   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1732 
1733   if (dump_enabled_p ())
1734     dump_printf_loc (MSG_NOTE, vect_location,
1735 		     "===== analyze_loop_nest =====");
1736 
1737   if (loop_outer (loop)
1738       && loop_vec_info_for_loop (loop_outer (loop))
1739       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1740     {
1741       if (dump_enabled_p ())
1742 	dump_printf_loc (MSG_NOTE, vect_location,
1743 			 "outer-loop already vectorized.");
1744       return NULL;
1745     }
1746 
1747   while (1)
1748     {
1749       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1750       loop_vinfo = vect_analyze_loop_form (loop);
1751       if (!loop_vinfo)
1752 	{
1753 	  if (dump_enabled_p ())
1754 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755 			     "bad loop form.");
1756 	  return NULL;
1757 	}
1758 
1759       if (vect_analyze_loop_2 (loop_vinfo))
1760 	{
1761 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1762 
1763 	  return loop_vinfo;
1764 	}
1765 
1766       destroy_loop_vec_info (loop_vinfo, true);
1767 
1768       vector_sizes &= ~current_vector_size;
1769       if (vector_sizes == 0
1770 	  || current_vector_size == 0)
1771 	return NULL;
1772 
1773       /* Try the next biggest vector size.  */
1774       current_vector_size = 1 << floor_log2 (vector_sizes);
1775       if (dump_enabled_p ())
1776 	dump_printf_loc (MSG_NOTE, vect_location,
1777 			 "***** Re-trying analysis with "
1778 			 "vector size %d\n", current_vector_size);
1779     }
1780 }
1781 
1782 
1783 /* Function reduction_code_for_scalar_code
1784 
1785    Input:
1786    CODE - tree_code of a reduction operations.
1787 
1788    Output:
1789    REDUC_CODE - the corresponding tree-code to be used to reduce the
1790       vector of partial results into a single scalar result (which
1791       will also reside in a vector) or ERROR_MARK if the operation is
1792       a supported reduction operation, but does not have such tree-code.
1793 
1794    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1795 
1796 static bool
reduction_code_for_scalar_code(enum tree_code code,enum tree_code * reduc_code)1797 reduction_code_for_scalar_code (enum tree_code code,
1798                                 enum tree_code *reduc_code)
1799 {
1800   switch (code)
1801     {
1802       case MAX_EXPR:
1803         *reduc_code = REDUC_MAX_EXPR;
1804         return true;
1805 
1806       case MIN_EXPR:
1807         *reduc_code = REDUC_MIN_EXPR;
1808         return true;
1809 
1810       case PLUS_EXPR:
1811         *reduc_code = REDUC_PLUS_EXPR;
1812         return true;
1813 
1814       case MULT_EXPR:
1815       case MINUS_EXPR:
1816       case BIT_IOR_EXPR:
1817       case BIT_XOR_EXPR:
1818       case BIT_AND_EXPR:
1819         *reduc_code = ERROR_MARK;
1820         return true;
1821 
1822       default:
1823        return false;
1824     }
1825 }
1826 
1827 
1828 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1829    STMT is printed with a message MSG. */
1830 
1831 static void
report_vect_op(int msg_type,gimple stmt,const char * msg)1832 report_vect_op (int msg_type, gimple stmt, const char *msg)
1833 {
1834   dump_printf_loc (msg_type, vect_location, "%s", msg);
1835   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1836 }
1837 
1838 
1839 /* Detect SLP reduction of the form:
1840 
1841    #a1 = phi <a5, a0>
1842    a2 = operation (a1)
1843    a3 = operation (a2)
1844    a4 = operation (a3)
1845    a5 = operation (a4)
1846 
1847    #a = phi <a5>
1848 
1849    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1850    FIRST_STMT is the first reduction stmt in the chain
1851    (a2 = operation (a1)).
1852 
1853    Return TRUE if a reduction chain was detected.  */
1854 
1855 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple phi,gimple first_stmt)1856 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1857 {
1858   struct loop *loop = (gimple_bb (phi))->loop_father;
1859   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1860   enum tree_code code;
1861   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1862   stmt_vec_info use_stmt_info, current_stmt_info;
1863   tree lhs;
1864   imm_use_iterator imm_iter;
1865   use_operand_p use_p;
1866   int nloop_uses, size = 0, n_out_of_loop_uses;
1867   bool found = false;
1868 
1869   if (loop != vect_loop)
1870     return false;
1871 
1872   lhs = PHI_RESULT (phi);
1873   code = gimple_assign_rhs_code (first_stmt);
1874   while (1)
1875     {
1876       nloop_uses = 0;
1877       n_out_of_loop_uses = 0;
1878       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1879         {
1880 	  gimple use_stmt = USE_STMT (use_p);
1881           if (is_gimple_debug (use_stmt))
1882             continue;
1883 
1884 	  use_stmt = USE_STMT (use_p);
1885 
1886           /* Check if we got back to the reduction phi.  */
1887 	  if (use_stmt == phi)
1888             {
1889 	      loop_use_stmt = use_stmt;
1890               found = true;
1891               break;
1892             }
1893 
1894           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1895             {
1896               if (vinfo_for_stmt (use_stmt)
1897                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1898                 {
1899                   loop_use_stmt = use_stmt;
1900                   nloop_uses++;
1901                 }
1902             }
1903            else
1904              n_out_of_loop_uses++;
1905 
1906            /* There are can be either a single use in the loop or two uses in
1907               phi nodes.  */
1908            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1909              return false;
1910         }
1911 
1912       if (found)
1913         break;
1914 
1915       /* We reached a statement with no loop uses.  */
1916       if (nloop_uses == 0)
1917 	return false;
1918 
1919       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1920       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1921         return false;
1922 
1923       if (!is_gimple_assign (loop_use_stmt)
1924 	  || code != gimple_assign_rhs_code (loop_use_stmt)
1925 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1926         return false;
1927 
1928       /* Insert USE_STMT into reduction chain.  */
1929       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1930       if (current_stmt)
1931         {
1932           current_stmt_info = vinfo_for_stmt (current_stmt);
1933 	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1934           GROUP_FIRST_ELEMENT (use_stmt_info)
1935             = GROUP_FIRST_ELEMENT (current_stmt_info);
1936         }
1937       else
1938 	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1939 
1940       lhs = gimple_assign_lhs (loop_use_stmt);
1941       current_stmt = loop_use_stmt;
1942       size++;
1943    }
1944 
1945   if (!found || loop_use_stmt != phi || size < 2)
1946     return false;
1947 
1948   /* Swap the operands, if needed, to make the reduction operand be the second
1949      operand.  */
1950   lhs = PHI_RESULT (phi);
1951   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1952   while (next_stmt)
1953     {
1954       if (gimple_assign_rhs2 (next_stmt) == lhs)
1955 	{
1956 	  tree op = gimple_assign_rhs1 (next_stmt);
1957           gimple def_stmt = NULL;
1958 
1959           if (TREE_CODE (op) == SSA_NAME)
1960             def_stmt = SSA_NAME_DEF_STMT (op);
1961 
1962 	  /* Check that the other def is either defined in the loop
1963 	     ("vect_internal_def"), or it's an induction (defined by a
1964 	     loop-header phi-node).  */
1965           if (def_stmt
1966               && gimple_bb (def_stmt)
1967 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1968               && (is_gimple_assign (def_stmt)
1969                   || is_gimple_call (def_stmt)
1970                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1971                            == vect_induction_def
1972                   || (gimple_code (def_stmt) == GIMPLE_PHI
1973                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1974                                   == vect_internal_def
1975                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1976 	    {
1977 	      lhs = gimple_assign_lhs (next_stmt);
1978 	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1979  	      continue;
1980 	    }
1981 
1982 	  return false;
1983 	}
1984       else
1985 	{
1986           tree op = gimple_assign_rhs2 (next_stmt);
1987           gimple def_stmt = NULL;
1988 
1989           if (TREE_CODE (op) == SSA_NAME)
1990             def_stmt = SSA_NAME_DEF_STMT (op);
1991 
1992           /* Check that the other def is either defined in the loop
1993             ("vect_internal_def"), or it's an induction (defined by a
1994             loop-header phi-node).  */
1995           if (def_stmt
1996               && gimple_bb (def_stmt)
1997 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1998               && (is_gimple_assign (def_stmt)
1999                   || is_gimple_call (def_stmt)
2000                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2001                               == vect_induction_def
2002                   || (gimple_code (def_stmt) == GIMPLE_PHI
2003                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2004                                   == vect_internal_def
2005                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2006   	    {
2007 	      if (dump_enabled_p ())
2008 		{
2009 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2010 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2011 		}
2012 
2013 	      swap_tree_operands (next_stmt,
2014 	 		          gimple_assign_rhs1_ptr (next_stmt),
2015                                   gimple_assign_rhs2_ptr (next_stmt));
2016 	      update_stmt (next_stmt);
2017 
2018 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2019 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2020 	    }
2021 	  else
2022 	    return false;
2023         }
2024 
2025       lhs = gimple_assign_lhs (next_stmt);
2026       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2027     }
2028 
2029   /* Save the chain for further analysis in SLP detection.  */
2030   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2031   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2032   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2033 
2034   return true;
2035 }
2036 
2037 
2038 /* Function vect_is_simple_reduction_1
2039 
2040    (1) Detect a cross-iteration def-use cycle that represents a simple
2041    reduction computation.  We look for the following pattern:
2042 
2043    loop_header:
2044      a1 = phi < a0, a2 >
2045      a3 = ...
2046      a2 = operation (a3, a1)
2047 
2048    such that:
2049    1. operation is commutative and associative and it is safe to
2050       change the order of the computation (if CHECK_REDUCTION is true)
2051    2. no uses for a2 in the loop (a2 is used out of the loop)
2052    3. no uses of a1 in the loop besides the reduction operation
2053    4. no uses of a1 outside the loop.
2054 
2055    Conditions 1,4 are tested here.
2056    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2057 
2058    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2059    nested cycles, if CHECK_REDUCTION is false.
2060 
2061    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2062    reductions:
2063 
2064      a1 = phi < a0, a2 >
2065      inner loop (def of a3)
2066      a2 = phi < a3 >
2067 
2068    If MODIFY is true it tries also to rework the code in-place to enable
2069    detection of more reduction patterns.  For the time being we rewrite
2070    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2071 */
2072 
2073 static gimple
vect_is_simple_reduction_1(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc,bool modify)2074 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2075 			    bool check_reduction, bool *double_reduc,
2076 			    bool modify)
2077 {
2078   struct loop *loop = (gimple_bb (phi))->loop_father;
2079   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2080   edge latch_e = loop_latch_edge (loop);
2081   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2082   gimple def_stmt, def1 = NULL, def2 = NULL;
2083   enum tree_code orig_code, code;
2084   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2085   tree type;
2086   int nloop_uses;
2087   tree name;
2088   imm_use_iterator imm_iter;
2089   use_operand_p use_p;
2090   bool phi_def;
2091 
2092   *double_reduc = false;
2093 
2094   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2095      otherwise, we assume outer loop vectorization.  */
2096   gcc_assert ((check_reduction && loop == vect_loop)
2097               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2098 
2099   name = PHI_RESULT (phi);
2100   nloop_uses = 0;
2101   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2102     {
2103       gimple use_stmt = USE_STMT (use_p);
2104       if (is_gimple_debug (use_stmt))
2105 	continue;
2106 
2107       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2108         {
2109           if (dump_enabled_p ())
2110 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111 			     "intermediate value used outside loop.");
2112 
2113           return NULL;
2114         }
2115 
2116       if (vinfo_for_stmt (use_stmt)
2117 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2118         nloop_uses++;
2119       if (nloop_uses > 1)
2120         {
2121           if (dump_enabled_p ())
2122 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 			     "reduction used in loop.");
2124           return NULL;
2125         }
2126     }
2127 
2128   if (TREE_CODE (loop_arg) != SSA_NAME)
2129     {
2130       if (dump_enabled_p ())
2131 	{
2132 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 			   "reduction: not ssa_name: ");
2134 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2135 	}
2136       return NULL;
2137     }
2138 
2139   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2140   if (!def_stmt)
2141     {
2142       if (dump_enabled_p ())
2143 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 			 "reduction: no def_stmt.");
2145       return NULL;
2146     }
2147 
2148   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2149     {
2150       if (dump_enabled_p ())
2151         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2152       return NULL;
2153     }
2154 
2155   if (is_gimple_assign (def_stmt))
2156     {
2157       name = gimple_assign_lhs (def_stmt);
2158       phi_def = false;
2159     }
2160   else
2161     {
2162       name = PHI_RESULT (def_stmt);
2163       phi_def = true;
2164     }
2165 
2166   nloop_uses = 0;
2167   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2168     {
2169       gimple use_stmt = USE_STMT (use_p);
2170       if (is_gimple_debug (use_stmt))
2171 	continue;
2172       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2173 	  && vinfo_for_stmt (use_stmt)
2174 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2175 	nloop_uses++;
2176       if (nloop_uses > 1)
2177 	{
2178 	  if (dump_enabled_p ())
2179 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 			     "reduction used in loop.");
2181 	  return NULL;
2182 	}
2183     }
2184 
2185   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2186      defined in the inner loop.  */
2187   if (phi_def)
2188     {
2189       op1 = PHI_ARG_DEF (def_stmt, 0);
2190 
2191       if (gimple_phi_num_args (def_stmt) != 1
2192           || TREE_CODE (op1) != SSA_NAME)
2193         {
2194           if (dump_enabled_p ())
2195 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2196 			     "unsupported phi node definition.");
2197 
2198           return NULL;
2199         }
2200 
2201       def1 = SSA_NAME_DEF_STMT (op1);
2202       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2203           && loop->inner
2204           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2205           && is_gimple_assign (def1))
2206         {
2207           if (dump_enabled_p ())
2208             report_vect_op (MSG_NOTE, def_stmt,
2209 			    "detected double reduction: ");
2210 
2211           *double_reduc = true;
2212           return def_stmt;
2213         }
2214 
2215       return NULL;
2216     }
2217 
2218   code = orig_code = gimple_assign_rhs_code (def_stmt);
2219 
2220   /* We can handle "res -= x[i]", which is non-associative by
2221      simply rewriting this into "res += -x[i]".  Avoid changing
2222      gimple instruction for the first simple tests and only do this
2223      if we're allowed to change code at all.  */
2224   if (code == MINUS_EXPR
2225       && modify
2226       && (op1 = gimple_assign_rhs1 (def_stmt))
2227       && TREE_CODE (op1) == SSA_NAME
2228       && SSA_NAME_DEF_STMT (op1) == phi)
2229     code = PLUS_EXPR;
2230 
2231   if (check_reduction
2232       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2233     {
2234       if (dump_enabled_p ())
2235         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2236 			"reduction: not commutative/associative: ");
2237       return NULL;
2238     }
2239 
2240   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2241     {
2242       if (code != COND_EXPR)
2243         {
2244 	  if (dump_enabled_p ())
2245 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2246 			    "reduction: not binary operation: ");
2247 
2248           return NULL;
2249         }
2250 
2251       op3 = gimple_assign_rhs1 (def_stmt);
2252       if (COMPARISON_CLASS_P (op3))
2253         {
2254           op4 = TREE_OPERAND (op3, 1);
2255           op3 = TREE_OPERAND (op3, 0);
2256         }
2257 
2258       op1 = gimple_assign_rhs2 (def_stmt);
2259       op2 = gimple_assign_rhs3 (def_stmt);
2260 
2261       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2262         {
2263           if (dump_enabled_p ())
2264             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2265 			    "reduction: uses not ssa_names: ");
2266 
2267           return NULL;
2268         }
2269     }
2270   else
2271     {
2272       op1 = gimple_assign_rhs1 (def_stmt);
2273       op2 = gimple_assign_rhs2 (def_stmt);
2274 
2275       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2276         {
2277           if (dump_enabled_p ())
2278 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2279 			    "reduction: uses not ssa_names: ");
2280 
2281           return NULL;
2282         }
2283    }
2284 
2285   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2286   if ((TREE_CODE (op1) == SSA_NAME
2287        && !types_compatible_p (type,TREE_TYPE (op1)))
2288       || (TREE_CODE (op2) == SSA_NAME
2289           && !types_compatible_p (type, TREE_TYPE (op2)))
2290       || (op3 && TREE_CODE (op3) == SSA_NAME
2291           && !types_compatible_p (type, TREE_TYPE (op3)))
2292       || (op4 && TREE_CODE (op4) == SSA_NAME
2293           && !types_compatible_p (type, TREE_TYPE (op4))))
2294     {
2295       if (dump_enabled_p ())
2296         {
2297           dump_printf_loc (MSG_NOTE, vect_location,
2298 			   "reduction: multiple types: operation type: ");
2299           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2300           dump_printf (MSG_NOTE, ", operands types: ");
2301           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2302 			     TREE_TYPE (op1));
2303           dump_printf (MSG_NOTE, ",");
2304           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2305 			     TREE_TYPE (op2));
2306           if (op3)
2307             {
2308               dump_printf (MSG_NOTE, ",");
2309               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2310 				 TREE_TYPE (op3));
2311             }
2312 
2313           if (op4)
2314             {
2315               dump_printf (MSG_NOTE, ",");
2316               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2317 				 TREE_TYPE (op4));
2318             }
2319         }
2320 
2321       return NULL;
2322     }
2323 
2324   /* Check that it's ok to change the order of the computation.
2325      Generally, when vectorizing a reduction we change the order of the
2326      computation.  This may change the behavior of the program in some
2327      cases, so we need to check that this is ok.  One exception is when
2328      vectorizing an outer-loop: the inner-loop is executed sequentially,
2329      and therefore vectorizing reductions in the inner-loop during
2330      outer-loop vectorization is safe.  */
2331 
2332   /* CHECKME: check for !flag_finite_math_only too?  */
2333   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2334       && check_reduction)
2335     {
2336       /* Changing the order of operations changes the semantics.  */
2337       if (dump_enabled_p ())
2338 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2339 			"reduction: unsafe fp math optimization: ");
2340       return NULL;
2341     }
2342   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2343 	   && check_reduction)
2344     {
2345       /* Changing the order of operations changes the semantics.  */
2346       if (dump_enabled_p ())
2347 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2348 			"reduction: unsafe int math optimization: ");
2349       return NULL;
2350     }
2351   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2352     {
2353       /* Changing the order of operations changes the semantics.  */
2354       if (dump_enabled_p ())
2355 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2356 			"reduction: unsafe fixed-point math optimization: ");
2357       return NULL;
2358     }
2359 
2360   /* If we detected "res -= x[i]" earlier, rewrite it into
2361      "res += -x[i]" now.  If this turns out to be useless reassoc
2362      will clean it up again.  */
2363   if (orig_code == MINUS_EXPR)
2364     {
2365       tree rhs = gimple_assign_rhs2 (def_stmt);
2366       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2367       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2368 							 rhs, NULL);
2369       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2370       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2371 							  loop_info, NULL));
2372       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2373       gimple_assign_set_rhs2 (def_stmt, negrhs);
2374       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2375       update_stmt (def_stmt);
2376     }
2377 
2378   /* Reduction is safe. We're dealing with one of the following:
2379      1) integer arithmetic and no trapv
2380      2) floating point arithmetic, and special flags permit this optimization
2381      3) nested cycle (i.e., outer loop vectorization).  */
2382   if (TREE_CODE (op1) == SSA_NAME)
2383     def1 = SSA_NAME_DEF_STMT (op1);
2384 
2385   if (TREE_CODE (op2) == SSA_NAME)
2386     def2 = SSA_NAME_DEF_STMT (op2);
2387 
2388   if (code != COND_EXPR
2389       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2390     {
2391       if (dump_enabled_p ())
2392 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2393       return NULL;
2394     }
2395 
2396   /* Check that one def is the reduction def, defined by PHI,
2397      the other def is either defined in the loop ("vect_internal_def"),
2398      or it's an induction (defined by a loop-header phi-node).  */
2399 
2400   if (def2 && def2 == phi
2401       && (code == COND_EXPR
2402 	  || !def1 || gimple_nop_p (def1)
2403           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2404               && (is_gimple_assign (def1)
2405 		  || is_gimple_call (def1)
2406   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2407                       == vect_induction_def
2408    	          || (gimple_code (def1) == GIMPLE_PHI
2409 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2410                           == vect_internal_def
2411  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
2412     {
2413       if (dump_enabled_p ())
2414 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2415       return def_stmt;
2416     }
2417 
2418   if (def1 && def1 == phi
2419       && (code == COND_EXPR
2420 	  || !def2 || gimple_nop_p (def2)
2421           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2422  	      && (is_gimple_assign (def2)
2423 		  || is_gimple_call (def2)
2424 	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2425                       == vect_induction_def
2426  	          || (gimple_code (def2) == GIMPLE_PHI
2427 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2428                           == vect_internal_def
2429 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
2430     {
2431       if (check_reduction)
2432         {
2433           /* Swap operands (just for simplicity - so that the rest of the code
2434 	     can assume that the reduction variable is always the last (second)
2435 	     argument).  */
2436           if (dump_enabled_p ())
2437 	    report_vect_op (MSG_NOTE, def_stmt,
2438 	  	            "detected reduction: need to swap operands: ");
2439 
2440           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2441  			      gimple_assign_rhs2_ptr (def_stmt));
2442 
2443 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2444 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2445         }
2446       else
2447         {
2448           if (dump_enabled_p ())
2449             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2450         }
2451 
2452       return def_stmt;
2453     }
2454 
2455   /* Try to find SLP reduction chain.  */
2456   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2457     {
2458       if (dump_enabled_p ())
2459         report_vect_op (MSG_NOTE, def_stmt,
2460 			"reduction: detected reduction chain: ");
2461 
2462       return def_stmt;
2463     }
2464 
2465   if (dump_enabled_p ())
2466     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2467 		    "reduction: unknown pattern: ");
2468 
2469   return NULL;
2470 }
2471 
2472 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2473    in-place.  Arguments as there.  */
2474 
2475 static gimple
vect_is_simple_reduction(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc)2476 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2477                           bool check_reduction, bool *double_reduc)
2478 {
2479   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2480 				     double_reduc, false);
2481 }
2482 
2483 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2484    in-place if it enables detection of more reductions.  Arguments
2485    as there.  */
2486 
2487 gimple
vect_force_simple_reduction(loop_vec_info loop_info,gimple phi,bool check_reduction,bool * double_reduc)2488 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2489                           bool check_reduction, bool *double_reduc)
2490 {
2491   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2492 				     double_reduc, true);
2493 }
2494 
2495 /* Calculate the cost of one scalar iteration of the loop.  */
2496 int
vect_get_single_scalar_iteration_cost(loop_vec_info loop_vinfo)2497 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2498 {
2499   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2500   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2501   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2502   int innerloop_iters, i, stmt_cost;
2503 
2504   /* Count statements in scalar loop.  Using this as scalar cost for a single
2505      iteration for now.
2506 
2507      TODO: Add outer loop support.
2508 
2509      TODO: Consider assigning different costs to different scalar
2510      statements.  */
2511 
2512   /* FORNOW.  */
2513   innerloop_iters = 1;
2514   if (loop->inner)
2515     innerloop_iters = 50; /* FIXME */
2516 
2517   for (i = 0; i < nbbs; i++)
2518     {
2519       gimple_stmt_iterator si;
2520       basic_block bb = bbs[i];
2521 
2522       if (bb->loop_father == loop->inner)
2523         factor = innerloop_iters;
2524       else
2525         factor = 1;
2526 
2527       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2528         {
2529           gimple stmt = gsi_stmt (si);
2530           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2531 
2532           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2533             continue;
2534 
2535           /* Skip stmts that are not vectorized inside the loop.  */
2536           if (stmt_info
2537               && !STMT_VINFO_RELEVANT_P (stmt_info)
2538               && (!STMT_VINFO_LIVE_P (stmt_info)
2539                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2540 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2541             continue;
2542 
2543           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2544             {
2545               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2546                stmt_cost = vect_get_stmt_cost (scalar_load);
2547              else
2548                stmt_cost = vect_get_stmt_cost (scalar_store);
2549             }
2550           else
2551             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2552 
2553           scalar_single_iter_cost += stmt_cost * factor;
2554         }
2555     }
2556   return scalar_single_iter_cost;
2557 }
2558 
2559 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2560 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,int scalar_single_iter_cost,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)2561 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2562                              int *peel_iters_epilogue,
2563                              int scalar_single_iter_cost,
2564 			     stmt_vector_for_cost *prologue_cost_vec,
2565 			     stmt_vector_for_cost *epilogue_cost_vec)
2566 {
2567   int retval = 0;
2568   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2569 
2570   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2571     {
2572       *peel_iters_epilogue = vf/2;
2573       if (dump_enabled_p ())
2574         dump_printf_loc (MSG_NOTE, vect_location,
2575 			 "cost model: epilogue peel iters set to vf/2 "
2576 			 "because loop iterations are unknown .");
2577 
2578       /* If peeled iterations are known but number of scalar loop
2579          iterations are unknown, count a taken branch per peeled loop.  */
2580       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2581 				 NULL, 0, vect_prologue);
2582     }
2583   else
2584     {
2585       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2586       peel_iters_prologue = niters < peel_iters_prologue ?
2587                             niters : peel_iters_prologue;
2588       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2589       /* If we need to peel for gaps, but no peeling is required, we have to
2590 	 peel VF iterations.  */
2591       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2592         *peel_iters_epilogue = vf;
2593     }
2594 
2595   if (peel_iters_prologue)
2596     retval += record_stmt_cost (prologue_cost_vec,
2597 				peel_iters_prologue * scalar_single_iter_cost,
2598 				scalar_stmt, NULL, 0, vect_prologue);
2599   if (*peel_iters_epilogue)
2600     retval += record_stmt_cost (epilogue_cost_vec,
2601 				*peel_iters_epilogue * scalar_single_iter_cost,
2602 				scalar_stmt, NULL, 0, vect_epilogue);
2603   return retval;
2604 }
2605 
2606 /* Function vect_estimate_min_profitable_iters
2607 
2608    Return the number of iterations required for the vector version of the
2609    loop to be profitable relative to the cost of the scalar version of the
2610    loop.  */
2611 
2612 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)2613 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2614 				    int *ret_min_profitable_niters,
2615 				    int *ret_min_profitable_estimate)
2616 {
2617   int min_profitable_iters;
2618   int min_profitable_estimate;
2619   int peel_iters_prologue;
2620   int peel_iters_epilogue;
2621   unsigned vec_inside_cost = 0;
2622   int vec_outside_cost = 0;
2623   unsigned vec_prologue_cost = 0;
2624   unsigned vec_epilogue_cost = 0;
2625   int scalar_single_iter_cost = 0;
2626   int scalar_outside_cost = 0;
2627   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2628   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2629   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2630 
2631   /* Cost model disabled.  */
2632   if (!flag_vect_cost_model)
2633     {
2634       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2635       *ret_min_profitable_niters = 0;
2636       *ret_min_profitable_estimate = 0;
2637       return;
2638     }
2639 
2640   /* Requires loop versioning tests to handle misalignment.  */
2641   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2642     {
2643       /*  FIXME: Make cost depend on complexity of individual check.  */
2644       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2645       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2646 			    vect_prologue);
2647       dump_printf (MSG_NOTE,
2648                    "cost model: Adding cost of checks for loop "
2649                    "versioning to treat misalignment.\n");
2650     }
2651 
2652   /* Requires loop versioning with alias checks.  */
2653   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2654     {
2655       /*  FIXME: Make cost depend on complexity of individual check.  */
2656       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2657       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2658 			    vect_prologue);
2659       dump_printf (MSG_NOTE,
2660                    "cost model: Adding cost of checks for loop "
2661                    "versioning aliasing.\n");
2662     }
2663 
2664   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2665       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2666     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2667 			  vect_prologue);
2668 
2669   /* Count statements in scalar loop.  Using this as scalar cost for a single
2670      iteration for now.
2671 
2672      TODO: Add outer loop support.
2673 
2674      TODO: Consider assigning different costs to different scalar
2675      statements.  */
2676 
2677   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2678 
2679   /* Add additional cost for the peeled instructions in prologue and epilogue
2680      loop.
2681 
2682      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2683      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2684 
2685      TODO: Build an expression that represents peel_iters for prologue and
2686      epilogue to be used in a run-time test.  */
2687 
2688   if (npeel  < 0)
2689     {
2690       peel_iters_prologue = vf/2;
2691       dump_printf (MSG_NOTE, "cost model: "
2692                    "prologue peel iters set to vf/2.");
2693 
2694       /* If peeling for alignment is unknown, loop bound of main loop becomes
2695          unknown.  */
2696       peel_iters_epilogue = vf/2;
2697       dump_printf (MSG_NOTE, "cost model: "
2698                    "epilogue peel iters set to vf/2 because "
2699                    "peeling for alignment is unknown.");
2700 
2701       /* If peeled iterations are unknown, count a taken branch and a not taken
2702          branch per peeled loop. Even if scalar loop iterations are known,
2703          vector iterations are not known since peeled prologue iterations are
2704          not known. Hence guards remain the same.  */
2705       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2706 			    NULL, 0, vect_prologue);
2707       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2708 			    NULL, 0, vect_prologue);
2709       /* FORNOW: Don't attempt to pass individual scalar instructions to
2710 	 the model; just assume linear cost for scalar iterations.  */
2711       (void) add_stmt_cost (target_cost_data,
2712 			    peel_iters_prologue * scalar_single_iter_cost,
2713 			    scalar_stmt, NULL, 0, vect_prologue);
2714       (void) add_stmt_cost (target_cost_data,
2715 			    peel_iters_epilogue * scalar_single_iter_cost,
2716 			    scalar_stmt, NULL, 0, vect_epilogue);
2717     }
2718   else
2719     {
2720       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2721       stmt_info_for_cost *si;
2722       int j;
2723       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2724 
2725       prologue_cost_vec.create (2);
2726       epilogue_cost_vec.create (2);
2727       peel_iters_prologue = npeel;
2728 
2729       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2730 					  &peel_iters_epilogue,
2731 					  scalar_single_iter_cost,
2732 					  &prologue_cost_vec,
2733 					  &epilogue_cost_vec);
2734 
2735       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2736 	{
2737 	  struct _stmt_vec_info *stmt_info
2738 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2739 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2740 				si->misalign, vect_prologue);
2741 	}
2742 
2743       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2744 	{
2745 	  struct _stmt_vec_info *stmt_info
2746 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2747 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2748 				si->misalign, vect_epilogue);
2749 	}
2750 
2751       prologue_cost_vec.release ();
2752       epilogue_cost_vec.release ();
2753     }
2754 
2755   /* FORNOW: The scalar outside cost is incremented in one of the
2756      following ways:
2757 
2758      1. The vectorizer checks for alignment and aliasing and generates
2759      a condition that allows dynamic vectorization.  A cost model
2760      check is ANDED with the versioning condition.  Hence scalar code
2761      path now has the added cost of the versioning check.
2762 
2763        if (cost > th & versioning_check)
2764          jmp to vector code
2765 
2766      Hence run-time scalar is incremented by not-taken branch cost.
2767 
2768      2. The vectorizer then checks if a prologue is required.  If the
2769      cost model check was not done before during versioning, it has to
2770      be done before the prologue check.
2771 
2772        if (cost <= th)
2773          prologue = scalar_iters
2774        if (prologue == 0)
2775          jmp to vector code
2776        else
2777          execute prologue
2778        if (prologue == num_iters)
2779 	 go to exit
2780 
2781      Hence the run-time scalar cost is incremented by a taken branch,
2782      plus a not-taken branch, plus a taken branch cost.
2783 
2784      3. The vectorizer then checks if an epilogue is required.  If the
2785      cost model check was not done before during prologue check, it
2786      has to be done with the epilogue check.
2787 
2788        if (prologue == 0)
2789          jmp to vector code
2790        else
2791          execute prologue
2792        if (prologue == num_iters)
2793 	 go to exit
2794        vector code:
2795          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2796            jmp to epilogue
2797 
2798      Hence the run-time scalar cost should be incremented by 2 taken
2799      branches.
2800 
2801      TODO: The back end may reorder the BBS's differently and reverse
2802      conditions/branch directions.  Change the estimates below to
2803      something more reasonable.  */
2804 
2805   /* If the number of iterations is known and we do not do versioning, we can
2806      decide whether to vectorize at compile time.  Hence the scalar version
2807      do not carry cost model guard costs.  */
2808   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2809       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2810       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2811     {
2812       /* Cost model check occurs at versioning.  */
2813       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2814           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2815 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2816       else
2817 	{
2818 	  /* Cost model check occurs at prologue generation.  */
2819 	  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2820 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2821 	      + vect_get_stmt_cost (cond_branch_not_taken);
2822 	  /* Cost model check occurs at epilogue generation.  */
2823 	  else
2824 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2825 	}
2826     }
2827 
2828   /* Complete the target-specific cost calculations.  */
2829   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2830 	       &vec_inside_cost, &vec_epilogue_cost);
2831 
2832   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2833 
2834   /* Calculate number of iterations required to make the vector version
2835      profitable, relative to the loop bodies only.  The following condition
2836      must hold true:
2837      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2838      where
2839      SIC = scalar iteration cost, VIC = vector iteration cost,
2840      VOC = vector outside cost, VF = vectorization factor,
2841      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2842      SOC = scalar outside cost for run time cost model check.  */
2843 
2844   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2845     {
2846       if (vec_outside_cost <= 0)
2847         min_profitable_iters = 1;
2848       else
2849         {
2850           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2851 				  - vec_inside_cost * peel_iters_prologue
2852                                   - vec_inside_cost * peel_iters_epilogue)
2853                                  / ((scalar_single_iter_cost * vf)
2854                                     - vec_inside_cost);
2855 
2856           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2857               <= (((int) vec_inside_cost * min_profitable_iters)
2858                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2859             min_profitable_iters++;
2860         }
2861     }
2862   /* vector version will never be profitable.  */
2863   else
2864     {
2865       if (dump_enabled_p ())
2866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2867 			 "cost model: the vector iteration cost = %d "
2868 			 "divided by the scalar iteration cost = %d "
2869 			 "is greater or equal to the vectorization factor = %d.",
2870 			 vec_inside_cost, scalar_single_iter_cost, vf);
2871       *ret_min_profitable_niters = -1;
2872       *ret_min_profitable_estimate = -1;
2873       return;
2874     }
2875 
2876   if (dump_enabled_p ())
2877     {
2878       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2879       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2880                    vec_inside_cost);
2881       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2882                    vec_prologue_cost);
2883       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2884                    vec_epilogue_cost);
2885       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2886                    scalar_single_iter_cost);
2887       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2888                    scalar_outside_cost);
2889       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2890                    vec_outside_cost);
2891       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2892                    peel_iters_prologue);
2893       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2894                    peel_iters_epilogue);
2895       dump_printf (MSG_NOTE,
2896                    "  Calculated minimum iters for profitability: %d\n",
2897                    min_profitable_iters);
2898     }
2899 
2900   min_profitable_iters =
2901 	min_profitable_iters < vf ? vf : min_profitable_iters;
2902 
2903   /* Because the condition we create is:
2904      if (niters <= min_profitable_iters)
2905        then skip the vectorized loop.  */
2906   min_profitable_iters--;
2907 
2908   if (dump_enabled_p ())
2909     dump_printf_loc (MSG_NOTE, vect_location,
2910                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2911 
2912   *ret_min_profitable_niters = min_profitable_iters;
2913 
2914   /* Calculate number of iterations required to make the vector version
2915      profitable, relative to the loop bodies only.
2916 
2917      Non-vectorized variant is SIC * niters and it must win over vector
2918      variant on the expected loop trip count.  The following condition must hold true:
2919      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2920 
2921   if (vec_outside_cost <= 0)
2922     min_profitable_estimate = 1;
2923   else
2924     {
2925       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2926 				 - vec_inside_cost * peel_iters_prologue
2927 				 - vec_inside_cost * peel_iters_epilogue)
2928 				 / ((scalar_single_iter_cost * vf)
2929 				   - vec_inside_cost);
2930     }
2931   min_profitable_estimate --;
2932   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2933   if (dump_enabled_p ())
2934     dump_printf_loc (MSG_NOTE, vect_location,
2935                      "  Static estimate profitability threshold = %d\n",
2936                       min_profitable_iters);
2937 
2938   *ret_min_profitable_estimate = min_profitable_estimate;
2939 }
2940 
2941 
2942 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2943    functions. Design better to avoid maintenance issues.  */
2944 
2945 /* Function vect_model_reduction_cost.
2946 
2947    Models cost for a reduction operation, including the vector ops
2948    generated within the strip-mine loop, the initial definition before
2949    the loop, and the epilogue code that must be generated.  */
2950 
2951 static bool
vect_model_reduction_cost(stmt_vec_info stmt_info,enum tree_code reduc_code,int ncopies)2952 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2953 			   int ncopies)
2954 {
2955   int prologue_cost = 0, epilogue_cost = 0;
2956   enum tree_code code;
2957   optab optab;
2958   tree vectype;
2959   gimple stmt, orig_stmt;
2960   tree reduction_op;
2961   enum machine_mode mode;
2962   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2963   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2964   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2965 
2966   /* Cost of reduction op inside loop.  */
2967   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2968 					stmt_info, 0, vect_body);
2969   stmt = STMT_VINFO_STMT (stmt_info);
2970 
2971   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2972     {
2973     case GIMPLE_SINGLE_RHS:
2974       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2975       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2976       break;
2977     case GIMPLE_UNARY_RHS:
2978       reduction_op = gimple_assign_rhs1 (stmt);
2979       break;
2980     case GIMPLE_BINARY_RHS:
2981       reduction_op = gimple_assign_rhs2 (stmt);
2982       break;
2983     case GIMPLE_TERNARY_RHS:
2984       reduction_op = gimple_assign_rhs3 (stmt);
2985       break;
2986     default:
2987       gcc_unreachable ();
2988     }
2989 
2990   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2991   if (!vectype)
2992     {
2993       if (dump_enabled_p ())
2994         {
2995 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2996 			   "unsupported data-type ");
2997           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2998 			     TREE_TYPE (reduction_op));
2999         }
3000       return false;
3001    }
3002 
3003   mode = TYPE_MODE (vectype);
3004   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3005 
3006   if (!orig_stmt)
3007     orig_stmt = STMT_VINFO_STMT (stmt_info);
3008 
3009   code = gimple_assign_rhs_code (orig_stmt);
3010 
3011   /* Add in cost for initial definition.  */
3012   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3013 				  stmt_info, 0, vect_prologue);
3014 
3015   /* Determine cost of epilogue code.
3016 
3017      We have a reduction operator that will reduce the vector in one statement.
3018      Also requires scalar extract.  */
3019 
3020   if (!nested_in_vect_loop_p (loop, orig_stmt))
3021     {
3022       if (reduc_code != ERROR_MARK)
3023 	{
3024 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3025 					  stmt_info, 0, vect_epilogue);
3026 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3027 					  stmt_info, 0, vect_epilogue);
3028 	}
3029       else
3030 	{
3031 	  int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3032 	  tree bitsize =
3033 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3034 	  int element_bitsize = tree_low_cst (bitsize, 1);
3035 	  int nelements = vec_size_in_bits / element_bitsize;
3036 
3037 	  optab = optab_for_tree_code (code, vectype, optab_default);
3038 
3039 	  /* We have a whole vector shift available.  */
3040 	  if (VECTOR_MODE_P (mode)
3041 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3042 	      && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3043 	    {
3044 	      /* Final reduction via vector shifts and the reduction operator.
3045 		 Also requires scalar extract.  */
3046 	      epilogue_cost += add_stmt_cost (target_cost_data,
3047 					      exact_log2 (nelements) * 2,
3048 					      vector_stmt, stmt_info, 0,
3049 					      vect_epilogue);
3050 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3051 					      vec_to_scalar, stmt_info, 0,
3052 					      vect_epilogue);
3053 	    }
3054 	  else
3055 	    /* Use extracts and reduction op for final reduction.  For N
3056 	       elements, we have N extracts and N-1 reduction ops.  */
3057 	    epilogue_cost += add_stmt_cost (target_cost_data,
3058 					    nelements + nelements - 1,
3059 					    vector_stmt, stmt_info, 0,
3060 					    vect_epilogue);
3061 	}
3062     }
3063 
3064   if (dump_enabled_p ())
3065     dump_printf (MSG_NOTE,
3066                  "vect_model_reduction_cost: inside_cost = %d, "
3067                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3068                  prologue_cost, epilogue_cost);
3069 
3070   return true;
3071 }
3072 
3073 
3074 /* Function vect_model_induction_cost.
3075 
3076    Models cost for induction operations.  */
3077 
3078 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)3079 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3080 {
3081   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3082   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3083   unsigned inside_cost, prologue_cost;
3084 
3085   /* loop cost for vec_loop.  */
3086   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3087 			       stmt_info, 0, vect_body);
3088 
3089   /* prologue cost for vec_init and vec_step.  */
3090   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3091 				 stmt_info, 0, vect_prologue);
3092 
3093   if (dump_enabled_p ())
3094     dump_printf_loc (MSG_NOTE, vect_location,
3095                      "vect_model_induction_cost: inside_cost = %d, "
3096                      "prologue_cost = %d .", inside_cost, prologue_cost);
3097 }
3098 
3099 
3100 /* Function get_initial_def_for_induction
3101 
3102    Input:
3103    STMT - a stmt that performs an induction operation in the loop.
3104    IV_PHI - the initial value of the induction variable
3105 
3106    Output:
3107    Return a vector variable, initialized with the first VF values of
3108    the induction variable.  E.g., for an iv with IV_PHI='X' and
3109    evolution S, for a vector of 4 units, we want to return:
3110    [X, X + S, X + 2*S, X + 3*S].  */
3111 
3112 static tree
get_initial_def_for_induction(gimple iv_phi)3113 get_initial_def_for_induction (gimple iv_phi)
3114 {
3115   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3116   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3117   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3118   tree scalar_type;
3119   tree vectype;
3120   int nunits;
3121   edge pe = loop_preheader_edge (loop);
3122   struct loop *iv_loop;
3123   basic_block new_bb;
3124   tree new_vec, vec_init, vec_step, t;
3125   tree access_fn;
3126   tree new_var;
3127   tree new_name;
3128   gimple init_stmt, induction_phi, new_stmt;
3129   tree induc_def, vec_def, vec_dest;
3130   tree init_expr, step_expr;
3131   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3132   int i;
3133   bool ok;
3134   int ncopies;
3135   tree expr;
3136   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3137   bool nested_in_vect_loop = false;
3138   gimple_seq stmts = NULL;
3139   imm_use_iterator imm_iter;
3140   use_operand_p use_p;
3141   gimple exit_phi;
3142   edge latch_e;
3143   tree loop_arg;
3144   gimple_stmt_iterator si;
3145   basic_block bb = gimple_bb (iv_phi);
3146   tree stepvectype;
3147   tree resvectype;
3148 
3149   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3150   if (nested_in_vect_loop_p (loop, iv_phi))
3151     {
3152       nested_in_vect_loop = true;
3153       iv_loop = loop->inner;
3154     }
3155   else
3156     iv_loop = loop;
3157   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3158 
3159   latch_e = loop_latch_edge (iv_loop);
3160   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3161 
3162   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3163   gcc_assert (access_fn);
3164   STRIP_NOPS (access_fn);
3165   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3166                                     &init_expr, &step_expr);
3167   gcc_assert (ok);
3168   pe = loop_preheader_edge (iv_loop);
3169 
3170   scalar_type = TREE_TYPE (init_expr);
3171   vectype = get_vectype_for_scalar_type (scalar_type);
3172   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3173   gcc_assert (vectype);
3174   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3175   ncopies = vf / nunits;
3176 
3177   gcc_assert (phi_info);
3178   gcc_assert (ncopies >= 1);
3179 
3180   /* Find the first insertion point in the BB.  */
3181   si = gsi_after_labels (bb);
3182 
3183   /* Create the vector that holds the initial_value of the induction.  */
3184   if (nested_in_vect_loop)
3185     {
3186       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3187 	 been created during vectorization of previous stmts.  We obtain it
3188 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3189       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3190                                            loop_preheader_edge (iv_loop));
3191       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3192       /* If the initial value is not of proper type, convert it.  */
3193       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3194 	{
3195 	  new_stmt = gimple_build_assign_with_ops
3196 	      (VIEW_CONVERT_EXPR,
3197 	       vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3198 	       build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3199 	  vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3200 	  gimple_assign_set_lhs (new_stmt, vec_init);
3201 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3202 						 new_stmt);
3203 	  gcc_assert (!new_bb);
3204 	  set_vinfo_for_stmt (new_stmt,
3205 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3206 	}
3207     }
3208   else
3209     {
3210       vec<constructor_elt, va_gc> *v;
3211 
3212       /* iv_loop is the loop to be vectorized. Create:
3213 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3214       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3215       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3216       if (stmts)
3217 	{
3218 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3219 	  gcc_assert (!new_bb);
3220 	}
3221 
3222       vec_alloc (v, nunits);
3223       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3224       for (i = 1; i < nunits; i++)
3225 	{
3226 	  /* Create: new_name_i = new_name + step_expr  */
3227 	  enum tree_code code = POINTER_TYPE_P (scalar_type)
3228 				? POINTER_PLUS_EXPR : PLUS_EXPR;
3229 	  init_stmt = gimple_build_assign_with_ops (code, new_var,
3230 						    new_name, step_expr);
3231 	  new_name = make_ssa_name (new_var, init_stmt);
3232 	  gimple_assign_set_lhs (init_stmt, new_name);
3233 
3234 	  new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3235 	  gcc_assert (!new_bb);
3236 
3237 	  if (dump_enabled_p ())
3238 	    {
3239 	      dump_printf_loc (MSG_NOTE, vect_location,
3240 			       "created new init_stmt: ");
3241 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3242 	    }
3243 	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3244 	}
3245       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3246       new_vec = build_constructor (vectype, v);
3247       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3248     }
3249 
3250 
3251   /* Create the vector that holds the step of the induction.  */
3252   if (nested_in_vect_loop)
3253     /* iv_loop is nested in the loop to be vectorized. Generate:
3254        vec_step = [S, S, S, S]  */
3255     new_name = step_expr;
3256   else
3257     {
3258       /* iv_loop is the loop to be vectorized. Generate:
3259 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3260       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3261       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3262 			      expr, step_expr);
3263     }
3264 
3265   t = unshare_expr (new_name);
3266   gcc_assert (CONSTANT_CLASS_P (new_name));
3267   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3268   gcc_assert (stepvectype);
3269   new_vec = build_vector_from_val (stepvectype, t);
3270   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3271 
3272 
3273   /* Create the following def-use cycle:
3274      loop prolog:
3275          vec_init = ...
3276 	 vec_step = ...
3277      loop:
3278          vec_iv = PHI <vec_init, vec_loop>
3279          ...
3280          STMT
3281          ...
3282          vec_loop = vec_iv + vec_step;  */
3283 
3284   /* Create the induction-phi that defines the induction-operand.  */
3285   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3286   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3287   set_vinfo_for_stmt (induction_phi,
3288 		      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3289   induc_def = PHI_RESULT (induction_phi);
3290 
3291   /* Create the iv update inside the loop  */
3292   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3293 					   induc_def, vec_step);
3294   vec_def = make_ssa_name (vec_dest, new_stmt);
3295   gimple_assign_set_lhs (new_stmt, vec_def);
3296   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3297   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3298                                                    NULL));
3299 
3300   /* Set the arguments of the phi node:  */
3301   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3302   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3303 	       UNKNOWN_LOCATION);
3304 
3305 
3306   /* In case that vectorization factor (VF) is bigger than the number
3307      of elements that we can fit in a vectype (nunits), we have to generate
3308      more than one vector stmt - i.e - we need to "unroll" the
3309      vector stmt by a factor VF/nunits.  For more details see documentation
3310      in vectorizable_operation.  */
3311 
3312   if (ncopies > 1)
3313     {
3314       stmt_vec_info prev_stmt_vinfo;
3315       /* FORNOW. This restriction should be relaxed.  */
3316       gcc_assert (!nested_in_vect_loop);
3317 
3318       /* Create the vector that holds the step of the induction.  */
3319       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3320       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3321 			      expr, step_expr);
3322       t = unshare_expr (new_name);
3323       gcc_assert (CONSTANT_CLASS_P (new_name));
3324       new_vec = build_vector_from_val (stepvectype, t);
3325       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3326 
3327       vec_def = induc_def;
3328       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3329       for (i = 1; i < ncopies; i++)
3330 	{
3331 	  /* vec_i = vec_prev + vec_step  */
3332 	  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3333 						   vec_def, vec_step);
3334 	  vec_def = make_ssa_name (vec_dest, new_stmt);
3335 	  gimple_assign_set_lhs (new_stmt, vec_def);
3336 
3337 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3338 	  if (!useless_type_conversion_p (resvectype, vectype))
3339 	    {
3340 	      new_stmt = gimple_build_assign_with_ops
3341 		  (VIEW_CONVERT_EXPR,
3342 		   vect_get_new_vect_var (resvectype, vect_simple_var,
3343 					  "vec_iv_"),
3344 		   build1 (VIEW_CONVERT_EXPR, resvectype,
3345 			   gimple_assign_lhs (new_stmt)), NULL_TREE);
3346 	      gimple_assign_set_lhs (new_stmt,
3347 				     make_ssa_name
3348 				       (gimple_assign_lhs (new_stmt), new_stmt));
3349 	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3350 	    }
3351 	  set_vinfo_for_stmt (new_stmt,
3352 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3353 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3354 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3355 	}
3356     }
3357 
3358   if (nested_in_vect_loop)
3359     {
3360       /* Find the loop-closed exit-phi of the induction, and record
3361          the final vector of induction results:  */
3362       exit_phi = NULL;
3363       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3364         {
3365 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3366 	    {
3367 	      exit_phi = USE_STMT (use_p);
3368 	      break;
3369 	    }
3370         }
3371       if (exit_phi)
3372 	{
3373 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3374 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
3375 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
3376 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3377 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
3378 
3379 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3380 	  if (dump_enabled_p ())
3381 	    {
3382 	      dump_printf_loc (MSG_NOTE, vect_location,
3383 			       "vector of inductions after inner-loop:");
3384 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3385 	    }
3386 	}
3387     }
3388 
3389 
3390   if (dump_enabled_p ())
3391     {
3392       dump_printf_loc (MSG_NOTE, vect_location,
3393 		       "transform induction: created def-use cycle: ");
3394       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3395       dump_printf (MSG_NOTE, "\n");
3396       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3397 			SSA_NAME_DEF_STMT (vec_def), 0);
3398     }
3399 
3400   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3401   if (!useless_type_conversion_p (resvectype, vectype))
3402     {
3403       new_stmt = gimple_build_assign_with_ops
3404 	 (VIEW_CONVERT_EXPR,
3405 	  vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3406 	  build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3407       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3408       gimple_assign_set_lhs (new_stmt, induc_def);
3409       si = gsi_after_labels (bb);
3410       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3411       set_vinfo_for_stmt (new_stmt,
3412 			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3413       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3414 	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3415     }
3416 
3417   return induc_def;
3418 }
3419 
3420 
3421 /* Function get_initial_def_for_reduction
3422 
3423    Input:
3424    STMT - a stmt that performs a reduction operation in the loop.
3425    INIT_VAL - the initial value of the reduction variable
3426 
3427    Output:
3428    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3429         of the reduction (used for adjusting the epilog - see below).
3430    Return a vector variable, initialized according to the operation that STMT
3431         performs. This vector will be used as the initial value of the
3432         vector of partial results.
3433 
3434    Option1 (adjust in epilog): Initialize the vector as follows:
3435      add/bit or/xor:    [0,0,...,0,0]
3436      mult/bit and:      [1,1,...,1,1]
3437      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3438    and when necessary (e.g. add/mult case) let the caller know
3439    that it needs to adjust the result by init_val.
3440 
3441    Option2: Initialize the vector as follows:
3442      add/bit or/xor:    [init_val,0,0,...,0]
3443      mult/bit and:      [init_val,1,1,...,1]
3444      min/max/cond_expr: [init_val,init_val,...,init_val]
3445    and no adjustments are needed.
3446 
3447    For example, for the following code:
3448 
3449    s = init_val;
3450    for (i=0;i<n;i++)
3451      s = s + a[i];
3452 
3453    STMT is 's = s + a[i]', and the reduction variable is 's'.
3454    For a vector of 4 units, we want to return either [0,0,0,init_val],
3455    or [0,0,0,0] and let the caller know that it needs to adjust
3456    the result at the end by 'init_val'.
3457 
3458    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3459    initialization vector is simpler (same element in all entries), if
3460    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3461 
3462    A cost model should help decide between these two schemes.  */
3463 
3464 tree
get_initial_def_for_reduction(gimple stmt,tree init_val,tree * adjustment_def)3465 get_initial_def_for_reduction (gimple stmt, tree init_val,
3466                                tree *adjustment_def)
3467 {
3468   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3469   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3471   tree scalar_type = TREE_TYPE (init_val);
3472   tree vectype = get_vectype_for_scalar_type (scalar_type);
3473   int nunits;
3474   enum tree_code code = gimple_assign_rhs_code (stmt);
3475   tree def_for_init;
3476   tree init_def;
3477   tree *elts;
3478   int i;
3479   bool nested_in_vect_loop = false;
3480   tree init_value;
3481   REAL_VALUE_TYPE real_init_val = dconst0;
3482   int int_init_val = 0;
3483   gimple def_stmt = NULL;
3484 
3485   gcc_assert (vectype);
3486   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3487 
3488   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3489 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
3490 
3491   if (nested_in_vect_loop_p (loop, stmt))
3492     nested_in_vect_loop = true;
3493   else
3494     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3495 
3496   /* In case of double reduction we only create a vector variable to be put
3497      in the reduction phi node.  The actual statement creation is done in
3498      vect_create_epilog_for_reduction.  */
3499   if (adjustment_def && nested_in_vect_loop
3500       && TREE_CODE (init_val) == SSA_NAME
3501       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3502       && gimple_code (def_stmt) == GIMPLE_PHI
3503       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3504       && vinfo_for_stmt (def_stmt)
3505       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3506           == vect_double_reduction_def)
3507     {
3508       *adjustment_def = NULL;
3509       return vect_create_destination_var (init_val, vectype);
3510     }
3511 
3512   if (TREE_CONSTANT (init_val))
3513     {
3514       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3515         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3516       else
3517         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3518     }
3519   else
3520     init_value = init_val;
3521 
3522   switch (code)
3523     {
3524       case WIDEN_SUM_EXPR:
3525       case DOT_PROD_EXPR:
3526       case PLUS_EXPR:
3527       case MINUS_EXPR:
3528       case BIT_IOR_EXPR:
3529       case BIT_XOR_EXPR:
3530       case MULT_EXPR:
3531       case BIT_AND_EXPR:
3532         /* ADJUSMENT_DEF is NULL when called from
3533            vect_create_epilog_for_reduction to vectorize double reduction.  */
3534         if (adjustment_def)
3535           {
3536             if (nested_in_vect_loop)
3537               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3538                                                               NULL);
3539             else
3540               *adjustment_def = init_val;
3541           }
3542 
3543         if (code == MULT_EXPR)
3544           {
3545             real_init_val = dconst1;
3546             int_init_val = 1;
3547           }
3548 
3549         if (code == BIT_AND_EXPR)
3550           int_init_val = -1;
3551 
3552         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3553           def_for_init = build_real (scalar_type, real_init_val);
3554         else
3555           def_for_init = build_int_cst (scalar_type, int_init_val);
3556 
3557         /* Create a vector of '0' or '1' except the first element.  */
3558 	elts = XALLOCAVEC (tree, nunits);
3559         for (i = nunits - 2; i >= 0; --i)
3560 	  elts[i + 1] = def_for_init;
3561 
3562         /* Option1: the first element is '0' or '1' as well.  */
3563         if (adjustment_def)
3564           {
3565 	    elts[0] = def_for_init;
3566             init_def = build_vector (vectype, elts);
3567             break;
3568           }
3569 
3570         /* Option2: the first element is INIT_VAL.  */
3571 	elts[0] = init_val;
3572         if (TREE_CONSTANT (init_val))
3573           init_def = build_vector (vectype, elts);
3574         else
3575 	  {
3576 	    vec<constructor_elt, va_gc> *v;
3577 	    vec_alloc (v, nunits);
3578 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3579 	    for (i = 1; i < nunits; ++i)
3580 	      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3581 	    init_def = build_constructor (vectype, v);
3582 	  }
3583 
3584         break;
3585 
3586       case MIN_EXPR:
3587       case MAX_EXPR:
3588       case COND_EXPR:
3589         if (adjustment_def)
3590           {
3591             *adjustment_def = NULL_TREE;
3592             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3593             break;
3594           }
3595 
3596 	init_def = build_vector_from_val (vectype, init_value);
3597         break;
3598 
3599       default:
3600         gcc_unreachable ();
3601     }
3602 
3603   return init_def;
3604 }
3605 
3606 
3607 /* Function vect_create_epilog_for_reduction
3608 
3609    Create code at the loop-epilog to finalize the result of a reduction
3610    computation.
3611 
3612    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3613      reduction statements.
3614    STMT is the scalar reduction stmt that is being vectorized.
3615    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3616      number of elements that we can fit in a vectype (nunits).  In this case
3617      we have to generate more than one vector stmt - i.e - we need to "unroll"
3618      the vector stmt by a factor VF/nunits.  For more details see documentation
3619      in vectorizable_operation.
3620    REDUC_CODE is the tree-code for the epilog reduction.
3621    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3622      computation.
3623    REDUC_INDEX is the index of the operand in the right hand side of the
3624      statement that is defined by REDUCTION_PHI.
3625    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3626    SLP_NODE is an SLP node containing a group of reduction statements. The
3627      first one in this group is STMT.
3628 
3629    This function:
3630    1. Creates the reduction def-use cycles: sets the arguments for
3631       REDUCTION_PHIS:
3632       The loop-entry argument is the vectorized initial-value of the reduction.
3633       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3634       sums.
3635    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3636       by applying the operation specified by REDUC_CODE if available, or by
3637       other means (whole-vector shifts or a scalar loop).
3638       The function also creates a new phi node at the loop exit to preserve
3639       loop-closed form, as illustrated below.
3640 
3641      The flow at the entry to this function:
3642 
3643         loop:
3644           vec_def = phi <null, null>            # REDUCTION_PHI
3645           VECT_DEF = vector_stmt                # vectorized form of STMT
3646           s_loop = scalar_stmt                  # (scalar) STMT
3647         loop_exit:
3648           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3649           use <s_out0>
3650           use <s_out0>
3651 
3652      The above is transformed by this function into:
3653 
3654         loop:
3655           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3656           VECT_DEF = vector_stmt                # vectorized form of STMT
3657           s_loop = scalar_stmt                  # (scalar) STMT
3658         loop_exit:
3659           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3660           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3661           v_out2 = reduce <v_out1>
3662           s_out3 = extract_field <v_out2, 0>
3663           s_out4 = adjust_result <s_out3>
3664           use <s_out4>
3665           use <s_out4>
3666 */
3667 
3668 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple stmt,int ncopies,enum tree_code reduc_code,vec<gimple> reduction_phis,int reduc_index,bool double_reduc,slp_tree slp_node)3669 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3670 				  int ncopies, enum tree_code reduc_code,
3671 				  vec<gimple> reduction_phis,
3672                                   int reduc_index, bool double_reduc,
3673                                   slp_tree slp_node)
3674 {
3675   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3676   stmt_vec_info prev_phi_info;
3677   tree vectype;
3678   enum machine_mode mode;
3679   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3680   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3681   basic_block exit_bb;
3682   tree scalar_dest;
3683   tree scalar_type;
3684   gimple new_phi = NULL, phi;
3685   gimple_stmt_iterator exit_gsi;
3686   tree vec_dest;
3687   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3688   gimple epilog_stmt = NULL;
3689   enum tree_code code = gimple_assign_rhs_code (stmt);
3690   gimple exit_phi;
3691   tree bitsize, bitpos;
3692   tree adjustment_def = NULL;
3693   tree vec_initial_def = NULL;
3694   tree reduction_op, expr, def;
3695   tree orig_name, scalar_result;
3696   imm_use_iterator imm_iter, phi_imm_iter;
3697   use_operand_p use_p, phi_use_p;
3698   bool extract_scalar_result = false;
3699   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3700   bool nested_in_vect_loop = false;
3701   vec<gimple> new_phis = vNULL;
3702   vec<gimple> inner_phis = vNULL;
3703   enum vect_def_type dt = vect_unknown_def_type;
3704   int j, i;
3705   vec<tree> scalar_results = vNULL;
3706   unsigned int group_size = 1, k, ratio;
3707   vec<tree> vec_initial_defs = vNULL;
3708   vec<gimple> phis;
3709   bool slp_reduc = false;
3710   tree new_phi_result;
3711   gimple inner_phi = NULL;
3712 
3713   if (slp_node)
3714     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3715 
3716   if (nested_in_vect_loop_p (loop, stmt))
3717     {
3718       outer_loop = loop;
3719       loop = loop->inner;
3720       nested_in_vect_loop = true;
3721       gcc_assert (!slp_node);
3722     }
3723 
3724   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3725     {
3726     case GIMPLE_SINGLE_RHS:
3727       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3728 		  == ternary_op);
3729       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3730       break;
3731     case GIMPLE_UNARY_RHS:
3732       reduction_op = gimple_assign_rhs1 (stmt);
3733       break;
3734     case GIMPLE_BINARY_RHS:
3735       reduction_op = reduc_index ?
3736                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3737       break;
3738     case GIMPLE_TERNARY_RHS:
3739       reduction_op = gimple_op (stmt, reduc_index + 1);
3740       break;
3741     default:
3742       gcc_unreachable ();
3743     }
3744 
3745   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3746   gcc_assert (vectype);
3747   mode = TYPE_MODE (vectype);
3748 
3749   /* 1. Create the reduction def-use cycle:
3750      Set the arguments of REDUCTION_PHIS, i.e., transform
3751 
3752         loop:
3753           vec_def = phi <null, null>            # REDUCTION_PHI
3754           VECT_DEF = vector_stmt                # vectorized form of STMT
3755           ...
3756 
3757      into:
3758 
3759         loop:
3760           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3761           VECT_DEF = vector_stmt                # vectorized form of STMT
3762           ...
3763 
3764      (in case of SLP, do it for all the phis). */
3765 
3766   /* Get the loop-entry arguments.  */
3767   if (slp_node)
3768     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3769                        NULL, slp_node, reduc_index);
3770   else
3771     {
3772       vec_initial_defs.create (1);
3773      /* For the case of reduction, vect_get_vec_def_for_operand returns
3774         the scalar def before the loop, that defines the initial value
3775         of the reduction variable.  */
3776       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3777                                                       &adjustment_def);
3778       vec_initial_defs.quick_push (vec_initial_def);
3779     }
3780 
3781   /* Set phi nodes arguments.  */
3782   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3783     {
3784       tree vec_init_def = vec_initial_defs[i];
3785       tree def = vect_defs[i];
3786       for (j = 0; j < ncopies; j++)
3787         {
3788           /* Set the loop-entry arg of the reduction-phi.  */
3789           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3790                        UNKNOWN_LOCATION);
3791 
3792           /* Set the loop-latch arg for the reduction-phi.  */
3793           if (j > 0)
3794             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3795 
3796           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3797 
3798           if (dump_enabled_p ())
3799             {
3800               dump_printf_loc (MSG_NOTE, vect_location,
3801 			       "transform reduction: created def-use cycle: ");
3802               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3803               dump_printf (MSG_NOTE, "\n");
3804               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3805             }
3806 
3807           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3808         }
3809     }
3810 
3811   vec_initial_defs.release ();
3812 
3813   /* 2. Create epilog code.
3814         The reduction epilog code operates across the elements of the vector
3815         of partial results computed by the vectorized loop.
3816         The reduction epilog code consists of:
3817 
3818         step 1: compute the scalar result in a vector (v_out2)
3819         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3820         step 3: adjust the scalar result (s_out3) if needed.
3821 
3822         Step 1 can be accomplished using one the following three schemes:
3823           (scheme 1) using reduc_code, if available.
3824           (scheme 2) using whole-vector shifts, if available.
3825           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3826                      combined.
3827 
3828           The overall epilog code looks like this:
3829 
3830           s_out0 = phi <s_loop>         # original EXIT_PHI
3831           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3832           v_out2 = reduce <v_out1>              # step 1
3833           s_out3 = extract_field <v_out2, 0>    # step 2
3834           s_out4 = adjust_result <s_out3>       # step 3
3835 
3836           (step 3 is optional, and steps 1 and 2 may be combined).
3837           Lastly, the uses of s_out0 are replaced by s_out4.  */
3838 
3839 
3840   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3841          v_out1 = phi <VECT_DEF>
3842          Store them in NEW_PHIS.  */
3843 
3844   exit_bb = single_exit (loop)->dest;
3845   prev_phi_info = NULL;
3846   new_phis.create (vect_defs.length ());
3847   FOR_EACH_VEC_ELT (vect_defs, i, def)
3848     {
3849       for (j = 0; j < ncopies; j++)
3850         {
3851 	  tree new_def = copy_ssa_name (def, NULL);
3852           phi = create_phi_node (new_def, exit_bb);
3853           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3854           if (j == 0)
3855             new_phis.quick_push (phi);
3856           else
3857 	    {
3858 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
3859 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3860 	    }
3861 
3862           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3863           prev_phi_info = vinfo_for_stmt (phi);
3864         }
3865     }
3866 
3867   /* The epilogue is created for the outer-loop, i.e., for the loop being
3868      vectorized.  Create exit phis for the outer loop.  */
3869   if (double_reduc)
3870     {
3871       loop = outer_loop;
3872       exit_bb = single_exit (loop)->dest;
3873       inner_phis.create (vect_defs.length ());
3874       FOR_EACH_VEC_ELT (new_phis, i, phi)
3875 	{
3876 	  tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3877 	  gimple outer_phi = create_phi_node (new_result, exit_bb);
3878 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3879 			   PHI_RESULT (phi));
3880 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3881 							    loop_vinfo, NULL));
3882 	  inner_phis.quick_push (phi);
3883 	  new_phis[i] = outer_phi;
3884 	  prev_phi_info = vinfo_for_stmt (outer_phi);
3885           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3886             {
3887 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3888 	      new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3889 	      outer_phi = create_phi_node (new_result, exit_bb);
3890 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3891 			       PHI_RESULT (phi));
3892 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3893 							loop_vinfo, NULL));
3894 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3895 	      prev_phi_info = vinfo_for_stmt (outer_phi);
3896 	    }
3897 	}
3898     }
3899 
3900   exit_gsi = gsi_after_labels (exit_bb);
3901 
3902   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3903          (i.e. when reduc_code is not available) and in the final adjustment
3904 	 code (if needed).  Also get the original scalar reduction variable as
3905          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3906          represents a reduction pattern), the tree-code and scalar-def are
3907          taken from the original stmt that the pattern-stmt (STMT) replaces.
3908          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3909          are taken from STMT.  */
3910 
3911   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3912   if (!orig_stmt)
3913     {
3914       /* Regular reduction  */
3915       orig_stmt = stmt;
3916     }
3917   else
3918     {
3919       /* Reduction pattern  */
3920       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3921       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3922       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3923     }
3924 
3925   code = gimple_assign_rhs_code (orig_stmt);
3926   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3927      partial results are added and not subtracted.  */
3928   if (code == MINUS_EXPR)
3929     code = PLUS_EXPR;
3930 
3931   scalar_dest = gimple_assign_lhs (orig_stmt);
3932   scalar_type = TREE_TYPE (scalar_dest);
3933   scalar_results.create (group_size);
3934   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3935   bitsize = TYPE_SIZE (scalar_type);
3936 
3937   /* In case this is a reduction in an inner-loop while vectorizing an outer
3938      loop - we don't need to extract a single scalar result at the end of the
3939      inner-loop (unless it is double reduction, i.e., the use of reduction is
3940      outside the outer-loop).  The final vector of partial results will be used
3941      in the vectorized outer-loop, or reduced to a scalar result at the end of
3942      the outer-loop.  */
3943   if (nested_in_vect_loop && !double_reduc)
3944     goto vect_finalize_reduction;
3945 
3946   /* SLP reduction without reduction chain, e.g.,
3947      # a1 = phi <a2, a0>
3948      # b1 = phi <b2, b0>
3949      a2 = operation (a1)
3950      b2 = operation (b1)  */
3951   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3952 
3953   /* In case of reduction chain, e.g.,
3954      # a1 = phi <a3, a0>
3955      a2 = operation (a1)
3956      a3 = operation (a2),
3957 
3958      we may end up with more than one vector result.  Here we reduce them to
3959      one vector.  */
3960   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3961     {
3962       tree first_vect = PHI_RESULT (new_phis[0]);
3963       tree tmp;
3964       gimple new_vec_stmt = NULL;
3965 
3966       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3967       for (k = 1; k < new_phis.length (); k++)
3968         {
3969           gimple next_phi = new_phis[k];
3970           tree second_vect = PHI_RESULT (next_phi);
3971 
3972           tmp = build2 (code, vectype,  first_vect, second_vect);
3973           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3974           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3975           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3976           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3977         }
3978 
3979       new_phi_result = first_vect;
3980       if (new_vec_stmt)
3981         {
3982           new_phis.truncate (0);
3983           new_phis.safe_push (new_vec_stmt);
3984         }
3985     }
3986   else
3987     new_phi_result = PHI_RESULT (new_phis[0]);
3988 
3989   /* 2.3 Create the reduction code, using one of the three schemes described
3990          above. In SLP we simply need to extract all the elements from the
3991          vector (without reducing them), so we use scalar shifts.  */
3992   if (reduc_code != ERROR_MARK && !slp_reduc)
3993     {
3994       tree tmp;
3995 
3996       /*** Case 1:  Create:
3997            v_out2 = reduc_expr <v_out1>  */
3998 
3999       if (dump_enabled_p ())
4000         dump_printf_loc (MSG_NOTE, vect_location,
4001 			 "Reduce using direct vector reduction.");
4002 
4003       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4004       tmp = build1 (reduc_code, vectype, new_phi_result);
4005       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4006       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4007       gimple_assign_set_lhs (epilog_stmt, new_temp);
4008       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4009 
4010       extract_scalar_result = true;
4011     }
4012   else
4013     {
4014       enum tree_code shift_code = ERROR_MARK;
4015       bool have_whole_vector_shift = true;
4016       int bit_offset;
4017       int element_bitsize = tree_low_cst (bitsize, 1);
4018       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4019       tree vec_temp;
4020 
4021       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4022         shift_code = VEC_RSHIFT_EXPR;
4023       else
4024         have_whole_vector_shift = false;
4025 
4026       /* Regardless of whether we have a whole vector shift, if we're
4027          emulating the operation via tree-vect-generic, we don't want
4028          to use it.  Only the first round of the reduction is likely
4029          to still be profitable via emulation.  */
4030       /* ??? It might be better to emit a reduction tree code here, so that
4031          tree-vect-generic can expand the first round via bit tricks.  */
4032       if (!VECTOR_MODE_P (mode))
4033         have_whole_vector_shift = false;
4034       else
4035         {
4036           optab optab = optab_for_tree_code (code, vectype, optab_default);
4037           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4038             have_whole_vector_shift = false;
4039         }
4040 
4041       if (have_whole_vector_shift && !slp_reduc)
4042         {
4043           /*** Case 2: Create:
4044              for (offset = VS/2; offset >= element_size; offset/=2)
4045                 {
4046                   Create:  va' = vec_shift <va, offset>
4047                   Create:  va = vop <va, va'>
4048                 }  */
4049 
4050           if (dump_enabled_p ())
4051             dump_printf_loc (MSG_NOTE, vect_location,
4052 			     "Reduce using vector shifts");
4053 
4054           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4055           new_temp = new_phi_result;
4056           for (bit_offset = vec_size_in_bits/2;
4057                bit_offset >= element_bitsize;
4058                bit_offset /= 2)
4059             {
4060               tree bitpos = size_int (bit_offset);
4061 
4062               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4063                                                vec_dest, new_temp, bitpos);
4064               new_name = make_ssa_name (vec_dest, epilog_stmt);
4065               gimple_assign_set_lhs (epilog_stmt, new_name);
4066               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4067 
4068               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4069                                                           new_name, new_temp);
4070               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4071               gimple_assign_set_lhs (epilog_stmt, new_temp);
4072               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4073             }
4074 
4075           extract_scalar_result = true;
4076         }
4077       else
4078         {
4079           tree rhs;
4080 
4081           /*** Case 3: Create:
4082              s = extract_field <v_out2, 0>
4083              for (offset = element_size;
4084                   offset < vector_size;
4085                   offset += element_size;)
4086                {
4087                  Create:  s' = extract_field <v_out2, offset>
4088                  Create:  s = op <s, s'>  // For non SLP cases
4089                }  */
4090 
4091           if (dump_enabled_p ())
4092             dump_printf_loc (MSG_NOTE, vect_location,
4093 			     "Reduce using scalar code. ");
4094 
4095           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4096           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4097             {
4098               if (gimple_code (new_phi) == GIMPLE_PHI)
4099                 vec_temp = PHI_RESULT (new_phi);
4100               else
4101                 vec_temp = gimple_assign_lhs (new_phi);
4102               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4103                             bitsize_zero_node);
4104               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4105               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4106               gimple_assign_set_lhs (epilog_stmt, new_temp);
4107               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4108 
4109               /* In SLP we don't need to apply reduction operation, so we just
4110                  collect s' values in SCALAR_RESULTS.  */
4111               if (slp_reduc)
4112                 scalar_results.safe_push (new_temp);
4113 
4114               for (bit_offset = element_bitsize;
4115                    bit_offset < vec_size_in_bits;
4116                    bit_offset += element_bitsize)
4117                 {
4118                   tree bitpos = bitsize_int (bit_offset);
4119                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4120                                      bitsize, bitpos);
4121 
4122                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4123                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4124                   gimple_assign_set_lhs (epilog_stmt, new_name);
4125                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4126 
4127                   if (slp_reduc)
4128                     {
4129                       /* In SLP we don't need to apply reduction operation, so
4130                          we just collect s' values in SCALAR_RESULTS.  */
4131                       new_temp = new_name;
4132                       scalar_results.safe_push (new_name);
4133                     }
4134                   else
4135                     {
4136                       epilog_stmt = gimple_build_assign_with_ops (code,
4137                                           new_scalar_dest, new_name, new_temp);
4138                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4139                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4140                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4141                     }
4142                 }
4143             }
4144 
4145           /* The only case where we need to reduce scalar results in SLP, is
4146              unrolling.  If the size of SCALAR_RESULTS is greater than
4147              GROUP_SIZE, we reduce them combining elements modulo
4148              GROUP_SIZE.  */
4149           if (slp_reduc)
4150             {
4151               tree res, first_res, new_res;
4152               gimple new_stmt;
4153 
4154               /* Reduce multiple scalar results in case of SLP unrolling.  */
4155               for (j = group_size; scalar_results.iterate (j, &res);
4156                    j++)
4157                 {
4158                   first_res = scalar_results[j % group_size];
4159                   new_stmt = gimple_build_assign_with_ops (code,
4160                                               new_scalar_dest, first_res, res);
4161                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4162                   gimple_assign_set_lhs (new_stmt, new_res);
4163                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4164                   scalar_results[j % group_size] = new_res;
4165                 }
4166             }
4167           else
4168             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4169             scalar_results.safe_push (new_temp);
4170 
4171           extract_scalar_result = false;
4172         }
4173     }
4174 
4175   /* 2.4  Extract the final scalar result.  Create:
4176           s_out3 = extract_field <v_out2, bitpos>  */
4177 
4178   if (extract_scalar_result)
4179     {
4180       tree rhs;
4181 
4182       if (dump_enabled_p ())
4183         dump_printf_loc (MSG_NOTE, vect_location,
4184 			 "extract scalar result");
4185 
4186       if (BYTES_BIG_ENDIAN)
4187         bitpos = size_binop (MULT_EXPR,
4188                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4189                              TYPE_SIZE (scalar_type));
4190       else
4191         bitpos = bitsize_zero_node;
4192 
4193       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4194       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4195       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4196       gimple_assign_set_lhs (epilog_stmt, new_temp);
4197       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4198       scalar_results.safe_push (new_temp);
4199     }
4200 
4201 vect_finalize_reduction:
4202 
4203   if (double_reduc)
4204     loop = loop->inner;
4205 
4206   /* 2.5 Adjust the final result by the initial value of the reduction
4207 	 variable. (When such adjustment is not needed, then
4208 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
4209 	 new_temp = loop_exit_def + adjustment_def  */
4210 
4211   if (adjustment_def)
4212     {
4213       gcc_assert (!slp_reduc);
4214       if (nested_in_vect_loop)
4215 	{
4216           new_phi = new_phis[0];
4217 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4218 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4219 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
4220 	}
4221       else
4222 	{
4223           new_temp = scalar_results[0];
4224 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4225 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
4226 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4227 	}
4228 
4229       epilog_stmt = gimple_build_assign (new_dest, expr);
4230       new_temp = make_ssa_name (new_dest, epilog_stmt);
4231       gimple_assign_set_lhs (epilog_stmt, new_temp);
4232       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4233       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4234       if (nested_in_vect_loop)
4235         {
4236           set_vinfo_for_stmt (epilog_stmt,
4237                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4238                                                  NULL));
4239           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4240                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4241 
4242           if (!double_reduc)
4243             scalar_results.quick_push (new_temp);
4244           else
4245             scalar_results[0] = new_temp;
4246         }
4247       else
4248         scalar_results[0] = new_temp;
4249 
4250       new_phis[0] = epilog_stmt;
4251     }
4252 
4253   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4254           phis with new adjusted scalar results, i.e., replace use <s_out0>
4255           with use <s_out4>.
4256 
4257      Transform:
4258         loop_exit:
4259           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4260           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4261           v_out2 = reduce <v_out1>
4262           s_out3 = extract_field <v_out2, 0>
4263           s_out4 = adjust_result <s_out3>
4264           use <s_out0>
4265           use <s_out0>
4266 
4267      into:
4268 
4269         loop_exit:
4270           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4271           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4272           v_out2 = reduce <v_out1>
4273           s_out3 = extract_field <v_out2, 0>
4274           s_out4 = adjust_result <s_out3>
4275           use <s_out4>
4276           use <s_out4> */
4277 
4278 
4279   /* In SLP reduction chain we reduce vector results into one vector if
4280      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4281      the last stmt in the reduction chain, since we are looking for the loop
4282      exit phi node.  */
4283   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4284     {
4285       scalar_dest = gimple_assign_lhs (
4286 			SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4287       group_size = 1;
4288     }
4289 
4290   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4291      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4292      need to match SCALAR_RESULTS with corresponding statements.  The first
4293      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4294      the first vector stmt, etc.
4295      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4296   if (group_size > new_phis.length ())
4297     {
4298       ratio = group_size / new_phis.length ();
4299       gcc_assert (!(group_size % new_phis.length ()));
4300     }
4301   else
4302     ratio = 1;
4303 
4304   for (k = 0; k < group_size; k++)
4305     {
4306       if (k % ratio == 0)
4307         {
4308           epilog_stmt = new_phis[k / ratio];
4309           reduction_phi = reduction_phis[k / ratio];
4310 	  if (double_reduc)
4311 	    inner_phi = inner_phis[k / ratio];
4312         }
4313 
4314       if (slp_reduc)
4315         {
4316           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4317 
4318           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4319           /* SLP statements can't participate in patterns.  */
4320           gcc_assert (!orig_stmt);
4321           scalar_dest = gimple_assign_lhs (current_stmt);
4322         }
4323 
4324       phis.create (3);
4325       /* Find the loop-closed-use at the loop exit of the original scalar
4326          result.  (The reduction result is expected to have two immediate uses -
4327          one at the latch block, and one at the loop exit).  */
4328       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4329         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4330           phis.safe_push (USE_STMT (use_p));
4331 
4332       /* We expect to have found an exit_phi because of loop-closed-ssa
4333          form.  */
4334       gcc_assert (!phis.is_empty ());
4335 
4336       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4337         {
4338           if (outer_loop)
4339             {
4340               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4341               gimple vect_phi;
4342 
4343               /* FORNOW. Currently not supporting the case that an inner-loop
4344                  reduction is not used in the outer-loop (but only outside the
4345                  outer-loop), unless it is double reduction.  */
4346               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4347                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4348                           || double_reduc);
4349 
4350               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4351               if (!double_reduc
4352                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4353                       != vect_double_reduction_def)
4354                 continue;
4355 
4356               /* Handle double reduction:
4357 
4358                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4359                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4360                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4361                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4362 
4363                  At that point the regular reduction (stmt2 and stmt3) is
4364                  already vectorized, as well as the exit phi node, stmt4.
4365                  Here we vectorize the phi node of double reduction, stmt1, and
4366                  update all relevant statements.  */
4367 
4368               /* Go through all the uses of s2 to find double reduction phi
4369                  node, i.e., stmt1 above.  */
4370               orig_name = PHI_RESULT (exit_phi);
4371               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4372                 {
4373                   stmt_vec_info use_stmt_vinfo;
4374                   stmt_vec_info new_phi_vinfo;
4375                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4376                   basic_block bb = gimple_bb (use_stmt);
4377                   gimple use;
4378 
4379                   /* Check that USE_STMT is really double reduction phi
4380                      node.  */
4381                   if (gimple_code (use_stmt) != GIMPLE_PHI
4382                       || gimple_phi_num_args (use_stmt) != 2
4383                       || bb->loop_father != outer_loop)
4384                     continue;
4385                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4386                   if (!use_stmt_vinfo
4387                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4388                           != vect_double_reduction_def)
4389 		    continue;
4390 
4391                   /* Create vector phi node for double reduction:
4392                      vs1 = phi <vs0, vs2>
4393                      vs1 was created previously in this function by a call to
4394                        vect_get_vec_def_for_operand and is stored in
4395                        vec_initial_def;
4396                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4397                      vs0 is created here.  */
4398 
4399                   /* Create vector phi node.  */
4400                   vect_phi = create_phi_node (vec_initial_def, bb);
4401                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4402                                     loop_vec_info_for_loop (outer_loop), NULL);
4403                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4404 
4405                   /* Create vs0 - initial def of the double reduction phi.  */
4406                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4407                                              loop_preheader_edge (outer_loop));
4408                   init_def = get_initial_def_for_reduction (stmt,
4409                                                           preheader_arg, NULL);
4410                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4411                                                     vectype, NULL);
4412 
4413                   /* Update phi node arguments with vs0 and vs2.  */
4414                   add_phi_arg (vect_phi, vect_phi_init,
4415                                loop_preheader_edge (outer_loop),
4416                                UNKNOWN_LOCATION);
4417                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4418                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4419                   if (dump_enabled_p ())
4420                     {
4421                       dump_printf_loc (MSG_NOTE, vect_location,
4422 				       "created double reduction phi node: ");
4423                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4424                     }
4425 
4426                   vect_phi_res = PHI_RESULT (vect_phi);
4427 
4428                   /* Replace the use, i.e., set the correct vs1 in the regular
4429                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4430                      loop is redundant.  */
4431                   use = reduction_phi;
4432                   for (j = 0; j < ncopies; j++)
4433                     {
4434                       edge pr_edge = loop_preheader_edge (loop);
4435                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4436                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4437                     }
4438                 }
4439             }
4440         }
4441 
4442       phis.release ();
4443       if (nested_in_vect_loop)
4444         {
4445           if (double_reduc)
4446             loop = outer_loop;
4447           else
4448             continue;
4449         }
4450 
4451       phis.create (3);
4452       /* Find the loop-closed-use at the loop exit of the original scalar
4453          result.  (The reduction result is expected to have two immediate uses,
4454          one at the latch block, and one at the loop exit).  For double
4455          reductions we are looking for exit phis of the outer loop.  */
4456       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4457         {
4458           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4459             phis.safe_push (USE_STMT (use_p));
4460           else
4461             {
4462               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4463                 {
4464                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4465 
4466                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4467                     {
4468                       if (!flow_bb_inside_loop_p (loop,
4469                                              gimple_bb (USE_STMT (phi_use_p))))
4470                         phis.safe_push (USE_STMT (phi_use_p));
4471                     }
4472                 }
4473             }
4474         }
4475 
4476       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4477         {
4478           /* Replace the uses:  */
4479           orig_name = PHI_RESULT (exit_phi);
4480           scalar_result = scalar_results[k];
4481           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4482             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4483               SET_USE (use_p, scalar_result);
4484         }
4485 
4486       phis.release ();
4487     }
4488 
4489   scalar_results.release ();
4490   inner_phis.release ();
4491   new_phis.release ();
4492 }
4493 
4494 
4495 /* Function vectorizable_reduction.
4496 
4497    Check if STMT performs a reduction operation that can be vectorized.
4498    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4499    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4500    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4501 
4502    This function also handles reduction idioms (patterns) that have been
4503    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4504    of this form:
4505      X = pattern_expr (arg0, arg1, ..., X)
4506    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4507    sequence that had been detected and replaced by the pattern-stmt (STMT).
4508 
4509    In some cases of reduction patterns, the type of the reduction variable X is
4510    different than the type of the other arguments of STMT.
4511    In such cases, the vectype that is used when transforming STMT into a vector
4512    stmt is different than the vectype that is used to determine the
4513    vectorization factor, because it consists of a different number of elements
4514    than the actual number of elements that are being operated upon in parallel.
4515 
4516    For example, consider an accumulation of shorts into an int accumulator.
4517    On some targets it's possible to vectorize this pattern operating on 8
4518    shorts at a time (hence, the vectype for purposes of determining the
4519    vectorization factor should be V8HI); on the other hand, the vectype that
4520    is used to create the vector form is actually V4SI (the type of the result).
4521 
4522    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4523    indicates what is the actual level of parallelism (V8HI in the example), so
4524    that the right vectorization factor would be derived.  This vectype
4525    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4526    be used to create the vectorized stmt.  The right vectype for the vectorized
4527    stmt is obtained from the type of the result X:
4528         get_vectype_for_scalar_type (TREE_TYPE (X))
4529 
4530    This means that, contrary to "regular" reductions (or "regular" stmts in
4531    general), the following equation:
4532       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4533    does *NOT* necessarily hold for reduction patterns.  */
4534 
4535 bool
vectorizable_reduction(gimple stmt,gimple_stmt_iterator * gsi,gimple * vec_stmt,slp_tree slp_node)4536 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4537 			gimple *vec_stmt, slp_tree slp_node)
4538 {
4539   tree vec_dest;
4540   tree scalar_dest;
4541   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4542   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4543   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4544   tree vectype_in = NULL_TREE;
4545   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4546   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4547   enum tree_code code, orig_code, epilog_reduc_code;
4548   enum machine_mode vec_mode;
4549   int op_type;
4550   optab optab, reduc_optab;
4551   tree new_temp = NULL_TREE;
4552   tree def;
4553   gimple def_stmt;
4554   enum vect_def_type dt;
4555   gimple new_phi = NULL;
4556   tree scalar_type;
4557   bool is_simple_use;
4558   gimple orig_stmt;
4559   stmt_vec_info orig_stmt_info;
4560   tree expr = NULL_TREE;
4561   int i;
4562   int ncopies;
4563   int epilog_copies;
4564   stmt_vec_info prev_stmt_info, prev_phi_info;
4565   bool single_defuse_cycle = false;
4566   tree reduc_def = NULL_TREE;
4567   gimple new_stmt = NULL;
4568   int j;
4569   tree ops[3];
4570   bool nested_cycle = false, found_nested_cycle_def = false;
4571   gimple reduc_def_stmt = NULL;
4572   /* The default is that the reduction variable is the last in statement.  */
4573   int reduc_index = 2;
4574   bool double_reduc = false, dummy;
4575   basic_block def_bb;
4576   struct loop * def_stmt_loop, *outer_loop = NULL;
4577   tree def_arg;
4578   gimple def_arg_stmt;
4579   vec<tree> vec_oprnds0 = vNULL;
4580   vec<tree> vec_oprnds1 = vNULL;
4581   vec<tree> vect_defs = vNULL;
4582   vec<gimple> phis = vNULL;
4583   int vec_num;
4584   tree def0, def1, tem, op0, op1 = NULL_TREE;
4585 
4586   /* In case of reduction chain we switch to the first stmt in the chain, but
4587      we don't update STMT_INFO, since only the last stmt is marked as reduction
4588      and has reduction properties.  */
4589   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4590     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4591 
4592   if (nested_in_vect_loop_p (loop, stmt))
4593     {
4594       outer_loop = loop;
4595       loop = loop->inner;
4596       nested_cycle = true;
4597     }
4598 
4599   /* 1. Is vectorizable reduction?  */
4600   /* Not supportable if the reduction variable is used in the loop, unless
4601      it's a reduction chain.  */
4602   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4603       && !GROUP_FIRST_ELEMENT (stmt_info))
4604     return false;
4605 
4606   /* Reductions that are not used even in an enclosing outer-loop,
4607      are expected to be "live" (used out of the loop).  */
4608   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4609       && !STMT_VINFO_LIVE_P (stmt_info))
4610     return false;
4611 
4612   /* Make sure it was already recognized as a reduction computation.  */
4613   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4614       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4615     return false;
4616 
4617   /* 2. Has this been recognized as a reduction pattern?
4618 
4619      Check if STMT represents a pattern that has been recognized
4620      in earlier analysis stages.  For stmts that represent a pattern,
4621      the STMT_VINFO_RELATED_STMT field records the last stmt in
4622      the original sequence that constitutes the pattern.  */
4623 
4624   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4625   if (orig_stmt)
4626     {
4627       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4628       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4629       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4630     }
4631 
4632   /* 3. Check the operands of the operation.  The first operands are defined
4633         inside the loop body. The last operand is the reduction variable,
4634         which is defined by the loop-header-phi.  */
4635 
4636   gcc_assert (is_gimple_assign (stmt));
4637 
4638   /* Flatten RHS.  */
4639   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4640     {
4641     case GIMPLE_SINGLE_RHS:
4642       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4643       if (op_type == ternary_op)
4644 	{
4645 	  tree rhs = gimple_assign_rhs1 (stmt);
4646 	  ops[0] = TREE_OPERAND (rhs, 0);
4647 	  ops[1] = TREE_OPERAND (rhs, 1);
4648 	  ops[2] = TREE_OPERAND (rhs, 2);
4649 	  code = TREE_CODE (rhs);
4650 	}
4651       else
4652 	return false;
4653       break;
4654 
4655     case GIMPLE_BINARY_RHS:
4656       code = gimple_assign_rhs_code (stmt);
4657       op_type = TREE_CODE_LENGTH (code);
4658       gcc_assert (op_type == binary_op);
4659       ops[0] = gimple_assign_rhs1 (stmt);
4660       ops[1] = gimple_assign_rhs2 (stmt);
4661       break;
4662 
4663     case GIMPLE_TERNARY_RHS:
4664       code = gimple_assign_rhs_code (stmt);
4665       op_type = TREE_CODE_LENGTH (code);
4666       gcc_assert (op_type == ternary_op);
4667       ops[0] = gimple_assign_rhs1 (stmt);
4668       ops[1] = gimple_assign_rhs2 (stmt);
4669       ops[2] = gimple_assign_rhs3 (stmt);
4670       break;
4671 
4672     case GIMPLE_UNARY_RHS:
4673       return false;
4674 
4675     default:
4676       gcc_unreachable ();
4677     }
4678 
4679   if (code == COND_EXPR && slp_node)
4680     return false;
4681 
4682   scalar_dest = gimple_assign_lhs (stmt);
4683   scalar_type = TREE_TYPE (scalar_dest);
4684   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4685       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4686     return false;
4687 
4688   /* Do not try to vectorize bit-precision reductions.  */
4689   if ((TYPE_PRECISION (scalar_type)
4690        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4691     return false;
4692 
4693   /* All uses but the last are expected to be defined in the loop.
4694      The last use is the reduction variable.  In case of nested cycle this
4695      assumption is not true: we use reduc_index to record the index of the
4696      reduction variable.  */
4697   for (i = 0; i < op_type - 1; i++)
4698     {
4699       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4700       if (i == 0 && code == COND_EXPR)
4701         continue;
4702 
4703       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4704 					    &def_stmt, &def, &dt, &tem);
4705       if (!vectype_in)
4706 	vectype_in = tem;
4707       gcc_assert (is_simple_use);
4708 
4709       if (dt != vect_internal_def
4710 	  && dt != vect_external_def
4711 	  && dt != vect_constant_def
4712 	  && dt != vect_induction_def
4713           && !(dt == vect_nested_cycle && nested_cycle))
4714 	return false;
4715 
4716       if (dt == vect_nested_cycle)
4717         {
4718           found_nested_cycle_def = true;
4719           reduc_def_stmt = def_stmt;
4720           reduc_index = i;
4721         }
4722     }
4723 
4724   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4725 					&def_stmt, &def, &dt, &tem);
4726   if (!vectype_in)
4727     vectype_in = tem;
4728   gcc_assert (is_simple_use);
4729   if (!(dt == vect_reduction_def
4730 	|| dt == vect_nested_cycle
4731 	|| ((dt == vect_internal_def || dt == vect_external_def
4732 	     || dt == vect_constant_def || dt == vect_induction_def)
4733 	    && nested_cycle && found_nested_cycle_def)))
4734     {
4735       /* For pattern recognized stmts, orig_stmt might be a reduction,
4736 	 but some helper statements for the pattern might not, or
4737 	 might be COND_EXPRs with reduction uses in the condition.  */
4738       gcc_assert (orig_stmt);
4739       return false;
4740     }
4741   if (!found_nested_cycle_def)
4742     reduc_def_stmt = def_stmt;
4743 
4744   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4745   if (orig_stmt)
4746     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4747                                                        reduc_def_stmt,
4748                                                        !nested_cycle,
4749                                                        &dummy));
4750   else
4751     {
4752       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4753                                              !nested_cycle, &dummy);
4754       /* We changed STMT to be the first stmt in reduction chain, hence we
4755          check that in this case the first element in the chain is STMT.  */
4756       gcc_assert (stmt == tmp
4757                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4758     }
4759 
4760   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4761     return false;
4762 
4763   if (slp_node || PURE_SLP_STMT (stmt_info))
4764     ncopies = 1;
4765   else
4766     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4767                / TYPE_VECTOR_SUBPARTS (vectype_in));
4768 
4769   gcc_assert (ncopies >= 1);
4770 
4771   vec_mode = TYPE_MODE (vectype_in);
4772 
4773   if (code == COND_EXPR)
4774     {
4775       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4776         {
4777           if (dump_enabled_p ())
4778 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4779 			     "unsupported condition in reduction");
4780 
4781             return false;
4782         }
4783     }
4784   else
4785     {
4786       /* 4. Supportable by target?  */
4787 
4788       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4789 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
4790 	{
4791 	  /* Shifts and rotates are only supported by vectorizable_shifts,
4792 	     not vectorizable_reduction.  */
4793           if (dump_enabled_p ())
4794 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4795 			     "unsupported shift or rotation.");
4796 	  return false;
4797 	}
4798 
4799       /* 4.1. check support for the operation in the loop  */
4800       optab = optab_for_tree_code (code, vectype_in, optab_default);
4801       if (!optab)
4802         {
4803           if (dump_enabled_p ())
4804 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4805 			     "no optab.");
4806 
4807           return false;
4808         }
4809 
4810       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4811         {
4812           if (dump_enabled_p ())
4813             dump_printf (MSG_NOTE, "op not supported by target.");
4814 
4815           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4816               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4817 	          < vect_min_worthwhile_factor (code))
4818             return false;
4819 
4820           if (dump_enabled_p ())
4821   	    dump_printf (MSG_NOTE, "proceeding using word mode.");
4822         }
4823 
4824       /* Worthwhile without SIMD support?  */
4825       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4826           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4827    	     < vect_min_worthwhile_factor (code))
4828         {
4829           if (dump_enabled_p ())
4830 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4831 			     "not worthwhile without SIMD support.");
4832 
4833           return false;
4834         }
4835     }
4836 
4837   /* 4.2. Check support for the epilog operation.
4838 
4839           If STMT represents a reduction pattern, then the type of the
4840           reduction variable may be different than the type of the rest
4841           of the arguments.  For example, consider the case of accumulation
4842           of shorts into an int accumulator; The original code:
4843                         S1: int_a = (int) short_a;
4844           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4845 
4846           was replaced with:
4847                         STMT: int_acc = widen_sum <short_a, int_acc>
4848 
4849           This means that:
4850           1. The tree-code that is used to create the vector operation in the
4851              epilog code (that reduces the partial results) is not the
4852              tree-code of STMT, but is rather the tree-code of the original
4853              stmt from the pattern that STMT is replacing.  I.e, in the example
4854              above we want to use 'widen_sum' in the loop, but 'plus' in the
4855              epilog.
4856           2. The type (mode) we use to check available target support
4857              for the vector operation to be created in the *epilog*, is
4858              determined by the type of the reduction variable (in the example
4859              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4860              However the type (mode) we use to check available target support
4861              for the vector operation to be created *inside the loop*, is
4862              determined by the type of the other arguments to STMT (in the
4863              example we'd check this: optab_handler (widen_sum_optab,
4864 	     vect_short_mode)).
4865 
4866           This is contrary to "regular" reductions, in which the types of all
4867           the arguments are the same as the type of the reduction variable.
4868           For "regular" reductions we can therefore use the same vector type
4869           (and also the same tree-code) when generating the epilog code and
4870           when generating the code inside the loop.  */
4871 
4872   if (orig_stmt)
4873     {
4874       /* This is a reduction pattern: get the vectype from the type of the
4875          reduction variable, and get the tree-code from orig_stmt.  */
4876       orig_code = gimple_assign_rhs_code (orig_stmt);
4877       gcc_assert (vectype_out);
4878       vec_mode = TYPE_MODE (vectype_out);
4879     }
4880   else
4881     {
4882       /* Regular reduction: use the same vectype and tree-code as used for
4883          the vector code inside the loop can be used for the epilog code. */
4884       orig_code = code;
4885     }
4886 
4887   if (nested_cycle)
4888     {
4889       def_bb = gimple_bb (reduc_def_stmt);
4890       def_stmt_loop = def_bb->loop_father;
4891       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4892                                        loop_preheader_edge (def_stmt_loop));
4893       if (TREE_CODE (def_arg) == SSA_NAME
4894           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4895           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4896           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4897           && vinfo_for_stmt (def_arg_stmt)
4898           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4899               == vect_double_reduction_def)
4900         double_reduc = true;
4901     }
4902 
4903   epilog_reduc_code = ERROR_MARK;
4904   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4905     {
4906       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4907                                          optab_default);
4908       if (!reduc_optab)
4909         {
4910           if (dump_enabled_p ())
4911 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4912 			     "no optab for reduction.");
4913 
4914           epilog_reduc_code = ERROR_MARK;
4915         }
4916 
4917       if (reduc_optab
4918           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4919         {
4920           if (dump_enabled_p ())
4921 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4922 			     "reduc op not supported by target.");
4923 
4924           epilog_reduc_code = ERROR_MARK;
4925         }
4926     }
4927   else
4928     {
4929       if (!nested_cycle || double_reduc)
4930         {
4931           if (dump_enabled_p ())
4932 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933 			     "no reduc code for scalar code.");
4934 
4935           return false;
4936         }
4937     }
4938 
4939   if (double_reduc && ncopies > 1)
4940     {
4941       if (dump_enabled_p ())
4942 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4943 			 "multiple types in double reduction");
4944 
4945       return false;
4946     }
4947 
4948   /* In case of widenning multiplication by a constant, we update the type
4949      of the constant to be the type of the other operand.  We check that the
4950      constant fits the type in the pattern recognition pass.  */
4951   if (code == DOT_PROD_EXPR
4952       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4953     {
4954       if (TREE_CODE (ops[0]) == INTEGER_CST)
4955         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4956       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4957         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4958       else
4959         {
4960           if (dump_enabled_p ())
4961 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4962 			     "invalid types in dot-prod");
4963 
4964           return false;
4965         }
4966     }
4967 
4968   if (!vec_stmt) /* transformation not required.  */
4969     {
4970       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4971         return false;
4972       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4973       return true;
4974     }
4975 
4976   /** Transform.  **/
4977 
4978   if (dump_enabled_p ())
4979     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4980 
4981   /* FORNOW: Multiple types are not supported for condition.  */
4982   if (code == COND_EXPR)
4983     gcc_assert (ncopies == 1);
4984 
4985   /* Create the destination vector  */
4986   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4987 
4988   /* In case the vectorization factor (VF) is bigger than the number
4989      of elements that we can fit in a vectype (nunits), we have to generate
4990      more than one vector stmt - i.e - we need to "unroll" the
4991      vector stmt by a factor VF/nunits.  For more details see documentation
4992      in vectorizable_operation.  */
4993 
4994   /* If the reduction is used in an outer loop we need to generate
4995      VF intermediate results, like so (e.g. for ncopies=2):
4996 	r0 = phi (init, r0)
4997 	r1 = phi (init, r1)
4998 	r0 = x0 + r0;
4999         r1 = x1 + r1;
5000     (i.e. we generate VF results in 2 registers).
5001     In this case we have a separate def-use cycle for each copy, and therefore
5002     for each copy we get the vector def for the reduction variable from the
5003     respective phi node created for this copy.
5004 
5005     Otherwise (the reduction is unused in the loop nest), we can combine
5006     together intermediate results, like so (e.g. for ncopies=2):
5007 	r = phi (init, r)
5008 	r = x0 + r;
5009 	r = x1 + r;
5010    (i.e. we generate VF/2 results in a single register).
5011    In this case for each copy we get the vector def for the reduction variable
5012    from the vectorized reduction operation generated in the previous iteration.
5013   */
5014 
5015   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5016     {
5017       single_defuse_cycle = true;
5018       epilog_copies = 1;
5019     }
5020   else
5021     epilog_copies = ncopies;
5022 
5023   prev_stmt_info = NULL;
5024   prev_phi_info = NULL;
5025   if (slp_node)
5026     {
5027       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5028       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5029                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5030     }
5031   else
5032     {
5033       vec_num = 1;
5034       vec_oprnds0.create (1);
5035       if (op_type == ternary_op)
5036         vec_oprnds1.create (1);
5037     }
5038 
5039   phis.create (vec_num);
5040   vect_defs.create (vec_num);
5041   if (!slp_node)
5042     vect_defs.quick_push (NULL_TREE);
5043 
5044   for (j = 0; j < ncopies; j++)
5045     {
5046       if (j == 0 || !single_defuse_cycle)
5047 	{
5048           for (i = 0; i < vec_num; i++)
5049             {
5050               /* Create the reduction-phi that defines the reduction
5051                  operand.  */
5052               new_phi = create_phi_node (vec_dest, loop->header);
5053               set_vinfo_for_stmt (new_phi,
5054                                   new_stmt_vec_info (new_phi, loop_vinfo,
5055                                                      NULL));
5056                if (j == 0 || slp_node)
5057                  phis.quick_push (new_phi);
5058             }
5059         }
5060 
5061       if (code == COND_EXPR)
5062         {
5063           gcc_assert (!slp_node);
5064           vectorizable_condition (stmt, gsi, vec_stmt,
5065                                   PHI_RESULT (phis[0]),
5066                                   reduc_index, NULL);
5067           /* Multiple types are not supported for condition.  */
5068           break;
5069         }
5070 
5071       /* Handle uses.  */
5072       if (j == 0)
5073         {
5074           op0 = ops[!reduc_index];
5075           if (op_type == ternary_op)
5076             {
5077               if (reduc_index == 0)
5078                 op1 = ops[2];
5079               else
5080                 op1 = ops[1];
5081             }
5082 
5083           if (slp_node)
5084             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5085                                slp_node, -1);
5086           else
5087             {
5088               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5089                                                             stmt, NULL);
5090               vec_oprnds0.quick_push (loop_vec_def0);
5091               if (op_type == ternary_op)
5092                {
5093                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5094                                                                NULL);
5095                  vec_oprnds1.quick_push (loop_vec_def1);
5096                }
5097             }
5098         }
5099       else
5100         {
5101           if (!slp_node)
5102             {
5103               enum vect_def_type dt;
5104               gimple dummy_stmt;
5105               tree dummy;
5106 
5107               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5108                                   &dummy_stmt, &dummy, &dt);
5109               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5110                                                               loop_vec_def0);
5111               vec_oprnds0[0] = loop_vec_def0;
5112               if (op_type == ternary_op)
5113                 {
5114                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5115                                       &dummy, &dt);
5116                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5117                                                                 loop_vec_def1);
5118                   vec_oprnds1[0] = loop_vec_def1;
5119                 }
5120             }
5121 
5122           if (single_defuse_cycle)
5123             reduc_def = gimple_assign_lhs (new_stmt);
5124 
5125           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5126         }
5127 
5128       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5129         {
5130           if (slp_node)
5131             reduc_def = PHI_RESULT (phis[i]);
5132           else
5133             {
5134               if (!single_defuse_cycle || j == 0)
5135                 reduc_def = PHI_RESULT (new_phi);
5136             }
5137 
5138           def1 = ((op_type == ternary_op)
5139                   ? vec_oprnds1[i] : NULL);
5140           if (op_type == binary_op)
5141             {
5142               if (reduc_index == 0)
5143                 expr = build2 (code, vectype_out, reduc_def, def0);
5144               else
5145                 expr = build2 (code, vectype_out, def0, reduc_def);
5146             }
5147           else
5148             {
5149               if (reduc_index == 0)
5150                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5151               else
5152                 {
5153                   if (reduc_index == 1)
5154                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5155                   else
5156                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5157                 }
5158             }
5159 
5160           new_stmt = gimple_build_assign (vec_dest, expr);
5161           new_temp = make_ssa_name (vec_dest, new_stmt);
5162           gimple_assign_set_lhs (new_stmt, new_temp);
5163           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5164 
5165           if (slp_node)
5166             {
5167               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5168               vect_defs.quick_push (new_temp);
5169             }
5170           else
5171             vect_defs[0] = new_temp;
5172         }
5173 
5174       if (slp_node)
5175         continue;
5176 
5177       if (j == 0)
5178 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5179       else
5180 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5181 
5182       prev_stmt_info = vinfo_for_stmt (new_stmt);
5183       prev_phi_info = vinfo_for_stmt (new_phi);
5184     }
5185 
5186   /* Finalize the reduction-phi (set its arguments) and create the
5187      epilog reduction code.  */
5188   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5189     {
5190       new_temp = gimple_assign_lhs (*vec_stmt);
5191       vect_defs[0] = new_temp;
5192     }
5193 
5194   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5195                                     epilog_reduc_code, phis, reduc_index,
5196                                     double_reduc, slp_node);
5197 
5198   phis.release ();
5199   vect_defs.release ();
5200   vec_oprnds0.release ();
5201   vec_oprnds1.release ();
5202 
5203   return true;
5204 }
5205 
5206 /* Function vect_min_worthwhile_factor.
5207 
5208    For a loop where we could vectorize the operation indicated by CODE,
5209    return the minimum vectorization factor that makes it worthwhile
5210    to use generic vectors.  */
5211 int
vect_min_worthwhile_factor(enum tree_code code)5212 vect_min_worthwhile_factor (enum tree_code code)
5213 {
5214   switch (code)
5215     {
5216     case PLUS_EXPR:
5217     case MINUS_EXPR:
5218     case NEGATE_EXPR:
5219       return 4;
5220 
5221     case BIT_AND_EXPR:
5222     case BIT_IOR_EXPR:
5223     case BIT_XOR_EXPR:
5224     case BIT_NOT_EXPR:
5225       return 2;
5226 
5227     default:
5228       return INT_MAX;
5229     }
5230 }
5231 
5232 
5233 /* Function vectorizable_induction
5234 
5235    Check if PHI performs an induction computation that can be vectorized.
5236    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5237    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5238    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5239 
5240 bool
vectorizable_induction(gimple phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple * vec_stmt)5241 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5242 			gimple *vec_stmt)
5243 {
5244   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5245   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5246   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5247   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5248   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5249   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5250   tree vec_def;
5251 
5252   gcc_assert (ncopies >= 1);
5253   /* FORNOW. These restrictions should be relaxed.  */
5254   if (nested_in_vect_loop_p (loop, phi))
5255     {
5256       imm_use_iterator imm_iter;
5257       use_operand_p use_p;
5258       gimple exit_phi;
5259       edge latch_e;
5260       tree loop_arg;
5261 
5262       if (ncopies > 1)
5263 	{
5264 	  if (dump_enabled_p ())
5265 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5266 			     "multiple types in nested loop.");
5267 	  return false;
5268 	}
5269 
5270       exit_phi = NULL;
5271       latch_e = loop_latch_edge (loop->inner);
5272       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5273       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5274 	{
5275 	  if (!flow_bb_inside_loop_p (loop->inner,
5276 				      gimple_bb (USE_STMT (use_p))))
5277 	    {
5278 	      exit_phi = USE_STMT (use_p);
5279 	      break;
5280 	    }
5281 	}
5282       if (exit_phi)
5283 	{
5284 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5285 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5286 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5287 	    {
5288 	      if (dump_enabled_p ())
5289 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5290 				 "inner-loop induction only used outside "
5291 				 "of the outer vectorized loop.");
5292 	      return false;
5293 	    }
5294 	}
5295     }
5296 
5297   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5298     return false;
5299 
5300   /* FORNOW: SLP not supported.  */
5301   if (STMT_SLP_TYPE (stmt_info))
5302     return false;
5303 
5304   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5305 
5306   if (gimple_code (phi) != GIMPLE_PHI)
5307     return false;
5308 
5309   if (!vec_stmt) /* transformation not required.  */
5310     {
5311       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5312       if (dump_enabled_p ())
5313         dump_printf_loc (MSG_NOTE, vect_location,
5314                          "=== vectorizable_induction ===");
5315       vect_model_induction_cost (stmt_info, ncopies);
5316       return true;
5317     }
5318 
5319   /** Transform.  **/
5320 
5321   if (dump_enabled_p ())
5322     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5323 
5324   vec_def = get_initial_def_for_induction (phi);
5325   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5326   return true;
5327 }
5328 
5329 /* Function vectorizable_live_operation.
5330 
5331    STMT computes a value that is used outside the loop.  Check if
5332    it can be supported.  */
5333 
5334 bool
vectorizable_live_operation(gimple stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple * vec_stmt ATTRIBUTE_UNUSED)5335 vectorizable_live_operation (gimple stmt,
5336 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5337 			     gimple *vec_stmt ATTRIBUTE_UNUSED)
5338 {
5339   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5340   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5341   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5342   int i;
5343   int op_type;
5344   tree op;
5345   tree def;
5346   gimple def_stmt;
5347   enum vect_def_type dt;
5348   enum tree_code code;
5349   enum gimple_rhs_class rhs_class;
5350 
5351   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5352 
5353   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5354     return false;
5355 
5356   if (!is_gimple_assign (stmt))
5357     return false;
5358 
5359   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5360     return false;
5361 
5362   /* FORNOW. CHECKME. */
5363   if (nested_in_vect_loop_p (loop, stmt))
5364     return false;
5365 
5366   code = gimple_assign_rhs_code (stmt);
5367   op_type = TREE_CODE_LENGTH (code);
5368   rhs_class = get_gimple_rhs_class (code);
5369   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5370   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5371 
5372   /* FORNOW: support only if all uses are invariant.  This means
5373      that the scalar operations can remain in place, unvectorized.
5374      The original last scalar value that they compute will be used.  */
5375 
5376   for (i = 0; i < op_type; i++)
5377     {
5378       if (rhs_class == GIMPLE_SINGLE_RHS)
5379 	op = TREE_OPERAND (gimple_op (stmt, 1), i);
5380       else
5381 	op = gimple_op (stmt, i + 1);
5382       if (op
5383           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5384 				  &dt))
5385         {
5386           if (dump_enabled_p ())
5387 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5388 			     "use not simple.");
5389           return false;
5390         }
5391 
5392       if (dt != vect_external_def && dt != vect_constant_def)
5393         return false;
5394     }
5395 
5396   /* No transformation is required for the cases we currently support.  */
5397   return true;
5398 }
5399 
5400 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5401 
5402 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple stmt)5403 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5404 {
5405   ssa_op_iter op_iter;
5406   imm_use_iterator imm_iter;
5407   def_operand_p def_p;
5408   gimple ustmt;
5409 
5410   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5411     {
5412       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5413 	{
5414 	  basic_block bb;
5415 
5416 	  if (!is_gimple_debug (ustmt))
5417 	    continue;
5418 
5419 	  bb = gimple_bb (ustmt);
5420 
5421 	  if (!flow_bb_inside_loop_p (loop, bb))
5422 	    {
5423 	      if (gimple_debug_bind_p (ustmt))
5424 		{
5425 		  if (dump_enabled_p ())
5426 		    dump_printf_loc (MSG_NOTE, vect_location,
5427                                      "killing debug use");
5428 
5429 		  gimple_debug_bind_reset_value (ustmt);
5430 		  update_stmt (ustmt);
5431 		}
5432 	      else
5433 		gcc_unreachable ();
5434 	    }
5435 	}
5436     }
5437 }
5438 
5439 /* Function vect_transform_loop.
5440 
5441    The analysis phase has determined that the loop is vectorizable.
5442    Vectorize the loop - created vectorized stmts to replace the scalar
5443    stmts in the loop, and update the loop exit condition.  */
5444 
5445 void
vect_transform_loop(loop_vec_info loop_vinfo)5446 vect_transform_loop (loop_vec_info loop_vinfo)
5447 {
5448   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5449   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5450   int nbbs = loop->num_nodes;
5451   gimple_stmt_iterator si;
5452   int i;
5453   tree ratio = NULL;
5454   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5455   bool grouped_store;
5456   bool slp_scheduled = false;
5457   unsigned int nunits;
5458   gimple stmt, pattern_stmt;
5459   gimple_seq pattern_def_seq = NULL;
5460   gimple_stmt_iterator pattern_def_si = gsi_none ();
5461   bool transform_pattern_stmt = false;
5462   bool check_profitability = false;
5463   int th;
5464   /* Record number of iterations before we started tampering with the profile. */
5465   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5466 
5467   if (dump_enabled_p ())
5468     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5469 
5470   /* If profile is inprecise, we have chance to fix it up.  */
5471   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5472     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5473 
5474   /* Use the more conservative vectorization threshold.  If the number
5475      of iterations is constant assume the cost check has been performed
5476      by our caller.  If the threshold makes all loops profitable that
5477      run at least the vectorization factor number of times checking
5478      is pointless, too.  */
5479   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5480 	 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5481   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5482   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5483       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5484     {
5485       if (dump_enabled_p ())
5486 	dump_printf_loc (MSG_NOTE, vect_location,
5487 			 "Profitability threshold is %d loop iterations.", th);
5488       check_profitability = true;
5489     }
5490 
5491   /* Peel the loop if there are data refs with unknown alignment.
5492      Only one data ref with unknown store is allowed.  */
5493 
5494   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5495     {
5496       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5497       check_profitability = false;
5498     }
5499 
5500   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5501       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5502     {
5503       vect_loop_versioning (loop_vinfo, th, check_profitability);
5504       check_profitability = false;
5505     }
5506 
5507   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5508      compile time constant), or it is a constant that doesn't divide by the
5509      vectorization factor, then an epilog loop needs to be created.
5510      We therefore duplicate the loop: the original loop will be vectorized,
5511      and will compute the first (n/VF) iterations.  The second copy of the loop
5512      will remain scalar and will compute the remaining (n%VF) iterations.
5513      (VF is the vectorization factor).  */
5514 
5515   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5516        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5517 	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5518        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5519     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5520 				    th, check_profitability);
5521   else
5522     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5523 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5524 
5525   /* 1) Make sure the loop header has exactly two entries
5526      2) Make sure we have a preheader basic block.  */
5527 
5528   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5529 
5530   split_edge (loop_preheader_edge (loop));
5531 
5532   /* FORNOW: the vectorizer supports only loops which body consist
5533      of one basic block (header + empty latch). When the vectorizer will
5534      support more involved loop forms, the order by which the BBs are
5535      traversed need to be reconsidered.  */
5536 
5537   for (i = 0; i < nbbs; i++)
5538     {
5539       basic_block bb = bbs[i];
5540       stmt_vec_info stmt_info;
5541       gimple phi;
5542 
5543       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5544         {
5545 	  phi = gsi_stmt (si);
5546 	  if (dump_enabled_p ())
5547 	    {
5548 	      dump_printf_loc (MSG_NOTE, vect_location,
5549                                "------>vectorizing phi: ");
5550 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5551 	    }
5552 	  stmt_info = vinfo_for_stmt (phi);
5553 	  if (!stmt_info)
5554 	    continue;
5555 
5556 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5557 	    vect_loop_kill_debug_uses (loop, phi);
5558 
5559 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
5560 	      && !STMT_VINFO_LIVE_P (stmt_info))
5561 	    continue;
5562 
5563 	  if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5564 	        != (unsigned HOST_WIDE_INT) vectorization_factor)
5565 	      && dump_enabled_p ())
5566 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5567 
5568 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5569 	    {
5570 	      if (dump_enabled_p ())
5571 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5572 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5573 	    }
5574 	}
5575 
5576       pattern_stmt = NULL;
5577       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5578 	{
5579 	  bool is_store;
5580 
5581           if (transform_pattern_stmt)
5582 	    stmt = pattern_stmt;
5583           else
5584             stmt = gsi_stmt (si);
5585 
5586 	  if (dump_enabled_p ())
5587 	    {
5588 	      dump_printf_loc (MSG_NOTE, vect_location,
5589 			       "------>vectorizing statement: ");
5590 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5591 	    }
5592 
5593 	  stmt_info = vinfo_for_stmt (stmt);
5594 
5595 	  /* vector stmts created in the outer-loop during vectorization of
5596 	     stmts in an inner-loop may not have a stmt_info, and do not
5597 	     need to be vectorized.  */
5598 	  if (!stmt_info)
5599 	    {
5600 	      gsi_next (&si);
5601 	      continue;
5602 	    }
5603 
5604 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5605 	    vect_loop_kill_debug_uses (loop, stmt);
5606 
5607 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
5608 	      && !STMT_VINFO_LIVE_P (stmt_info))
5609             {
5610               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5611                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5612                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5613                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5614                 {
5615                   stmt = pattern_stmt;
5616                   stmt_info = vinfo_for_stmt (stmt);
5617                 }
5618               else
5619 	        {
5620    	          gsi_next (&si);
5621 	          continue;
5622                 }
5623 	    }
5624           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5625                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5626                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5627                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5628             transform_pattern_stmt = true;
5629 
5630 	  /* If pattern statement has def stmts, vectorize them too.  */
5631 	  if (is_pattern_stmt_p (stmt_info))
5632 	    {
5633 	      if (pattern_def_seq == NULL)
5634 		{
5635 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5636 		  pattern_def_si = gsi_start (pattern_def_seq);
5637 		}
5638 	      else if (!gsi_end_p (pattern_def_si))
5639 		gsi_next (&pattern_def_si);
5640 	      if (pattern_def_seq != NULL)
5641 		{
5642 		  gimple pattern_def_stmt = NULL;
5643 		  stmt_vec_info pattern_def_stmt_info = NULL;
5644 
5645 		  while (!gsi_end_p (pattern_def_si))
5646 		    {
5647 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
5648 		      pattern_def_stmt_info
5649 			= vinfo_for_stmt (pattern_def_stmt);
5650 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5651 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5652 			break;
5653 		      gsi_next (&pattern_def_si);
5654 		    }
5655 
5656 		  if (!gsi_end_p (pattern_def_si))
5657 		    {
5658 		      if (dump_enabled_p ())
5659 			{
5660 			  dump_printf_loc (MSG_NOTE, vect_location,
5661 					   "==> vectorizing pattern def "
5662 					   "stmt: ");
5663 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5664 					    pattern_def_stmt, 0);
5665 			}
5666 
5667 		      stmt = pattern_def_stmt;
5668 		      stmt_info = pattern_def_stmt_info;
5669 		    }
5670 		  else
5671 		    {
5672 		      pattern_def_si = gsi_none ();
5673 		      transform_pattern_stmt = false;
5674 		    }
5675 		}
5676 	      else
5677 		transform_pattern_stmt = false;
5678             }
5679 
5680 	  gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5681 	  nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5682                                                STMT_VINFO_VECTYPE (stmt_info));
5683 	  if (!STMT_SLP_TYPE (stmt_info)
5684 	      && nunits != (unsigned int) vectorization_factor
5685               && dump_enabled_p ())
5686 	    /* For SLP VF is set according to unrolling factor, and not to
5687 	       vector size, hence for SLP this print is not valid.  */
5688             dump_printf_loc (MSG_NOTE, vect_location,
5689 			     "multiple-types.");
5690 
5691 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
5692 	     reached.  */
5693 	  if (STMT_SLP_TYPE (stmt_info))
5694 	    {
5695 	      if (!slp_scheduled)
5696 		{
5697 		  slp_scheduled = true;
5698 
5699 		  if (dump_enabled_p ())
5700 		    dump_printf_loc (MSG_NOTE, vect_location,
5701 				     "=== scheduling SLP instances ===");
5702 
5703 		  vect_schedule_slp (loop_vinfo, NULL);
5704 		}
5705 
5706 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5707 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5708 		{
5709 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5710 		    {
5711 		      pattern_def_seq = NULL;
5712 		      gsi_next (&si);
5713 		    }
5714 		  continue;
5715 		}
5716 	    }
5717 
5718 	  /* -------- vectorize statement ------------ */
5719 	  if (dump_enabled_p ())
5720 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5721 
5722 	  grouped_store = false;
5723 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5724           if (is_store)
5725             {
5726 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5727 		{
5728 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5729 		     interleaving chain was completed - free all the stores in
5730 		     the chain.  */
5731 		  gsi_next (&si);
5732 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5733  		  continue;
5734 		}
5735 	      else
5736 		{
5737 		  /* Free the attached stmt_vec_info and remove the stmt.  */
5738 		  gimple store = gsi_stmt (si);
5739 		  free_stmt_vec_info (store);
5740 		  unlink_stmt_vdef (store);
5741 		  gsi_remove (&si, true);
5742 		  release_defs (store);
5743 		  continue;
5744 		}
5745 	    }
5746 
5747 	  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5748 	    {
5749 	      pattern_def_seq = NULL;
5750 	      gsi_next (&si);
5751 	    }
5752 	}		        /* stmts in BB */
5753     }				/* BBs in loop */
5754 
5755   slpeel_make_loop_iterate_ntimes (loop, ratio);
5756 
5757   /* Reduce loop iterations by the vectorization factor.  */
5758   scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
5759 		      expected_iterations / vectorization_factor);
5760   loop->nb_iterations_upper_bound
5761     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5762 					    FLOOR_DIV_EXPR);
5763   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5764       && loop->nb_iterations_upper_bound != double_int_zero)
5765     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5766   if (loop->any_estimate)
5767     {
5768       loop->nb_iterations_estimate
5769         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5770 					     FLOOR_DIV_EXPR);
5771        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5772 	   && loop->nb_iterations_estimate != double_int_zero)
5773 	 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5774     }
5775 
5776   /* The memory tags and pointers in vectorized statements need to
5777      have their SSA forms updated.  FIXME, why can't this be delayed
5778      until all the loops have been transformed?  */
5779   update_ssa (TODO_update_ssa);
5780 
5781   if (dump_enabled_p ())
5782     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5783   if (loop->inner && dump_enabled_p ())
5784     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5785 		     "OUTER LOOP VECTORIZED.");
5786 }
5787