1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 #define vec_step vec_step_
59
60 /* Loop Vectorization Pass.
61
62 This pass tries to vectorize loops.
63
64 For example, the vectorizer transforms the following simple loop:
65
66 short a[N]; short b[N]; short c[N]; int i;
67
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
70 }
71
72 as if it was manually vectorized by rewriting the source code into:
73
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
78
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
84 }
85
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
97
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
103
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
108
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
119
120 For example, say stmt S1 was vectorized into stmt VS1:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
125
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
130
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
138
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
146
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
153
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
163
164 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf,vec<stmt_vec_info> * mask_producers)165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf,
168 vec<stmt_vec_info > *mask_producers)
169 {
170 gimple *stmt = stmt_info->stmt;
171
172 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173 && !STMT_VINFO_LIVE_P (stmt_info))
174 || gimple_clobber_p (stmt))
175 {
176 if (dump_enabled_p ())
177 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178 return opt_result::success ();
179 }
180
181 tree stmt_vectype, nunits_vectype;
182 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
186
187 if (stmt_vectype)
188 {
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else if (stmt_vectype == boolean_type_node)
197 mask_producers->safe_push (stmt_info);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 }
201
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
204
205 return opt_result::success ();
206 }
207
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. If some of the statements
211 produce a mask result whose vector type can only be calculated later,
212 add them to MASK_PRODUCERS. Return true on success or false if
213 something prevented vectorization. */
214
215 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf,vec<stmt_vec_info> * mask_producers)216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
217 vec<stmt_vec_info > *mask_producers)
218 {
219 vec_info *vinfo = stmt_info->vinfo;
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res
224 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
225 if (!res)
226 return res;
227
228 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
229 && STMT_VINFO_RELATED_STMT (stmt_info))
230 {
231 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
232 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233
234 /* If a pattern statement has def stmts, analyze them too. */
235 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
236 !gsi_end_p (si); gsi_next (&si))
237 {
238 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
239 if (dump_enabled_p ())
240 dump_printf_loc (MSG_NOTE, vect_location,
241 "==> examining pattern def stmt: %G",
242 def_stmt_info->stmt);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
246 vf, mask_producers);
247 if (!res)
248 return res;
249 }
250
251 if (dump_enabled_p ())
252 dump_printf_loc (MSG_NOTE, vect_location,
253 "==> examining pattern statement: %G",
254 stmt_info->stmt);
255 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
256 if (!res)
257 return res;
258 }
259
260 return opt_result::success ();
261 }
262
263 /* Function vect_determine_vectorization_factor
264
265 Determine the vectorization factor (VF). VF is the number of data elements
266 that are operated upon in parallel in a single iteration of the vectorized
267 loop. For example, when vectorizing a loop that operates on 4byte elements,
268 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
269 elements can fit in a single vector register.
270
271 We currently support vectorization of loops in which all types operated upon
272 are of the same size. Therefore this function currently sets VF according to
273 the size of the types operated upon, and fails if there are multiple sizes
274 in the loop.
275
276 VF is also the factor by which the loop iterations are strip-mined, e.g.:
277 original loop:
278 for (i=0; i<N; i++){
279 a[i] = b[i] + c[i];
280 }
281
282 vectorized loop:
283 for (i=0; i<N; i+=VF){
284 a[i:VF] = b[i:VF] + c[i:VF];
285 }
286 */
287
288 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
290 {
291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
292 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
293 unsigned nbbs = loop->num_nodes;
294 poly_uint64 vectorization_factor = 1;
295 tree scalar_type = NULL_TREE;
296 gphi *phi;
297 tree vectype;
298 stmt_vec_info stmt_info;
299 unsigned i;
300 auto_vec<stmt_vec_info> mask_producers;
301
302 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
303
304 for (i = 0; i < nbbs; i++)
305 {
306 basic_block bb = bbs[i];
307
308 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
309 gsi_next (&si))
310 {
311 phi = si.phi ();
312 stmt_info = loop_vinfo->lookup_stmt (phi);
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
315 phi);
316
317 gcc_assert (stmt_info);
318
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
321 {
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
324
325 if (dump_enabled_p ())
326 dump_printf_loc (MSG_NOTE, vect_location,
327 "get vectype for scalar type: %T\n",
328 scalar_type);
329
330 vectype = get_vectype_for_scalar_type (scalar_type);
331 if (!vectype)
332 return opt_result::failure_at (phi,
333 "not vectorized: unsupported "
334 "data-type %T\n",
335 scalar_type);
336 STMT_VINFO_VECTYPE (stmt_info) = vectype;
337
338 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
340 vectype);
341
342 if (dump_enabled_p ())
343 {
344 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
345 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
346 dump_printf (MSG_NOTE, "\n");
347 }
348
349 vect_update_max_nunits (&vectorization_factor, vectype);
350 }
351 }
352
353 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
354 gsi_next (&si))
355 {
356 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
357 opt_result res
358 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
359 &mask_producers);
360 if (!res)
361 return res;
362 }
363 }
364
365 /* TODO: Analyze cost. Decide if worth while to vectorize. */
366 if (dump_enabled_p ())
367 {
368 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
369 dump_dec (MSG_NOTE, vectorization_factor);
370 dump_printf (MSG_NOTE, "\n");
371 }
372
373 if (known_le (vectorization_factor, 1U))
374 return opt_result::failure_at (vect_location,
375 "not vectorized: unsupported data-type\n");
376 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
377
378 for (i = 0; i < mask_producers.length (); i++)
379 {
380 stmt_info = mask_producers[i];
381 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
382 if (!mask_type)
383 return opt_result::propagate_failure (mask_type);
384 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 }
386
387 return opt_result::success ();
388 }
389
390
391 /* Function vect_is_simple_iv_evolution.
392
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution. */
395
396 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree * step)
399 {
400 tree init_expr;
401 tree step_expr;
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 basic_block bb;
404
405 /* When there is no evolution in this loop, the evolution function
406 is not "simple". */
407 if (evolution_part == NULL_TREE)
408 return false;
409
410 /* When the evolution is a polynomial of degree >= 2
411 the evolution function is not "simple". */
412 if (tree_is_chrec (evolution_part))
413 return false;
414
415 step_expr = evolution_part;
416 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
417
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
420 step_expr, init_expr);
421
422 *init = init_expr;
423 *step = step_expr;
424
425 if (TREE_CODE (step_expr) != INTEGER_CST
426 && (TREE_CODE (step_expr) != SSA_NAME
427 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
428 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
429 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
430 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
431 || !flag_associative_math)))
432 && (TREE_CODE (step_expr) != REAL_CST
433 || !flag_associative_math))
434 {
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "step unknown.\n");
438 return false;
439 }
440
441 return true;
442 }
443
444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
445 what we are assuming is a double reduction. For example, given
446 a structure like this:
447
448 outer1:
449 x_1 = PHI <x_4(outer2), ...>;
450 ...
451
452 inner:
453 x_2 = PHI <x_1(outer1), ...>;
454 ...
455 x_3 = ...;
456 ...
457
458 outer2:
459 x_4 = PHI <x_3(inner)>;
460 ...
461
462 outer loop analysis would treat x_1 as a double reduction phi and
463 this function would then return true for x_2. */
464
465 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
467 {
468 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
469 use_operand_p use_p;
470 ssa_op_iter op_iter;
471 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
472 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
473 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
474 return true;
475 return false;
476 }
477
478 /* Function vect_analyze_scalar_cycles_1.
479
480 Examine the cross iteration def-use cycles of scalar variables
481 in LOOP. LOOP_VINFO represents the loop that is now being
482 considered for vectorization (can be LOOP, or an outer-loop
483 enclosing LOOP). */
484
485 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
487 {
488 basic_block bb = loop->header;
489 tree init, step;
490 auto_vec<stmt_vec_info, 64> worklist;
491 gphi_iterator gsi;
492 bool double_reduc;
493
494 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
495
496 /* First - identify all inductions. Reduction detection assumes that all the
497 inductions have been identified, therefore, this order must not be
498 changed. */
499 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
500 {
501 gphi *phi = gsi.phi ();
502 tree access_fn = NULL;
503 tree def = PHI_RESULT (phi);
504 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
505
506 if (dump_enabled_p ())
507 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
508
509 /* Skip virtual phi's. The data dependences that are associated with
510 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
511 if (virtual_operand_p (def))
512 continue;
513
514 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
515
516 /* Analyze the evolution function. */
517 access_fn = analyze_scalar_evolution (loop, def);
518 if (access_fn)
519 {
520 STRIP_NOPS (access_fn);
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "Access function of PHI: %T\n", access_fn);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
525 = initial_condition_in_loop_num (access_fn, loop->num);
526 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
527 = evolution_part_in_loop_num (access_fn, loop->num);
528 }
529
530 if (!access_fn
531 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
532 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
533 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
534 && TREE_CODE (step) != INTEGER_CST))
535 {
536 worklist.safe_push (stmt_vinfo);
537 continue;
538 }
539
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
541 != NULL_TREE);
542 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
543
544 if (dump_enabled_p ())
545 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
546 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
547 }
548
549
550 /* Second - identify all reductions and nested cycles. */
551 while (worklist.length () > 0)
552 {
553 stmt_vec_info stmt_vinfo = worklist.pop ();
554 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
555 tree def = PHI_RESULT (phi);
556
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
559
560 gcc_assert (!virtual_operand_p (def)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
562
563 stmt_vec_info reduc_stmt_info
564 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
565 &double_reduc, false);
566 if (reduc_stmt_info)
567 {
568 if (double_reduc)
569 {
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "Detected double reduction.\n");
573
574 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
575 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
576 = vect_double_reduction_def;
577 }
578 else
579 {
580 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581 {
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE, vect_location,
584 "Detected vectorizable nested cycle.\n");
585
586 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
587 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
588 }
589 else
590 {
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "Detected reduction.\n");
594
595 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
596 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
597 /* Store the reduction cycles for possible vectorization in
598 loop-aware SLP if it was not detected as reduction
599 chain. */
600 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
601 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
602 (reduc_stmt_info);
603 }
604 }
605 }
606 else
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 "Unknown def-use cycle pattern.\n");
610 }
611 }
612
613
614 /* Function vect_analyze_scalar_cycles.
615
616 Examine the cross iteration def-use cycles of scalar variables, by
617 analyzing the loop-header PHIs of scalar variables. Classify each
618 cycle as one of the following: invariant, induction, reduction, unknown.
619 We do that for the loop represented by LOOP_VINFO, and also to its
620 inner-loop, if exists.
621 Examples for scalar cycles:
622
623 Example1: reduction:
624
625 loop1:
626 for (i=0; i<N; i++)
627 sum += a[i];
628
629 Example2: induction:
630
631 loop2:
632 for (i=0; i<N; i++)
633 a[i] = i; */
634
635 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
637 {
638 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
639
640 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
641
642 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643 Reductions in such inner-loop therefore have different properties than
644 the reductions in the nest that gets vectorized:
645 1. When vectorized, they are executed in the same order as in the original
646 scalar loop, so we can't change the order of computation when
647 vectorizing them.
648 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649 current checks are too strict. */
650
651 if (loop->inner)
652 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 }
654
655 /* Transfer group and reduction information from STMT_INFO to its
656 pattern stmt. */
657
658 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)659 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
660 {
661 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
662 stmt_vec_info stmtp;
663 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
664 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
665 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 do
667 {
668 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
669 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
670 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
671 if (stmt_info)
672 REDUC_GROUP_NEXT_ELEMENT (stmtp)
673 = STMT_VINFO_RELATED_STMT (stmt_info);
674 }
675 while (stmt_info);
676 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 }
678
679 /* Fixup scalar cycles that now have their stmts detected as patterns. */
680
681 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
683 {
684 stmt_vec_info first;
685 unsigned i;
686
687 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
688 if (STMT_VINFO_IN_PATTERN_P (first))
689 {
690 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
691 while (next)
692 {
693 if (! STMT_VINFO_IN_PATTERN_P (next))
694 break;
695 next = REDUC_GROUP_NEXT_ELEMENT (next);
696 }
697 /* If not all stmt in the chain are patterns try to handle
698 the chain without patterns. */
699 if (! next)
700 {
701 vect_fixup_reduc_chain (first);
702 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
703 = STMT_VINFO_RELATED_STMT (first);
704 }
705 }
706 }
707
708 /* Function vect_get_loop_niters.
709
710 Determine how many iterations the loop is executed and place it
711 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
712 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
713 niter information holds in ASSUMPTIONS.
714
715 Return the loop exit condition. */
716
717
718 static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
720 tree *number_of_iterations, tree *number_of_iterationsm1)
721 {
722 edge exit = single_exit (loop);
723 struct tree_niter_desc niter_desc;
724 tree niter_assumptions, niter, may_be_zero;
725 gcond *cond = get_loop_exit_condition (loop);
726
727 *assumptions = boolean_true_node;
728 *number_of_iterationsm1 = chrec_dont_know;
729 *number_of_iterations = chrec_dont_know;
730 DUMP_VECT_SCOPE ("get_loop_niters");
731
732 if (!exit)
733 return cond;
734
735 niter = chrec_dont_know;
736 may_be_zero = NULL_TREE;
737 niter_assumptions = boolean_true_node;
738 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
739 || chrec_contains_undetermined (niter_desc.niter))
740 return cond;
741
742 niter_assumptions = niter_desc.assumptions;
743 may_be_zero = niter_desc.may_be_zero;
744 niter = niter_desc.niter;
745
746 if (may_be_zero && integer_zerop (may_be_zero))
747 may_be_zero = NULL_TREE;
748
749 if (may_be_zero)
750 {
751 if (COMPARISON_CLASS_P (may_be_zero))
752 {
753 /* Try to combine may_be_zero with assumptions, this can simplify
754 computation of niter expression. */
755 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
756 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
757 niter_assumptions,
758 fold_build1 (TRUTH_NOT_EXPR,
759 boolean_type_node,
760 may_be_zero));
761 else
762 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
763 build_int_cst (TREE_TYPE (niter), 0),
764 rewrite_to_non_trapping_overflow (niter));
765
766 may_be_zero = NULL_TREE;
767 }
768 else if (integer_nonzerop (may_be_zero))
769 {
770 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
771 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
772 return cond;
773 }
774 else
775 return cond;
776 }
777
778 *assumptions = niter_assumptions;
779 *number_of_iterationsm1 = niter;
780
781 /* We want the number of loop header executions which is the number
782 of latch executions plus one.
783 ??? For UINT_MAX latch executions this number overflows to zero
784 for loops like do { n++; } while (n != 0); */
785 if (niter && !chrec_contains_undetermined (niter))
786 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
787 build_int_cst (TREE_TYPE (niter), 1));
788 *number_of_iterations = niter;
789
790 return cond;
791 }
792
793 /* Function bb_in_loop_p
794
795 Used as predicate for dfs order traversal of the loop bbs. */
796
797 static bool
bb_in_loop_p(const_basic_block bb,const void * data)798 bb_in_loop_p (const_basic_block bb, const void *data)
799 {
800 const struct loop *const loop = (const struct loop *)data;
801 if (flow_bb_inside_loop_p (loop, bb))
802 return true;
803 return false;
804 }
805
806
807 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
808 stmt_vec_info structs for all the stmts in LOOP_IN. */
809
_loop_vec_info(struct loop * loop_in,vec_info_shared * shared)810 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
811 : vec_info (vec_info::loop, init_cost (loop_in), shared),
812 loop (loop_in),
813 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
814 num_itersm1 (NULL_TREE),
815 num_iters (NULL_TREE),
816 num_iters_unchanged (NULL_TREE),
817 num_iters_assumptions (NULL_TREE),
818 th (0),
819 versioning_threshold (0),
820 vectorization_factor (0),
821 max_vectorization_factor (0),
822 mask_skip_niters (NULL_TREE),
823 mask_compare_type (NULL_TREE),
824 simd_if_cond (NULL_TREE),
825 unaligned_dr (NULL),
826 peeling_for_alignment (0),
827 ptr_mask (0),
828 ivexpr_map (NULL),
829 slp_unrolling_factor (1),
830 single_scalar_iteration_cost (0),
831 vectorizable (false),
832 can_fully_mask_p (true),
833 fully_masked_p (false),
834 peeling_for_gaps (false),
835 peeling_for_niter (false),
836 operands_swapped (false),
837 no_data_dependencies (false),
838 has_mask_store (false),
839 scalar_loop (NULL),
840 orig_loop_info (NULL)
841 {
842 /* CHECKME: We want to visit all BBs before their successors (except for
843 latch blocks, for which this assertion wouldn't hold). In the simple
844 case of the loop forms we allow, a dfs order of the BBs would the same
845 as reversed postorder traversal, so we are safe. */
846
847 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
848 bbs, loop->num_nodes, loop);
849 gcc_assert (nbbs == loop->num_nodes);
850
851 for (unsigned int i = 0; i < nbbs; i++)
852 {
853 basic_block bb = bbs[i];
854 gimple_stmt_iterator si;
855
856 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
857 {
858 gimple *phi = gsi_stmt (si);
859 gimple_set_uid (phi, 0);
860 add_stmt (phi);
861 }
862
863 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
864 {
865 gimple *stmt = gsi_stmt (si);
866 gimple_set_uid (stmt, 0);
867 add_stmt (stmt);
868 /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
869 second argument is the #pragma omp simd if (x) condition, when 0,
870 loop shouldn't be vectorized, when non-zero constant, it should
871 be vectorized normally, otherwise versioned with vectorized loop
872 done if the condition is non-zero at runtime. */
873 if (loop_in->simduid
874 && is_gimple_call (stmt)
875 && gimple_call_internal_p (stmt)
876 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
877 && gimple_call_num_args (stmt) >= 2
878 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
879 && (loop_in->simduid
880 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
881 {
882 tree arg = gimple_call_arg (stmt, 1);
883 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
884 simd_if_cond = arg;
885 else
886 gcc_assert (integer_nonzerop (arg));
887 }
888 }
889 }
890 }
891
892 /* Free all levels of MASKS. */
893
894 void
release_vec_loop_masks(vec_loop_masks * masks)895 release_vec_loop_masks (vec_loop_masks *masks)
896 {
897 rgroup_masks *rgm;
898 unsigned int i;
899 FOR_EACH_VEC_ELT (*masks, i, rgm)
900 rgm->masks.release ();
901 masks->release ();
902 }
903
904 /* Free all memory used by the _loop_vec_info, as well as all the
905 stmt_vec_info structs of all the stmts in the loop. */
906
~_loop_vec_info()907 _loop_vec_info::~_loop_vec_info ()
908 {
909 int nbbs;
910 gimple_stmt_iterator si;
911 int j;
912
913 nbbs = loop->num_nodes;
914 for (j = 0; j < nbbs; j++)
915 {
916 basic_block bb = bbs[j];
917 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
918 {
919 gimple *stmt = gsi_stmt (si);
920
921 /* We may have broken canonical form by moving a constant
922 into RHS1 of a commutative op. Fix such occurrences. */
923 if (operands_swapped && is_gimple_assign (stmt))
924 {
925 enum tree_code code = gimple_assign_rhs_code (stmt);
926
927 if ((code == PLUS_EXPR
928 || code == POINTER_PLUS_EXPR
929 || code == MULT_EXPR)
930 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
931 swap_ssa_operands (stmt,
932 gimple_assign_rhs1_ptr (stmt),
933 gimple_assign_rhs2_ptr (stmt));
934 else if (code == COND_EXPR
935 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
936 {
937 tree cond_expr = gimple_assign_rhs1 (stmt);
938 enum tree_code cond_code = TREE_CODE (cond_expr);
939
940 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
941 {
942 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
943 0));
944 cond_code = invert_tree_comparison (cond_code,
945 honor_nans);
946 if (cond_code != ERROR_MARK)
947 {
948 TREE_SET_CODE (cond_expr, cond_code);
949 swap_ssa_operands (stmt,
950 gimple_assign_rhs2_ptr (stmt),
951 gimple_assign_rhs3_ptr (stmt));
952 }
953 }
954 }
955 }
956 gsi_next (&si);
957 }
958 }
959
960 free (bbs);
961
962 release_vec_loop_masks (&masks);
963 delete ivexpr_map;
964
965 loop->aux = NULL;
966 }
967
968 /* Return an invariant or register for EXPR and emit necessary
969 computations in the LOOP_VINFO loop preheader. */
970
971 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)972 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
973 {
974 if (is_gimple_reg (expr)
975 || is_gimple_min_invariant (expr))
976 return expr;
977
978 if (! loop_vinfo->ivexpr_map)
979 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
980 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
981 if (! cached)
982 {
983 gimple_seq stmts = NULL;
984 cached = force_gimple_operand (unshare_expr (expr),
985 &stmts, true, NULL_TREE);
986 if (stmts)
987 {
988 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
989 gsi_insert_seq_on_edge_immediate (e, stmts);
990 }
991 }
992 return cached;
993 }
994
995 /* Return true if we can use CMP_TYPE as the comparison type to produce
996 all masks required to mask LOOP_VINFO. */
997
998 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)999 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1000 {
1001 rgroup_masks *rgm;
1002 unsigned int i;
1003 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1004 if (rgm->mask_type != NULL_TREE
1005 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1006 cmp_type, rgm->mask_type,
1007 OPTIMIZE_FOR_SPEED))
1008 return false;
1009 return true;
1010 }
1011
1012 /* Calculate the maximum number of scalars per iteration for every
1013 rgroup in LOOP_VINFO. */
1014
1015 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)1016 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1017 {
1018 unsigned int res = 1;
1019 unsigned int i;
1020 rgroup_masks *rgm;
1021 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1022 res = MAX (res, rgm->max_nscalars_per_iter);
1023 return res;
1024 }
1025
1026 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1027 whether we can actually generate the masks required. Return true if so,
1028 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1029
1030 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1031 vect_verify_full_masking (loop_vec_info loop_vinfo)
1032 {
1033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1034 unsigned int min_ni_width;
1035
1036 /* Use a normal loop if there are no statements that need masking.
1037 This only happens in rare degenerate cases: it means that the loop
1038 has no loads, no stores, and no live-out values. */
1039 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1040 return false;
1041
1042 /* Get the maximum number of iterations that is representable
1043 in the counter type. */
1044 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1045 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1046
1047 /* Get a more refined estimate for the number of iterations. */
1048 widest_int max_back_edges;
1049 if (max_loop_iterations (loop, &max_back_edges))
1050 max_ni = wi::smin (max_ni, max_back_edges + 1);
1051
1052 /* Account for rgroup masks, in which each bit is replicated N times. */
1053 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1054
1055 /* Work out how many bits we need to represent the limit. */
1056 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1057
1058 /* Find a scalar mode for which WHILE_ULT is supported. */
1059 opt_scalar_int_mode cmp_mode_iter;
1060 tree cmp_type = NULL_TREE;
1061 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1062 {
1063 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1064 if (cmp_bits >= min_ni_width
1065 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1066 {
1067 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1068 if (this_type
1069 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1070 {
1071 /* Although we could stop as soon as we find a valid mode,
1072 it's often better to continue until we hit Pmode, since the
1073 operands to the WHILE are more likely to be reusable in
1074 address calculations. */
1075 cmp_type = this_type;
1076 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1077 break;
1078 }
1079 }
1080 }
1081
1082 if (!cmp_type)
1083 return false;
1084
1085 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1086 return true;
1087 }
1088
1089 /* Calculate the cost of one scalar iteration of the loop. */
1090 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1091 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1092 {
1093 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1094 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1095 int nbbs = loop->num_nodes, factor;
1096 int innerloop_iters, i;
1097
1098 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1099
1100 /* Gather costs for statements in the scalar loop. */
1101
1102 /* FORNOW. */
1103 innerloop_iters = 1;
1104 if (loop->inner)
1105 innerloop_iters = 50; /* FIXME */
1106
1107 for (i = 0; i < nbbs; i++)
1108 {
1109 gimple_stmt_iterator si;
1110 basic_block bb = bbs[i];
1111
1112 if (bb->loop_father == loop->inner)
1113 factor = innerloop_iters;
1114 else
1115 factor = 1;
1116
1117 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1118 {
1119 gimple *stmt = gsi_stmt (si);
1120 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1121
1122 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1123 continue;
1124
1125 /* Skip stmts that are not vectorized inside the loop. */
1126 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1127 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1128 && (!STMT_VINFO_LIVE_P (vstmt_info)
1129 || !VECTORIZABLE_CYCLE_DEF
1130 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1131 continue;
1132
1133 vect_cost_for_stmt kind;
1134 if (STMT_VINFO_DATA_REF (stmt_info))
1135 {
1136 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1137 kind = scalar_load;
1138 else
1139 kind = scalar_store;
1140 }
1141 else
1142 kind = scalar_stmt;
1143
1144 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1145 factor, kind, stmt_info, 0, vect_prologue);
1146 }
1147 }
1148
1149 /* Now accumulate cost. */
1150 void *target_cost_data = init_cost (loop);
1151 stmt_info_for_cost *si;
1152 int j;
1153 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1154 j, si)
1155 (void) add_stmt_cost (target_cost_data, si->count,
1156 si->kind, si->stmt_info, si->misalign,
1157 vect_body);
1158 unsigned dummy, body_cost = 0;
1159 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1160 destroy_cost_data (target_cost_data);
1161 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1162 }
1163
1164
1165 /* Function vect_analyze_loop_form_1.
1166
1167 Verify that certain CFG restrictions hold, including:
1168 - the loop has a pre-header
1169 - the loop has a single entry and exit
1170 - the loop exit condition is simple enough
1171 - the number of iterations can be analyzed, i.e, a countable loop. The
1172 niter could be analyzed under some assumptions. */
1173
1174 opt_result
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1175 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1176 tree *assumptions, tree *number_of_iterationsm1,
1177 tree *number_of_iterations, gcond **inner_loop_cond)
1178 {
1179 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1180
1181 /* Different restrictions apply when we are considering an inner-most loop,
1182 vs. an outer (nested) loop.
1183 (FORNOW. May want to relax some of these restrictions in the future). */
1184
1185 if (!loop->inner)
1186 {
1187 /* Inner-most loop. We currently require that the number of BBs is
1188 exactly 2 (the header and latch). Vectorizable inner-most loops
1189 look like this:
1190
1191 (pre-header)
1192 |
1193 header <--------+
1194 | | |
1195 | +--> latch --+
1196 |
1197 (exit-bb) */
1198
1199 if (loop->num_nodes != 2)
1200 return opt_result::failure_at (vect_location,
1201 "not vectorized:"
1202 " control flow in loop.\n");
1203
1204 if (empty_block_p (loop->header))
1205 return opt_result::failure_at (vect_location,
1206 "not vectorized: empty loop.\n");
1207 }
1208 else
1209 {
1210 struct loop *innerloop = loop->inner;
1211 edge entryedge;
1212
1213 /* Nested loop. We currently require that the loop is doubly-nested,
1214 contains a single inner loop, and the number of BBs is exactly 5.
1215 Vectorizable outer-loops look like this:
1216
1217 (pre-header)
1218 |
1219 header <---+
1220 | |
1221 inner-loop |
1222 | |
1223 tail ------+
1224 |
1225 (exit-bb)
1226
1227 The inner-loop has the properties expected of inner-most loops
1228 as described above. */
1229
1230 if ((loop->inner)->inner || (loop->inner)->next)
1231 return opt_result::failure_at (vect_location,
1232 "not vectorized:"
1233 " multiple nested loops.\n");
1234
1235 if (loop->num_nodes != 5)
1236 return opt_result::failure_at (vect_location,
1237 "not vectorized:"
1238 " control flow in loop.\n");
1239
1240 entryedge = loop_preheader_edge (innerloop);
1241 if (entryedge->src != loop->header
1242 || !single_exit (innerloop)
1243 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1244 return opt_result::failure_at (vect_location,
1245 "not vectorized:"
1246 " unsupported outerloop form.\n");
1247
1248 /* Analyze the inner-loop. */
1249 tree inner_niterm1, inner_niter, inner_assumptions;
1250 opt_result res
1251 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1252 &inner_assumptions, &inner_niterm1,
1253 &inner_niter, NULL);
1254 if (!res)
1255 {
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "not vectorized: Bad inner loop.\n");
1259 return res;
1260 }
1261
1262 /* Don't support analyzing niter under assumptions for inner
1263 loop. */
1264 if (!integer_onep (inner_assumptions))
1265 return opt_result::failure_at (vect_location,
1266 "not vectorized: Bad inner loop.\n");
1267
1268 if (!expr_invariant_in_loop_p (loop, inner_niter))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: inner-loop count not"
1271 " invariant.\n");
1272
1273 if (dump_enabled_p ())
1274 dump_printf_loc (MSG_NOTE, vect_location,
1275 "Considering outer-loop vectorization.\n");
1276 }
1277
1278 if (!single_exit (loop))
1279 return opt_result::failure_at (vect_location,
1280 "not vectorized: multiple exits.\n");
1281 if (EDGE_COUNT (loop->header->preds) != 2)
1282 return opt_result::failure_at (vect_location,
1283 "not vectorized:"
1284 " too many incoming edges.\n");
1285
1286 /* We assume that the loop exit condition is at the end of the loop. i.e,
1287 that the loop is represented as a do-while (with a proper if-guard
1288 before the loop if needed), where the loop header contains all the
1289 executable statements, and the latch is empty. */
1290 if (!empty_block_p (loop->latch)
1291 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1292 return opt_result::failure_at (vect_location,
1293 "not vectorized: latch block not empty.\n");
1294
1295 /* Make sure the exit is not abnormal. */
1296 edge e = single_exit (loop);
1297 if (e->flags & EDGE_ABNORMAL)
1298 return opt_result::failure_at (vect_location,
1299 "not vectorized:"
1300 " abnormal loop exit edge.\n");
1301
1302 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1303 number_of_iterationsm1);
1304 if (!*loop_cond)
1305 return opt_result::failure_at
1306 (vect_location,
1307 "not vectorized: complicated exit condition.\n");
1308
1309 if (integer_zerop (*assumptions)
1310 || !*number_of_iterations
1311 || chrec_contains_undetermined (*number_of_iterations))
1312 return opt_result::failure_at
1313 (*loop_cond,
1314 "not vectorized: number of iterations cannot be computed.\n");
1315
1316 if (integer_zerop (*number_of_iterations))
1317 return opt_result::failure_at
1318 (*loop_cond,
1319 "not vectorized: number of iterations = 0.\n");
1320
1321 return opt_result::success ();
1322 }
1323
1324 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1325
1326 opt_loop_vec_info
vect_analyze_loop_form(struct loop * loop,vec_info_shared * shared)1327 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1328 {
1329 tree assumptions, number_of_iterations, number_of_iterationsm1;
1330 gcond *loop_cond, *inner_loop_cond = NULL;
1331
1332 opt_result res
1333 = vect_analyze_loop_form_1 (loop, &loop_cond,
1334 &assumptions, &number_of_iterationsm1,
1335 &number_of_iterations, &inner_loop_cond);
1336 if (!res)
1337 return opt_loop_vec_info::propagate_failure (res);
1338
1339 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1340 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1341 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1342 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1343 if (!integer_onep (assumptions))
1344 {
1345 /* We consider to vectorize this loop by versioning it under
1346 some assumptions. In order to do this, we need to clear
1347 existing information computed by scev and niter analyzer. */
1348 scev_reset_htab ();
1349 free_numbers_of_iterations_estimates (loop);
1350 /* Also set flag for this loop so that following scev and niter
1351 analysis are done under the assumptions. */
1352 loop_constraint_set (loop, LOOP_C_FINITE);
1353 /* Also record the assumptions for versioning. */
1354 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1355 }
1356
1357 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1358 {
1359 if (dump_enabled_p ())
1360 {
1361 dump_printf_loc (MSG_NOTE, vect_location,
1362 "Symbolic number of iterations is ");
1363 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1364 dump_printf (MSG_NOTE, "\n");
1365 }
1366 }
1367
1368 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1369 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1370 if (inner_loop_cond)
1371 {
1372 stmt_vec_info inner_loop_cond_info
1373 = loop_vinfo->lookup_stmt (inner_loop_cond);
1374 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1375 }
1376
1377 gcc_assert (!loop->aux);
1378 loop->aux = loop_vinfo;
1379 return opt_loop_vec_info::success (loop_vinfo);
1380 }
1381
1382
1383
1384 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1385 statements update the vectorization factor. */
1386
1387 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1388 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1389 {
1390 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1392 int nbbs = loop->num_nodes;
1393 poly_uint64 vectorization_factor;
1394 int i;
1395
1396 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1397
1398 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1399 gcc_assert (known_ne (vectorization_factor, 0U));
1400
1401 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1402 vectorization factor of the loop is the unrolling factor required by
1403 the SLP instances. If that unrolling factor is 1, we say, that we
1404 perform pure SLP on loop - cross iteration parallelism is not
1405 exploited. */
1406 bool only_slp_in_loop = true;
1407 for (i = 0; i < nbbs; i++)
1408 {
1409 basic_block bb = bbs[i];
1410 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1411 gsi_next (&si))
1412 {
1413 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1414 stmt_info = vect_stmt_to_vectorize (stmt_info);
1415 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1416 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1417 && !PURE_SLP_STMT (stmt_info))
1418 /* STMT needs both SLP and loop-based vectorization. */
1419 only_slp_in_loop = false;
1420 }
1421 }
1422
1423 if (only_slp_in_loop)
1424 {
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Loop contains only SLP stmts\n");
1428 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1429 }
1430 else
1431 {
1432 if (dump_enabled_p ())
1433 dump_printf_loc (MSG_NOTE, vect_location,
1434 "Loop contains SLP and non-SLP stmts\n");
1435 /* Both the vectorization factor and unroll factor have the form
1436 current_vector_size * X for some rational X, so they must have
1437 a common multiple. */
1438 vectorization_factor
1439 = force_common_multiple (vectorization_factor,
1440 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1441 }
1442
1443 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1444 if (dump_enabled_p ())
1445 {
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "Updating vectorization factor to ");
1448 dump_dec (MSG_NOTE, vectorization_factor);
1449 dump_printf (MSG_NOTE, ".\n");
1450 }
1451 }
1452
1453 /* Return true if STMT_INFO describes a double reduction phi and if
1454 the other phi in the reduction is also relevant for vectorization.
1455 This rejects cases such as:
1456
1457 outer1:
1458 x_1 = PHI <x_3(outer2), ...>;
1459 ...
1460
1461 inner:
1462 x_2 = ...;
1463 ...
1464
1465 outer2:
1466 x_3 = PHI <x_2(inner)>;
1467
1468 if nothing in x_2 or elsewhere makes x_1 relevant. */
1469
1470 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1471 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1472 {
1473 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1474 return false;
1475
1476 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1477 }
1478
1479 /* Function vect_analyze_loop_operations.
1480
1481 Scan the loop stmts and make sure they are all vectorizable. */
1482
1483 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1484 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1485 {
1486 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1487 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1488 int nbbs = loop->num_nodes;
1489 int i;
1490 stmt_vec_info stmt_info;
1491 bool need_to_vectorize = false;
1492 bool ok;
1493
1494 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1495
1496 auto_vec<stmt_info_for_cost> cost_vec;
1497
1498 for (i = 0; i < nbbs; i++)
1499 {
1500 basic_block bb = bbs[i];
1501
1502 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1503 gsi_next (&si))
1504 {
1505 gphi *phi = si.phi ();
1506 ok = true;
1507
1508 stmt_info = loop_vinfo->lookup_stmt (phi);
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1511 if (virtual_operand_p (gimple_phi_result (phi)))
1512 continue;
1513
1514 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1515 (i.e., a phi in the tail of the outer-loop). */
1516 if (! is_loop_header_bb_p (bb))
1517 {
1518 /* FORNOW: we currently don't support the case that these phis
1519 are not used in the outerloop (unless it is double reduction,
1520 i.e., this phi is vect_reduction_def), cause this case
1521 requires to actually do something here. */
1522 if (STMT_VINFO_LIVE_P (stmt_info)
1523 && !vect_active_double_reduction_p (stmt_info))
1524 return opt_result::failure_at (phi,
1525 "Unsupported loop-closed phi"
1526 " in outer-loop.\n");
1527
1528 /* If PHI is used in the outer loop, we check that its operand
1529 is defined in the inner loop. */
1530 if (STMT_VINFO_RELEVANT_P (stmt_info))
1531 {
1532 tree phi_op;
1533
1534 if (gimple_phi_num_args (phi) != 1)
1535 return opt_result::failure_at (phi, "unsupported phi");
1536
1537 phi_op = PHI_ARG_DEF (phi, 0);
1538 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1539 if (!op_def_info)
1540 return opt_result::failure_at (phi, "unsupported phi");
1541
1542 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1543 && (STMT_VINFO_RELEVANT (op_def_info)
1544 != vect_used_in_outer_by_reduction))
1545 return opt_result::failure_at (phi, "unsupported phi");
1546 }
1547
1548 continue;
1549 }
1550
1551 gcc_assert (stmt_info);
1552
1553 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1554 || STMT_VINFO_LIVE_P (stmt_info))
1555 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1556 /* A scalar-dependence cycle that we don't support. */
1557 return opt_result::failure_at (phi,
1558 "not vectorized:"
1559 " scalar dependence cycle.\n");
1560
1561 if (STMT_VINFO_RELEVANT_P (stmt_info))
1562 {
1563 need_to_vectorize = true;
1564 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1565 && ! PURE_SLP_STMT (stmt_info))
1566 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1567 &cost_vec);
1568 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1569 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1570 && ! PURE_SLP_STMT (stmt_info))
1571 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1572 &cost_vec);
1573 }
1574
1575 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1576 if (ok
1577 && STMT_VINFO_LIVE_P (stmt_info)
1578 && !PURE_SLP_STMT (stmt_info))
1579 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1580 &cost_vec);
1581
1582 if (!ok)
1583 return opt_result::failure_at (phi,
1584 "not vectorized: relevant phi not "
1585 "supported: %G",
1586 static_cast <gimple *> (phi));
1587 }
1588
1589 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1590 gsi_next (&si))
1591 {
1592 gimple *stmt = gsi_stmt (si);
1593 if (!gimple_clobber_p (stmt))
1594 {
1595 opt_result res
1596 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1597 &need_to_vectorize,
1598 NULL, NULL, &cost_vec);
1599 if (!res)
1600 return res;
1601 }
1602 }
1603 } /* bbs */
1604
1605 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1606
1607 /* All operations in the loop are either irrelevant (deal with loop
1608 control, or dead), or only used outside the loop and can be moved
1609 out of the loop (e.g. invariants, inductions). The loop can be
1610 optimized away by scalar optimizations. We're better off not
1611 touching this loop. */
1612 if (!need_to_vectorize)
1613 {
1614 if (dump_enabled_p ())
1615 dump_printf_loc (MSG_NOTE, vect_location,
1616 "All the computation can be taken out of the loop.\n");
1617 return opt_result::failure_at
1618 (vect_location,
1619 "not vectorized: redundant loop. no profit to vectorize.\n");
1620 }
1621
1622 return opt_result::success ();
1623 }
1624
1625 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1626 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1627 definitely no, or -1 if it's worth retrying. */
1628
1629 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1630 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1631 {
1632 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1634
1635 /* Only fully-masked loops can have iteration counts less than the
1636 vectorization factor. */
1637 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1638 {
1639 HOST_WIDE_INT max_niter;
1640
1641 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1642 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1643 else
1644 max_niter = max_stmt_executions_int (loop);
1645
1646 if (max_niter != -1
1647 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1648 {
1649 if (dump_enabled_p ())
1650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651 "not vectorized: iteration count smaller than "
1652 "vectorization factor.\n");
1653 return 0;
1654 }
1655 }
1656
1657 int min_profitable_iters, min_profitable_estimate;
1658 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1659 &min_profitable_estimate);
1660
1661 if (min_profitable_iters < 0)
1662 {
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665 "not vectorized: vectorization not profitable.\n");
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1668 "not vectorized: vector version will never be "
1669 "profitable.\n");
1670 return -1;
1671 }
1672
1673 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1674 * assumed_vf);
1675
1676 /* Use the cost model only if it is more conservative than user specified
1677 threshold. */
1678 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1679 min_profitable_iters);
1680
1681 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1682
1683 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1684 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1685 {
1686 if (dump_enabled_p ())
1687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1688 "not vectorized: vectorization not profitable.\n");
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_NOTE, vect_location,
1691 "not vectorized: iteration count smaller than user "
1692 "specified loop bound parameter or minimum profitable "
1693 "iterations (whichever is more conservative).\n");
1694 return 0;
1695 }
1696
1697 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1698 if (estimated_niter == -1)
1699 estimated_niter = likely_max_stmt_executions_int (loop);
1700 if (estimated_niter != -1
1701 && ((unsigned HOST_WIDE_INT) estimated_niter
1702 < MAX (th, (unsigned) min_profitable_estimate)))
1703 {
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706 "not vectorized: estimated iteration count too "
1707 "small.\n");
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE, vect_location,
1710 "not vectorized: estimated iteration count smaller "
1711 "than specified loop bound parameter or minimum "
1712 "profitable iterations (whichever is more "
1713 "conservative).\n");
1714 return -1;
1715 }
1716
1717 return 1;
1718 }
1719
1720 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1721 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1722 vec<data_reference_p> *datarefs,
1723 unsigned int *n_stmts)
1724 {
1725 *n_stmts = 0;
1726 for (unsigned i = 0; i < loop->num_nodes; i++)
1727 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1728 !gsi_end_p (gsi); gsi_next (&gsi))
1729 {
1730 gimple *stmt = gsi_stmt (gsi);
1731 if (is_gimple_debug (stmt))
1732 continue;
1733 ++(*n_stmts);
1734 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1735 if (!res)
1736 {
1737 if (is_gimple_call (stmt) && loop->safelen)
1738 {
1739 tree fndecl = gimple_call_fndecl (stmt), op;
1740 if (fndecl != NULL_TREE)
1741 {
1742 cgraph_node *node = cgraph_node::get (fndecl);
1743 if (node != NULL && node->simd_clones != NULL)
1744 {
1745 unsigned int j, n = gimple_call_num_args (stmt);
1746 for (j = 0; j < n; j++)
1747 {
1748 op = gimple_call_arg (stmt, j);
1749 if (DECL_P (op)
1750 || (REFERENCE_CLASS_P (op)
1751 && get_base_address (op)))
1752 break;
1753 }
1754 op = gimple_call_lhs (stmt);
1755 /* Ignore #pragma omp declare simd functions
1756 if they don't have data references in the
1757 call stmt itself. */
1758 if (j == n
1759 && !(op
1760 && (DECL_P (op)
1761 || (REFERENCE_CLASS_P (op)
1762 && get_base_address (op)))))
1763 continue;
1764 }
1765 }
1766 }
1767 return res;
1768 }
1769 /* If dependence analysis will give up due to the limit on the
1770 number of datarefs stop here and fail fatally. */
1771 if (datarefs->length ()
1772 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1773 return opt_result::failure_at (stmt, "exceeded param "
1774 "loop-max-datarefs-for-datadeps\n");
1775 }
1776 return opt_result::success ();
1777 }
1778
1779 /* Function vect_analyze_loop_2.
1780
1781 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1782 for it. The different analyses will record information in the
1783 loop_vec_info struct. */
1784 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1785 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1786 {
1787 opt_result ok = opt_result::success ();
1788 int res;
1789 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1790 poly_uint64 min_vf = 2;
1791
1792 /* The first group of checks is independent of the vector size. */
1793 fatal = true;
1794
1795 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1796 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1797 return opt_result::failure_at (vect_location,
1798 "not vectorized: simd if(0)\n");
1799
1800 /* Find all data references in the loop (which correspond to vdefs/vuses)
1801 and analyze their evolution in the loop. */
1802
1803 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804
1805 /* Gather the data references and count stmts in the loop. */
1806 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1807 {
1808 opt_result res
1809 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1810 &LOOP_VINFO_DATAREFS (loop_vinfo),
1811 n_stmts);
1812 if (!res)
1813 {
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1816 "not vectorized: loop contains function "
1817 "calls or data references that cannot "
1818 "be analyzed\n");
1819 return res;
1820 }
1821 loop_vinfo->shared->save_datarefs ();
1822 }
1823 else
1824 loop_vinfo->shared->check_datarefs ();
1825
1826 /* Analyze the data references and also adjust the minimal
1827 vectorization factor according to the loads and stores. */
1828
1829 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1830 if (!ok)
1831 {
1832 if (dump_enabled_p ())
1833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834 "bad data references.\n");
1835 return ok;
1836 }
1837
1838 /* Classify all cross-iteration scalar data-flow cycles.
1839 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1840 vect_analyze_scalar_cycles (loop_vinfo);
1841
1842 vect_pattern_recog (loop_vinfo);
1843
1844 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1845
1846 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1847 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1848
1849 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1850 if (!ok)
1851 {
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data access.\n");
1855 return ok;
1856 }
1857
1858 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1859
1860 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1861 if (!ok)
1862 {
1863 if (dump_enabled_p ())
1864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 "unexpected pattern.\n");
1866 return ok;
1867 }
1868
1869 /* While the rest of the analysis below depends on it in some way. */
1870 fatal = false;
1871
1872 /* Analyze data dependences between the data-refs in the loop
1873 and adjust the maximum vectorization factor according to
1874 the dependences.
1875 FORNOW: fail at the first data dependence that we encounter. */
1876
1877 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1878 if (!ok)
1879 {
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882 "bad data dependence.\n");
1883 return ok;
1884 }
1885 if (max_vf != MAX_VECTORIZATION_FACTOR
1886 && maybe_lt (max_vf, min_vf))
1887 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1888 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1889
1890 ok = vect_determine_vectorization_factor (loop_vinfo);
1891 if (!ok)
1892 {
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "can't determine vectorization factor.\n");
1896 return ok;
1897 }
1898 if (max_vf != MAX_VECTORIZATION_FACTOR
1899 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1900 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1901
1902 /* Compute the scalar iteration cost. */
1903 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1904
1905 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1906 unsigned th;
1907
1908 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1909 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1910 if (!ok)
1911 return ok;
1912
1913 /* If there are any SLP instances mark them as pure_slp. */
1914 bool slp = vect_make_slp_decision (loop_vinfo);
1915 if (slp)
1916 {
1917 /* Find stmts that need to be both vectorized and SLPed. */
1918 vect_detect_hybrid_slp (loop_vinfo);
1919
1920 /* Update the vectorization factor based on the SLP decision. */
1921 vect_update_vf_for_slp (loop_vinfo);
1922 }
1923
1924 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1925
1926 /* We don't expect to have to roll back to anything other than an empty
1927 set of rgroups. */
1928 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1929
1930 /* This is the point where we can re-start analysis with SLP forced off. */
1931 start_over:
1932
1933 /* Now the vectorization factor is final. */
1934 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935 gcc_assert (known_ne (vectorization_factor, 0U));
1936
1937 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1938 {
1939 dump_printf_loc (MSG_NOTE, vect_location,
1940 "vectorization_factor = ");
1941 dump_dec (MSG_NOTE, vectorization_factor);
1942 dump_printf (MSG_NOTE, ", niters = %wd\n",
1943 LOOP_VINFO_INT_NITERS (loop_vinfo));
1944 }
1945
1946 HOST_WIDE_INT max_niter
1947 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1948
1949 /* Analyze the alignment of the data-refs in the loop.
1950 Fail if a data reference is found that cannot be vectorized. */
1951
1952 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1953 if (!ok)
1954 {
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957 "bad data alignment.\n");
1958 return ok;
1959 }
1960
1961 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1962 It is important to call pruning after vect_analyze_data_ref_accesses,
1963 since we use grouping information gathered by interleaving analysis. */
1964 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1965 if (!ok)
1966 return ok;
1967
1968 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1969 vectorization, since we do not want to add extra peeling or
1970 add versioning for alignment. */
1971 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1972 /* This pass will decide on using loop versioning and/or loop peeling in
1973 order to enhance the alignment of data references in the loop. */
1974 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1975 else
1976 ok = vect_verify_datarefs_alignment (loop_vinfo);
1977 if (!ok)
1978 return ok;
1979
1980 if (slp)
1981 {
1982 /* Analyze operations in the SLP instances. Note this may
1983 remove unsupported SLP instances which makes the above
1984 SLP kind detection invalid. */
1985 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1986 vect_slp_analyze_operations (loop_vinfo);
1987 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1988 {
1989 ok = opt_result::failure_at (vect_location,
1990 "unsupported SLP instances\n");
1991 goto again;
1992 }
1993 }
1994
1995 /* Scan all the remaining operations in the loop that are not subject
1996 to SLP and make sure they are vectorizable. */
1997 ok = vect_analyze_loop_operations (loop_vinfo);
1998 if (!ok)
1999 {
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 "bad operation or unsupported loop bound.\n");
2003 return ok;
2004 }
2005
2006 /* Decide whether to use a fully-masked loop for this vectorization
2007 factor. */
2008 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2009 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2010 && vect_verify_full_masking (loop_vinfo));
2011 if (dump_enabled_p ())
2012 {
2013 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "using a fully-masked loop.\n");
2016 else
2017 dump_printf_loc (MSG_NOTE, vect_location,
2018 "not using a fully-masked loop.\n");
2019 }
2020
2021 /* If epilog loop is required because of data accesses with gaps,
2022 one additional iteration needs to be peeled. Check if there is
2023 enough iterations for vectorization. */
2024 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2025 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2026 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027 {
2028 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2029 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2030
2031 if (known_lt (wi::to_widest (scalar_niters), vf))
2032 return opt_result::failure_at (vect_location,
2033 "loop has no enough iterations to"
2034 " support peeling for gaps.\n");
2035 }
2036
2037 /* Check the costings of the loop make vectorizing worthwhile. */
2038 res = vect_analyze_loop_costing (loop_vinfo);
2039 if (res < 0)
2040 {
2041 ok = opt_result::failure_at (vect_location,
2042 "Loop costings may not be worthwhile.\n");
2043 goto again;
2044 }
2045 if (!res)
2046 return opt_result::failure_at (vect_location,
2047 "Loop costings not worthwhile.\n");
2048
2049 /* Decide whether we need to create an epilogue loop to handle
2050 remaining scalar iterations. */
2051 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2052
2053 unsigned HOST_WIDE_INT const_vf;
2054 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2055 /* The main loop handles all iterations. */
2056 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2057 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2058 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2059 {
2060 /* Work out the (constant) number of iterations that need to be
2061 peeled for reasons other than niters. */
2062 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2063 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2064 peel_niter += 1;
2065 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2066 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2067 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2068 }
2069 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2070 /* ??? When peeling for gaps but not alignment, we could
2071 try to check whether the (variable) niters is known to be
2072 VF * N + 1. That's something of a niche case though. */
2073 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2074 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2075 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2076 < (unsigned) exact_log2 (const_vf))
2077 /* In case of versioning, check if the maximum number of
2078 iterations is greater than th. If they are identical,
2079 the epilogue is unnecessary. */
2080 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2081 || ((unsigned HOST_WIDE_INT) max_niter
2082 > (th / const_vf) * const_vf))))
2083 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2084
2085 /* If an epilogue loop is required make sure we can create one. */
2086 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2087 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2088 {
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2091 if (!vect_can_advance_ivs_p (loop_vinfo)
2092 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2093 single_exit (LOOP_VINFO_LOOP
2094 (loop_vinfo))))
2095 {
2096 ok = opt_result::failure_at (vect_location,
2097 "not vectorized: can't create required "
2098 "epilog loop\n");
2099 goto again;
2100 }
2101 }
2102
2103 /* During peeling, we need to check if number of loop iterations is
2104 enough for both peeled prolog loop and vector loop. This check
2105 can be merged along with threshold check of loop versioning, so
2106 increase threshold for this case if necessary. */
2107 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2108 {
2109 poly_uint64 niters_th = 0;
2110
2111 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2112 {
2113 /* Niters for peeled prolog loop. */
2114 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2115 {
2116 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2117 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2118 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2119 }
2120 else
2121 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2122 }
2123
2124 /* Niters for at least one iteration of vectorized loop. */
2125 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2126 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2127 /* One additional iteration because of peeling for gap. */
2128 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2129 niters_th += 1;
2130 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2131 }
2132
2133 gcc_assert (known_eq (vectorization_factor,
2134 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2135
2136 /* Ok to vectorize! */
2137 return opt_result::success ();
2138
2139 again:
2140 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2141 gcc_assert (!ok);
2142
2143 /* Try again with SLP forced off but if we didn't do any SLP there is
2144 no point in re-trying. */
2145 if (!slp)
2146 return ok;
2147
2148 /* If there are reduction chains re-trying will fail anyway. */
2149 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2150 return ok;
2151
2152 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2153 via interleaving or lane instructions. */
2154 slp_instance instance;
2155 slp_tree node;
2156 unsigned i, j;
2157 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2158 {
2159 stmt_vec_info vinfo;
2160 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2161 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2162 continue;
2163 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2164 unsigned int size = DR_GROUP_SIZE (vinfo);
2165 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2166 if (! vect_store_lanes_supported (vectype, size, false)
2167 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2168 && ! vect_grouped_store_supported (vectype, size))
2169 return opt_result::failure_at (vinfo->stmt,
2170 "unsupported grouped store\n");
2171 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2172 {
2173 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2174 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2175 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2176 size = DR_GROUP_SIZE (vinfo);
2177 vectype = STMT_VINFO_VECTYPE (vinfo);
2178 if (! vect_load_lanes_supported (vectype, size, false)
2179 && ! vect_grouped_load_supported (vectype, single_element_p,
2180 size))
2181 return opt_result::failure_at (vinfo->stmt,
2182 "unsupported grouped load\n");
2183 }
2184 }
2185
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_NOTE, vect_location,
2188 "re-trying with SLP disabled\n");
2189
2190 /* Roll back state appropriately. No SLP this time. */
2191 slp = false;
2192 /* Restore vectorization factor as it were without SLP. */
2193 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2194 /* Free the SLP instances. */
2195 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2196 vect_free_slp_instance (instance, false);
2197 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2198 /* Reset SLP type to loop_vect on all stmts. */
2199 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2200 {
2201 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2202 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2203 !gsi_end_p (si); gsi_next (&si))
2204 {
2205 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2206 STMT_SLP_TYPE (stmt_info) = loop_vect;
2207 }
2208 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2209 !gsi_end_p (si); gsi_next (&si))
2210 {
2211 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2212 STMT_SLP_TYPE (stmt_info) = loop_vect;
2213 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2214 {
2215 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2216 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2217 STMT_SLP_TYPE (stmt_info) = loop_vect;
2218 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2219 !gsi_end_p (pi); gsi_next (&pi))
2220 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2221 = loop_vect;
2222 }
2223 }
2224 }
2225 /* Free optimized alias test DDRS. */
2226 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2227 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2228 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2229 /* Reset target cost data. */
2230 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2231 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2232 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2233 /* Reset accumulated rgroup information. */
2234 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2235 /* Reset assorted flags. */
2236 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2237 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2238 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2239 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2240 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2241
2242 goto start_over;
2243 }
2244
2245 /* Function vect_analyze_loop.
2246
2247 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2248 for it. The different analyses will record information in the
2249 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2250 be vectorized. */
2251 opt_loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo,vec_info_shared * shared)2252 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2253 vec_info_shared *shared)
2254 {
2255 auto_vector_sizes vector_sizes;
2256
2257 /* Autodetect first vector size we try. */
2258 current_vector_size = 0;
2259 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2260 unsigned int next_size = 0;
2261
2262 DUMP_VECT_SCOPE ("analyze_loop_nest");
2263
2264 if (loop_outer (loop)
2265 && loop_vec_info_for_loop (loop_outer (loop))
2266 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2267 return opt_loop_vec_info::failure_at (vect_location,
2268 "outer-loop already vectorized.\n");
2269
2270 if (!find_loop_nest (loop, &shared->loop_nest))
2271 return opt_loop_vec_info::failure_at
2272 (vect_location,
2273 "not vectorized: loop nest containing two or more consecutive inner"
2274 " loops cannot be vectorized\n");
2275
2276 unsigned n_stmts = 0;
2277 poly_uint64 autodetected_vector_size = 0;
2278 while (1)
2279 {
2280 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2281 opt_loop_vec_info loop_vinfo
2282 = vect_analyze_loop_form (loop, shared);
2283 if (!loop_vinfo)
2284 {
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "bad loop form.\n");
2288 return loop_vinfo;
2289 }
2290
2291 bool fatal = false;
2292
2293 if (orig_loop_vinfo)
2294 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2295
2296 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2297 if (res)
2298 {
2299 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2300
2301 return loop_vinfo;
2302 }
2303
2304 delete loop_vinfo;
2305
2306 if (next_size == 0)
2307 autodetected_vector_size = current_vector_size;
2308
2309 if (next_size < vector_sizes.length ()
2310 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2311 next_size += 1;
2312
2313 if (fatal
2314 || next_size == vector_sizes.length ()
2315 || known_eq (current_vector_size, 0U))
2316 return opt_loop_vec_info::propagate_failure (res);
2317
2318 /* Try the next biggest vector size. */
2319 current_vector_size = vector_sizes[next_size++];
2320 if (dump_enabled_p ())
2321 {
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "***** Re-trying analysis with "
2324 "vector size ");
2325 dump_dec (MSG_NOTE, current_vector_size);
2326 dump_printf (MSG_NOTE, "\n");
2327 }
2328 }
2329 }
2330
2331 /* Return true if there is an in-order reduction function for CODE, storing
2332 it in *REDUC_FN if so. */
2333
2334 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2335 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2336 {
2337 switch (code)
2338 {
2339 case PLUS_EXPR:
2340 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2341 return true;
2342
2343 default:
2344 return false;
2345 }
2346 }
2347
2348 /* Function reduction_fn_for_scalar_code
2349
2350 Input:
2351 CODE - tree_code of a reduction operations.
2352
2353 Output:
2354 REDUC_FN - the corresponding internal function to be used to reduce the
2355 vector of partial results into a single scalar result, or IFN_LAST
2356 if the operation is a supported reduction operation, but does not have
2357 such an internal function.
2358
2359 Return FALSE if CODE currently cannot be vectorized as reduction. */
2360
2361 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2362 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2363 {
2364 switch (code)
2365 {
2366 case MAX_EXPR:
2367 *reduc_fn = IFN_REDUC_MAX;
2368 return true;
2369
2370 case MIN_EXPR:
2371 *reduc_fn = IFN_REDUC_MIN;
2372 return true;
2373
2374 case PLUS_EXPR:
2375 *reduc_fn = IFN_REDUC_PLUS;
2376 return true;
2377
2378 case BIT_AND_EXPR:
2379 *reduc_fn = IFN_REDUC_AND;
2380 return true;
2381
2382 case BIT_IOR_EXPR:
2383 *reduc_fn = IFN_REDUC_IOR;
2384 return true;
2385
2386 case BIT_XOR_EXPR:
2387 *reduc_fn = IFN_REDUC_XOR;
2388 return true;
2389
2390 case MULT_EXPR:
2391 case MINUS_EXPR:
2392 *reduc_fn = IFN_LAST;
2393 return true;
2394
2395 default:
2396 return false;
2397 }
2398 }
2399
2400 /* If there is a neutral value X such that SLP reduction NODE would not
2401 be affected by the introduction of additional X elements, return that X,
2402 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2403 is true if the SLP statements perform a single reduction, false if each
2404 statement performs an independent reduction. */
2405
2406 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)2407 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2408 bool reduc_chain)
2409 {
2410 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2411 stmt_vec_info stmt_vinfo = stmts[0];
2412 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2413 tree scalar_type = TREE_TYPE (vector_type);
2414 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2415 gcc_assert (loop);
2416
2417 switch (code)
2418 {
2419 case WIDEN_SUM_EXPR:
2420 case DOT_PROD_EXPR:
2421 case SAD_EXPR:
2422 case PLUS_EXPR:
2423 case MINUS_EXPR:
2424 case BIT_IOR_EXPR:
2425 case BIT_XOR_EXPR:
2426 return build_zero_cst (scalar_type);
2427
2428 case MULT_EXPR:
2429 return build_one_cst (scalar_type);
2430
2431 case BIT_AND_EXPR:
2432 return build_all_ones_cst (scalar_type);
2433
2434 case MAX_EXPR:
2435 case MIN_EXPR:
2436 /* For MIN/MAX the initial values are neutral. A reduction chain
2437 has only a single initial value, so that value is neutral for
2438 all statements. */
2439 if (reduc_chain)
2440 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2441 loop_preheader_edge (loop));
2442 return NULL_TREE;
2443
2444 default:
2445 return NULL_TREE;
2446 }
2447 }
2448
2449 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2450 STMT is printed with a message MSG. */
2451
2452 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2453 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2454 {
2455 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2456 }
2457
2458 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2459 operation. Return true if the results of DEF_STMT_INFO are something
2460 that can be accumulated by such a reduction. */
2461
2462 static bool
vect_valid_reduction_input_p(stmt_vec_info def_stmt_info)2463 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2464 {
2465 return (is_gimple_assign (def_stmt_info->stmt)
2466 || is_gimple_call (def_stmt_info->stmt)
2467 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2468 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2469 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2470 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2471 }
2472
2473 /* Detect SLP reduction of the form:
2474
2475 #a1 = phi <a5, a0>
2476 a2 = operation (a1)
2477 a3 = operation (a2)
2478 a4 = operation (a3)
2479 a5 = operation (a4)
2480
2481 #a = phi <a5>
2482
2483 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2484 FIRST_STMT is the first reduction stmt in the chain
2485 (a2 = operation (a1)).
2486
2487 Return TRUE if a reduction chain was detected. */
2488
2489 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2490 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2491 gimple *first_stmt)
2492 {
2493 struct loop *loop = (gimple_bb (phi))->loop_father;
2494 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2495 enum tree_code code;
2496 gimple *loop_use_stmt = NULL;
2497 stmt_vec_info use_stmt_info;
2498 tree lhs;
2499 imm_use_iterator imm_iter;
2500 use_operand_p use_p;
2501 int nloop_uses, size = 0, n_out_of_loop_uses;
2502 bool found = false;
2503
2504 if (loop != vect_loop)
2505 return false;
2506
2507 auto_vec<stmt_vec_info, 8> reduc_chain;
2508 lhs = PHI_RESULT (phi);
2509 code = gimple_assign_rhs_code (first_stmt);
2510 while (1)
2511 {
2512 nloop_uses = 0;
2513 n_out_of_loop_uses = 0;
2514 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2515 {
2516 gimple *use_stmt = USE_STMT (use_p);
2517 if (is_gimple_debug (use_stmt))
2518 continue;
2519
2520 /* Check if we got back to the reduction phi. */
2521 if (use_stmt == phi)
2522 {
2523 loop_use_stmt = use_stmt;
2524 found = true;
2525 break;
2526 }
2527
2528 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2529 {
2530 loop_use_stmt = use_stmt;
2531 nloop_uses++;
2532 }
2533 else
2534 n_out_of_loop_uses++;
2535
2536 /* There are can be either a single use in the loop or two uses in
2537 phi nodes. */
2538 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2539 return false;
2540 }
2541
2542 if (found)
2543 break;
2544
2545 /* We reached a statement with no loop uses. */
2546 if (nloop_uses == 0)
2547 return false;
2548
2549 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2550 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2551 return false;
2552
2553 if (!is_gimple_assign (loop_use_stmt)
2554 || code != gimple_assign_rhs_code (loop_use_stmt)
2555 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2556 return false;
2557
2558 /* Insert USE_STMT into reduction chain. */
2559 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2560 reduc_chain.safe_push (use_stmt_info);
2561
2562 lhs = gimple_assign_lhs (loop_use_stmt);
2563 size++;
2564 }
2565
2566 if (!found || loop_use_stmt != phi || size < 2)
2567 return false;
2568
2569 /* Swap the operands, if needed, to make the reduction operand be the second
2570 operand. */
2571 lhs = PHI_RESULT (phi);
2572 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2573 {
2574 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2575 if (gimple_assign_rhs2 (next_stmt) == lhs)
2576 {
2577 tree op = gimple_assign_rhs1 (next_stmt);
2578 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2579
2580 /* Check that the other def is either defined in the loop
2581 ("vect_internal_def"), or it's an induction (defined by a
2582 loop-header phi-node). */
2583 if (def_stmt_info
2584 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2585 && vect_valid_reduction_input_p (def_stmt_info))
2586 {
2587 lhs = gimple_assign_lhs (next_stmt);
2588 continue;
2589 }
2590
2591 return false;
2592 }
2593 else
2594 {
2595 tree op = gimple_assign_rhs2 (next_stmt);
2596 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2597
2598 /* Check that the other def is either defined in the loop
2599 ("vect_internal_def"), or it's an induction (defined by a
2600 loop-header phi-node). */
2601 if (def_stmt_info
2602 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2603 && vect_valid_reduction_input_p (def_stmt_info))
2604 {
2605 if (dump_enabled_p ())
2606 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2607 next_stmt);
2608
2609 swap_ssa_operands (next_stmt,
2610 gimple_assign_rhs1_ptr (next_stmt),
2611 gimple_assign_rhs2_ptr (next_stmt));
2612 update_stmt (next_stmt);
2613
2614 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2615 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2616 }
2617 else
2618 return false;
2619 }
2620
2621 lhs = gimple_assign_lhs (next_stmt);
2622 }
2623
2624 /* Build up the actual chain. */
2625 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2626 {
2627 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2628 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2629 }
2630 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2631 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2632
2633 /* Save the chain for further analysis in SLP detection. */
2634 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2635 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2636
2637 return true;
2638 }
2639
2640 /* Return true if we need an in-order reduction for operation CODE
2641 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2642 overflow must wrap. */
2643
2644 static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)2645 needs_fold_left_reduction_p (tree type, tree_code code,
2646 bool need_wrapping_integral_overflow)
2647 {
2648 /* CHECKME: check for !flag_finite_math_only too? */
2649 if (SCALAR_FLOAT_TYPE_P (type))
2650 switch (code)
2651 {
2652 case MIN_EXPR:
2653 case MAX_EXPR:
2654 return false;
2655
2656 default:
2657 return !flag_associative_math;
2658 }
2659
2660 if (INTEGRAL_TYPE_P (type))
2661 {
2662 if (!operation_no_trapping_overflow (type, code))
2663 return true;
2664 if (need_wrapping_integral_overflow
2665 && !TYPE_OVERFLOW_WRAPS (type)
2666 && operation_can_overflow (code))
2667 return true;
2668 return false;
2669 }
2670
2671 if (SAT_FIXED_POINT_TYPE_P (type))
2672 return true;
2673
2674 return false;
2675 }
2676
2677 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2678 reduction operation CODE has a handled computation expression. */
2679
2680 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)2681 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2682 tree loop_arg, enum tree_code code)
2683 {
2684 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2685 auto_bitmap visited;
2686 tree lookfor = PHI_RESULT (phi);
2687 ssa_op_iter curri;
2688 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2689 while (USE_FROM_PTR (curr) != loop_arg)
2690 curr = op_iter_next_use (&curri);
2691 curri.i = curri.numops;
2692 do
2693 {
2694 path.safe_push (std::make_pair (curri, curr));
2695 tree use = USE_FROM_PTR (curr);
2696 if (use == lookfor)
2697 break;
2698 gimple *def = SSA_NAME_DEF_STMT (use);
2699 if (gimple_nop_p (def)
2700 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2701 {
2702 pop:
2703 do
2704 {
2705 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2706 curri = x.first;
2707 curr = x.second;
2708 do
2709 curr = op_iter_next_use (&curri);
2710 /* Skip already visited or non-SSA operands (from iterating
2711 over PHI args). */
2712 while (curr != NULL_USE_OPERAND_P
2713 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2714 || ! bitmap_set_bit (visited,
2715 SSA_NAME_VERSION
2716 (USE_FROM_PTR (curr)))));
2717 }
2718 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2719 if (curr == NULL_USE_OPERAND_P)
2720 break;
2721 }
2722 else
2723 {
2724 if (gimple_code (def) == GIMPLE_PHI)
2725 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2726 else
2727 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2728 while (curr != NULL_USE_OPERAND_P
2729 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2730 || ! bitmap_set_bit (visited,
2731 SSA_NAME_VERSION
2732 (USE_FROM_PTR (curr)))))
2733 curr = op_iter_next_use (&curri);
2734 if (curr == NULL_USE_OPERAND_P)
2735 goto pop;
2736 }
2737 }
2738 while (1);
2739 if (dump_file && (dump_flags & TDF_DETAILS))
2740 {
2741 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2742 unsigned i;
2743 std::pair<ssa_op_iter, use_operand_p> *x;
2744 FOR_EACH_VEC_ELT (path, i, x)
2745 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2746 dump_printf (MSG_NOTE, "\n");
2747 }
2748
2749 /* Check whether the reduction path detected is valid. */
2750 bool fail = path.length () == 0;
2751 bool neg = false;
2752 for (unsigned i = 1; i < path.length (); ++i)
2753 {
2754 gimple *use_stmt = USE_STMT (path[i].second);
2755 tree op = USE_FROM_PTR (path[i].second);
2756 if (! has_single_use (op)
2757 || ! is_gimple_assign (use_stmt))
2758 {
2759 fail = true;
2760 break;
2761 }
2762 if (gimple_assign_rhs_code (use_stmt) != code)
2763 {
2764 if (code == PLUS_EXPR
2765 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2766 {
2767 /* Track whether we negate the reduction value each iteration. */
2768 if (gimple_assign_rhs2 (use_stmt) == op)
2769 neg = ! neg;
2770 }
2771 else
2772 {
2773 fail = true;
2774 break;
2775 }
2776 }
2777 }
2778 return ! fail && ! neg;
2779 }
2780
2781
2782 /* Function vect_is_simple_reduction
2783
2784 (1) Detect a cross-iteration def-use cycle that represents a simple
2785 reduction computation. We look for the following pattern:
2786
2787 loop_header:
2788 a1 = phi < a0, a2 >
2789 a3 = ...
2790 a2 = operation (a3, a1)
2791
2792 or
2793
2794 a3 = ...
2795 loop_header:
2796 a1 = phi < a0, a2 >
2797 a2 = operation (a3, a1)
2798
2799 such that:
2800 1. operation is commutative and associative and it is safe to
2801 change the order of the computation
2802 2. no uses for a2 in the loop (a2 is used out of the loop)
2803 3. no uses of a1 in the loop besides the reduction operation
2804 4. no uses of a1 outside the loop.
2805
2806 Conditions 1,4 are tested here.
2807 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2808
2809 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2810 nested cycles.
2811
2812 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2813 reductions:
2814
2815 a1 = phi < a0, a2 >
2816 inner loop (def of a3)
2817 a2 = phi < a3 >
2818
2819 (4) Detect condition expressions, ie:
2820 for (int i = 0; i < N; i++)
2821 if (a[i] < val)
2822 ret_val = a[i];
2823
2824 */
2825
2826 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)2827 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2828 bool *double_reduc,
2829 bool need_wrapping_integral_overflow,
2830 enum vect_reduction_type *v_reduc_type)
2831 {
2832 gphi *phi = as_a <gphi *> (phi_info->stmt);
2833 struct loop *loop = (gimple_bb (phi))->loop_father;
2834 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2835 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2836 gimple *phi_use_stmt = NULL;
2837 enum tree_code orig_code, code;
2838 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2839 tree type;
2840 tree name;
2841 imm_use_iterator imm_iter;
2842 use_operand_p use_p;
2843 bool phi_def;
2844
2845 *double_reduc = false;
2846 *v_reduc_type = TREE_CODE_REDUCTION;
2847
2848 tree phi_name = PHI_RESULT (phi);
2849 /* ??? If there are no uses of the PHI result the inner loop reduction
2850 won't be detected as possibly double-reduction by vectorizable_reduction
2851 because that tries to walk the PHI arg from the preheader edge which
2852 can be constant. See PR60382. */
2853 if (has_zero_uses (phi_name))
2854 return NULL;
2855 unsigned nphi_def_loop_uses = 0;
2856 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2857 {
2858 gimple *use_stmt = USE_STMT (use_p);
2859 if (is_gimple_debug (use_stmt))
2860 continue;
2861
2862 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2863 {
2864 if (dump_enabled_p ())
2865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2866 "intermediate value used outside loop.\n");
2867
2868 return NULL;
2869 }
2870
2871 nphi_def_loop_uses++;
2872 phi_use_stmt = use_stmt;
2873 }
2874
2875 edge latch_e = loop_latch_edge (loop);
2876 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2877 if (TREE_CODE (loop_arg) != SSA_NAME)
2878 {
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881 "reduction: not ssa_name: %T\n", loop_arg);
2882 return NULL;
2883 }
2884
2885 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2886 if (!def_stmt_info
2887 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2888 return NULL;
2889
2890 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2891 {
2892 name = gimple_assign_lhs (def_stmt);
2893 phi_def = false;
2894 }
2895 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2896 {
2897 name = PHI_RESULT (def_stmt);
2898 phi_def = true;
2899 }
2900 else
2901 {
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904 "reduction: unhandled reduction operation: %G",
2905 def_stmt_info->stmt);
2906 return NULL;
2907 }
2908
2909 unsigned nlatch_def_loop_uses = 0;
2910 auto_vec<gphi *, 3> lcphis;
2911 bool inner_loop_of_double_reduc = false;
2912 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2913 {
2914 gimple *use_stmt = USE_STMT (use_p);
2915 if (is_gimple_debug (use_stmt))
2916 continue;
2917 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2918 nlatch_def_loop_uses++;
2919 else
2920 {
2921 /* We can have more than one loop-closed PHI. */
2922 lcphis.safe_push (as_a <gphi *> (use_stmt));
2923 if (nested_in_vect_loop
2924 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2925 == vect_double_reduction_def))
2926 inner_loop_of_double_reduc = true;
2927 }
2928 }
2929
2930 /* If this isn't a nested cycle or if the nested cycle reduction value
2931 is used ouside of the inner loop we cannot handle uses of the reduction
2932 value. */
2933 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2934 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2935 {
2936 if (dump_enabled_p ())
2937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2938 "reduction used in loop.\n");
2939 return NULL;
2940 }
2941
2942 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2943 defined in the inner loop. */
2944 if (phi_def)
2945 {
2946 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2947 op1 = PHI_ARG_DEF (def_stmt, 0);
2948
2949 if (gimple_phi_num_args (def_stmt) != 1
2950 || TREE_CODE (op1) != SSA_NAME)
2951 {
2952 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954 "unsupported phi node definition.\n");
2955
2956 return NULL;
2957 }
2958
2959 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2960 if (gimple_bb (def1)
2961 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2962 && loop->inner
2963 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2964 && is_gimple_assign (def1)
2965 && is_a <gphi *> (phi_use_stmt)
2966 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2967 {
2968 if (dump_enabled_p ())
2969 report_vect_op (MSG_NOTE, def_stmt,
2970 "detected double reduction: ");
2971
2972 *double_reduc = true;
2973 return def_stmt_info;
2974 }
2975
2976 return NULL;
2977 }
2978
2979 /* If we are vectorizing an inner reduction we are executing that
2980 in the original order only in case we are not dealing with a
2981 double reduction. */
2982 bool check_reduction = true;
2983 if (flow_loop_nested_p (vect_loop, loop))
2984 {
2985 gphi *lcphi;
2986 unsigned i;
2987 check_reduction = false;
2988 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2989 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2990 {
2991 gimple *use_stmt = USE_STMT (use_p);
2992 if (is_gimple_debug (use_stmt))
2993 continue;
2994 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2995 check_reduction = true;
2996 }
2997 }
2998
2999 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3000 code = orig_code = gimple_assign_rhs_code (def_stmt);
3001
3002 if (nested_in_vect_loop && !check_reduction)
3003 {
3004 /* FIXME: Even for non-reductions code generation is funneled
3005 through vectorizable_reduction for the stmt defining the
3006 PHI latch value. So we have to artificially restrict ourselves
3007 for the supported operations. */
3008 switch (get_gimple_rhs_class (code))
3009 {
3010 case GIMPLE_BINARY_RHS:
3011 case GIMPLE_TERNARY_RHS:
3012 break;
3013 default:
3014 /* Not supported by vectorizable_reduction. */
3015 if (dump_enabled_p ())
3016 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3017 "nested cycle: not handled operation: ");
3018 return NULL;
3019 }
3020 if (dump_enabled_p ())
3021 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3022 return def_stmt_info;
3023 }
3024
3025 /* We can handle "res -= x[i]", which is non-associative by
3026 simply rewriting this into "res += -x[i]". Avoid changing
3027 gimple instruction for the first simple tests and only do this
3028 if we're allowed to change code at all. */
3029 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3030 code = PLUS_EXPR;
3031
3032 if (code == COND_EXPR)
3033 {
3034 if (! nested_in_vect_loop)
3035 *v_reduc_type = COND_REDUCTION;
3036
3037 op3 = gimple_assign_rhs1 (def_stmt);
3038 if (COMPARISON_CLASS_P (op3))
3039 {
3040 op4 = TREE_OPERAND (op3, 1);
3041 op3 = TREE_OPERAND (op3, 0);
3042 }
3043 if (op3 == phi_name || op4 == phi_name)
3044 {
3045 if (dump_enabled_p ())
3046 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3047 "reduction: condition depends on previous"
3048 " iteration: ");
3049 return NULL;
3050 }
3051
3052 op1 = gimple_assign_rhs2 (def_stmt);
3053 op2 = gimple_assign_rhs3 (def_stmt);
3054 }
3055 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3056 {
3057 if (dump_enabled_p ())
3058 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3059 "reduction: not commutative/associative: ");
3060 return NULL;
3061 }
3062 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3063 {
3064 op1 = gimple_assign_rhs1 (def_stmt);
3065 op2 = gimple_assign_rhs2 (def_stmt);
3066 }
3067 else
3068 {
3069 if (dump_enabled_p ())
3070 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071 "reduction: not handled operation: ");
3072 return NULL;
3073 }
3074
3075 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3076 {
3077 if (dump_enabled_p ())
3078 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3079 "reduction: both uses not ssa_names: ");
3080
3081 return NULL;
3082 }
3083
3084 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3085 if ((TREE_CODE (op1) == SSA_NAME
3086 && !types_compatible_p (type,TREE_TYPE (op1)))
3087 || (TREE_CODE (op2) == SSA_NAME
3088 && !types_compatible_p (type, TREE_TYPE (op2)))
3089 || (op3 && TREE_CODE (op3) == SSA_NAME
3090 && !types_compatible_p (type, TREE_TYPE (op3)))
3091 || (op4 && TREE_CODE (op4) == SSA_NAME
3092 && !types_compatible_p (type, TREE_TYPE (op4))))
3093 {
3094 if (dump_enabled_p ())
3095 {
3096 dump_printf_loc (MSG_NOTE, vect_location,
3097 "reduction: multiple types: operation type: "
3098 "%T, operands types: %T,%T",
3099 type, TREE_TYPE (op1), TREE_TYPE (op2));
3100 if (op3)
3101 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3102
3103 if (op4)
3104 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3105 dump_printf (MSG_NOTE, "\n");
3106 }
3107
3108 return NULL;
3109 }
3110
3111 /* Check whether it's ok to change the order of the computation.
3112 Generally, when vectorizing a reduction we change the order of the
3113 computation. This may change the behavior of the program in some
3114 cases, so we need to check that this is ok. One exception is when
3115 vectorizing an outer-loop: the inner-loop is executed sequentially,
3116 and therefore vectorizing reductions in the inner-loop during
3117 outer-loop vectorization is safe. */
3118 if (check_reduction
3119 && *v_reduc_type == TREE_CODE_REDUCTION
3120 && needs_fold_left_reduction_p (type, code,
3121 need_wrapping_integral_overflow))
3122 *v_reduc_type = FOLD_LEFT_REDUCTION;
3123
3124 /* Reduction is safe. We're dealing with one of the following:
3125 1) integer arithmetic and no trapv
3126 2) floating point arithmetic, and special flags permit this optimization
3127 3) nested cycle (i.e., outer loop vectorization). */
3128 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3129 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3130 if (code != COND_EXPR && !def1_info && !def2_info)
3131 {
3132 if (dump_enabled_p ())
3133 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3134 return NULL;
3135 }
3136
3137 /* Check that one def is the reduction def, defined by PHI,
3138 the other def is either defined in the loop ("vect_internal_def"),
3139 or it's an induction (defined by a loop-header phi-node). */
3140
3141 if (def2_info
3142 && def2_info->stmt == phi
3143 && (code == COND_EXPR
3144 || !def1_info
3145 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3146 || vect_valid_reduction_input_p (def1_info)))
3147 {
3148 if (dump_enabled_p ())
3149 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3150 return def_stmt_info;
3151 }
3152
3153 if (def1_info
3154 && def1_info->stmt == phi
3155 && (code == COND_EXPR
3156 || !def2_info
3157 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3158 || vect_valid_reduction_input_p (def2_info)))
3159 {
3160 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3161 {
3162 /* Check if we can swap operands (just for simplicity - so that
3163 the rest of the code can assume that the reduction variable
3164 is always the last (second) argument). */
3165 if (code == COND_EXPR)
3166 {
3167 /* Swap cond_expr by inverting the condition. */
3168 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3169 enum tree_code invert_code = ERROR_MARK;
3170 enum tree_code cond_code = TREE_CODE (cond_expr);
3171
3172 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3173 {
3174 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3175 invert_code = invert_tree_comparison (cond_code, honor_nans);
3176 }
3177 if (invert_code != ERROR_MARK)
3178 {
3179 TREE_SET_CODE (cond_expr, invert_code);
3180 swap_ssa_operands (def_stmt,
3181 gimple_assign_rhs2_ptr (def_stmt),
3182 gimple_assign_rhs3_ptr (def_stmt));
3183 }
3184 else
3185 {
3186 if (dump_enabled_p ())
3187 report_vect_op (MSG_NOTE, def_stmt,
3188 "detected reduction: cannot swap operands "
3189 "for cond_expr");
3190 return NULL;
3191 }
3192 }
3193 else
3194 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3195 gimple_assign_rhs2_ptr (def_stmt));
3196
3197 if (dump_enabled_p ())
3198 report_vect_op (MSG_NOTE, def_stmt,
3199 "detected reduction: need to swap operands: ");
3200
3201 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3202 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3203 }
3204 else
3205 {
3206 if (dump_enabled_p ())
3207 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3208 }
3209
3210 return def_stmt_info;
3211 }
3212
3213 /* Try to find SLP reduction chain. */
3214 if (! nested_in_vect_loop
3215 && code != COND_EXPR
3216 && orig_code != MINUS_EXPR
3217 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3218 {
3219 if (dump_enabled_p ())
3220 report_vect_op (MSG_NOTE, def_stmt,
3221 "reduction: detected reduction chain: ");
3222
3223 return def_stmt_info;
3224 }
3225
3226 /* Look for the expression computing loop_arg from loop PHI result. */
3227 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3228 return def_stmt_info;
3229
3230 if (dump_enabled_p ())
3231 {
3232 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3233 "reduction: unknown pattern: ");
3234 }
3235
3236 return NULL;
3237 }
3238
3239 /* Wrapper around vect_is_simple_reduction, which will modify code
3240 in-place if it enables detection of more reductions. Arguments
3241 as there. */
3242
3243 stmt_vec_info
vect_force_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow)3244 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3245 bool *double_reduc,
3246 bool need_wrapping_integral_overflow)
3247 {
3248 enum vect_reduction_type v_reduc_type;
3249 stmt_vec_info def_info
3250 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3251 need_wrapping_integral_overflow,
3252 &v_reduc_type);
3253 if (def_info)
3254 {
3255 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3256 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3257 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3258 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3259 }
3260 return def_info;
3261 }
3262
3263 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3264 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3265 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3266 int *peel_iters_epilogue,
3267 stmt_vector_for_cost *scalar_cost_vec,
3268 stmt_vector_for_cost *prologue_cost_vec,
3269 stmt_vector_for_cost *epilogue_cost_vec)
3270 {
3271 int retval = 0;
3272 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3273
3274 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3275 {
3276 *peel_iters_epilogue = assumed_vf / 2;
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location,
3279 "cost model: epilogue peel iters set to vf/2 "
3280 "because loop iterations are unknown .\n");
3281
3282 /* If peeled iterations are known but number of scalar loop
3283 iterations are unknown, count a taken branch per peeled loop. */
3284 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285 NULL, 0, vect_prologue);
3286 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287 NULL, 0, vect_epilogue);
3288 }
3289 else
3290 {
3291 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3292 peel_iters_prologue = niters < peel_iters_prologue ?
3293 niters : peel_iters_prologue;
3294 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3295 /* If we need to peel for gaps, but no peeling is required, we have to
3296 peel VF iterations. */
3297 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3298 *peel_iters_epilogue = assumed_vf;
3299 }
3300
3301 stmt_info_for_cost *si;
3302 int j;
3303 if (peel_iters_prologue)
3304 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3305 retval += record_stmt_cost (prologue_cost_vec,
3306 si->count * peel_iters_prologue,
3307 si->kind, si->stmt_info, si->misalign,
3308 vect_prologue);
3309 if (*peel_iters_epilogue)
3310 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3311 retval += record_stmt_cost (epilogue_cost_vec,
3312 si->count * *peel_iters_epilogue,
3313 si->kind, si->stmt_info, si->misalign,
3314 vect_epilogue);
3315
3316 return retval;
3317 }
3318
3319 /* Function vect_estimate_min_profitable_iters
3320
3321 Return the number of iterations required for the vector version of the
3322 loop to be profitable relative to the cost of the scalar version of the
3323 loop.
3324
3325 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3326 of iterations for vectorization. -1 value means loop vectorization
3327 is not profitable. This returned value may be used for dynamic
3328 profitability check.
3329
3330 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3331 for static check against estimated number of iterations. */
3332
3333 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3334 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3335 int *ret_min_profitable_niters,
3336 int *ret_min_profitable_estimate)
3337 {
3338 int min_profitable_iters;
3339 int min_profitable_estimate;
3340 int peel_iters_prologue;
3341 int peel_iters_epilogue;
3342 unsigned vec_inside_cost = 0;
3343 int vec_outside_cost = 0;
3344 unsigned vec_prologue_cost = 0;
3345 unsigned vec_epilogue_cost = 0;
3346 int scalar_single_iter_cost = 0;
3347 int scalar_outside_cost = 0;
3348 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3349 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3350 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3351
3352 /* Cost model disabled. */
3353 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3354 {
3355 if (dump_enabled_p ())
3356 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3357 *ret_min_profitable_niters = 0;
3358 *ret_min_profitable_estimate = 0;
3359 return;
3360 }
3361
3362 /* Requires loop versioning tests to handle misalignment. */
3363 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3364 {
3365 /* FIXME: Make cost depend on complexity of individual check. */
3366 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3367 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3368 vect_prologue);
3369 if (dump_enabled_p ())
3370 dump_printf (MSG_NOTE,
3371 "cost model: Adding cost of checks for loop "
3372 "versioning to treat misalignment.\n");
3373 }
3374
3375 /* Requires loop versioning with alias checks. */
3376 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3377 {
3378 /* FIXME: Make cost depend on complexity of individual check. */
3379 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3380 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3381 vect_prologue);
3382 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3383 if (len)
3384 /* Count LEN - 1 ANDs and LEN comparisons. */
3385 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3386 NULL, 0, vect_prologue);
3387 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3388 if (len)
3389 {
3390 /* Count LEN - 1 ANDs and LEN comparisons. */
3391 unsigned int nstmts = len * 2 - 1;
3392 /* +1 for each bias that needs adding. */
3393 for (unsigned int i = 0; i < len; ++i)
3394 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3395 nstmts += 1;
3396 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3397 NULL, 0, vect_prologue);
3398 }
3399 if (dump_enabled_p ())
3400 dump_printf (MSG_NOTE,
3401 "cost model: Adding cost of checks for loop "
3402 "versioning aliasing.\n");
3403 }
3404
3405 /* Requires loop versioning with niter checks. */
3406 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3407 {
3408 /* FIXME: Make cost depend on complexity of individual check. */
3409 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3410 vect_prologue);
3411 if (dump_enabled_p ())
3412 dump_printf (MSG_NOTE,
3413 "cost model: Adding cost of checks for loop "
3414 "versioning niters.\n");
3415 }
3416
3417 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3418 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3419 vect_prologue);
3420
3421 /* Count statements in scalar loop. Using this as scalar cost for a single
3422 iteration for now.
3423
3424 TODO: Add outer loop support.
3425
3426 TODO: Consider assigning different costs to different scalar
3427 statements. */
3428
3429 scalar_single_iter_cost
3430 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3431
3432 /* Add additional cost for the peeled instructions in prologue and epilogue
3433 loop. (For fully-masked loops there will be no peeling.)
3434
3435 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3436 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3437
3438 TODO: Build an expression that represents peel_iters for prologue and
3439 epilogue to be used in a run-time test. */
3440
3441 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3442 {
3443 peel_iters_prologue = 0;
3444 peel_iters_epilogue = 0;
3445
3446 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3447 {
3448 /* We need to peel exactly one iteration. */
3449 peel_iters_epilogue += 1;
3450 stmt_info_for_cost *si;
3451 int j;
3452 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3453 j, si)
3454 (void) add_stmt_cost (target_cost_data, si->count,
3455 si->kind, si->stmt_info, si->misalign,
3456 vect_epilogue);
3457 }
3458 }
3459 else if (npeel < 0)
3460 {
3461 peel_iters_prologue = assumed_vf / 2;
3462 if (dump_enabled_p ())
3463 dump_printf (MSG_NOTE, "cost model: "
3464 "prologue peel iters set to vf/2.\n");
3465
3466 /* If peeling for alignment is unknown, loop bound of main loop becomes
3467 unknown. */
3468 peel_iters_epilogue = assumed_vf / 2;
3469 if (dump_enabled_p ())
3470 dump_printf (MSG_NOTE, "cost model: "
3471 "epilogue peel iters set to vf/2 because "
3472 "peeling for alignment is unknown.\n");
3473
3474 /* If peeled iterations are unknown, count a taken branch and a not taken
3475 branch per peeled loop. Even if scalar loop iterations are known,
3476 vector iterations are not known since peeled prologue iterations are
3477 not known. Hence guards remain the same. */
3478 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3479 NULL, 0, vect_prologue);
3480 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3481 NULL, 0, vect_prologue);
3482 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3483 NULL, 0, vect_epilogue);
3484 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3485 NULL, 0, vect_epilogue);
3486 stmt_info_for_cost *si;
3487 int j;
3488 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3489 {
3490 (void) add_stmt_cost (target_cost_data,
3491 si->count * peel_iters_prologue,
3492 si->kind, si->stmt_info, si->misalign,
3493 vect_prologue);
3494 (void) add_stmt_cost (target_cost_data,
3495 si->count * peel_iters_epilogue,
3496 si->kind, si->stmt_info, si->misalign,
3497 vect_epilogue);
3498 }
3499 }
3500 else
3501 {
3502 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3503 stmt_info_for_cost *si;
3504 int j;
3505 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3506
3507 prologue_cost_vec.create (2);
3508 epilogue_cost_vec.create (2);
3509 peel_iters_prologue = npeel;
3510
3511 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3512 &peel_iters_epilogue,
3513 &LOOP_VINFO_SCALAR_ITERATION_COST
3514 (loop_vinfo),
3515 &prologue_cost_vec,
3516 &epilogue_cost_vec);
3517
3518 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3519 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3520 si->misalign, vect_prologue);
3521
3522 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3523 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3524 si->misalign, vect_epilogue);
3525
3526 prologue_cost_vec.release ();
3527 epilogue_cost_vec.release ();
3528 }
3529
3530 /* FORNOW: The scalar outside cost is incremented in one of the
3531 following ways:
3532
3533 1. The vectorizer checks for alignment and aliasing and generates
3534 a condition that allows dynamic vectorization. A cost model
3535 check is ANDED with the versioning condition. Hence scalar code
3536 path now has the added cost of the versioning check.
3537
3538 if (cost > th & versioning_check)
3539 jmp to vector code
3540
3541 Hence run-time scalar is incremented by not-taken branch cost.
3542
3543 2. The vectorizer then checks if a prologue is required. If the
3544 cost model check was not done before during versioning, it has to
3545 be done before the prologue check.
3546
3547 if (cost <= th)
3548 prologue = scalar_iters
3549 if (prologue == 0)
3550 jmp to vector code
3551 else
3552 execute prologue
3553 if (prologue == num_iters)
3554 go to exit
3555
3556 Hence the run-time scalar cost is incremented by a taken branch,
3557 plus a not-taken branch, plus a taken branch cost.
3558
3559 3. The vectorizer then checks if an epilogue is required. If the
3560 cost model check was not done before during prologue check, it
3561 has to be done with the epilogue check.
3562
3563 if (prologue == 0)
3564 jmp to vector code
3565 else
3566 execute prologue
3567 if (prologue == num_iters)
3568 go to exit
3569 vector code:
3570 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3571 jmp to epilogue
3572
3573 Hence the run-time scalar cost should be incremented by 2 taken
3574 branches.
3575
3576 TODO: The back end may reorder the BBS's differently and reverse
3577 conditions/branch directions. Change the estimates below to
3578 something more reasonable. */
3579
3580 /* If the number of iterations is known and we do not do versioning, we can
3581 decide whether to vectorize at compile time. Hence the scalar version
3582 do not carry cost model guard costs. */
3583 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3584 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3585 {
3586 /* Cost model check occurs at versioning. */
3587 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3588 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3589 else
3590 {
3591 /* Cost model check occurs at prologue generation. */
3592 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3593 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3594 + vect_get_stmt_cost (cond_branch_not_taken);
3595 /* Cost model check occurs at epilogue generation. */
3596 else
3597 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3598 }
3599 }
3600
3601 /* Complete the target-specific cost calculations. */
3602 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3603 &vec_inside_cost, &vec_epilogue_cost);
3604
3605 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3606
3607 if (dump_enabled_p ())
3608 {
3609 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3610 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3611 vec_inside_cost);
3612 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3613 vec_prologue_cost);
3614 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3615 vec_epilogue_cost);
3616 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3617 scalar_single_iter_cost);
3618 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3619 scalar_outside_cost);
3620 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3621 vec_outside_cost);
3622 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3623 peel_iters_prologue);
3624 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3625 peel_iters_epilogue);
3626 }
3627
3628 /* Calculate number of iterations required to make the vector version
3629 profitable, relative to the loop bodies only. The following condition
3630 must hold true:
3631 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3632 where
3633 SIC = scalar iteration cost, VIC = vector iteration cost,
3634 VOC = vector outside cost, VF = vectorization factor,
3635 NPEEL = prologue iterations + epilogue iterations,
3636 SOC = scalar outside cost for run time cost model check. */
3637
3638 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3639 - vec_inside_cost);
3640 if (saving_per_viter <= 0)
3641 {
3642 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3643 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3644 "vectorization did not happen for a simd loop");
3645
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648 "cost model: the vector iteration cost = %d "
3649 "divided by the scalar iteration cost = %d "
3650 "is greater or equal to the vectorization factor = %d"
3651 ".\n",
3652 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3653 *ret_min_profitable_niters = -1;
3654 *ret_min_profitable_estimate = -1;
3655 return;
3656 }
3657
3658 /* ??? The "if" arm is written to handle all cases; see below for what
3659 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3660 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3661 {
3662 /* Rewriting the condition above in terms of the number of
3663 vector iterations (vniters) rather than the number of
3664 scalar iterations (niters) gives:
3665
3666 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3667
3668 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3669
3670 For integer N, X and Y when X > 0:
3671
3672 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3673 int outside_overhead = (vec_outside_cost
3674 - scalar_single_iter_cost * peel_iters_prologue
3675 - scalar_single_iter_cost * peel_iters_epilogue
3676 - scalar_outside_cost);
3677 /* We're only interested in cases that require at least one
3678 vector iteration. */
3679 int min_vec_niters = 1;
3680 if (outside_overhead > 0)
3681 min_vec_niters = outside_overhead / saving_per_viter + 1;
3682
3683 if (dump_enabled_p ())
3684 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3685 min_vec_niters);
3686
3687 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3688 {
3689 /* Now that we know the minimum number of vector iterations,
3690 find the minimum niters for which the scalar cost is larger:
3691
3692 SIC * niters > VIC * vniters + VOC - SOC
3693
3694 We know that the minimum niters is no more than
3695 vniters * VF + NPEEL, but it might be (and often is) less
3696 than that if a partial vector iteration is cheaper than the
3697 equivalent scalar code. */
3698 int threshold = (vec_inside_cost * min_vec_niters
3699 + vec_outside_cost
3700 - scalar_outside_cost);
3701 if (threshold <= 0)
3702 min_profitable_iters = 1;
3703 else
3704 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3705 }
3706 else
3707 /* Convert the number of vector iterations into a number of
3708 scalar iterations. */
3709 min_profitable_iters = (min_vec_niters * assumed_vf
3710 + peel_iters_prologue
3711 + peel_iters_epilogue);
3712 }
3713 else
3714 {
3715 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3716 * assumed_vf
3717 - vec_inside_cost * peel_iters_prologue
3718 - vec_inside_cost * peel_iters_epilogue);
3719 if (min_profitable_iters <= 0)
3720 min_profitable_iters = 0;
3721 else
3722 {
3723 min_profitable_iters /= saving_per_viter;
3724
3725 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3726 <= (((int) vec_inside_cost * min_profitable_iters)
3727 + (((int) vec_outside_cost - scalar_outside_cost)
3728 * assumed_vf)))
3729 min_profitable_iters++;
3730 }
3731 }
3732
3733 if (dump_enabled_p ())
3734 dump_printf (MSG_NOTE,
3735 " Calculated minimum iters for profitability: %d\n",
3736 min_profitable_iters);
3737
3738 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3739 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3740 /* We want the vectorized loop to execute at least once. */
3741 min_profitable_iters = assumed_vf + peel_iters_prologue;
3742
3743 if (dump_enabled_p ())
3744 dump_printf_loc (MSG_NOTE, vect_location,
3745 " Runtime profitability threshold = %d\n",
3746 min_profitable_iters);
3747
3748 *ret_min_profitable_niters = min_profitable_iters;
3749
3750 /* Calculate number of iterations required to make the vector version
3751 profitable, relative to the loop bodies only.
3752
3753 Non-vectorized variant is SIC * niters and it must win over vector
3754 variant on the expected loop trip count. The following condition must hold true:
3755 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3756
3757 if (vec_outside_cost <= 0)
3758 min_profitable_estimate = 0;
3759 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3760 {
3761 /* This is a repeat of the code above, but with + SOC rather
3762 than - SOC. */
3763 int outside_overhead = (vec_outside_cost
3764 - scalar_single_iter_cost * peel_iters_prologue
3765 - scalar_single_iter_cost * peel_iters_epilogue
3766 + scalar_outside_cost);
3767 int min_vec_niters = 1;
3768 if (outside_overhead > 0)
3769 min_vec_niters = outside_overhead / saving_per_viter + 1;
3770
3771 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3772 {
3773 int threshold = (vec_inside_cost * min_vec_niters
3774 + vec_outside_cost
3775 + scalar_outside_cost);
3776 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3777 }
3778 else
3779 min_profitable_estimate = (min_vec_niters * assumed_vf
3780 + peel_iters_prologue
3781 + peel_iters_epilogue);
3782 }
3783 else
3784 {
3785 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3786 * assumed_vf
3787 - vec_inside_cost * peel_iters_prologue
3788 - vec_inside_cost * peel_iters_epilogue)
3789 / ((scalar_single_iter_cost * assumed_vf)
3790 - vec_inside_cost);
3791 }
3792 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3793 if (dump_enabled_p ())
3794 dump_printf_loc (MSG_NOTE, vect_location,
3795 " Static estimate profitability threshold = %d\n",
3796 min_profitable_estimate);
3797
3798 *ret_min_profitable_estimate = min_profitable_estimate;
3799 }
3800
3801 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3802 vector elements (not bits) for a vector with NELT elements. */
3803 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3804 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3805 vec_perm_builder *sel)
3806 {
3807 /* The encoding is a single stepped pattern. Any wrap-around is handled
3808 by vec_perm_indices. */
3809 sel->new_vector (nelt, 1, 3);
3810 for (unsigned int i = 0; i < 3; i++)
3811 sel->quick_push (i + offset);
3812 }
3813
3814 /* Checks whether the target supports whole-vector shifts for vectors of mode
3815 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3816 it supports vec_perm_const with masks for all necessary shift amounts. */
3817 static bool
have_whole_vector_shift(machine_mode mode)3818 have_whole_vector_shift (machine_mode mode)
3819 {
3820 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3821 return true;
3822
3823 /* Variable-length vectors should be handled via the optab. */
3824 unsigned int nelt;
3825 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3826 return false;
3827
3828 vec_perm_builder sel;
3829 vec_perm_indices indices;
3830 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3831 {
3832 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3833 indices.new_vector (sel, 2, nelt);
3834 if (!can_vec_perm_const_p (mode, indices, false))
3835 return false;
3836 }
3837 return true;
3838 }
3839
3840 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3841 functions. Design better to avoid maintenance issues. */
3842
3843 /* Function vect_model_reduction_cost.
3844
3845 Models cost for a reduction operation, including the vector ops
3846 generated within the strip-mine loop, the initial definition before
3847 the loop, and the epilogue code that must be generated. */
3848
3849 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies,stmt_vector_for_cost * cost_vec)3850 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3851 int ncopies, stmt_vector_for_cost *cost_vec)
3852 {
3853 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3854 enum tree_code code;
3855 optab optab;
3856 tree vectype;
3857 machine_mode mode;
3858 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3859 struct loop *loop = NULL;
3860
3861 if (loop_vinfo)
3862 loop = LOOP_VINFO_LOOP (loop_vinfo);
3863
3864 /* Condition reductions generate two reductions in the loop. */
3865 vect_reduction_type reduction_type
3866 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3867 if (reduction_type == COND_REDUCTION)
3868 ncopies *= 2;
3869
3870 vectype = STMT_VINFO_VECTYPE (stmt_info);
3871 mode = TYPE_MODE (vectype);
3872 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3873
3874 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3875
3876 if (reduction_type == EXTRACT_LAST_REDUCTION
3877 || reduction_type == FOLD_LEFT_REDUCTION)
3878 {
3879 /* No extra instructions needed in the prologue. */
3880 prologue_cost = 0;
3881
3882 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3883 /* Count one reduction-like operation per vector. */
3884 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3885 stmt_info, 0, vect_body);
3886 else
3887 {
3888 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3889 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3890 inside_cost = record_stmt_cost (cost_vec, nelements,
3891 vec_to_scalar, stmt_info, 0,
3892 vect_body);
3893 inside_cost += record_stmt_cost (cost_vec, nelements,
3894 scalar_stmt, stmt_info, 0,
3895 vect_body);
3896 }
3897 }
3898 else
3899 {
3900 /* Add in cost for initial definition.
3901 For cond reduction we have four vectors: initial index, step,
3902 initial result of the data reduction, initial value of the index
3903 reduction. */
3904 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3905 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3906 scalar_to_vec, stmt_info, 0,
3907 vect_prologue);
3908
3909 /* Cost of reduction op inside loop. */
3910 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3911 stmt_info, 0, vect_body);
3912 }
3913
3914 /* Determine cost of epilogue code.
3915
3916 We have a reduction operator that will reduce the vector in one statement.
3917 Also requires scalar extract. */
3918
3919 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3920 {
3921 if (reduc_fn != IFN_LAST)
3922 {
3923 if (reduction_type == COND_REDUCTION)
3924 {
3925 /* An EQ stmt and an COND_EXPR stmt. */
3926 epilogue_cost += record_stmt_cost (cost_vec, 2,
3927 vector_stmt, stmt_info, 0,
3928 vect_epilogue);
3929 /* Reduction of the max index and a reduction of the found
3930 values. */
3931 epilogue_cost += record_stmt_cost (cost_vec, 2,
3932 vec_to_scalar, stmt_info, 0,
3933 vect_epilogue);
3934 /* A broadcast of the max value. */
3935 epilogue_cost += record_stmt_cost (cost_vec, 1,
3936 scalar_to_vec, stmt_info, 0,
3937 vect_epilogue);
3938 }
3939 else
3940 {
3941 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3942 stmt_info, 0, vect_epilogue);
3943 epilogue_cost += record_stmt_cost (cost_vec, 1,
3944 vec_to_scalar, stmt_info, 0,
3945 vect_epilogue);
3946 }
3947 }
3948 else if (reduction_type == COND_REDUCTION)
3949 {
3950 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3951 /* Extraction of scalar elements. */
3952 epilogue_cost += record_stmt_cost (cost_vec,
3953 2 * estimated_nunits,
3954 vec_to_scalar, stmt_info, 0,
3955 vect_epilogue);
3956 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3957 epilogue_cost += record_stmt_cost (cost_vec,
3958 2 * estimated_nunits - 3,
3959 scalar_stmt, stmt_info, 0,
3960 vect_epilogue);
3961 }
3962 else if (reduction_type == EXTRACT_LAST_REDUCTION
3963 || reduction_type == FOLD_LEFT_REDUCTION)
3964 /* No extra instructions need in the epilogue. */
3965 ;
3966 else
3967 {
3968 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3969 tree bitsize =
3970 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3971 int element_bitsize = tree_to_uhwi (bitsize);
3972 int nelements = vec_size_in_bits / element_bitsize;
3973
3974 if (code == COND_EXPR)
3975 code = MAX_EXPR;
3976
3977 optab = optab_for_tree_code (code, vectype, optab_default);
3978
3979 /* We have a whole vector shift available. */
3980 if (optab != unknown_optab
3981 && VECTOR_MODE_P (mode)
3982 && optab_handler (optab, mode) != CODE_FOR_nothing
3983 && have_whole_vector_shift (mode))
3984 {
3985 /* Final reduction via vector shifts and the reduction operator.
3986 Also requires scalar extract. */
3987 epilogue_cost += record_stmt_cost (cost_vec,
3988 exact_log2 (nelements) * 2,
3989 vector_stmt, stmt_info, 0,
3990 vect_epilogue);
3991 epilogue_cost += record_stmt_cost (cost_vec, 1,
3992 vec_to_scalar, stmt_info, 0,
3993 vect_epilogue);
3994 }
3995 else
3996 /* Use extracts and reduction op for final reduction. For N
3997 elements, we have N extracts and N-1 reduction ops. */
3998 epilogue_cost += record_stmt_cost (cost_vec,
3999 nelements + nelements - 1,
4000 vector_stmt, stmt_info, 0,
4001 vect_epilogue);
4002 }
4003 }
4004
4005 if (dump_enabled_p ())
4006 dump_printf (MSG_NOTE,
4007 "vect_model_reduction_cost: inside_cost = %d, "
4008 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4009 prologue_cost, epilogue_cost);
4010 }
4011
4012
4013 /* Function vect_model_induction_cost.
4014
4015 Models cost for induction operations. */
4016
4017 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4018 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4019 stmt_vector_for_cost *cost_vec)
4020 {
4021 unsigned inside_cost, prologue_cost;
4022
4023 if (PURE_SLP_STMT (stmt_info))
4024 return;
4025
4026 /* loop cost for vec_loop. */
4027 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4028 stmt_info, 0, vect_body);
4029
4030 /* prologue cost for vec_init and vec_step. */
4031 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4032 stmt_info, 0, vect_prologue);
4033
4034 if (dump_enabled_p ())
4035 dump_printf_loc (MSG_NOTE, vect_location,
4036 "vect_model_induction_cost: inside_cost = %d, "
4037 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4038 }
4039
4040
4041
4042 /* Function get_initial_def_for_reduction
4043
4044 Input:
4045 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4046 INIT_VAL - the initial value of the reduction variable
4047
4048 Output:
4049 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4050 of the reduction (used for adjusting the epilog - see below).
4051 Return a vector variable, initialized according to the operation that
4052 STMT_VINFO performs. This vector will be used as the initial value
4053 of the vector of partial results.
4054
4055 Option1 (adjust in epilog): Initialize the vector as follows:
4056 add/bit or/xor: [0,0,...,0,0]
4057 mult/bit and: [1,1,...,1,1]
4058 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4059 and when necessary (e.g. add/mult case) let the caller know
4060 that it needs to adjust the result by init_val.
4061
4062 Option2: Initialize the vector as follows:
4063 add/bit or/xor: [init_val,0,0,...,0]
4064 mult/bit and: [init_val,1,1,...,1]
4065 min/max/cond_expr: [init_val,init_val,...,init_val]
4066 and no adjustments are needed.
4067
4068 For example, for the following code:
4069
4070 s = init_val;
4071 for (i=0;i<n;i++)
4072 s = s + a[i];
4073
4074 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4075 For a vector of 4 units, we want to return either [0,0,0,init_val],
4076 or [0,0,0,0] and let the caller know that it needs to adjust
4077 the result at the end by 'init_val'.
4078
4079 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4080 initialization vector is simpler (same element in all entries), if
4081 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4082
4083 A cost model should help decide between these two schemes. */
4084
4085 tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,tree init_val,tree * adjustment_def)4086 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4087 tree *adjustment_def)
4088 {
4089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4090 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4091 tree scalar_type = TREE_TYPE (init_val);
4092 tree vectype = get_vectype_for_scalar_type (scalar_type);
4093 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4094 tree def_for_init;
4095 tree init_def;
4096 REAL_VALUE_TYPE real_init_val = dconst0;
4097 int int_init_val = 0;
4098 gimple_seq stmts = NULL;
4099
4100 gcc_assert (vectype);
4101
4102 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4103 || SCALAR_FLOAT_TYPE_P (scalar_type));
4104
4105 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4106 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4107
4108 vect_reduction_type reduction_type
4109 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4110
4111 switch (code)
4112 {
4113 case WIDEN_SUM_EXPR:
4114 case DOT_PROD_EXPR:
4115 case SAD_EXPR:
4116 case PLUS_EXPR:
4117 case MINUS_EXPR:
4118 case BIT_IOR_EXPR:
4119 case BIT_XOR_EXPR:
4120 case MULT_EXPR:
4121 case BIT_AND_EXPR:
4122 {
4123 /* ADJUSTMENT_DEF is NULL when called from
4124 vect_create_epilog_for_reduction to vectorize double reduction. */
4125 if (adjustment_def)
4126 *adjustment_def = init_val;
4127
4128 if (code == MULT_EXPR)
4129 {
4130 real_init_val = dconst1;
4131 int_init_val = 1;
4132 }
4133
4134 if (code == BIT_AND_EXPR)
4135 int_init_val = -1;
4136
4137 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4138 def_for_init = build_real (scalar_type, real_init_val);
4139 else
4140 def_for_init = build_int_cst (scalar_type, int_init_val);
4141
4142 if (adjustment_def)
4143 /* Option1: the first element is '0' or '1' as well. */
4144 init_def = gimple_build_vector_from_val (&stmts, vectype,
4145 def_for_init);
4146 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4147 {
4148 /* Option2 (variable length): the first element is INIT_VAL. */
4149 init_def = gimple_build_vector_from_val (&stmts, vectype,
4150 def_for_init);
4151 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4152 vectype, init_def, init_val);
4153 }
4154 else
4155 {
4156 /* Option2: the first element is INIT_VAL. */
4157 tree_vector_builder elts (vectype, 1, 2);
4158 elts.quick_push (init_val);
4159 elts.quick_push (def_for_init);
4160 init_def = gimple_build_vector (&stmts, &elts);
4161 }
4162 }
4163 break;
4164
4165 case MIN_EXPR:
4166 case MAX_EXPR:
4167 case COND_EXPR:
4168 {
4169 if (adjustment_def)
4170 {
4171 *adjustment_def = NULL_TREE;
4172 if (reduction_type != COND_REDUCTION
4173 && reduction_type != EXTRACT_LAST_REDUCTION)
4174 {
4175 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4176 break;
4177 }
4178 }
4179 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4180 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4181 }
4182 break;
4183
4184 default:
4185 gcc_unreachable ();
4186 }
4187
4188 if (stmts)
4189 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4190 return init_def;
4191 }
4192
4193 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4194 NUMBER_OF_VECTORS is the number of vector defs to create.
4195 If NEUTRAL_OP is nonnull, introducing extra elements of that
4196 value will not change the result. */
4197
4198 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4199 get_initial_defs_for_reduction (slp_tree slp_node,
4200 vec<tree> *vec_oprnds,
4201 unsigned int number_of_vectors,
4202 bool reduc_chain, tree neutral_op)
4203 {
4204 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4205 stmt_vec_info stmt_vinfo = stmts[0];
4206 unsigned HOST_WIDE_INT nunits;
4207 unsigned j, number_of_places_left_in_vector;
4208 tree vector_type;
4209 unsigned int group_size = stmts.length ();
4210 unsigned int i;
4211 struct loop *loop;
4212
4213 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4214
4215 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4216
4217 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4218 gcc_assert (loop);
4219 edge pe = loop_preheader_edge (loop);
4220
4221 gcc_assert (!reduc_chain || neutral_op);
4222
4223 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4224 created vectors. It is greater than 1 if unrolling is performed.
4225
4226 For example, we have two scalar operands, s1 and s2 (e.g., group of
4227 strided accesses of size two), while NUNITS is four (i.e., four scalars
4228 of this type can be packed in a vector). The output vector will contain
4229 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4230 will be 2).
4231
4232 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4233 vectors containing the operands.
4234
4235 For example, NUNITS is four as before, and the group size is 8
4236 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4237 {s5, s6, s7, s8}. */
4238
4239 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4240 nunits = group_size;
4241
4242 number_of_places_left_in_vector = nunits;
4243 bool constant_p = true;
4244 tree_vector_builder elts (vector_type, nunits, 1);
4245 elts.quick_grow (nunits);
4246 gimple_seq ctor_seq = NULL;
4247 for (j = 0; j < nunits * number_of_vectors; ++j)
4248 {
4249 tree op;
4250 i = j % group_size;
4251 stmt_vinfo = stmts[i];
4252
4253 /* Get the def before the loop. In reduction chain we have only
4254 one initial value. Else we have as many as PHIs in the group. */
4255 if (reduc_chain)
4256 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4257 else if (((vec_oprnds->length () + 1) * nunits
4258 - number_of_places_left_in_vector >= group_size)
4259 && neutral_op)
4260 op = neutral_op;
4261 else
4262 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4263
4264 /* Create 'vect_ = {op0,op1,...,opn}'. */
4265 number_of_places_left_in_vector--;
4266 elts[nunits - number_of_places_left_in_vector - 1] = op;
4267 if (!CONSTANT_CLASS_P (op))
4268 constant_p = false;
4269
4270 if (number_of_places_left_in_vector == 0)
4271 {
4272 tree init;
4273 if (constant_p && !neutral_op
4274 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4275 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4276 /* Build the vector directly from ELTS. */
4277 init = gimple_build_vector (&ctor_seq, &elts);
4278 else if (neutral_op)
4279 {
4280 /* Build a vector of the neutral value and shift the
4281 other elements into place. */
4282 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4283 neutral_op);
4284 int k = nunits;
4285 while (k > 0 && elts[k - 1] == neutral_op)
4286 k -= 1;
4287 while (k > 0)
4288 {
4289 k -= 1;
4290 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4291 vector_type, init, elts[k]);
4292 }
4293 }
4294 else
4295 {
4296 /* First time round, duplicate ELTS to fill the
4297 required number of vectors. */
4298 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4299 number_of_vectors, *vec_oprnds);
4300 break;
4301 }
4302 vec_oprnds->quick_push (init);
4303
4304 number_of_places_left_in_vector = nunits;
4305 elts.new_vector (vector_type, nunits, 1);
4306 elts.quick_grow (nunits);
4307 constant_p = true;
4308 }
4309 }
4310 if (ctor_seq != NULL)
4311 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4312 }
4313
4314
4315 /* Function vect_create_epilog_for_reduction
4316
4317 Create code at the loop-epilog to finalize the result of a reduction
4318 computation.
4319
4320 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4321 reduction statements.
4322 STMT_INFO is the scalar reduction stmt that is being vectorized.
4323 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4324 number of elements that we can fit in a vectype (nunits). In this case
4325 we have to generate more than one vector stmt - i.e - we need to "unroll"
4326 the vector stmt by a factor VF/nunits. For more details see documentation
4327 in vectorizable_operation.
4328 REDUC_FN is the internal function for the epilog reduction.
4329 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4330 computation.
4331 REDUC_INDEX is the index of the operand in the right hand side of the
4332 statement that is defined by REDUCTION_PHI.
4333 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4334 SLP_NODE is an SLP node containing a group of reduction statements. The
4335 first one in this group is STMT_INFO.
4336 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4337 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4338 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4339 any value of the IV in the loop.
4340 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4341 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4342 null if this is not an SLP reduction
4343
4344 This function:
4345 1. Creates the reduction def-use cycles: sets the arguments for
4346 REDUCTION_PHIS:
4347 The loop-entry argument is the vectorized initial-value of the reduction.
4348 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4349 sums.
4350 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4351 by calling the function specified by REDUC_FN if available, or by
4352 other means (whole-vector shifts or a scalar loop).
4353 The function also creates a new phi node at the loop exit to preserve
4354 loop-closed form, as illustrated below.
4355
4356 The flow at the entry to this function:
4357
4358 loop:
4359 vec_def = phi <null, null> # REDUCTION_PHI
4360 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4361 s_loop = scalar_stmt # (scalar) STMT_INFO
4362 loop_exit:
4363 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4364 use <s_out0>
4365 use <s_out0>
4366
4367 The above is transformed by this function into:
4368
4369 loop:
4370 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4371 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4372 s_loop = scalar_stmt # (scalar) STMT_INFO
4373 loop_exit:
4374 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4375 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4376 v_out2 = reduce <v_out1>
4377 s_out3 = extract_field <v_out2, 0>
4378 s_out4 = adjust_result <s_out3>
4379 use <s_out4>
4380 use <s_out4>
4381 */
4382
4383 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,stmt_vec_info stmt_info,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<stmt_vec_info> reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)4384 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4385 stmt_vec_info stmt_info,
4386 gimple *reduc_def_stmt,
4387 int ncopies, internal_fn reduc_fn,
4388 vec<stmt_vec_info> reduction_phis,
4389 bool double_reduc,
4390 slp_tree slp_node,
4391 slp_instance slp_node_instance,
4392 tree induc_val, enum tree_code induc_code,
4393 tree neutral_op)
4394 {
4395 stmt_vec_info prev_phi_info;
4396 tree vectype;
4397 machine_mode mode;
4398 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4399 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4400 basic_block exit_bb;
4401 tree scalar_dest;
4402 tree scalar_type;
4403 gimple *new_phi = NULL, *phi;
4404 stmt_vec_info phi_info;
4405 gimple_stmt_iterator exit_gsi;
4406 tree vec_dest;
4407 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4408 gimple *epilog_stmt = NULL;
4409 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4410 gimple *exit_phi;
4411 tree bitsize;
4412 tree adjustment_def = NULL;
4413 tree vec_initial_def = NULL;
4414 tree expr, def, initial_def = NULL;
4415 tree orig_name, scalar_result;
4416 imm_use_iterator imm_iter, phi_imm_iter;
4417 use_operand_p use_p, phi_use_p;
4418 gimple *use_stmt;
4419 stmt_vec_info reduction_phi_info = NULL;
4420 bool nested_in_vect_loop = false;
4421 auto_vec<gimple *> new_phis;
4422 auto_vec<stmt_vec_info> inner_phis;
4423 int j, i;
4424 auto_vec<tree> scalar_results;
4425 unsigned int group_size = 1, k, ratio;
4426 auto_vec<tree> vec_initial_defs;
4427 auto_vec<gimple *> phis;
4428 bool slp_reduc = false;
4429 bool direct_slp_reduc;
4430 tree new_phi_result;
4431 stmt_vec_info inner_phi = NULL;
4432 tree induction_index = NULL_TREE;
4433
4434 if (slp_node)
4435 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4436
4437 if (nested_in_vect_loop_p (loop, stmt_info))
4438 {
4439 outer_loop = loop;
4440 loop = loop->inner;
4441 nested_in_vect_loop = true;
4442 gcc_assert (!slp_node);
4443 }
4444
4445 vectype = STMT_VINFO_VECTYPE (stmt_info);
4446 gcc_assert (vectype);
4447 mode = TYPE_MODE (vectype);
4448
4449 /* 1. Create the reduction def-use cycle:
4450 Set the arguments of REDUCTION_PHIS, i.e., transform
4451
4452 loop:
4453 vec_def = phi <null, null> # REDUCTION_PHI
4454 VECT_DEF = vector_stmt # vectorized form of STMT
4455 ...
4456
4457 into:
4458
4459 loop:
4460 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4461 VECT_DEF = vector_stmt # vectorized form of STMT
4462 ...
4463
4464 (in case of SLP, do it for all the phis). */
4465
4466 /* Get the loop-entry arguments. */
4467 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4468 if (slp_node)
4469 {
4470 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4471 vec_initial_defs.reserve (vec_num);
4472 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4473 &vec_initial_defs, vec_num,
4474 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4475 neutral_op);
4476 }
4477 else
4478 {
4479 /* Get at the scalar def before the loop, that defines the initial value
4480 of the reduction variable. */
4481 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4482 loop_preheader_edge (loop));
4483 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4484 and we can't use zero for induc_val, use initial_def. Similarly
4485 for REDUC_MIN and initial_def larger than the base. */
4486 if (TREE_CODE (initial_def) == INTEGER_CST
4487 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4488 == INTEGER_INDUC_COND_REDUCTION)
4489 && !integer_zerop (induc_val)
4490 && ((induc_code == MAX_EXPR
4491 && tree_int_cst_lt (initial_def, induc_val))
4492 || (induc_code == MIN_EXPR
4493 && tree_int_cst_lt (induc_val, initial_def))))
4494 induc_val = initial_def;
4495
4496 if (double_reduc)
4497 /* In case of double reduction we only create a vector variable
4498 to be put in the reduction phi node. The actual statement
4499 creation is done later in this function. */
4500 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4501 else if (nested_in_vect_loop)
4502 {
4503 /* Do not use an adjustment def as that case is not supported
4504 correctly if ncopies is not one. */
4505 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4506 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4507 stmt_info);
4508 }
4509 else
4510 vec_initial_def
4511 = get_initial_def_for_reduction (stmt_info, initial_def,
4512 &adjustment_def);
4513 vec_initial_defs.create (1);
4514 vec_initial_defs.quick_push (vec_initial_def);
4515 }
4516
4517 /* Set phi nodes arguments. */
4518 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4519 {
4520 tree vec_init_def = vec_initial_defs[i];
4521 tree def = vect_defs[i];
4522 for (j = 0; j < ncopies; j++)
4523 {
4524 if (j != 0)
4525 {
4526 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4527 if (nested_in_vect_loop)
4528 vec_init_def
4529 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4530 }
4531
4532 /* Set the loop-entry arg of the reduction-phi. */
4533
4534 gphi *phi = as_a <gphi *> (phi_info->stmt);
4535 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4536 == INTEGER_INDUC_COND_REDUCTION)
4537 {
4538 /* Initialise the reduction phi to zero. This prevents initial
4539 values of non-zero interferring with the reduction op. */
4540 gcc_assert (ncopies == 1);
4541 gcc_assert (i == 0);
4542
4543 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4544 tree induc_val_vec
4545 = build_vector_from_val (vec_init_def_type, induc_val);
4546
4547 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4548 UNKNOWN_LOCATION);
4549 }
4550 else
4551 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4552 UNKNOWN_LOCATION);
4553
4554 /* Set the loop-latch arg for the reduction-phi. */
4555 if (j > 0)
4556 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4557
4558 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4559
4560 if (dump_enabled_p ())
4561 dump_printf_loc (MSG_NOTE, vect_location,
4562 "transform reduction: created def-use cycle: %G%G",
4563 phi, SSA_NAME_DEF_STMT (def));
4564 }
4565 }
4566
4567 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4568 which is updated with the current index of the loop for every match of
4569 the original loop's cond_expr (VEC_STMT). This results in a vector
4570 containing the last time the condition passed for that vector lane.
4571 The first match will be a 1 to allow 0 to be used for non-matching
4572 indexes. If there are no matches at all then the vector will be all
4573 zeroes. */
4574 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4575 {
4576 tree indx_before_incr, indx_after_incr;
4577 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4578
4579 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4580 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4581
4582 int scalar_precision
4583 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4584 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4585 tree cr_index_vector_type = build_vector_type
4586 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4587
4588 /* First we create a simple vector induction variable which starts
4589 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4590 vector size (STEP). */
4591
4592 /* Create a {1,2,3,...} vector. */
4593 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4594
4595 /* Create a vector of the step value. */
4596 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4597 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4598
4599 /* Create an induction variable. */
4600 gimple_stmt_iterator incr_gsi;
4601 bool insert_after;
4602 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4603 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4604 insert_after, &indx_before_incr, &indx_after_incr);
4605
4606 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4607 filled with zeros (VEC_ZERO). */
4608
4609 /* Create a vector of 0s. */
4610 tree zero = build_zero_cst (cr_index_scalar_type);
4611 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4612
4613 /* Create a vector phi node. */
4614 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4615 new_phi = create_phi_node (new_phi_tree, loop->header);
4616 loop_vinfo->add_stmt (new_phi);
4617 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4618 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4619
4620 /* Now take the condition from the loops original cond_expr
4621 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4622 every match uses values from the induction variable
4623 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4624 (NEW_PHI_TREE).
4625 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4626 the new cond_expr (INDEX_COND_EXPR). */
4627
4628 /* Duplicate the condition from vec_stmt. */
4629 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4630
4631 /* Create a conditional, where the condition is taken from vec_stmt
4632 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4633 else is the phi (NEW_PHI_TREE). */
4634 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4635 ccompare, indx_before_incr,
4636 new_phi_tree);
4637 induction_index = make_ssa_name (cr_index_vector_type);
4638 gimple *index_condition = gimple_build_assign (induction_index,
4639 index_cond_expr);
4640 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4641 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4642 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4643
4644 /* Update the phi with the vec cond. */
4645 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4646 loop_latch_edge (loop), UNKNOWN_LOCATION);
4647 }
4648
4649 /* 2. Create epilog code.
4650 The reduction epilog code operates across the elements of the vector
4651 of partial results computed by the vectorized loop.
4652 The reduction epilog code consists of:
4653
4654 step 1: compute the scalar result in a vector (v_out2)
4655 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4656 step 3: adjust the scalar result (s_out3) if needed.
4657
4658 Step 1 can be accomplished using one the following three schemes:
4659 (scheme 1) using reduc_fn, if available.
4660 (scheme 2) using whole-vector shifts, if available.
4661 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4662 combined.
4663
4664 The overall epilog code looks like this:
4665
4666 s_out0 = phi <s_loop> # original EXIT_PHI
4667 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4668 v_out2 = reduce <v_out1> # step 1
4669 s_out3 = extract_field <v_out2, 0> # step 2
4670 s_out4 = adjust_result <s_out3> # step 3
4671
4672 (step 3 is optional, and steps 1 and 2 may be combined).
4673 Lastly, the uses of s_out0 are replaced by s_out4. */
4674
4675
4676 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4677 v_out1 = phi <VECT_DEF>
4678 Store them in NEW_PHIS. */
4679
4680 exit_bb = single_exit (loop)->dest;
4681 prev_phi_info = NULL;
4682 new_phis.create (vect_defs.length ());
4683 FOR_EACH_VEC_ELT (vect_defs, i, def)
4684 {
4685 for (j = 0; j < ncopies; j++)
4686 {
4687 tree new_def = copy_ssa_name (def);
4688 phi = create_phi_node (new_def, exit_bb);
4689 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4690 if (j == 0)
4691 new_phis.quick_push (phi);
4692 else
4693 {
4694 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4695 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4696 }
4697
4698 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4699 prev_phi_info = phi_info;
4700 }
4701 }
4702
4703 /* The epilogue is created for the outer-loop, i.e., for the loop being
4704 vectorized. Create exit phis for the outer loop. */
4705 if (double_reduc)
4706 {
4707 loop = outer_loop;
4708 exit_bb = single_exit (loop)->dest;
4709 inner_phis.create (vect_defs.length ());
4710 FOR_EACH_VEC_ELT (new_phis, i, phi)
4711 {
4712 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4713 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4714 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4715 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4716 PHI_RESULT (phi));
4717 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4718 inner_phis.quick_push (phi_info);
4719 new_phis[i] = outer_phi;
4720 while (STMT_VINFO_RELATED_STMT (phi_info))
4721 {
4722 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4723 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4724 outer_phi = create_phi_node (new_result, exit_bb);
4725 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4726 PHI_RESULT (phi_info->stmt));
4727 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4728 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4729 prev_phi_info = outer_phi_info;
4730 }
4731 }
4732 }
4733
4734 exit_gsi = gsi_after_labels (exit_bb);
4735
4736 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4737 (i.e. when reduc_fn is not available) and in the final adjustment
4738 code (if needed). Also get the original scalar reduction variable as
4739 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4740 represents a reduction pattern), the tree-code and scalar-def are
4741 taken from the original stmt that the pattern-stmt (STMT) replaces.
4742 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4743 are taken from STMT. */
4744
4745 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4746 if (orig_stmt_info != stmt_info)
4747 {
4748 /* Reduction pattern */
4749 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4750 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4751 }
4752
4753 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4754 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4755 partial results are added and not subtracted. */
4756 if (code == MINUS_EXPR)
4757 code = PLUS_EXPR;
4758
4759 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4760 scalar_type = TREE_TYPE (scalar_dest);
4761 scalar_results.create (group_size);
4762 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4763 bitsize = TYPE_SIZE (scalar_type);
4764
4765 /* In case this is a reduction in an inner-loop while vectorizing an outer
4766 loop - we don't need to extract a single scalar result at the end of the
4767 inner-loop (unless it is double reduction, i.e., the use of reduction is
4768 outside the outer-loop). The final vector of partial results will be used
4769 in the vectorized outer-loop, or reduced to a scalar result at the end of
4770 the outer-loop. */
4771 if (nested_in_vect_loop && !double_reduc)
4772 goto vect_finalize_reduction;
4773
4774 /* SLP reduction without reduction chain, e.g.,
4775 # a1 = phi <a2, a0>
4776 # b1 = phi <b2, b0>
4777 a2 = operation (a1)
4778 b2 = operation (b1) */
4779 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4780
4781 /* True if we should implement SLP_REDUC using native reduction operations
4782 instead of scalar operations. */
4783 direct_slp_reduc = (reduc_fn != IFN_LAST
4784 && slp_reduc
4785 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4786
4787 /* In case of reduction chain, e.g.,
4788 # a1 = phi <a3, a0>
4789 a2 = operation (a1)
4790 a3 = operation (a2),
4791
4792 we may end up with more than one vector result. Here we reduce them to
4793 one vector. */
4794 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4795 {
4796 tree first_vect = PHI_RESULT (new_phis[0]);
4797 gassign *new_vec_stmt = NULL;
4798 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4799 for (k = 1; k < new_phis.length (); k++)
4800 {
4801 gimple *next_phi = new_phis[k];
4802 tree second_vect = PHI_RESULT (next_phi);
4803 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4804 new_vec_stmt = gimple_build_assign (tem, code,
4805 first_vect, second_vect);
4806 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4807 first_vect = tem;
4808 }
4809
4810 new_phi_result = first_vect;
4811 if (new_vec_stmt)
4812 {
4813 new_phis.truncate (0);
4814 new_phis.safe_push (new_vec_stmt);
4815 }
4816 }
4817 /* Likewise if we couldn't use a single defuse cycle. */
4818 else if (ncopies > 1)
4819 {
4820 gcc_assert (new_phis.length () == 1);
4821 tree first_vect = PHI_RESULT (new_phis[0]);
4822 gassign *new_vec_stmt = NULL;
4823 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4824 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4825 for (int k = 1; k < ncopies; ++k)
4826 {
4827 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4828 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4829 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4830 new_vec_stmt = gimple_build_assign (tem, code,
4831 first_vect, second_vect);
4832 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4833 first_vect = tem;
4834 }
4835 new_phi_result = first_vect;
4836 new_phis.truncate (0);
4837 new_phis.safe_push (new_vec_stmt);
4838 }
4839 else
4840 new_phi_result = PHI_RESULT (new_phis[0]);
4841
4842 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4843 && reduc_fn != IFN_LAST)
4844 {
4845 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4846 various data values where the condition matched and another vector
4847 (INDUCTION_INDEX) containing all the indexes of those matches. We
4848 need to extract the last matching index (which will be the index with
4849 highest value) and use this to index into the data vector.
4850 For the case where there were no matches, the data vector will contain
4851 all default values and the index vector will be all zeros. */
4852
4853 /* Get various versions of the type of the vector of indexes. */
4854 tree index_vec_type = TREE_TYPE (induction_index);
4855 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4856 tree index_scalar_type = TREE_TYPE (index_vec_type);
4857 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4858 (index_vec_type);
4859
4860 /* Get an unsigned integer version of the type of the data vector. */
4861 int scalar_precision
4862 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4863 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4864 tree vectype_unsigned = build_vector_type
4865 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4866
4867 /* First we need to create a vector (ZERO_VEC) of zeros and another
4868 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4869 can create using a MAX reduction and then expanding.
4870 In the case where the loop never made any matches, the max index will
4871 be zero. */
4872
4873 /* Vector of {0, 0, 0,...}. */
4874 tree zero_vec = make_ssa_name (vectype);
4875 tree zero_vec_rhs = build_zero_cst (vectype);
4876 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4877 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4878
4879 /* Find maximum value from the vector of found indexes. */
4880 tree max_index = make_ssa_name (index_scalar_type);
4881 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882 1, induction_index);
4883 gimple_call_set_lhs (max_index_stmt, max_index);
4884 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4885
4886 /* Vector of {max_index, max_index, max_index,...}. */
4887 tree max_index_vec = make_ssa_name (index_vec_type);
4888 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4889 max_index);
4890 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4891 max_index_vec_rhs);
4892 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4893
4894 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4895 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4896 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4897 otherwise. Only one value should match, resulting in a vector
4898 (VEC_COND) with one data value and the rest zeros.
4899 In the case where the loop never made any matches, every index will
4900 match, resulting in a vector with all data values (which will all be
4901 the default value). */
4902
4903 /* Compare the max index vector to the vector of found indexes to find
4904 the position of the max value. */
4905 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4906 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4907 induction_index,
4908 max_index_vec);
4909 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4910
4911 /* Use the compare to choose either values from the data vector or
4912 zero. */
4913 tree vec_cond = make_ssa_name (vectype);
4914 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4915 vec_compare, new_phi_result,
4916 zero_vec);
4917 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4918
4919 /* Finally we need to extract the data value from the vector (VEC_COND)
4920 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4921 reduction, but because this doesn't exist, we can use a MAX reduction
4922 instead. The data value might be signed or a float so we need to cast
4923 it first.
4924 In the case where the loop never made any matches, the data values are
4925 all identical, and so will reduce down correctly. */
4926
4927 /* Make the matched data values unsigned. */
4928 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4929 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4930 vec_cond);
4931 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4932 VIEW_CONVERT_EXPR,
4933 vec_cond_cast_rhs);
4934 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4935
4936 /* Reduce down to a scalar value. */
4937 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4938 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4939 1, vec_cond_cast);
4940 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4941 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4942
4943 /* Convert the reduced value back to the result type and set as the
4944 result. */
4945 gimple_seq stmts = NULL;
4946 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4947 data_reduc);
4948 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4949 scalar_results.safe_push (new_temp);
4950 }
4951 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4952 && reduc_fn == IFN_LAST)
4953 {
4954 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4955 idx = 0;
4956 idx_val = induction_index[0];
4957 val = data_reduc[0];
4958 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4959 if (induction_index[i] > idx_val)
4960 val = data_reduc[i], idx_val = induction_index[i];
4961 return val; */
4962
4963 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4964 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4965 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4966 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4967 /* Enforced by vectorizable_reduction, which ensures we have target
4968 support before allowing a conditional reduction on variable-length
4969 vectors. */
4970 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4971 tree idx_val = NULL_TREE, val = NULL_TREE;
4972 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4973 {
4974 tree old_idx_val = idx_val;
4975 tree old_val = val;
4976 idx_val = make_ssa_name (idx_eltype);
4977 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4978 build3 (BIT_FIELD_REF, idx_eltype,
4979 induction_index,
4980 bitsize_int (el_size),
4981 bitsize_int (off)));
4982 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4983 val = make_ssa_name (data_eltype);
4984 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4985 build3 (BIT_FIELD_REF,
4986 data_eltype,
4987 new_phi_result,
4988 bitsize_int (el_size),
4989 bitsize_int (off)));
4990 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991 if (off != 0)
4992 {
4993 tree new_idx_val = idx_val;
4994 tree new_val = val;
4995 if (off != v_size - el_size)
4996 {
4997 new_idx_val = make_ssa_name (idx_eltype);
4998 epilog_stmt = gimple_build_assign (new_idx_val,
4999 MAX_EXPR, idx_val,
5000 old_idx_val);
5001 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5002 }
5003 new_val = make_ssa_name (data_eltype);
5004 epilog_stmt = gimple_build_assign (new_val,
5005 COND_EXPR,
5006 build2 (GT_EXPR,
5007 boolean_type_node,
5008 idx_val,
5009 old_idx_val),
5010 val, old_val);
5011 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5012 idx_val = new_idx_val;
5013 val = new_val;
5014 }
5015 }
5016 /* Convert the reduced value back to the result type and set as the
5017 result. */
5018 gimple_seq stmts = NULL;
5019 val = gimple_convert (&stmts, scalar_type, val);
5020 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5021 scalar_results.safe_push (val);
5022 }
5023
5024 /* 2.3 Create the reduction code, using one of the three schemes described
5025 above. In SLP we simply need to extract all the elements from the
5026 vector (without reducing them), so we use scalar shifts. */
5027 else if (reduc_fn != IFN_LAST && !slp_reduc)
5028 {
5029 tree tmp;
5030 tree vec_elem_type;
5031
5032 /* Case 1: Create:
5033 v_out2 = reduc_expr <v_out1> */
5034
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_NOTE, vect_location,
5037 "Reduce using direct vector reduction.\n");
5038
5039 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5040 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5041 {
5042 tree tmp_dest
5043 = vect_create_destination_var (scalar_dest, vec_elem_type);
5044 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5045 new_phi_result);
5046 gimple_set_lhs (epilog_stmt, tmp_dest);
5047 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5048 gimple_set_lhs (epilog_stmt, new_temp);
5049 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5050
5051 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5052 new_temp);
5053 }
5054 else
5055 {
5056 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5057 new_phi_result);
5058 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5059 }
5060
5061 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5062 gimple_set_lhs (epilog_stmt, new_temp);
5063 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5064
5065 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5066 == INTEGER_INDUC_COND_REDUCTION)
5067 && !operand_equal_p (initial_def, induc_val, 0))
5068 {
5069 /* Earlier we set the initial value to be a vector if induc_val
5070 values. Check the result and if it is induc_val then replace
5071 with the original initial value, unless induc_val is
5072 the same as initial_def already. */
5073 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5074 induc_val);
5075
5076 tmp = make_ssa_name (new_scalar_dest);
5077 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5078 initial_def, new_temp);
5079 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5080 new_temp = tmp;
5081 }
5082
5083 scalar_results.safe_push (new_temp);
5084 }
5085 else if (direct_slp_reduc)
5086 {
5087 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5088 with the elements for other SLP statements replaced with the
5089 neutral value. We can then do a normal reduction on each vector. */
5090
5091 /* Enforced by vectorizable_reduction. */
5092 gcc_assert (new_phis.length () == 1);
5093 gcc_assert (pow2p_hwi (group_size));
5094
5095 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5096 vec<stmt_vec_info> orig_phis
5097 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5098 gimple_seq seq = NULL;
5099
5100 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5101 and the same element size as VECTYPE. */
5102 tree index = build_index_vector (vectype, 0, 1);
5103 tree index_type = TREE_TYPE (index);
5104 tree index_elt_type = TREE_TYPE (index_type);
5105 tree mask_type = build_same_sized_truth_vector_type (index_type);
5106
5107 /* Create a vector that, for each element, identifies which of
5108 the REDUC_GROUP_SIZE results should use it. */
5109 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5110 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5111 build_vector_from_val (index_type, index_mask));
5112
5113 /* Get a neutral vector value. This is simply a splat of the neutral
5114 scalar value if we have one, otherwise the initial scalar value
5115 is itself a neutral value. */
5116 tree vector_identity = NULL_TREE;
5117 if (neutral_op)
5118 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5119 neutral_op);
5120 for (unsigned int i = 0; i < group_size; ++i)
5121 {
5122 /* If there's no univeral neutral value, we can use the
5123 initial scalar value from the original PHI. This is used
5124 for MIN and MAX reduction, for example. */
5125 if (!neutral_op)
5126 {
5127 tree scalar_value
5128 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5129 loop_preheader_edge (loop));
5130 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5131 scalar_value);
5132 }
5133
5134 /* Calculate the equivalent of:
5135
5136 sel[j] = (index[j] == i);
5137
5138 which selects the elements of NEW_PHI_RESULT that should
5139 be included in the result. */
5140 tree compare_val = build_int_cst (index_elt_type, i);
5141 compare_val = build_vector_from_val (index_type, compare_val);
5142 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5143 index, compare_val);
5144
5145 /* Calculate the equivalent of:
5146
5147 vec = seq ? new_phi_result : vector_identity;
5148
5149 VEC is now suitable for a full vector reduction. */
5150 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5151 sel, new_phi_result, vector_identity);
5152
5153 /* Do the reduction and convert it to the appropriate type. */
5154 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5155 TREE_TYPE (vectype), vec);
5156 scalar = gimple_convert (&seq, scalar_type, scalar);
5157 scalar_results.safe_push (scalar);
5158 }
5159 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5160 }
5161 else
5162 {
5163 bool reduce_with_shift;
5164 tree vec_temp;
5165
5166 /* COND reductions all do the final reduction with MAX_EXPR
5167 or MIN_EXPR. */
5168 if (code == COND_EXPR)
5169 {
5170 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5171 == INTEGER_INDUC_COND_REDUCTION)
5172 code = induc_code;
5173 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5174 == CONST_COND_REDUCTION)
5175 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5176 else
5177 code = MAX_EXPR;
5178 }
5179
5180 /* See if the target wants to do the final (shift) reduction
5181 in a vector mode of smaller size and first reduce upper/lower
5182 halves against each other. */
5183 enum machine_mode mode1 = mode;
5184 tree vectype1 = vectype;
5185 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5186 unsigned sz1 = sz;
5187 if (!slp_reduc
5188 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5189 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5190
5191 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5192 reduce_with_shift = have_whole_vector_shift (mode1);
5193 if (!VECTOR_MODE_P (mode1))
5194 reduce_with_shift = false;
5195 else
5196 {
5197 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5198 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5199 reduce_with_shift = false;
5200 }
5201
5202 /* First reduce the vector to the desired vector size we should
5203 do shift reduction on by combining upper and lower halves. */
5204 new_temp = new_phi_result;
5205 while (sz > sz1)
5206 {
5207 gcc_assert (!slp_reduc);
5208 sz /= 2;
5209 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5210
5211 /* The target has to make sure we support lowpart/highpart
5212 extraction, either via direct vector extract or through
5213 an integer mode punning. */
5214 tree dst1, dst2;
5215 if (convert_optab_handler (vec_extract_optab,
5216 TYPE_MODE (TREE_TYPE (new_temp)),
5217 TYPE_MODE (vectype1))
5218 != CODE_FOR_nothing)
5219 {
5220 /* Extract sub-vectors directly once vec_extract becomes
5221 a conversion optab. */
5222 dst1 = make_ssa_name (vectype1);
5223 epilog_stmt
5224 = gimple_build_assign (dst1, BIT_FIELD_REF,
5225 build3 (BIT_FIELD_REF, vectype1,
5226 new_temp, TYPE_SIZE (vectype1),
5227 bitsize_int (0)));
5228 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 dst2 = make_ssa_name (vectype1);
5230 epilog_stmt
5231 = gimple_build_assign (dst2, BIT_FIELD_REF,
5232 build3 (BIT_FIELD_REF, vectype1,
5233 new_temp, TYPE_SIZE (vectype1),
5234 bitsize_int (sz * BITS_PER_UNIT)));
5235 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5236 }
5237 else
5238 {
5239 /* Extract via punning to appropriately sized integer mode
5240 vector. */
5241 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5242 1);
5243 tree etype = build_vector_type (eltype, 2);
5244 gcc_assert (convert_optab_handler (vec_extract_optab,
5245 TYPE_MODE (etype),
5246 TYPE_MODE (eltype))
5247 != CODE_FOR_nothing);
5248 tree tem = make_ssa_name (etype);
5249 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5250 build1 (VIEW_CONVERT_EXPR,
5251 etype, new_temp));
5252 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5253 new_temp = tem;
5254 tem = make_ssa_name (eltype);
5255 epilog_stmt
5256 = gimple_build_assign (tem, BIT_FIELD_REF,
5257 build3 (BIT_FIELD_REF, eltype,
5258 new_temp, TYPE_SIZE (eltype),
5259 bitsize_int (0)));
5260 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261 dst1 = make_ssa_name (vectype1);
5262 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5263 build1 (VIEW_CONVERT_EXPR,
5264 vectype1, tem));
5265 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266 tem = make_ssa_name (eltype);
5267 epilog_stmt
5268 = gimple_build_assign (tem, BIT_FIELD_REF,
5269 build3 (BIT_FIELD_REF, eltype,
5270 new_temp, TYPE_SIZE (eltype),
5271 bitsize_int (sz * BITS_PER_UNIT)));
5272 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5273 dst2 = make_ssa_name (vectype1);
5274 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5275 build1 (VIEW_CONVERT_EXPR,
5276 vectype1, tem));
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 }
5279
5280 new_temp = make_ssa_name (vectype1);
5281 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5282 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 }
5284
5285 if (reduce_with_shift && !slp_reduc)
5286 {
5287 int element_bitsize = tree_to_uhwi (bitsize);
5288 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5289 for variable-length vectors and also requires direct target support
5290 for loop reductions. */
5291 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5292 int nelements = vec_size_in_bits / element_bitsize;
5293 vec_perm_builder sel;
5294 vec_perm_indices indices;
5295
5296 int elt_offset;
5297
5298 tree zero_vec = build_zero_cst (vectype1);
5299 /* Case 2: Create:
5300 for (offset = nelements/2; offset >= 1; offset/=2)
5301 {
5302 Create: va' = vec_shift <va, offset>
5303 Create: va = vop <va, va'>
5304 } */
5305
5306 tree rhs;
5307
5308 if (dump_enabled_p ())
5309 dump_printf_loc (MSG_NOTE, vect_location,
5310 "Reduce using vector shifts\n");
5311
5312 mode1 = TYPE_MODE (vectype1);
5313 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5314 for (elt_offset = nelements / 2;
5315 elt_offset >= 1;
5316 elt_offset /= 2)
5317 {
5318 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5319 indices.new_vector (sel, 2, nelements);
5320 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5321 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5322 new_temp, zero_vec, mask);
5323 new_name = make_ssa_name (vec_dest, epilog_stmt);
5324 gimple_assign_set_lhs (epilog_stmt, new_name);
5325 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5326
5327 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5328 new_temp);
5329 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5330 gimple_assign_set_lhs (epilog_stmt, new_temp);
5331 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5332 }
5333
5334 /* 2.4 Extract the final scalar result. Create:
5335 s_out3 = extract_field <v_out2, bitpos> */
5336
5337 if (dump_enabled_p ())
5338 dump_printf_loc (MSG_NOTE, vect_location,
5339 "extract scalar result\n");
5340
5341 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5342 bitsize, bitsize_zero_node);
5343 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5344 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5345 gimple_assign_set_lhs (epilog_stmt, new_temp);
5346 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347 scalar_results.safe_push (new_temp);
5348 }
5349 else
5350 {
5351 /* Case 3: Create:
5352 s = extract_field <v_out2, 0>
5353 for (offset = element_size;
5354 offset < vector_size;
5355 offset += element_size;)
5356 {
5357 Create: s' = extract_field <v_out2, offset>
5358 Create: s = op <s, s'> // For non SLP cases
5359 } */
5360
5361 if (dump_enabled_p ())
5362 dump_printf_loc (MSG_NOTE, vect_location,
5363 "Reduce using scalar code.\n");
5364
5365 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5366 int element_bitsize = tree_to_uhwi (bitsize);
5367 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5368 {
5369 int bit_offset;
5370 if (gimple_code (new_phi) == GIMPLE_PHI)
5371 vec_temp = PHI_RESULT (new_phi);
5372 else
5373 vec_temp = gimple_assign_lhs (new_phi);
5374 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5375 bitsize_zero_node);
5376 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5377 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5378 gimple_assign_set_lhs (epilog_stmt, new_temp);
5379 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380
5381 /* In SLP we don't need to apply reduction operation, so we just
5382 collect s' values in SCALAR_RESULTS. */
5383 if (slp_reduc)
5384 scalar_results.safe_push (new_temp);
5385
5386 for (bit_offset = element_bitsize;
5387 bit_offset < vec_size_in_bits;
5388 bit_offset += element_bitsize)
5389 {
5390 tree bitpos = bitsize_int (bit_offset);
5391 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5392 bitsize, bitpos);
5393
5394 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5395 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5396 gimple_assign_set_lhs (epilog_stmt, new_name);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398
5399 if (slp_reduc)
5400 {
5401 /* In SLP we don't need to apply reduction operation, so
5402 we just collect s' values in SCALAR_RESULTS. */
5403 new_temp = new_name;
5404 scalar_results.safe_push (new_name);
5405 }
5406 else
5407 {
5408 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5409 new_name, new_temp);
5410 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5411 gimple_assign_set_lhs (epilog_stmt, new_temp);
5412 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413 }
5414 }
5415 }
5416
5417 /* The only case where we need to reduce scalar results in SLP, is
5418 unrolling. If the size of SCALAR_RESULTS is greater than
5419 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5420 REDUC_GROUP_SIZE. */
5421 if (slp_reduc)
5422 {
5423 tree res, first_res, new_res;
5424 gimple *new_stmt;
5425
5426 /* Reduce multiple scalar results in case of SLP unrolling. */
5427 for (j = group_size; scalar_results.iterate (j, &res);
5428 j++)
5429 {
5430 first_res = scalar_results[j % group_size];
5431 new_stmt = gimple_build_assign (new_scalar_dest, code,
5432 first_res, res);
5433 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5434 gimple_assign_set_lhs (new_stmt, new_res);
5435 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5436 scalar_results[j % group_size] = new_res;
5437 }
5438 }
5439 else
5440 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5441 scalar_results.safe_push (new_temp);
5442 }
5443
5444 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5445 == INTEGER_INDUC_COND_REDUCTION)
5446 && !operand_equal_p (initial_def, induc_val, 0))
5447 {
5448 /* Earlier we set the initial value to be a vector if induc_val
5449 values. Check the result and if it is induc_val then replace
5450 with the original initial value, unless induc_val is
5451 the same as initial_def already. */
5452 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5453 induc_val);
5454
5455 tree tmp = make_ssa_name (new_scalar_dest);
5456 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5457 initial_def, new_temp);
5458 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5459 scalar_results[0] = tmp;
5460 }
5461 }
5462
5463 vect_finalize_reduction:
5464
5465 if (double_reduc)
5466 loop = loop->inner;
5467
5468 /* 2.5 Adjust the final result by the initial value of the reduction
5469 variable. (When such adjustment is not needed, then
5470 'adjustment_def' is zero). For example, if code is PLUS we create:
5471 new_temp = loop_exit_def + adjustment_def */
5472
5473 if (adjustment_def)
5474 {
5475 gcc_assert (!slp_reduc);
5476 if (nested_in_vect_loop)
5477 {
5478 new_phi = new_phis[0];
5479 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5480 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5481 new_dest = vect_create_destination_var (scalar_dest, vectype);
5482 }
5483 else
5484 {
5485 new_temp = scalar_results[0];
5486 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5487 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5488 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5489 }
5490
5491 epilog_stmt = gimple_build_assign (new_dest, expr);
5492 new_temp = make_ssa_name (new_dest, epilog_stmt);
5493 gimple_assign_set_lhs (epilog_stmt, new_temp);
5494 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5495 if (nested_in_vect_loop)
5496 {
5497 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5498 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5499 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5500
5501 if (!double_reduc)
5502 scalar_results.quick_push (new_temp);
5503 else
5504 scalar_results[0] = new_temp;
5505 }
5506 else
5507 scalar_results[0] = new_temp;
5508
5509 new_phis[0] = epilog_stmt;
5510 }
5511
5512 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5513 phis with new adjusted scalar results, i.e., replace use <s_out0>
5514 with use <s_out4>.
5515
5516 Transform:
5517 loop_exit:
5518 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5519 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5520 v_out2 = reduce <v_out1>
5521 s_out3 = extract_field <v_out2, 0>
5522 s_out4 = adjust_result <s_out3>
5523 use <s_out0>
5524 use <s_out0>
5525
5526 into:
5527
5528 loop_exit:
5529 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5530 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5531 v_out2 = reduce <v_out1>
5532 s_out3 = extract_field <v_out2, 0>
5533 s_out4 = adjust_result <s_out3>
5534 use <s_out4>
5535 use <s_out4> */
5536
5537
5538 /* In SLP reduction chain we reduce vector results into one vector if
5539 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5540 LHS of the last stmt in the reduction chain, since we are looking for
5541 the loop exit phi node. */
5542 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5543 {
5544 stmt_vec_info dest_stmt_info
5545 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5546 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5547 group_size = 1;
5548 }
5549
5550 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5551 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5552 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5553 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5554 correspond to the first vector stmt, etc.
5555 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5556 if (group_size > new_phis.length ())
5557 {
5558 ratio = group_size / new_phis.length ();
5559 gcc_assert (!(group_size % new_phis.length ()));
5560 }
5561 else
5562 ratio = 1;
5563
5564 stmt_vec_info epilog_stmt_info = NULL;
5565 for (k = 0; k < group_size; k++)
5566 {
5567 if (k % ratio == 0)
5568 {
5569 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5570 reduction_phi_info = reduction_phis[k / ratio];
5571 if (double_reduc)
5572 inner_phi = inner_phis[k / ratio];
5573 }
5574
5575 if (slp_reduc)
5576 {
5577 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5578
5579 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5580 /* SLP statements can't participate in patterns. */
5581 gcc_assert (!orig_stmt_info);
5582 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5583 }
5584
5585 phis.create (3);
5586 /* Find the loop-closed-use at the loop exit of the original scalar
5587 result. (The reduction result is expected to have two immediate uses -
5588 one at the latch block, and one at the loop exit). */
5589 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5590 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5591 && !is_gimple_debug (USE_STMT (use_p)))
5592 phis.safe_push (USE_STMT (use_p));
5593
5594 /* While we expect to have found an exit_phi because of loop-closed-ssa
5595 form we can end up without one if the scalar cycle is dead. */
5596
5597 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5598 {
5599 if (outer_loop)
5600 {
5601 stmt_vec_info exit_phi_vinfo
5602 = loop_vinfo->lookup_stmt (exit_phi);
5603 gphi *vect_phi;
5604
5605 if (double_reduc)
5606 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5607 else
5608 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5609 if (!double_reduc
5610 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5611 != vect_double_reduction_def)
5612 continue;
5613
5614 /* Handle double reduction:
5615
5616 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5617 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5618 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5619 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5620
5621 At that point the regular reduction (stmt2 and stmt3) is
5622 already vectorized, as well as the exit phi node, stmt4.
5623 Here we vectorize the phi node of double reduction, stmt1, and
5624 update all relevant statements. */
5625
5626 /* Go through all the uses of s2 to find double reduction phi
5627 node, i.e., stmt1 above. */
5628 orig_name = PHI_RESULT (exit_phi);
5629 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5630 {
5631 stmt_vec_info use_stmt_vinfo;
5632 tree vect_phi_init, preheader_arg, vect_phi_res;
5633 basic_block bb = gimple_bb (use_stmt);
5634
5635 /* Check that USE_STMT is really double reduction phi
5636 node. */
5637 if (gimple_code (use_stmt) != GIMPLE_PHI
5638 || gimple_phi_num_args (use_stmt) != 2
5639 || bb->loop_father != outer_loop)
5640 continue;
5641 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5642 if (!use_stmt_vinfo
5643 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5644 != vect_double_reduction_def)
5645 continue;
5646
5647 /* Create vector phi node for double reduction:
5648 vs1 = phi <vs0, vs2>
5649 vs1 was created previously in this function by a call to
5650 vect_get_vec_def_for_operand and is stored in
5651 vec_initial_def;
5652 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5653 vs0 is created here. */
5654
5655 /* Create vector phi node. */
5656 vect_phi = create_phi_node (vec_initial_def, bb);
5657 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5658
5659 /* Create vs0 - initial def of the double reduction phi. */
5660 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5661 loop_preheader_edge (outer_loop));
5662 vect_phi_init = get_initial_def_for_reduction
5663 (stmt_info, preheader_arg, NULL);
5664
5665 /* Update phi node arguments with vs0 and vs2. */
5666 add_phi_arg (vect_phi, vect_phi_init,
5667 loop_preheader_edge (outer_loop),
5668 UNKNOWN_LOCATION);
5669 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5670 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5671 if (dump_enabled_p ())
5672 dump_printf_loc (MSG_NOTE, vect_location,
5673 "created double reduction phi node: %G",
5674 vect_phi);
5675
5676 vect_phi_res = PHI_RESULT (vect_phi);
5677
5678 /* Replace the use, i.e., set the correct vs1 in the regular
5679 reduction phi node. FORNOW, NCOPIES is always 1, so the
5680 loop is redundant. */
5681 stmt_vec_info use_info = reduction_phi_info;
5682 for (j = 0; j < ncopies; j++)
5683 {
5684 edge pr_edge = loop_preheader_edge (loop);
5685 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5686 pr_edge->dest_idx, vect_phi_res);
5687 use_info = STMT_VINFO_RELATED_STMT (use_info);
5688 }
5689 }
5690 }
5691 }
5692
5693 phis.release ();
5694 if (nested_in_vect_loop)
5695 {
5696 if (double_reduc)
5697 loop = outer_loop;
5698 else
5699 continue;
5700 }
5701
5702 phis.create (3);
5703 /* Find the loop-closed-use at the loop exit of the original scalar
5704 result. (The reduction result is expected to have two immediate uses,
5705 one at the latch block, and one at the loop exit). For double
5706 reductions we are looking for exit phis of the outer loop. */
5707 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5708 {
5709 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5710 {
5711 if (!is_gimple_debug (USE_STMT (use_p)))
5712 phis.safe_push (USE_STMT (use_p));
5713 }
5714 else
5715 {
5716 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5717 {
5718 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5719
5720 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5721 {
5722 if (!flow_bb_inside_loop_p (loop,
5723 gimple_bb (USE_STMT (phi_use_p)))
5724 && !is_gimple_debug (USE_STMT (phi_use_p)))
5725 phis.safe_push (USE_STMT (phi_use_p));
5726 }
5727 }
5728 }
5729 }
5730
5731 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5732 {
5733 /* Replace the uses: */
5734 orig_name = PHI_RESULT (exit_phi);
5735 scalar_result = scalar_results[k];
5736 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5737 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5738 SET_USE (use_p, scalar_result);
5739 }
5740
5741 phis.release ();
5742 }
5743 }
5744
5745 /* Return a vector of type VECTYPE that is equal to the vector select
5746 operation "MASK ? VEC : IDENTITY". Insert the select statements
5747 before GSI. */
5748
5749 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5750 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5751 tree vec, tree identity)
5752 {
5753 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5754 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5755 mask, vec, identity);
5756 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5757 return cond;
5758 }
5759
5760 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5761 order, starting with LHS. Insert the extraction statements before GSI and
5762 associate the new scalar SSA names with variable SCALAR_DEST.
5763 Return the SSA name for the result. */
5764
5765 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5766 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5767 tree_code code, tree lhs, tree vector_rhs)
5768 {
5769 tree vectype = TREE_TYPE (vector_rhs);
5770 tree scalar_type = TREE_TYPE (vectype);
5771 tree bitsize = TYPE_SIZE (scalar_type);
5772 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5773 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5774
5775 for (unsigned HOST_WIDE_INT bit_offset = 0;
5776 bit_offset < vec_size_in_bits;
5777 bit_offset += element_bitsize)
5778 {
5779 tree bitpos = bitsize_int (bit_offset);
5780 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5781 bitsize, bitpos);
5782
5783 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5784 rhs = make_ssa_name (scalar_dest, stmt);
5785 gimple_assign_set_lhs (stmt, rhs);
5786 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5787
5788 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5789 tree new_name = make_ssa_name (scalar_dest, stmt);
5790 gimple_assign_set_lhs (stmt, new_name);
5791 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5792 lhs = new_name;
5793 }
5794 return lhs;
5795 }
5796
5797 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5798 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5799 statement. CODE is the operation performed by STMT_INFO and OPS are
5800 its scalar operands. REDUC_INDEX is the index of the operand in
5801 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5802 implements in-order reduction, or IFN_LAST if we should open-code it.
5803 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5804 that should be used to control the operation in a fully-masked loop. */
5805
5806 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5807 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5808 gimple_stmt_iterator *gsi,
5809 stmt_vec_info *vec_stmt, slp_tree slp_node,
5810 gimple *reduc_def_stmt,
5811 tree_code code, internal_fn reduc_fn,
5812 tree ops[3], tree vectype_in,
5813 int reduc_index, vec_loop_masks *masks)
5814 {
5815 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5816 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5817 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5818 stmt_vec_info new_stmt_info = NULL;
5819
5820 int ncopies;
5821 if (slp_node)
5822 ncopies = 1;
5823 else
5824 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5825
5826 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5827 gcc_assert (ncopies == 1);
5828 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5829 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5830 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5831 == FOLD_LEFT_REDUCTION);
5832
5833 if (slp_node)
5834 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5835 TYPE_VECTOR_SUBPARTS (vectype_in)));
5836
5837 tree op0 = ops[1 - reduc_index];
5838
5839 int group_size = 1;
5840 stmt_vec_info scalar_dest_def_info;
5841 auto_vec<tree> vec_oprnds0;
5842 if (slp_node)
5843 {
5844 auto_vec<vec<tree> > vec_defs (2);
5845 auto_vec<tree> sops(2);
5846 sops.quick_push (ops[0]);
5847 sops.quick_push (ops[1]);
5848 vect_get_slp_defs (sops, slp_node, &vec_defs);
5849 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5850 vec_defs[0].release ();
5851 vec_defs[1].release ();
5852 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5853 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5854 }
5855 else
5856 {
5857 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5858 vec_oprnds0.create (1);
5859 vec_oprnds0.quick_push (loop_vec_def0);
5860 scalar_dest_def_info = stmt_info;
5861 }
5862
5863 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5864 tree scalar_type = TREE_TYPE (scalar_dest);
5865 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5866
5867 int vec_num = vec_oprnds0.length ();
5868 gcc_assert (vec_num == 1 || slp_node);
5869 tree vec_elem_type = TREE_TYPE (vectype_out);
5870 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5871
5872 tree vector_identity = NULL_TREE;
5873 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5874 vector_identity = build_zero_cst (vectype_out);
5875
5876 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5877 int i;
5878 tree def0;
5879 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5880 {
5881 gimple *new_stmt;
5882 tree mask = NULL_TREE;
5883 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5884 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5885
5886 /* Handle MINUS by adding the negative. */
5887 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5888 {
5889 tree negated = make_ssa_name (vectype_out);
5890 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5891 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5892 def0 = negated;
5893 }
5894
5895 if (mask)
5896 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5897 vector_identity);
5898
5899 /* On the first iteration the input is simply the scalar phi
5900 result, and for subsequent iterations it is the output of
5901 the preceding operation. */
5902 if (reduc_fn != IFN_LAST)
5903 {
5904 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5905 /* For chained SLP reductions the output of the previous reduction
5906 operation serves as the input of the next. For the final statement
5907 the output cannot be a temporary - we reuse the original
5908 scalar destination of the last statement. */
5909 if (i != vec_num - 1)
5910 {
5911 gimple_set_lhs (new_stmt, scalar_dest_var);
5912 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5913 gimple_set_lhs (new_stmt, reduc_var);
5914 }
5915 }
5916 else
5917 {
5918 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5919 reduc_var, def0);
5920 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5921 /* Remove the statement, so that we can use the same code paths
5922 as for statements that we've just created. */
5923 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5924 gsi_remove (&tmp_gsi, true);
5925 }
5926
5927 if (i == vec_num - 1)
5928 {
5929 gimple_set_lhs (new_stmt, scalar_dest);
5930 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5931 new_stmt);
5932 }
5933 else
5934 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5935 new_stmt, gsi);
5936
5937 if (slp_node)
5938 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5939 }
5940
5941 if (!slp_node)
5942 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5943
5944 return true;
5945 }
5946
5947 /* Function is_nonwrapping_integer_induction.
5948
5949 Check if STMT_VINO (which is part of loop LOOP) both increments and
5950 does not cause overflow. */
5951
5952 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,struct loop * loop)5953 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5954 {
5955 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5956 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5957 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5958 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5959 widest_int ni, max_loop_value, lhs_max;
5960 wi::overflow_type overflow = wi::OVF_NONE;
5961
5962 /* Make sure the loop is integer based. */
5963 if (TREE_CODE (base) != INTEGER_CST
5964 || TREE_CODE (step) != INTEGER_CST)
5965 return false;
5966
5967 /* Check that the max size of the loop will not wrap. */
5968
5969 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5970 return true;
5971
5972 if (! max_stmt_executions (loop, &ni))
5973 return false;
5974
5975 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5976 &overflow);
5977 if (overflow)
5978 return false;
5979
5980 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5981 TYPE_SIGN (lhs_type), &overflow);
5982 if (overflow)
5983 return false;
5984
5985 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5986 <= TYPE_PRECISION (lhs_type));
5987 }
5988
5989 /* Function vectorizable_reduction.
5990
5991 Check if STMT_INFO performs a reduction operation that can be vectorized.
5992 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5993 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5994 Return true if STMT_INFO is vectorizable in this way.
5995
5996 This function also handles reduction idioms (patterns) that have been
5997 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5998 may be of this form:
5999 X = pattern_expr (arg0, arg1, ..., X)
6000 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6001 sequence that had been detected and replaced by the pattern-stmt
6002 (STMT_INFO).
6003
6004 This function also handles reduction of condition expressions, for example:
6005 for (int i = 0; i < N; i++)
6006 if (a[i] < value)
6007 last = a[i];
6008 This is handled by vectorising the loop and creating an additional vector
6009 containing the loop indexes for which "a[i] < value" was true. In the
6010 function epilogue this is reduced to a single max value and then used to
6011 index into the vector of results.
6012
6013 In some cases of reduction patterns, the type of the reduction variable X is
6014 different than the type of the other arguments of STMT_INFO.
6015 In such cases, the vectype that is used when transforming STMT_INFO into
6016 a vector stmt is different than the vectype that is used to determine the
6017 vectorization factor, because it consists of a different number of elements
6018 than the actual number of elements that are being operated upon in parallel.
6019
6020 For example, consider an accumulation of shorts into an int accumulator.
6021 On some targets it's possible to vectorize this pattern operating on 8
6022 shorts at a time (hence, the vectype for purposes of determining the
6023 vectorization factor should be V8HI); on the other hand, the vectype that
6024 is used to create the vector form is actually V4SI (the type of the result).
6025
6026 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6027 indicates what is the actual level of parallelism (V8HI in the example), so
6028 that the right vectorization factor would be derived. This vectype
6029 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6030 be used to create the vectorized stmt. The right vectype for the vectorized
6031 stmt is obtained from the type of the result X:
6032 get_vectype_for_scalar_type (TREE_TYPE (X))
6033
6034 This means that, contrary to "regular" reductions (or "regular" stmts in
6035 general), the following equation:
6036 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6037 does *NOT* necessarily hold for reduction patterns. */
6038
6039 bool
vectorizable_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6040 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6041 stmt_vec_info *vec_stmt, slp_tree slp_node,
6042 slp_instance slp_node_instance,
6043 stmt_vector_for_cost *cost_vec)
6044 {
6045 tree vec_dest;
6046 tree scalar_dest;
6047 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6048 tree vectype_in = NULL_TREE;
6049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6050 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6051 enum tree_code code, orig_code;
6052 internal_fn reduc_fn;
6053 machine_mode vec_mode;
6054 int op_type;
6055 optab optab;
6056 tree new_temp = NULL_TREE;
6057 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6058 stmt_vec_info cond_stmt_vinfo = NULL;
6059 enum tree_code cond_reduc_op_code = ERROR_MARK;
6060 tree scalar_type;
6061 bool is_simple_use;
6062 int i;
6063 int ncopies;
6064 int epilog_copies;
6065 stmt_vec_info prev_stmt_info, prev_phi_info;
6066 bool single_defuse_cycle = false;
6067 stmt_vec_info new_stmt_info = NULL;
6068 int j;
6069 tree ops[3];
6070 enum vect_def_type dts[3];
6071 bool nested_cycle = false, found_nested_cycle_def = false;
6072 bool double_reduc = false;
6073 basic_block def_bb;
6074 struct loop * def_stmt_loop;
6075 tree def_arg;
6076 auto_vec<tree> vec_oprnds0;
6077 auto_vec<tree> vec_oprnds1;
6078 auto_vec<tree> vec_oprnds2;
6079 auto_vec<tree> vect_defs;
6080 auto_vec<stmt_vec_info> phis;
6081 int vec_num;
6082 tree def0, tem;
6083 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6084 tree cond_reduc_val = NULL_TREE;
6085
6086 /* Make sure it was already recognized as a reduction computation. */
6087 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6088 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6089 return false;
6090
6091 if (nested_in_vect_loop_p (loop, stmt_info))
6092 {
6093 loop = loop->inner;
6094 nested_cycle = true;
6095 }
6096
6097 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6098 gcc_assert (slp_node
6099 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6100
6101 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6102 {
6103 tree phi_result = gimple_phi_result (phi);
6104 /* Analysis is fully done on the reduction stmt invocation. */
6105 if (! vec_stmt)
6106 {
6107 if (slp_node)
6108 slp_node_instance->reduc_phis = slp_node;
6109
6110 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6111 return true;
6112 }
6113
6114 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6115 /* Leave the scalar phi in place. Note that checking
6116 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6117 for reductions involving a single statement. */
6118 return true;
6119
6120 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6121 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6122
6123 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6124 == EXTRACT_LAST_REDUCTION)
6125 /* Leave the scalar phi in place. */
6126 return true;
6127
6128 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6129 code = gimple_assign_rhs_code (reduc_stmt);
6130 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6131 {
6132 tree op = gimple_op (reduc_stmt, k);
6133 if (op == phi_result)
6134 continue;
6135 if (k == 1 && code == COND_EXPR)
6136 continue;
6137 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6138 gcc_assert (is_simple_use);
6139 if (dt == vect_constant_def || dt == vect_external_def)
6140 continue;
6141 if (!vectype_in
6142 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6143 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6144 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6145 break;
6146 }
6147 /* For a nested cycle we might end up with an operation like
6148 phi_result * phi_result. */
6149 if (!vectype_in)
6150 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6151 gcc_assert (vectype_in);
6152
6153 if (slp_node)
6154 ncopies = 1;
6155 else
6156 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6157
6158 stmt_vec_info use_stmt_info;
6159 if (ncopies > 1
6160 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6161 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6162 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6163 single_defuse_cycle = true;
6164
6165 /* Create the destination vector */
6166 scalar_dest = gimple_assign_lhs (reduc_stmt);
6167 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6168
6169 if (slp_node)
6170 /* The size vect_schedule_slp_instance computes is off for us. */
6171 vec_num = vect_get_num_vectors
6172 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6173 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6174 vectype_in);
6175 else
6176 vec_num = 1;
6177
6178 /* Generate the reduction PHIs upfront. */
6179 prev_phi_info = NULL;
6180 for (j = 0; j < ncopies; j++)
6181 {
6182 if (j == 0 || !single_defuse_cycle)
6183 {
6184 for (i = 0; i < vec_num; i++)
6185 {
6186 /* Create the reduction-phi that defines the reduction
6187 operand. */
6188 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6189 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6190
6191 if (slp_node)
6192 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6193 else
6194 {
6195 if (j == 0)
6196 STMT_VINFO_VEC_STMT (stmt_info)
6197 = *vec_stmt = new_phi_info;
6198 else
6199 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6200 prev_phi_info = new_phi_info;
6201 }
6202 }
6203 }
6204 }
6205
6206 return true;
6207 }
6208
6209 /* 1. Is vectorizable reduction? */
6210 /* Not supportable if the reduction variable is used in the loop, unless
6211 it's a reduction chain. */
6212 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6213 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6214 return false;
6215
6216 /* Reductions that are not used even in an enclosing outer-loop,
6217 are expected to be "live" (used out of the loop). */
6218 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6219 && !STMT_VINFO_LIVE_P (stmt_info))
6220 return false;
6221
6222 /* 2. Has this been recognized as a reduction pattern?
6223
6224 Check if STMT represents a pattern that has been recognized
6225 in earlier analysis stages. For stmts that represent a pattern,
6226 the STMT_VINFO_RELATED_STMT field records the last stmt in
6227 the original sequence that constitutes the pattern. */
6228
6229 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6230 if (orig_stmt_info)
6231 {
6232 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6233 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6234 }
6235
6236 /* 3. Check the operands of the operation. The first operands are defined
6237 inside the loop body. The last operand is the reduction variable,
6238 which is defined by the loop-header-phi. */
6239
6240 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6241
6242 /* Flatten RHS. */
6243 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6244 {
6245 case GIMPLE_BINARY_RHS:
6246 code = gimple_assign_rhs_code (stmt);
6247 op_type = TREE_CODE_LENGTH (code);
6248 gcc_assert (op_type == binary_op);
6249 ops[0] = gimple_assign_rhs1 (stmt);
6250 ops[1] = gimple_assign_rhs2 (stmt);
6251 break;
6252
6253 case GIMPLE_TERNARY_RHS:
6254 code = gimple_assign_rhs_code (stmt);
6255 op_type = TREE_CODE_LENGTH (code);
6256 gcc_assert (op_type == ternary_op);
6257 ops[0] = gimple_assign_rhs1 (stmt);
6258 ops[1] = gimple_assign_rhs2 (stmt);
6259 ops[2] = gimple_assign_rhs3 (stmt);
6260 break;
6261
6262 case GIMPLE_UNARY_RHS:
6263 return false;
6264
6265 default:
6266 gcc_unreachable ();
6267 }
6268
6269 if (code == COND_EXPR && slp_node)
6270 return false;
6271
6272 scalar_dest = gimple_assign_lhs (stmt);
6273 scalar_type = TREE_TYPE (scalar_dest);
6274 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6275 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6276 return false;
6277
6278 /* Do not try to vectorize bit-precision reductions. */
6279 if (!type_has_mode_precision_p (scalar_type))
6280 return false;
6281
6282 /* All uses but the last are expected to be defined in the loop.
6283 The last use is the reduction variable. In case of nested cycle this
6284 assumption is not true: we use reduc_index to record the index of the
6285 reduction variable. */
6286 stmt_vec_info reduc_def_info;
6287 if (orig_stmt_info)
6288 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6289 else
6290 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6291 gcc_assert (reduc_def_info);
6292 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6293 tree reduc_def = PHI_RESULT (reduc_def_phi);
6294 int reduc_index = -1;
6295 for (i = 0; i < op_type; i++)
6296 {
6297 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6298 if (i == 0 && code == COND_EXPR)
6299 continue;
6300
6301 stmt_vec_info def_stmt_info;
6302 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6303 &def_stmt_info);
6304 dt = dts[i];
6305 gcc_assert (is_simple_use);
6306 if (dt == vect_reduction_def
6307 && ops[i] == reduc_def)
6308 {
6309 reduc_index = i;
6310 continue;
6311 }
6312 else if (tem)
6313 {
6314 /* To properly compute ncopies we are interested in the widest
6315 input type in case we're looking at a widening accumulation. */
6316 if (!vectype_in
6317 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6318 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6319 vectype_in = tem;
6320 }
6321
6322 if (dt != vect_internal_def
6323 && dt != vect_external_def
6324 && dt != vect_constant_def
6325 && dt != vect_induction_def
6326 && !(dt == vect_nested_cycle && nested_cycle))
6327 return false;
6328
6329 if (dt == vect_nested_cycle
6330 && ops[i] == reduc_def)
6331 {
6332 found_nested_cycle_def = true;
6333 reduc_index = i;
6334 }
6335
6336 if (i == 1 && code == COND_EXPR)
6337 {
6338 /* Record how value of COND_EXPR is defined. */
6339 if (dt == vect_constant_def)
6340 {
6341 cond_reduc_dt = dt;
6342 cond_reduc_val = ops[i];
6343 }
6344 if (dt == vect_induction_def
6345 && def_stmt_info
6346 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6347 {
6348 cond_reduc_dt = dt;
6349 cond_stmt_vinfo = def_stmt_info;
6350 }
6351 }
6352 }
6353
6354 if (!vectype_in)
6355 vectype_in = vectype_out;
6356
6357 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6358 directy used in stmt. */
6359 if (reduc_index == -1)
6360 {
6361 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6362 {
6363 if (dump_enabled_p ())
6364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6365 "in-order reduction chain without SLP.\n");
6366 return false;
6367 }
6368 }
6369
6370 if (!(reduc_index == -1
6371 || dts[reduc_index] == vect_reduction_def
6372 || dts[reduc_index] == vect_nested_cycle
6373 || ((dts[reduc_index] == vect_internal_def
6374 || dts[reduc_index] == vect_external_def
6375 || dts[reduc_index] == vect_constant_def
6376 || dts[reduc_index] == vect_induction_def)
6377 && nested_cycle && found_nested_cycle_def)))
6378 {
6379 /* For pattern recognized stmts, orig_stmt might be a reduction,
6380 but some helper statements for the pattern might not, or
6381 might be COND_EXPRs with reduction uses in the condition. */
6382 gcc_assert (orig_stmt_info);
6383 return false;
6384 }
6385
6386 /* PHIs should not participate in patterns. */
6387 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6388 enum vect_reduction_type v_reduc_type
6389 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6390 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6391
6392 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6393 /* If we have a condition reduction, see if we can simplify it further. */
6394 if (v_reduc_type == COND_REDUCTION)
6395 {
6396 /* TODO: We can't yet handle reduction chains, since we need to treat
6397 each COND_EXPR in the chain specially, not just the last one.
6398 E.g. for:
6399
6400 x_1 = PHI <x_3, ...>
6401 x_2 = a_2 ? ... : x_1;
6402 x_3 = a_3 ? ... : x_2;
6403
6404 we're interested in the last element in x_3 for which a_2 || a_3
6405 is true, whereas the current reduction chain handling would
6406 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6407 as a reduction operation. */
6408 if (reduc_index == -1)
6409 {
6410 if (dump_enabled_p ())
6411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6412 "conditional reduction chains not supported\n");
6413 return false;
6414 }
6415
6416 /* vect_is_simple_reduction ensured that operand 2 is the
6417 loop-carried operand. */
6418 gcc_assert (reduc_index == 2);
6419
6420 /* Loop peeling modifies initial value of reduction PHI, which
6421 makes the reduction stmt to be transformed different to the
6422 original stmt analyzed. We need to record reduction code for
6423 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6424 it can be used directly at transform stage. */
6425 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6426 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6427 {
6428 /* Also set the reduction type to CONST_COND_REDUCTION. */
6429 gcc_assert (cond_reduc_dt == vect_constant_def);
6430 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6431 }
6432 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6433 vectype_in, OPTIMIZE_FOR_SPEED))
6434 {
6435 if (dump_enabled_p ())
6436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6437 "optimizing condition reduction with"
6438 " FOLD_EXTRACT_LAST.\n");
6439 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6440 }
6441 else if (cond_reduc_dt == vect_induction_def)
6442 {
6443 tree base
6444 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6445 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6446
6447 gcc_assert (TREE_CODE (base) == INTEGER_CST
6448 && TREE_CODE (step) == INTEGER_CST);
6449 cond_reduc_val = NULL_TREE;
6450 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6451 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6452 ;
6453 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6454 above base; punt if base is the minimum value of the type for
6455 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6456 else if (tree_int_cst_sgn (step) == -1)
6457 {
6458 cond_reduc_op_code = MIN_EXPR;
6459 if (tree_int_cst_sgn (base) == -1)
6460 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6461 else if (tree_int_cst_lt (base,
6462 TYPE_MAX_VALUE (TREE_TYPE (base))))
6463 cond_reduc_val
6464 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6465 }
6466 else
6467 {
6468 cond_reduc_op_code = MAX_EXPR;
6469 if (tree_int_cst_sgn (base) == 1)
6470 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6471 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6472 base))
6473 cond_reduc_val
6474 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6475 }
6476 if (cond_reduc_val)
6477 {
6478 if (dump_enabled_p ())
6479 dump_printf_loc (MSG_NOTE, vect_location,
6480 "condition expression based on "
6481 "integer induction.\n");
6482 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6483 = INTEGER_INDUC_COND_REDUCTION;
6484 }
6485 }
6486 else if (cond_reduc_dt == vect_constant_def)
6487 {
6488 enum vect_def_type cond_initial_dt;
6489 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6490 tree cond_initial_val
6491 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6492
6493 gcc_assert (cond_reduc_val != NULL_TREE);
6494 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6495 if (cond_initial_dt == vect_constant_def
6496 && types_compatible_p (TREE_TYPE (cond_initial_val),
6497 TREE_TYPE (cond_reduc_val)))
6498 {
6499 tree e = fold_binary (LE_EXPR, boolean_type_node,
6500 cond_initial_val, cond_reduc_val);
6501 if (e && (integer_onep (e) || integer_zerop (e)))
6502 {
6503 if (dump_enabled_p ())
6504 dump_printf_loc (MSG_NOTE, vect_location,
6505 "condition expression based on "
6506 "compile time constant.\n");
6507 /* Record reduction code at analysis stage. */
6508 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6509 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6510 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6511 = CONST_COND_REDUCTION;
6512 }
6513 }
6514 }
6515 }
6516
6517 if (orig_stmt_info)
6518 gcc_assert (tmp == orig_stmt_info
6519 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6520 else
6521 /* We changed STMT to be the first stmt in reduction chain, hence we
6522 check that in this case the first element in the chain is STMT. */
6523 gcc_assert (tmp == stmt_info
6524 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6525
6526 if (STMT_VINFO_LIVE_P (reduc_def_info))
6527 return false;
6528
6529 if (slp_node)
6530 ncopies = 1;
6531 else
6532 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6533
6534 gcc_assert (ncopies >= 1);
6535
6536 vec_mode = TYPE_MODE (vectype_in);
6537 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6538
6539 if (nested_cycle)
6540 {
6541 def_bb = gimple_bb (reduc_def_phi);
6542 def_stmt_loop = def_bb->loop_father;
6543 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6544 loop_preheader_edge (def_stmt_loop));
6545 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6546 if (def_arg_stmt_info
6547 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6548 == vect_double_reduction_def))
6549 double_reduc = true;
6550 }
6551
6552 vect_reduction_type reduction_type
6553 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6554 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6555 && ncopies > 1)
6556 {
6557 if (dump_enabled_p ())
6558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6559 "multiple types in double reduction or condition "
6560 "reduction.\n");
6561 return false;
6562 }
6563
6564 if (code == COND_EXPR)
6565 {
6566 /* Only call during the analysis stage, otherwise we'll lose
6567 STMT_VINFO_TYPE. */
6568 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6569 true, NULL, cost_vec))
6570 {
6571 if (dump_enabled_p ())
6572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 "unsupported condition in reduction\n");
6574 return false;
6575 }
6576 }
6577 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6578 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6579 {
6580 /* Only call during the analysis stage, otherwise we'll lose
6581 STMT_VINFO_TYPE. We only support this for nested cycles
6582 without double reductions at the moment. */
6583 if (!nested_cycle
6584 || double_reduc
6585 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6586 NULL, cost_vec)))
6587 {
6588 if (dump_enabled_p ())
6589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6590 "unsupported shift or rotation in reduction\n");
6591 return false;
6592 }
6593 }
6594 else
6595 {
6596 /* 4. Supportable by target? */
6597
6598 /* 4.1. check support for the operation in the loop */
6599 optab = optab_for_tree_code (code, vectype_in, optab_default);
6600 if (!optab)
6601 {
6602 if (dump_enabled_p ())
6603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6604 "no optab.\n");
6605
6606 return false;
6607 }
6608
6609 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6610 {
6611 if (dump_enabled_p ())
6612 dump_printf (MSG_NOTE, "op not supported by target.\n");
6613
6614 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6615 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6616 return false;
6617
6618 if (dump_enabled_p ())
6619 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6620 }
6621
6622 /* Worthwhile without SIMD support? */
6623 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6624 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6625 {
6626 if (dump_enabled_p ())
6627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6628 "not worthwhile without SIMD support.\n");
6629
6630 return false;
6631 }
6632 }
6633
6634 /* 4.2. Check support for the epilog operation.
6635
6636 If STMT represents a reduction pattern, then the type of the
6637 reduction variable may be different than the type of the rest
6638 of the arguments. For example, consider the case of accumulation
6639 of shorts into an int accumulator; The original code:
6640 S1: int_a = (int) short_a;
6641 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6642
6643 was replaced with:
6644 STMT: int_acc = widen_sum <short_a, int_acc>
6645
6646 This means that:
6647 1. The tree-code that is used to create the vector operation in the
6648 epilog code (that reduces the partial results) is not the
6649 tree-code of STMT, but is rather the tree-code of the original
6650 stmt from the pattern that STMT is replacing. I.e, in the example
6651 above we want to use 'widen_sum' in the loop, but 'plus' in the
6652 epilog.
6653 2. The type (mode) we use to check available target support
6654 for the vector operation to be created in the *epilog*, is
6655 determined by the type of the reduction variable (in the example
6656 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6657 However the type (mode) we use to check available target support
6658 for the vector operation to be created *inside the loop*, is
6659 determined by the type of the other arguments to STMT (in the
6660 example we'd check this: optab_handler (widen_sum_optab,
6661 vect_short_mode)).
6662
6663 This is contrary to "regular" reductions, in which the types of all
6664 the arguments are the same as the type of the reduction variable.
6665 For "regular" reductions we can therefore use the same vector type
6666 (and also the same tree-code) when generating the epilog code and
6667 when generating the code inside the loop. */
6668
6669 if (orig_stmt_info
6670 && (reduction_type == TREE_CODE_REDUCTION
6671 || reduction_type == FOLD_LEFT_REDUCTION))
6672 {
6673 /* This is a reduction pattern: get the vectype from the type of the
6674 reduction variable, and get the tree-code from orig_stmt. */
6675 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6676 gcc_assert (vectype_out);
6677 vec_mode = TYPE_MODE (vectype_out);
6678 }
6679 else
6680 {
6681 /* Regular reduction: use the same vectype and tree-code as used for
6682 the vector code inside the loop can be used for the epilog code. */
6683 orig_code = code;
6684
6685 if (code == MINUS_EXPR)
6686 orig_code = PLUS_EXPR;
6687
6688 /* For simple condition reductions, replace with the actual expression
6689 we want to base our reduction around. */
6690 if (reduction_type == CONST_COND_REDUCTION)
6691 {
6692 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6693 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6694 }
6695 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6696 orig_code = cond_reduc_op_code;
6697 }
6698
6699 reduc_fn = IFN_LAST;
6700
6701 if (reduction_type == TREE_CODE_REDUCTION
6702 || reduction_type == FOLD_LEFT_REDUCTION
6703 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6704 || reduction_type == CONST_COND_REDUCTION)
6705 {
6706 if (reduction_type == FOLD_LEFT_REDUCTION
6707 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6708 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6709 {
6710 if (reduc_fn != IFN_LAST
6711 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6712 OPTIMIZE_FOR_SPEED))
6713 {
6714 if (dump_enabled_p ())
6715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716 "reduc op not supported by target.\n");
6717
6718 reduc_fn = IFN_LAST;
6719 }
6720 }
6721 else
6722 {
6723 if (!nested_cycle || double_reduc)
6724 {
6725 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 "no reduc code for scalar code.\n");
6728
6729 return false;
6730 }
6731 }
6732 }
6733 else if (reduction_type == COND_REDUCTION)
6734 {
6735 int scalar_precision
6736 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6737 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6738 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6739 nunits_out);
6740
6741 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6742 OPTIMIZE_FOR_SPEED))
6743 reduc_fn = IFN_REDUC_MAX;
6744 }
6745
6746 if (reduction_type != EXTRACT_LAST_REDUCTION
6747 && (!nested_cycle || double_reduc)
6748 && reduc_fn == IFN_LAST
6749 && !nunits_out.is_constant ())
6750 {
6751 if (dump_enabled_p ())
6752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 "missing target support for reduction on"
6754 " variable-length vectors.\n");
6755 return false;
6756 }
6757
6758 /* For SLP reductions, see if there is a neutral value we can use. */
6759 tree neutral_op = NULL_TREE;
6760 if (slp_node)
6761 neutral_op = neutral_op_for_slp_reduction
6762 (slp_node_instance->reduc_phis, code,
6763 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6764
6765 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6766 {
6767 /* We can't support in-order reductions of code such as this:
6768
6769 for (int i = 0; i < n1; ++i)
6770 for (int j = 0; j < n2; ++j)
6771 l += a[j];
6772
6773 since GCC effectively transforms the loop when vectorizing:
6774
6775 for (int i = 0; i < n1 / VF; ++i)
6776 for (int j = 0; j < n2; ++j)
6777 for (int k = 0; k < VF; ++k)
6778 l += a[j];
6779
6780 which is a reassociation of the original operation. */
6781 if (dump_enabled_p ())
6782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783 "in-order double reduction not supported.\n");
6784
6785 return false;
6786 }
6787
6788 if (reduction_type == FOLD_LEFT_REDUCTION
6789 && slp_node
6790 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6791 {
6792 /* We cannot use in-order reductions in this case because there is
6793 an implicit reassociation of the operations involved. */
6794 if (dump_enabled_p ())
6795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6796 "in-order unchained SLP reductions not supported.\n");
6797 return false;
6798 }
6799
6800 /* For double reductions, and for SLP reductions with a neutral value,
6801 we construct a variable-length initial vector by loading a vector
6802 full of the neutral value and then shift-and-inserting the start
6803 values into the low-numbered elements. */
6804 if ((double_reduc || neutral_op)
6805 && !nunits_out.is_constant ()
6806 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6807 vectype_out, OPTIMIZE_FOR_SPEED))
6808 {
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 "reduction on variable-length vectors requires"
6812 " target support for a vector-shift-and-insert"
6813 " operation.\n");
6814 return false;
6815 }
6816
6817 /* Check extra constraints for variable-length unchained SLP reductions. */
6818 if (STMT_SLP_TYPE (stmt_info)
6819 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6820 && !nunits_out.is_constant ())
6821 {
6822 /* We checked above that we could build the initial vector when
6823 there's a neutral element value. Check here for the case in
6824 which each SLP statement has its own initial value and in which
6825 that value needs to be repeated for every instance of the
6826 statement within the initial vector. */
6827 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6828 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6829 if (!neutral_op
6830 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6831 {
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834 "unsupported form of SLP reduction for"
6835 " variable-length vectors: cannot build"
6836 " initial vector.\n");
6837 return false;
6838 }
6839 /* The epilogue code relies on the number of elements being a multiple
6840 of the group size. The duplicate-and-interleave approach to setting
6841 up the the initial vector does too. */
6842 if (!multiple_p (nunits_out, group_size))
6843 {
6844 if (dump_enabled_p ())
6845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6846 "unsupported form of SLP reduction for"
6847 " variable-length vectors: the vector size"
6848 " is not a multiple of the number of results.\n");
6849 return false;
6850 }
6851 }
6852
6853 /* In case of widenning multiplication by a constant, we update the type
6854 of the constant to be the type of the other operand. We check that the
6855 constant fits the type in the pattern recognition pass. */
6856 if (code == DOT_PROD_EXPR
6857 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6858 {
6859 if (TREE_CODE (ops[0]) == INTEGER_CST)
6860 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6861 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6862 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6863 else
6864 {
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 "invalid types in dot-prod\n");
6868
6869 return false;
6870 }
6871 }
6872
6873 if (reduction_type == COND_REDUCTION)
6874 {
6875 widest_int ni;
6876
6877 if (! max_loop_iterations (loop, &ni))
6878 {
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_NOTE, vect_location,
6881 "loop count not known, cannot create cond "
6882 "reduction.\n");
6883 return false;
6884 }
6885 /* Convert backedges to iterations. */
6886 ni += 1;
6887
6888 /* The additional index will be the same type as the condition. Check
6889 that the loop can fit into this less one (because we'll use up the
6890 zero slot for when there are no matches). */
6891 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6892 if (wi::geu_p (ni, wi::to_widest (max_index)))
6893 {
6894 if (dump_enabled_p ())
6895 dump_printf_loc (MSG_NOTE, vect_location,
6896 "loop size is greater than data size.\n");
6897 return false;
6898 }
6899 }
6900
6901 /* In case the vectorization factor (VF) is bigger than the number
6902 of elements that we can fit in a vectype (nunits), we have to generate
6903 more than one vector stmt - i.e - we need to "unroll" the
6904 vector stmt by a factor VF/nunits. For more details see documentation
6905 in vectorizable_operation. */
6906
6907 /* If the reduction is used in an outer loop we need to generate
6908 VF intermediate results, like so (e.g. for ncopies=2):
6909 r0 = phi (init, r0)
6910 r1 = phi (init, r1)
6911 r0 = x0 + r0;
6912 r1 = x1 + r1;
6913 (i.e. we generate VF results in 2 registers).
6914 In this case we have a separate def-use cycle for each copy, and therefore
6915 for each copy we get the vector def for the reduction variable from the
6916 respective phi node created for this copy.
6917
6918 Otherwise (the reduction is unused in the loop nest), we can combine
6919 together intermediate results, like so (e.g. for ncopies=2):
6920 r = phi (init, r)
6921 r = x0 + r;
6922 r = x1 + r;
6923 (i.e. we generate VF/2 results in a single register).
6924 In this case for each copy we get the vector def for the reduction variable
6925 from the vectorized reduction operation generated in the previous iteration.
6926
6927 This only works when we see both the reduction PHI and its only consumer
6928 in vectorizable_reduction and there are no intermediate stmts
6929 participating. */
6930 stmt_vec_info use_stmt_info;
6931 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6932 if (ncopies > 1
6933 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6934 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6935 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6936 {
6937 single_defuse_cycle = true;
6938 epilog_copies = 1;
6939 }
6940 else
6941 epilog_copies = ncopies;
6942
6943 /* If the reduction stmt is one of the patterns that have lane
6944 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6945 if ((ncopies > 1
6946 && ! single_defuse_cycle)
6947 && (code == DOT_PROD_EXPR
6948 || code == WIDEN_SUM_EXPR
6949 || code == SAD_EXPR))
6950 {
6951 if (dump_enabled_p ())
6952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953 "multi def-use cycle not possible for lane-reducing "
6954 "reduction operation\n");
6955 return false;
6956 }
6957
6958 if (slp_node)
6959 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6960 else
6961 vec_num = 1;
6962
6963 internal_fn cond_fn = get_conditional_internal_fn (code);
6964 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6965
6966 if (!vec_stmt) /* transformation not required. */
6967 {
6968 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6969 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6970 {
6971 if (reduction_type != FOLD_LEFT_REDUCTION
6972 && (cond_fn == IFN_LAST
6973 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6974 OPTIMIZE_FOR_SPEED)))
6975 {
6976 if (dump_enabled_p ())
6977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978 "can't use a fully-masked loop because no"
6979 " conditional operation is available.\n");
6980 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6981 }
6982 else if (reduc_index == -1)
6983 {
6984 if (dump_enabled_p ())
6985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6986 "can't use a fully-masked loop for chained"
6987 " reductions.\n");
6988 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6989 }
6990 else
6991 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6992 vectype_in);
6993 }
6994 if (dump_enabled_p ()
6995 && reduction_type == FOLD_LEFT_REDUCTION)
6996 dump_printf_loc (MSG_NOTE, vect_location,
6997 "using an in-order (fold-left) reduction.\n");
6998 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6999 return true;
7000 }
7001
7002 /* Transform. */
7003
7004 if (dump_enabled_p ())
7005 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7006
7007 /* FORNOW: Multiple types are not supported for condition. */
7008 if (code == COND_EXPR)
7009 gcc_assert (ncopies == 1);
7010
7011 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7012
7013 if (reduction_type == FOLD_LEFT_REDUCTION)
7014 return vectorize_fold_left_reduction
7015 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7016 reduc_fn, ops, vectype_in, reduc_index, masks);
7017
7018 if (reduction_type == EXTRACT_LAST_REDUCTION)
7019 {
7020 gcc_assert (!slp_node);
7021 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7022 true, NULL, NULL);
7023 }
7024
7025 /* Create the destination vector */
7026 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7027
7028 prev_stmt_info = NULL;
7029 prev_phi_info = NULL;
7030 if (!slp_node)
7031 {
7032 vec_oprnds0.create (1);
7033 vec_oprnds1.create (1);
7034 if (op_type == ternary_op)
7035 vec_oprnds2.create (1);
7036 }
7037
7038 phis.create (vec_num);
7039 vect_defs.create (vec_num);
7040 if (!slp_node)
7041 vect_defs.quick_push (NULL_TREE);
7042
7043 if (slp_node)
7044 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7045 else
7046 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7047
7048 for (j = 0; j < ncopies; j++)
7049 {
7050 if (code == COND_EXPR)
7051 {
7052 gcc_assert (!slp_node);
7053 vectorizable_condition (stmt_info, gsi, vec_stmt,
7054 true, NULL, NULL);
7055 break;
7056 }
7057 if (code == LSHIFT_EXPR
7058 || code == RSHIFT_EXPR)
7059 {
7060 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7061 break;
7062 }
7063
7064 /* Handle uses. */
7065 if (j == 0)
7066 {
7067 if (slp_node)
7068 {
7069 /* Get vec defs for all the operands except the reduction index,
7070 ensuring the ordering of the ops in the vector is kept. */
7071 auto_vec<tree, 3> slp_ops;
7072 auto_vec<vec<tree>, 3> vec_defs;
7073
7074 slp_ops.quick_push (ops[0]);
7075 slp_ops.quick_push (ops[1]);
7076 if (op_type == ternary_op)
7077 slp_ops.quick_push (ops[2]);
7078
7079 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7080
7081 vec_oprnds0.safe_splice (vec_defs[0]);
7082 vec_defs[0].release ();
7083 vec_oprnds1.safe_splice (vec_defs[1]);
7084 vec_defs[1].release ();
7085 if (op_type == ternary_op)
7086 {
7087 vec_oprnds2.safe_splice (vec_defs[2]);
7088 vec_defs[2].release ();
7089 }
7090 }
7091 else
7092 {
7093 vec_oprnds0.quick_push
7094 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7095 vec_oprnds1.quick_push
7096 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7097 if (op_type == ternary_op)
7098 vec_oprnds2.quick_push
7099 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7100 }
7101 }
7102 else
7103 {
7104 if (!slp_node)
7105 {
7106 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7107
7108 if (single_defuse_cycle && reduc_index == 0)
7109 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7110 else
7111 vec_oprnds0[0]
7112 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7113 vec_oprnds0[0]);
7114 if (single_defuse_cycle && reduc_index == 1)
7115 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7116 else
7117 vec_oprnds1[0]
7118 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119 vec_oprnds1[0]);
7120 if (op_type == ternary_op)
7121 {
7122 if (single_defuse_cycle && reduc_index == 2)
7123 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7124 else
7125 vec_oprnds2[0]
7126 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7127 vec_oprnds2[0]);
7128 }
7129 }
7130 }
7131
7132 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7133 {
7134 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7135 if (masked_loop_p)
7136 {
7137 /* Make sure that the reduction accumulator is vop[0]. */
7138 if (reduc_index == 1)
7139 {
7140 gcc_assert (commutative_tree_code (code));
7141 std::swap (vop[0], vop[1]);
7142 }
7143 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7144 vectype_in, i * ncopies + j);
7145 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7146 vop[0], vop[1],
7147 vop[0]);
7148 new_temp = make_ssa_name (vec_dest, call);
7149 gimple_call_set_lhs (call, new_temp);
7150 gimple_call_set_nothrow (call, true);
7151 new_stmt_info
7152 = vect_finish_stmt_generation (stmt_info, call, gsi);
7153 }
7154 else
7155 {
7156 if (op_type == ternary_op)
7157 vop[2] = vec_oprnds2[i];
7158
7159 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7160 vop[0], vop[1], vop[2]);
7161 new_temp = make_ssa_name (vec_dest, new_stmt);
7162 gimple_assign_set_lhs (new_stmt, new_temp);
7163 new_stmt_info
7164 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7165 }
7166
7167 if (slp_node)
7168 {
7169 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7170 vect_defs.quick_push (new_temp);
7171 }
7172 else
7173 vect_defs[0] = new_temp;
7174 }
7175
7176 if (slp_node)
7177 continue;
7178
7179 if (j == 0)
7180 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7181 else
7182 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7183
7184 prev_stmt_info = new_stmt_info;
7185 }
7186
7187 /* Finalize the reduction-phi (set its arguments) and create the
7188 epilog reduction code. */
7189 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7190 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7191
7192 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7193 epilog_copies, reduc_fn, phis,
7194 double_reduc, slp_node, slp_node_instance,
7195 cond_reduc_val, cond_reduc_op_code,
7196 neutral_op);
7197
7198 return true;
7199 }
7200
7201 /* Function vect_min_worthwhile_factor.
7202
7203 For a loop where we could vectorize the operation indicated by CODE,
7204 return the minimum vectorization factor that makes it worthwhile
7205 to use generic vectors. */
7206 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7207 vect_min_worthwhile_factor (enum tree_code code)
7208 {
7209 switch (code)
7210 {
7211 case PLUS_EXPR:
7212 case MINUS_EXPR:
7213 case NEGATE_EXPR:
7214 return 4;
7215
7216 case BIT_AND_EXPR:
7217 case BIT_IOR_EXPR:
7218 case BIT_XOR_EXPR:
7219 case BIT_NOT_EXPR:
7220 return 2;
7221
7222 default:
7223 return INT_MAX;
7224 }
7225 }
7226
7227 /* Return true if VINFO indicates we are doing loop vectorization and if
7228 it is worth decomposing CODE operations into scalar operations for
7229 that loop's vectorization factor. */
7230
7231 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7232 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7233 {
7234 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7235 unsigned HOST_WIDE_INT value;
7236 return (loop_vinfo
7237 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7238 && value >= vect_min_worthwhile_factor (code));
7239 }
7240
7241 /* Function vectorizable_induction
7242
7243 Check if STMT_INFO performs an induction computation that can be vectorized.
7244 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7245 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7246 Return true if STMT_INFO is vectorizable in this way. */
7247
7248 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7249 vectorizable_induction (stmt_vec_info stmt_info,
7250 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7251 stmt_vec_info *vec_stmt, slp_tree slp_node,
7252 stmt_vector_for_cost *cost_vec)
7253 {
7254 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7256 unsigned ncopies;
7257 bool nested_in_vect_loop = false;
7258 struct loop *iv_loop;
7259 tree vec_def;
7260 edge pe = loop_preheader_edge (loop);
7261 basic_block new_bb;
7262 tree new_vec, vec_init, vec_step, t;
7263 tree new_name;
7264 gimple *new_stmt;
7265 gphi *induction_phi;
7266 tree induc_def, vec_dest;
7267 tree init_expr, step_expr;
7268 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7269 unsigned i;
7270 tree expr;
7271 gimple_seq stmts;
7272 imm_use_iterator imm_iter;
7273 use_operand_p use_p;
7274 gimple *exit_phi;
7275 edge latch_e;
7276 tree loop_arg;
7277 gimple_stmt_iterator si;
7278
7279 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7280 if (!phi)
7281 return false;
7282
7283 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7284 return false;
7285
7286 /* Make sure it was recognized as induction computation. */
7287 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7288 return false;
7289
7290 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7291 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7292
7293 if (slp_node)
7294 ncopies = 1;
7295 else
7296 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7297 gcc_assert (ncopies >= 1);
7298
7299 /* FORNOW. These restrictions should be relaxed. */
7300 if (nested_in_vect_loop_p (loop, stmt_info))
7301 {
7302 imm_use_iterator imm_iter;
7303 use_operand_p use_p;
7304 gimple *exit_phi;
7305 edge latch_e;
7306 tree loop_arg;
7307
7308 if (ncopies > 1)
7309 {
7310 if (dump_enabled_p ())
7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 "multiple types in nested loop.\n");
7313 return false;
7314 }
7315
7316 /* FORNOW: outer loop induction with SLP not supported. */
7317 if (STMT_SLP_TYPE (stmt_info))
7318 return false;
7319
7320 exit_phi = NULL;
7321 latch_e = loop_latch_edge (loop->inner);
7322 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7323 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7324 {
7325 gimple *use_stmt = USE_STMT (use_p);
7326 if (is_gimple_debug (use_stmt))
7327 continue;
7328
7329 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7330 {
7331 exit_phi = use_stmt;
7332 break;
7333 }
7334 }
7335 if (exit_phi)
7336 {
7337 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7338 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7339 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7340 {
7341 if (dump_enabled_p ())
7342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7343 "inner-loop induction only used outside "
7344 "of the outer vectorized loop.\n");
7345 return false;
7346 }
7347 }
7348
7349 nested_in_vect_loop = true;
7350 iv_loop = loop->inner;
7351 }
7352 else
7353 iv_loop = loop;
7354 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7355
7356 if (slp_node && !nunits.is_constant ())
7357 {
7358 /* The current SLP code creates the initial value element-by-element. */
7359 if (dump_enabled_p ())
7360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7361 "SLP induction not supported for variable-length"
7362 " vectors.\n");
7363 return false;
7364 }
7365
7366 if (!vec_stmt) /* transformation not required. */
7367 {
7368 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7369 DUMP_VECT_SCOPE ("vectorizable_induction");
7370 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7371 return true;
7372 }
7373
7374 /* Transform. */
7375
7376 /* Compute a vector variable, initialized with the first VF values of
7377 the induction variable. E.g., for an iv with IV_PHI='X' and
7378 evolution S, for a vector of 4 units, we want to compute:
7379 [X, X + S, X + 2*S, X + 3*S]. */
7380
7381 if (dump_enabled_p ())
7382 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7383
7384 latch_e = loop_latch_edge (iv_loop);
7385 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7386
7387 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7388 gcc_assert (step_expr != NULL_TREE);
7389
7390 pe = loop_preheader_edge (iv_loop);
7391 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7392 loop_preheader_edge (iv_loop));
7393
7394 stmts = NULL;
7395 if (!nested_in_vect_loop)
7396 {
7397 /* Convert the initial value to the desired type. */
7398 tree new_type = TREE_TYPE (vectype);
7399 init_expr = gimple_convert (&stmts, new_type, init_expr);
7400
7401 /* If we are using the loop mask to "peel" for alignment then we need
7402 to adjust the start value here. */
7403 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7404 if (skip_niters != NULL_TREE)
7405 {
7406 if (FLOAT_TYPE_P (vectype))
7407 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7408 skip_niters);
7409 else
7410 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7411 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7412 skip_niters, step_expr);
7413 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7414 init_expr, skip_step);
7415 }
7416 }
7417
7418 /* Convert the step to the desired type. */
7419 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7420
7421 if (stmts)
7422 {
7423 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7424 gcc_assert (!new_bb);
7425 }
7426
7427 /* Find the first insertion point in the BB. */
7428 basic_block bb = gimple_bb (phi);
7429 si = gsi_after_labels (bb);
7430
7431 /* For SLP induction we have to generate several IVs as for example
7432 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7433 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7434 [VF*S, VF*S, VF*S, VF*S] for all. */
7435 if (slp_node)
7436 {
7437 /* Enforced above. */
7438 unsigned int const_nunits = nunits.to_constant ();
7439
7440 /* Generate [VF*S, VF*S, ... ]. */
7441 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7442 {
7443 expr = build_int_cst (integer_type_node, vf);
7444 expr = fold_convert (TREE_TYPE (step_expr), expr);
7445 }
7446 else
7447 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7448 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7449 expr, step_expr);
7450 if (! CONSTANT_CLASS_P (new_name))
7451 new_name = vect_init_vector (stmt_info, new_name,
7452 TREE_TYPE (step_expr), NULL);
7453 new_vec = build_vector_from_val (vectype, new_name);
7454 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7455
7456 /* Now generate the IVs. */
7457 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7458 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7459 unsigned elts = const_nunits * nvects;
7460 unsigned nivs = least_common_multiple (group_size,
7461 const_nunits) / const_nunits;
7462 gcc_assert (elts % group_size == 0);
7463 tree elt = init_expr;
7464 unsigned ivn;
7465 for (ivn = 0; ivn < nivs; ++ivn)
7466 {
7467 tree_vector_builder elts (vectype, const_nunits, 1);
7468 stmts = NULL;
7469 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7470 {
7471 if (ivn*const_nunits + eltn >= group_size
7472 && (ivn * const_nunits + eltn) % group_size == 0)
7473 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7474 elt, step_expr);
7475 elts.quick_push (elt);
7476 }
7477 vec_init = gimple_build_vector (&stmts, &elts);
7478 if (stmts)
7479 {
7480 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7481 gcc_assert (!new_bb);
7482 }
7483
7484 /* Create the induction-phi that defines the induction-operand. */
7485 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7486 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7487 stmt_vec_info induction_phi_info
7488 = loop_vinfo->add_stmt (induction_phi);
7489 induc_def = PHI_RESULT (induction_phi);
7490
7491 /* Create the iv update inside the loop */
7492 vec_def = make_ssa_name (vec_dest);
7493 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7494 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7495 loop_vinfo->add_stmt (new_stmt);
7496
7497 /* Set the arguments of the phi node: */
7498 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7499 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7500 UNKNOWN_LOCATION);
7501
7502 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7503 }
7504
7505 /* Re-use IVs when we can. */
7506 if (ivn < nvects)
7507 {
7508 unsigned vfp
7509 = least_common_multiple (group_size, const_nunits) / group_size;
7510 /* Generate [VF'*S, VF'*S, ... ]. */
7511 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7512 {
7513 expr = build_int_cst (integer_type_node, vfp);
7514 expr = fold_convert (TREE_TYPE (step_expr), expr);
7515 }
7516 else
7517 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7518 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7519 expr, step_expr);
7520 if (! CONSTANT_CLASS_P (new_name))
7521 new_name = vect_init_vector (stmt_info, new_name,
7522 TREE_TYPE (step_expr), NULL);
7523 new_vec = build_vector_from_val (vectype, new_name);
7524 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7525 for (; ivn < nvects; ++ivn)
7526 {
7527 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7528 tree def;
7529 if (gimple_code (iv) == GIMPLE_PHI)
7530 def = gimple_phi_result (iv);
7531 else
7532 def = gimple_assign_lhs (iv);
7533 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7534 PLUS_EXPR,
7535 def, vec_step);
7536 if (gimple_code (iv) == GIMPLE_PHI)
7537 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7538 else
7539 {
7540 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7541 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7542 }
7543 SLP_TREE_VEC_STMTS (slp_node).quick_push
7544 (loop_vinfo->add_stmt (new_stmt));
7545 }
7546 }
7547
7548 return true;
7549 }
7550
7551 /* Create the vector that holds the initial_value of the induction. */
7552 if (nested_in_vect_loop)
7553 {
7554 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7555 been created during vectorization of previous stmts. We obtain it
7556 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7557 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7558 /* If the initial value is not of proper type, convert it. */
7559 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7560 {
7561 new_stmt
7562 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7563 vect_simple_var,
7564 "vec_iv_"),
7565 VIEW_CONVERT_EXPR,
7566 build1 (VIEW_CONVERT_EXPR, vectype,
7567 vec_init));
7568 vec_init = gimple_assign_lhs (new_stmt);
7569 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7570 new_stmt);
7571 gcc_assert (!new_bb);
7572 loop_vinfo->add_stmt (new_stmt);
7573 }
7574 }
7575 else
7576 {
7577 /* iv_loop is the loop to be vectorized. Create:
7578 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7579 stmts = NULL;
7580 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7581
7582 unsigned HOST_WIDE_INT const_nunits;
7583 if (nunits.is_constant (&const_nunits))
7584 {
7585 tree_vector_builder elts (vectype, const_nunits, 1);
7586 elts.quick_push (new_name);
7587 for (i = 1; i < const_nunits; i++)
7588 {
7589 /* Create: new_name_i = new_name + step_expr */
7590 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7591 new_name, step_expr);
7592 elts.quick_push (new_name);
7593 }
7594 /* Create a vector from [new_name_0, new_name_1, ...,
7595 new_name_nunits-1] */
7596 vec_init = gimple_build_vector (&stmts, &elts);
7597 }
7598 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7599 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7600 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7601 new_name, step_expr);
7602 else
7603 {
7604 /* Build:
7605 [base, base, base, ...]
7606 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7607 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7608 gcc_assert (flag_associative_math);
7609 tree index = build_index_vector (vectype, 0, 1);
7610 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7611 new_name);
7612 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7613 step_expr);
7614 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7615 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7616 vec_init, step_vec);
7617 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7618 vec_init, base_vec);
7619 }
7620
7621 if (stmts)
7622 {
7623 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7624 gcc_assert (!new_bb);
7625 }
7626 }
7627
7628
7629 /* Create the vector that holds the step of the induction. */
7630 if (nested_in_vect_loop)
7631 /* iv_loop is nested in the loop to be vectorized. Generate:
7632 vec_step = [S, S, S, S] */
7633 new_name = step_expr;
7634 else
7635 {
7636 /* iv_loop is the loop to be vectorized. Generate:
7637 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7638 gimple_seq seq = NULL;
7639 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7640 {
7641 expr = build_int_cst (integer_type_node, vf);
7642 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7643 }
7644 else
7645 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7646 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7647 expr, step_expr);
7648 if (seq)
7649 {
7650 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7651 gcc_assert (!new_bb);
7652 }
7653 }
7654
7655 t = unshare_expr (new_name);
7656 gcc_assert (CONSTANT_CLASS_P (new_name)
7657 || TREE_CODE (new_name) == SSA_NAME);
7658 new_vec = build_vector_from_val (vectype, t);
7659 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7660
7661
7662 /* Create the following def-use cycle:
7663 loop prolog:
7664 vec_init = ...
7665 vec_step = ...
7666 loop:
7667 vec_iv = PHI <vec_init, vec_loop>
7668 ...
7669 STMT
7670 ...
7671 vec_loop = vec_iv + vec_step; */
7672
7673 /* Create the induction-phi that defines the induction-operand. */
7674 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7675 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7676 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7677 induc_def = PHI_RESULT (induction_phi);
7678
7679 /* Create the iv update inside the loop */
7680 vec_def = make_ssa_name (vec_dest);
7681 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7682 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7683 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7684
7685 /* Set the arguments of the phi node: */
7686 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7687 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7688 UNKNOWN_LOCATION);
7689
7690 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7691
7692 /* In case that vectorization factor (VF) is bigger than the number
7693 of elements that we can fit in a vectype (nunits), we have to generate
7694 more than one vector stmt - i.e - we need to "unroll" the
7695 vector stmt by a factor VF/nunits. For more details see documentation
7696 in vectorizable_operation. */
7697
7698 if (ncopies > 1)
7699 {
7700 gimple_seq seq = NULL;
7701 stmt_vec_info prev_stmt_vinfo;
7702 /* FORNOW. This restriction should be relaxed. */
7703 gcc_assert (!nested_in_vect_loop);
7704
7705 /* Create the vector that holds the step of the induction. */
7706 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7707 {
7708 expr = build_int_cst (integer_type_node, nunits);
7709 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7710 }
7711 else
7712 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7713 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7714 expr, step_expr);
7715 if (seq)
7716 {
7717 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7718 gcc_assert (!new_bb);
7719 }
7720
7721 t = unshare_expr (new_name);
7722 gcc_assert (CONSTANT_CLASS_P (new_name)
7723 || TREE_CODE (new_name) == SSA_NAME);
7724 new_vec = build_vector_from_val (vectype, t);
7725 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7726
7727 vec_def = induc_def;
7728 prev_stmt_vinfo = induction_phi_info;
7729 for (i = 1; i < ncopies; i++)
7730 {
7731 /* vec_i = vec_prev + vec_step */
7732 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7733 vec_def, vec_step);
7734 vec_def = make_ssa_name (vec_dest, new_stmt);
7735 gimple_assign_set_lhs (new_stmt, vec_def);
7736
7737 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7738 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7739 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7740 prev_stmt_vinfo = new_stmt_info;
7741 }
7742 }
7743
7744 if (nested_in_vect_loop)
7745 {
7746 /* Find the loop-closed exit-phi of the induction, and record
7747 the final vector of induction results: */
7748 exit_phi = NULL;
7749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7750 {
7751 gimple *use_stmt = USE_STMT (use_p);
7752 if (is_gimple_debug (use_stmt))
7753 continue;
7754
7755 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7756 {
7757 exit_phi = use_stmt;
7758 break;
7759 }
7760 }
7761 if (exit_phi)
7762 {
7763 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7764 /* FORNOW. Currently not supporting the case that an inner-loop induction
7765 is not used in the outer-loop (i.e. only outside the outer-loop). */
7766 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7767 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7768
7769 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7770 if (dump_enabled_p ())
7771 dump_printf_loc (MSG_NOTE, vect_location,
7772 "vector of inductions after inner-loop:%G",
7773 new_stmt);
7774 }
7775 }
7776
7777
7778 if (dump_enabled_p ())
7779 dump_printf_loc (MSG_NOTE, vect_location,
7780 "transform induction: created def-use cycle: %G%G",
7781 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7782
7783 return true;
7784 }
7785
7786 /* Function vectorizable_live_operation.
7787
7788 STMT_INFO computes a value that is used outside the loop. Check if
7789 it can be supported. */
7790
7791 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,stmt_vec_info * vec_stmt,stmt_vector_for_cost *)7792 vectorizable_live_operation (stmt_vec_info stmt_info,
7793 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7794 slp_tree slp_node, int slp_index,
7795 stmt_vec_info *vec_stmt,
7796 stmt_vector_for_cost *)
7797 {
7798 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7799 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7800 imm_use_iterator imm_iter;
7801 tree lhs, lhs_type, bitsize, vec_bitsize;
7802 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7803 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7804 int ncopies;
7805 gimple *use_stmt;
7806 auto_vec<tree> vec_oprnds;
7807 int vec_entry = 0;
7808 poly_uint64 vec_index = 0;
7809
7810 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7811
7812 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7813 return false;
7814
7815 /* FORNOW. CHECKME. */
7816 if (nested_in_vect_loop_p (loop, stmt_info))
7817 return false;
7818
7819 /* If STMT is not relevant and it is a simple assignment and its inputs are
7820 invariant then it can remain in place, unvectorized. The original last
7821 scalar value that it computes will be used. */
7822 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7823 {
7824 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7825 if (dump_enabled_p ())
7826 dump_printf_loc (MSG_NOTE, vect_location,
7827 "statement is simple and uses invariant. Leaving in "
7828 "place.\n");
7829 return true;
7830 }
7831
7832 if (slp_node)
7833 ncopies = 1;
7834 else
7835 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7836
7837 if (slp_node)
7838 {
7839 gcc_assert (slp_index >= 0);
7840
7841 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7842 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7843
7844 /* Get the last occurrence of the scalar index from the concatenation of
7845 all the slp vectors. Calculate which slp vector it is and the index
7846 within. */
7847 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7848
7849 /* Calculate which vector contains the result, and which lane of
7850 that vector we need. */
7851 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7852 {
7853 if (dump_enabled_p ())
7854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7855 "Cannot determine which vector holds the"
7856 " final result.\n");
7857 return false;
7858 }
7859 }
7860
7861 if (!vec_stmt)
7862 {
7863 /* No transformation required. */
7864 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7865 {
7866 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7867 OPTIMIZE_FOR_SPEED))
7868 {
7869 if (dump_enabled_p ())
7870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7871 "can't use a fully-masked loop because "
7872 "the target doesn't support extract last "
7873 "reduction.\n");
7874 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7875 }
7876 else if (slp_node)
7877 {
7878 if (dump_enabled_p ())
7879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7880 "can't use a fully-masked loop because an "
7881 "SLP statement is live after the loop.\n");
7882 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7883 }
7884 else if (ncopies > 1)
7885 {
7886 if (dump_enabled_p ())
7887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7888 "can't use a fully-masked loop because"
7889 " ncopies is greater than 1.\n");
7890 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7891 }
7892 else
7893 {
7894 gcc_assert (ncopies == 1 && !slp_node);
7895 vect_record_loop_mask (loop_vinfo,
7896 &LOOP_VINFO_MASKS (loop_vinfo),
7897 1, vectype);
7898 }
7899 }
7900 return true;
7901 }
7902
7903 /* Use the lhs of the original scalar statement. */
7904 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7905
7906 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7907 : gimple_get_lhs (stmt);
7908 lhs_type = TREE_TYPE (lhs);
7909
7910 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7911 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7912 : TYPE_SIZE (TREE_TYPE (vectype)));
7913 vec_bitsize = TYPE_SIZE (vectype);
7914
7915 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7916 tree vec_lhs, bitstart;
7917 if (slp_node)
7918 {
7919 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7920
7921 /* Get the correct slp vectorized stmt. */
7922 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7923 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7924 vec_lhs = gimple_phi_result (phi);
7925 else
7926 vec_lhs = gimple_get_lhs (vec_stmt);
7927
7928 /* Get entry to use. */
7929 bitstart = bitsize_int (vec_index);
7930 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7931 }
7932 else
7933 {
7934 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7935 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7936 gcc_checking_assert (ncopies == 1
7937 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7938
7939 /* For multiple copies, get the last copy. */
7940 for (int i = 1; i < ncopies; ++i)
7941 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7942
7943 /* Get the last lane in the vector. */
7944 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7945 }
7946
7947 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
7948 requirement, insert one phi node for it. It looks like:
7949 loop;
7950 BB:
7951 # lhs' = PHI <lhs>
7952 ==>
7953 loop;
7954 BB:
7955 # vec_lhs' = PHI <vec_lhs>
7956 new_tree = lane_extract <vec_lhs', ...>;
7957 lhs' = new_tree; */
7958
7959 basic_block exit_bb = single_exit (loop)->dest;
7960 gcc_assert (single_pred_p (exit_bb));
7961
7962 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
7963 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
7964 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
7965
7966 gimple_seq stmts = NULL;
7967 tree new_tree;
7968 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7969 {
7970 /* Emit:
7971
7972 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7973
7974 where VEC_LHS is the vectorized live-out result and MASK is
7975 the loop mask for the final iteration. */
7976 gcc_assert (ncopies == 1 && !slp_node);
7977 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7978 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
7979 vectype, 0);
7980 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
7981 mask, vec_lhs_phi);
7982
7983 /* Convert the extracted vector element to the required scalar type. */
7984 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7985 }
7986 else
7987 {
7988 tree bftype = TREE_TYPE (vectype);
7989 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7990 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7991 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
7992 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7993 &stmts, true, NULL_TREE);
7994 }
7995
7996 if (stmts)
7997 {
7998 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
7999 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8000
8001 /* Remove existing phi from lhs and create one copy from new_tree. */
8002 tree lhs_phi = NULL_TREE;
8003 gimple_stmt_iterator gsi;
8004 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8005 {
8006 gimple *phi = gsi_stmt (gsi);
8007 if ((gimple_phi_arg_def (phi, 0) == lhs))
8008 {
8009 remove_phi_node (&gsi, false);
8010 lhs_phi = gimple_phi_result (phi);
8011 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8012 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8013 break;
8014 }
8015 }
8016 }
8017
8018 /* Replace use of lhs with newly computed result. If the use stmt is a
8019 single arg PHI, just replace all uses of PHI result. It's necessary
8020 because lcssa PHI defining lhs may be before newly inserted stmt. */
8021 use_operand_p use_p;
8022 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8023 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8024 && !is_gimple_debug (use_stmt))
8025 {
8026 if (gimple_code (use_stmt) == GIMPLE_PHI
8027 && gimple_phi_num_args (use_stmt) == 1)
8028 {
8029 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8030 }
8031 else
8032 {
8033 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8034 SET_USE (use_p, new_tree);
8035 }
8036 update_stmt (use_stmt);
8037 }
8038
8039 return true;
8040 }
8041
8042 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8043
8044 static void
vect_loop_kill_debug_uses(struct loop * loop,stmt_vec_info stmt_info)8045 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8046 {
8047 ssa_op_iter op_iter;
8048 imm_use_iterator imm_iter;
8049 def_operand_p def_p;
8050 gimple *ustmt;
8051
8052 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8053 {
8054 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8055 {
8056 basic_block bb;
8057
8058 if (!is_gimple_debug (ustmt))
8059 continue;
8060
8061 bb = gimple_bb (ustmt);
8062
8063 if (!flow_bb_inside_loop_p (loop, bb))
8064 {
8065 if (gimple_debug_bind_p (ustmt))
8066 {
8067 if (dump_enabled_p ())
8068 dump_printf_loc (MSG_NOTE, vect_location,
8069 "killing debug use\n");
8070
8071 gimple_debug_bind_reset_value (ustmt);
8072 update_stmt (ustmt);
8073 }
8074 else
8075 gcc_unreachable ();
8076 }
8077 }
8078 }
8079 }
8080
8081 /* Given loop represented by LOOP_VINFO, return true if computation of
8082 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8083 otherwise. */
8084
8085 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8086 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8087 {
8088 /* Constant case. */
8089 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8090 {
8091 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8092 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8093
8094 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8095 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8096 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8097 return true;
8098 }
8099
8100 widest_int max;
8101 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8102 /* Check the upper bound of loop niters. */
8103 if (get_max_loop_iterations (loop, &max))
8104 {
8105 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8106 signop sgn = TYPE_SIGN (type);
8107 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8108 if (max < type_max)
8109 return true;
8110 }
8111 return false;
8112 }
8113
8114 /* Return a mask type with half the number of elements as TYPE. */
8115
8116 tree
vect_halve_mask_nunits(tree type)8117 vect_halve_mask_nunits (tree type)
8118 {
8119 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8120 return build_truth_vector_type (nunits, current_vector_size);
8121 }
8122
8123 /* Return a mask type with twice as many elements as TYPE. */
8124
8125 tree
vect_double_mask_nunits(tree type)8126 vect_double_mask_nunits (tree type)
8127 {
8128 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8129 return build_truth_vector_type (nunits, current_vector_size);
8130 }
8131
8132 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8133 contain a sequence of NVECTORS masks that each control a vector of type
8134 VECTYPE. */
8135
8136 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)8137 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8138 unsigned int nvectors, tree vectype)
8139 {
8140 gcc_assert (nvectors != 0);
8141 if (masks->length () < nvectors)
8142 masks->safe_grow_cleared (nvectors);
8143 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8144 /* The number of scalars per iteration and the number of vectors are
8145 both compile-time constants. */
8146 unsigned int nscalars_per_iter
8147 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8148 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8149 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8150 {
8151 rgm->max_nscalars_per_iter = nscalars_per_iter;
8152 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8153 }
8154 }
8155
8156 /* Given a complete set of masks MASKS, extract mask number INDEX
8157 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8158 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8159
8160 See the comment above vec_loop_masks for more details about the mask
8161 arrangement. */
8162
8163 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8164 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8165 unsigned int nvectors, tree vectype, unsigned int index)
8166 {
8167 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8168 tree mask_type = rgm->mask_type;
8169
8170 /* Populate the rgroup's mask array, if this is the first time we've
8171 used it. */
8172 if (rgm->masks.is_empty ())
8173 {
8174 rgm->masks.safe_grow_cleared (nvectors);
8175 for (unsigned int i = 0; i < nvectors; ++i)
8176 {
8177 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8178 /* Provide a dummy definition until the real one is available. */
8179 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8180 rgm->masks[i] = mask;
8181 }
8182 }
8183
8184 tree mask = rgm->masks[index];
8185 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8186 TYPE_VECTOR_SUBPARTS (vectype)))
8187 {
8188 /* A loop mask for data type X can be reused for data type Y
8189 if X has N times more elements than Y and if Y's elements
8190 are N times bigger than X's. In this case each sequence
8191 of N elements in the loop mask will be all-zero or all-one.
8192 We can then view-convert the mask so that each sequence of
8193 N elements is replaced by a single element. */
8194 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8195 TYPE_VECTOR_SUBPARTS (vectype)));
8196 gimple_seq seq = NULL;
8197 mask_type = build_same_sized_truth_vector_type (vectype);
8198 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8199 if (seq)
8200 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8201 }
8202 return mask;
8203 }
8204
8205 /* Scale profiling counters by estimation for LOOP which is vectorized
8206 by factor VF. */
8207
8208 static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)8209 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8210 {
8211 edge preheader = loop_preheader_edge (loop);
8212 /* Reduce loop iterations by the vectorization factor. */
8213 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8214 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8215
8216 if (freq_h.nonzero_p ())
8217 {
8218 profile_probability p;
8219
8220 /* Avoid dropping loop body profile counter to 0 because of zero count
8221 in loop's preheader. */
8222 if (!(freq_e == profile_count::zero ()))
8223 freq_e = freq_e.force_nonzero ();
8224 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8225 scale_loop_frequencies (loop, p);
8226 }
8227
8228 edge exit_e = single_exit (loop);
8229 exit_e->probability = profile_probability::always ()
8230 .apply_scale (1, new_est_niter + 1);
8231
8232 edge exit_l = single_pred_edge (loop->latch);
8233 profile_probability prob = exit_l->probability;
8234 exit_l->probability = exit_e->probability.invert ();
8235 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8236 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8237 }
8238
8239 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8240 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8241 stmt_vec_info. */
8242
8243 static void
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8244 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8245 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8246 {
8247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8248 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8249
8250 if (dump_enabled_p ())
8251 dump_printf_loc (MSG_NOTE, vect_location,
8252 "------>vectorizing statement: %G", stmt_info->stmt);
8253
8254 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8255 vect_loop_kill_debug_uses (loop, stmt_info);
8256
8257 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8258 && !STMT_VINFO_LIVE_P (stmt_info))
8259 return;
8260
8261 if (STMT_VINFO_VECTYPE (stmt_info))
8262 {
8263 poly_uint64 nunits
8264 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8265 if (!STMT_SLP_TYPE (stmt_info)
8266 && maybe_ne (nunits, vf)
8267 && dump_enabled_p ())
8268 /* For SLP VF is set according to unrolling factor, and not
8269 to vector size, hence for SLP this print is not valid. */
8270 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8271 }
8272
8273 /* Pure SLP statements have already been vectorized. We still need
8274 to apply loop vectorization to hybrid SLP statements. */
8275 if (PURE_SLP_STMT (stmt_info))
8276 return;
8277
8278 if (dump_enabled_p ())
8279 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8280
8281 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8282 *seen_store = stmt_info;
8283 }
8284
8285 /* Function vect_transform_loop.
8286
8287 The analysis phase has determined that the loop is vectorizable.
8288 Vectorize the loop - created vectorized stmts to replace the scalar
8289 stmts in the loop, and update the loop exit condition.
8290 Returns scalar epilogue loop if any. */
8291
8292 struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)8293 vect_transform_loop (loop_vec_info loop_vinfo)
8294 {
8295 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8296 struct loop *epilogue = NULL;
8297 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8298 int nbbs = loop->num_nodes;
8299 int i;
8300 tree niters_vector = NULL_TREE;
8301 tree step_vector = NULL_TREE;
8302 tree niters_vector_mult_vf = NULL_TREE;
8303 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8304 unsigned int lowest_vf = constant_lower_bound (vf);
8305 gimple *stmt;
8306 bool check_profitability = false;
8307 unsigned int th;
8308
8309 DUMP_VECT_SCOPE ("vec_transform_loop");
8310
8311 loop_vinfo->shared->check_datarefs ();
8312
8313 /* Use the more conservative vectorization threshold. If the number
8314 of iterations is constant assume the cost check has been performed
8315 by our caller. If the threshold makes all loops profitable that
8316 run at least the (estimated) vectorization factor number of times
8317 checking is pointless, too. */
8318 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8319 if (th >= vect_vf_for_cost (loop_vinfo)
8320 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8321 {
8322 if (dump_enabled_p ())
8323 dump_printf_loc (MSG_NOTE, vect_location,
8324 "Profitability threshold is %d loop iterations.\n",
8325 th);
8326 check_profitability = true;
8327 }
8328
8329 /* Make sure there exists a single-predecessor exit bb. Do this before
8330 versioning. */
8331 edge e = single_exit (loop);
8332 if (! single_pred_p (e->dest))
8333 {
8334 split_loop_exit_edge (e, true);
8335 if (dump_enabled_p ())
8336 dump_printf (MSG_NOTE, "split exit edge\n");
8337 }
8338
8339 /* Version the loop first, if required, so the profitability check
8340 comes first. */
8341
8342 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8343 {
8344 poly_uint64 versioning_threshold
8345 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8346 if (check_profitability
8347 && ordered_p (poly_uint64 (th), versioning_threshold))
8348 {
8349 versioning_threshold = ordered_max (poly_uint64 (th),
8350 versioning_threshold);
8351 check_profitability = false;
8352 }
8353 struct loop *sloop
8354 = vect_loop_versioning (loop_vinfo, th, check_profitability,
8355 versioning_threshold);
8356 sloop->force_vectorize = false;
8357 check_profitability = false;
8358 }
8359
8360 /* Make sure there exists a single-predecessor exit bb also on the
8361 scalar loop copy. Do this after versioning but before peeling
8362 so CFG structure is fine for both scalar and if-converted loop
8363 to make slpeel_duplicate_current_defs_from_edges face matched
8364 loop closed PHI nodes on the exit. */
8365 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8366 {
8367 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8368 if (! single_pred_p (e->dest))
8369 {
8370 split_loop_exit_edge (e, true);
8371 if (dump_enabled_p ())
8372 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8373 }
8374 }
8375
8376 tree niters = vect_build_loop_niters (loop_vinfo);
8377 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8378 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8379 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8380 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8381 &step_vector, &niters_vector_mult_vf, th,
8382 check_profitability, niters_no_overflow);
8383
8384 if (niters_vector == NULL_TREE)
8385 {
8386 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8387 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8388 && known_eq (lowest_vf, vf))
8389 {
8390 niters_vector
8391 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8392 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8393 step_vector = build_one_cst (TREE_TYPE (niters));
8394 }
8395 else
8396 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8397 &step_vector, niters_no_overflow);
8398 }
8399
8400 /* 1) Make sure the loop header has exactly two entries
8401 2) Make sure we have a preheader basic block. */
8402
8403 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8404
8405 split_edge (loop_preheader_edge (loop));
8406
8407 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8408 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8409 /* This will deal with any possible peeling. */
8410 vect_prepare_for_masked_peels (loop_vinfo);
8411
8412 /* Schedule the SLP instances first, then handle loop vectorization
8413 below. */
8414 if (!loop_vinfo->slp_instances.is_empty ())
8415 {
8416 DUMP_VECT_SCOPE ("scheduling SLP instances");
8417 vect_schedule_slp (loop_vinfo);
8418 }
8419
8420 /* FORNOW: the vectorizer supports only loops which body consist
8421 of one basic block (header + empty latch). When the vectorizer will
8422 support more involved loop forms, the order by which the BBs are
8423 traversed need to be reconsidered. */
8424
8425 for (i = 0; i < nbbs; i++)
8426 {
8427 basic_block bb = bbs[i];
8428 stmt_vec_info stmt_info;
8429
8430 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8431 gsi_next (&si))
8432 {
8433 gphi *phi = si.phi ();
8434 if (dump_enabled_p ())
8435 dump_printf_loc (MSG_NOTE, vect_location,
8436 "------>vectorizing phi: %G", phi);
8437 stmt_info = loop_vinfo->lookup_stmt (phi);
8438 if (!stmt_info)
8439 continue;
8440
8441 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8442 vect_loop_kill_debug_uses (loop, stmt_info);
8443
8444 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8445 && !STMT_VINFO_LIVE_P (stmt_info))
8446 continue;
8447
8448 if (STMT_VINFO_VECTYPE (stmt_info)
8449 && (maybe_ne
8450 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8451 && dump_enabled_p ())
8452 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8453
8454 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8455 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8456 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8457 && ! PURE_SLP_STMT (stmt_info))
8458 {
8459 if (dump_enabled_p ())
8460 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8461 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8462 }
8463 }
8464
8465 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8466 !gsi_end_p (si);)
8467 {
8468 stmt = gsi_stmt (si);
8469 /* During vectorization remove existing clobber stmts. */
8470 if (gimple_clobber_p (stmt))
8471 {
8472 unlink_stmt_vdef (stmt);
8473 gsi_remove (&si, true);
8474 release_defs (stmt);
8475 }
8476 else
8477 {
8478 stmt_info = loop_vinfo->lookup_stmt (stmt);
8479
8480 /* vector stmts created in the outer-loop during vectorization of
8481 stmts in an inner-loop may not have a stmt_info, and do not
8482 need to be vectorized. */
8483 stmt_vec_info seen_store = NULL;
8484 if (stmt_info)
8485 {
8486 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8487 {
8488 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8489 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8490 !gsi_end_p (subsi); gsi_next (&subsi))
8491 {
8492 stmt_vec_info pat_stmt_info
8493 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8494 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8495 &si, &seen_store);
8496 }
8497 stmt_vec_info pat_stmt_info
8498 = STMT_VINFO_RELATED_STMT (stmt_info);
8499 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8500 &seen_store);
8501 }
8502 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8503 &seen_store);
8504 }
8505 gsi_next (&si);
8506 if (seen_store)
8507 {
8508 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8509 /* Interleaving. If IS_STORE is TRUE, the
8510 vectorization of the interleaving chain was
8511 completed - free all the stores in the chain. */
8512 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8513 else
8514 /* Free the attached stmt_vec_info and remove the stmt. */
8515 loop_vinfo->remove_stmt (stmt_info);
8516 }
8517 }
8518 }
8519
8520 /* Stub out scalar statements that must not survive vectorization.
8521 Doing this here helps with grouped statements, or statements that
8522 are involved in patterns. */
8523 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8524 !gsi_end_p (gsi); gsi_next (&gsi))
8525 {
8526 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8527 if (!call || !gimple_call_internal_p (call))
8528 continue;
8529 internal_fn ifn = gimple_call_internal_fn (call);
8530 if (ifn == IFN_MASK_LOAD)
8531 {
8532 tree lhs = gimple_get_lhs (call);
8533 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8534 {
8535 tree zero = build_zero_cst (TREE_TYPE (lhs));
8536 gimple *new_stmt = gimple_build_assign (lhs, zero);
8537 gsi_replace (&gsi, new_stmt, true);
8538 }
8539 }
8540 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
8541 {
8542 tree lhs = gimple_get_lhs (call);
8543 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8544 {
8545 tree else_arg
8546 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
8547 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
8548 gsi_replace (&gsi, new_stmt, true);
8549 }
8550 }
8551 }
8552 } /* BBs in loop */
8553
8554 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8555 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8556 if (integer_onep (step_vector))
8557 niters_no_overflow = true;
8558 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8559 niters_vector_mult_vf, !niters_no_overflow);
8560
8561 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8562 scale_profile_for_vect_loop (loop, assumed_vf);
8563
8564 /* True if the final iteration might not handle a full vector's
8565 worth of scalar iterations. */
8566 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8567 /* The minimum number of iterations performed by the epilogue. This
8568 is 1 when peeling for gaps because we always need a final scalar
8569 iteration. */
8570 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8571 /* +1 to convert latch counts to loop iteration counts,
8572 -min_epilogue_iters to remove iterations that cannot be performed
8573 by the vector code. */
8574 int bias_for_lowest = 1 - min_epilogue_iters;
8575 int bias_for_assumed = bias_for_lowest;
8576 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8577 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8578 {
8579 /* When the amount of peeling is known at compile time, the first
8580 iteration will have exactly alignment_npeels active elements.
8581 In the worst case it will have at least one. */
8582 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8583 bias_for_lowest += lowest_vf - min_first_active;
8584 bias_for_assumed += assumed_vf - min_first_active;
8585 }
8586 /* In these calculations the "- 1" converts loop iteration counts
8587 back to latch counts. */
8588 if (loop->any_upper_bound)
8589 loop->nb_iterations_upper_bound
8590 = (final_iter_may_be_partial
8591 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8592 lowest_vf) - 1
8593 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8594 lowest_vf) - 1);
8595 if (loop->any_likely_upper_bound)
8596 loop->nb_iterations_likely_upper_bound
8597 = (final_iter_may_be_partial
8598 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8599 + bias_for_lowest, lowest_vf) - 1
8600 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8601 + bias_for_lowest, lowest_vf) - 1);
8602 if (loop->any_estimate)
8603 loop->nb_iterations_estimate
8604 = (final_iter_may_be_partial
8605 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8606 assumed_vf) - 1
8607 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8608 assumed_vf) - 1);
8609
8610 if (dump_enabled_p ())
8611 {
8612 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8613 {
8614 dump_printf_loc (MSG_NOTE, vect_location,
8615 "LOOP VECTORIZED\n");
8616 if (loop->inner)
8617 dump_printf_loc (MSG_NOTE, vect_location,
8618 "OUTER LOOP VECTORIZED\n");
8619 dump_printf (MSG_NOTE, "\n");
8620 }
8621 else
8622 {
8623 dump_printf_loc (MSG_NOTE, vect_location,
8624 "LOOP EPILOGUE VECTORIZED (VS=");
8625 dump_dec (MSG_NOTE, current_vector_size);
8626 dump_printf (MSG_NOTE, ")\n");
8627 }
8628 }
8629
8630 /* Loops vectorized with a variable factor won't benefit from
8631 unrolling/peeling. */
8632 if (!vf.is_constant ())
8633 {
8634 loop->unroll = 1;
8635 if (dump_enabled_p ())
8636 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8637 " variable-length vectorization factor\n");
8638 }
8639 /* Free SLP instances here because otherwise stmt reference counting
8640 won't work. */
8641 slp_instance instance;
8642 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8643 vect_free_slp_instance (instance, true);
8644 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8645 /* Clear-up safelen field since its value is invalid after vectorization
8646 since vectorized loop can have loop-carried dependencies. */
8647 loop->safelen = 0;
8648
8649 /* Don't vectorize epilogue for epilogue. */
8650 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8651 epilogue = NULL;
8652
8653 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8654 epilogue = NULL;
8655
8656 if (epilogue)
8657 {
8658 auto_vector_sizes vector_sizes;
8659 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8660 unsigned int next_size = 0;
8661
8662 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8663 on niters already ajusted for the iterations of the prologue. */
8664 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8665 && known_eq (vf, lowest_vf))
8666 {
8667 unsigned HOST_WIDE_INT eiters
8668 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8669 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8670 eiters
8671 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8672 epilogue->nb_iterations_upper_bound = eiters - 1;
8673 epilogue->any_upper_bound = true;
8674
8675 unsigned int ratio;
8676 while (next_size < vector_sizes.length ()
8677 && !(constant_multiple_p (current_vector_size,
8678 vector_sizes[next_size], &ratio)
8679 && eiters >= lowest_vf / ratio))
8680 next_size += 1;
8681 }
8682 else
8683 while (next_size < vector_sizes.length ()
8684 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8685 next_size += 1;
8686
8687 if (next_size == vector_sizes.length ())
8688 epilogue = NULL;
8689 }
8690
8691 if (epilogue)
8692 {
8693 epilogue->force_vectorize = loop->force_vectorize;
8694 epilogue->safelen = loop->safelen;
8695 epilogue->dont_vectorize = false;
8696
8697 /* We may need to if-convert epilogue to vectorize it. */
8698 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8699 tree_if_conversion (epilogue);
8700 }
8701
8702 return epilogue;
8703 }
8704
8705 /* The code below is trying to perform simple optimization - revert
8706 if-conversion for masked stores, i.e. if the mask of a store is zero
8707 do not perform it and all stored value producers also if possible.
8708 For example,
8709 for (i=0; i<n; i++)
8710 if (c[i])
8711 {
8712 p1[i] += 1;
8713 p2[i] = p3[i] +2;
8714 }
8715 this transformation will produce the following semi-hammock:
8716
8717 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8718 {
8719 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8720 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8721 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8722 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8723 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8724 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8725 }
8726 */
8727
8728 void
optimize_mask_stores(struct loop * loop)8729 optimize_mask_stores (struct loop *loop)
8730 {
8731 basic_block *bbs = get_loop_body (loop);
8732 unsigned nbbs = loop->num_nodes;
8733 unsigned i;
8734 basic_block bb;
8735 struct loop *bb_loop;
8736 gimple_stmt_iterator gsi;
8737 gimple *stmt;
8738 auto_vec<gimple *> worklist;
8739 auto_purge_vect_location sentinel;
8740
8741 vect_location = find_loop_location (loop);
8742 /* Pick up all masked stores in loop if any. */
8743 for (i = 0; i < nbbs; i++)
8744 {
8745 bb = bbs[i];
8746 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8747 gsi_next (&gsi))
8748 {
8749 stmt = gsi_stmt (gsi);
8750 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8751 worklist.safe_push (stmt);
8752 }
8753 }
8754
8755 free (bbs);
8756 if (worklist.is_empty ())
8757 return;
8758
8759 /* Loop has masked stores. */
8760 while (!worklist.is_empty ())
8761 {
8762 gimple *last, *last_store;
8763 edge e, efalse;
8764 tree mask;
8765 basic_block store_bb, join_bb;
8766 gimple_stmt_iterator gsi_to;
8767 tree vdef, new_vdef;
8768 gphi *phi;
8769 tree vectype;
8770 tree zero;
8771
8772 last = worklist.pop ();
8773 mask = gimple_call_arg (last, 2);
8774 bb = gimple_bb (last);
8775 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8776 the same loop as if_bb. It could be different to LOOP when two
8777 level loop-nest is vectorized and mask_store belongs to the inner
8778 one. */
8779 e = split_block (bb, last);
8780 bb_loop = bb->loop_father;
8781 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8782 join_bb = e->dest;
8783 store_bb = create_empty_bb (bb);
8784 add_bb_to_loop (store_bb, bb_loop);
8785 e->flags = EDGE_TRUE_VALUE;
8786 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8787 /* Put STORE_BB to likely part. */
8788 efalse->probability = profile_probability::unlikely ();
8789 store_bb->count = efalse->count ();
8790 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8791 if (dom_info_available_p (CDI_DOMINATORS))
8792 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8793 if (dump_enabled_p ())
8794 dump_printf_loc (MSG_NOTE, vect_location,
8795 "Create new block %d to sink mask stores.",
8796 store_bb->index);
8797 /* Create vector comparison with boolean result. */
8798 vectype = TREE_TYPE (mask);
8799 zero = build_zero_cst (vectype);
8800 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8801 gsi = gsi_last_bb (bb);
8802 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8803 /* Create new PHI node for vdef of the last masked store:
8804 .MEM_2 = VDEF <.MEM_1>
8805 will be converted to
8806 .MEM.3 = VDEF <.MEM_1>
8807 and new PHI node will be created in join bb
8808 .MEM_2 = PHI <.MEM_1, .MEM_3>
8809 */
8810 vdef = gimple_vdef (last);
8811 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8812 gimple_set_vdef (last, new_vdef);
8813 phi = create_phi_node (vdef, join_bb);
8814 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8815
8816 /* Put all masked stores with the same mask to STORE_BB if possible. */
8817 while (true)
8818 {
8819 gimple_stmt_iterator gsi_from;
8820 gimple *stmt1 = NULL;
8821
8822 /* Move masked store to STORE_BB. */
8823 last_store = last;
8824 gsi = gsi_for_stmt (last);
8825 gsi_from = gsi;
8826 /* Shift GSI to the previous stmt for further traversal. */
8827 gsi_prev (&gsi);
8828 gsi_to = gsi_start_bb (store_bb);
8829 gsi_move_before (&gsi_from, &gsi_to);
8830 /* Setup GSI_TO to the non-empty block start. */
8831 gsi_to = gsi_start_bb (store_bb);
8832 if (dump_enabled_p ())
8833 dump_printf_loc (MSG_NOTE, vect_location,
8834 "Move stmt to created bb\n%G", last);
8835 /* Move all stored value producers if possible. */
8836 while (!gsi_end_p (gsi))
8837 {
8838 tree lhs;
8839 imm_use_iterator imm_iter;
8840 use_operand_p use_p;
8841 bool res;
8842
8843 /* Skip debug statements. */
8844 if (is_gimple_debug (gsi_stmt (gsi)))
8845 {
8846 gsi_prev (&gsi);
8847 continue;
8848 }
8849 stmt1 = gsi_stmt (gsi);
8850 /* Do not consider statements writing to memory or having
8851 volatile operand. */
8852 if (gimple_vdef (stmt1)
8853 || gimple_has_volatile_ops (stmt1))
8854 break;
8855 gsi_from = gsi;
8856 gsi_prev (&gsi);
8857 lhs = gimple_get_lhs (stmt1);
8858 if (!lhs)
8859 break;
8860
8861 /* LHS of vectorized stmt must be SSA_NAME. */
8862 if (TREE_CODE (lhs) != SSA_NAME)
8863 break;
8864
8865 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8866 {
8867 /* Remove dead scalar statement. */
8868 if (has_zero_uses (lhs))
8869 {
8870 gsi_remove (&gsi_from, true);
8871 continue;
8872 }
8873 }
8874
8875 /* Check that LHS does not have uses outside of STORE_BB. */
8876 res = true;
8877 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8878 {
8879 gimple *use_stmt;
8880 use_stmt = USE_STMT (use_p);
8881 if (is_gimple_debug (use_stmt))
8882 continue;
8883 if (gimple_bb (use_stmt) != store_bb)
8884 {
8885 res = false;
8886 break;
8887 }
8888 }
8889 if (!res)
8890 break;
8891
8892 if (gimple_vuse (stmt1)
8893 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8894 break;
8895
8896 /* Can move STMT1 to STORE_BB. */
8897 if (dump_enabled_p ())
8898 dump_printf_loc (MSG_NOTE, vect_location,
8899 "Move stmt to created bb\n%G", stmt1);
8900 gsi_move_before (&gsi_from, &gsi_to);
8901 /* Shift GSI_TO for further insertion. */
8902 gsi_prev (&gsi_to);
8903 }
8904 /* Put other masked stores with the same mask to STORE_BB. */
8905 if (worklist.is_empty ()
8906 || gimple_call_arg (worklist.last (), 2) != mask
8907 || worklist.last () != stmt1)
8908 break;
8909 last = worklist.pop ();
8910 }
8911 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8912 }
8913 }
8914