1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else
195 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
196 }
197
198 if (nunits_vectype)
199 vect_update_max_nunits (vf, nunits_vectype);
200
201 return opt_result::success ();
202 }
203
204 /* Subroutine of vect_determine_vectorization_factor. Set the vector
205 types of STMT_INFO and all attached pattern statements and update
206 the vectorization factor VF accordingly. Return true on success
207 or false if something prevented vectorization. */
208
209 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf)210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
211 {
212 vec_info *vinfo = stmt_info->vinfo;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
215 stmt_info->stmt);
216 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
217 if (!res)
218 return res;
219
220 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
221 && STMT_VINFO_RELATED_STMT (stmt_info))
222 {
223 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
224 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
225
226 /* If a pattern statement has def stmts, analyze them too. */
227 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
228 !gsi_end_p (si); gsi_next (&si))
229 {
230 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
231 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "==> examining pattern def stmt: %G",
234 def_stmt_info->stmt);
235 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
236 if (!res)
237 return res;
238 }
239
240 if (dump_enabled_p ())
241 dump_printf_loc (MSG_NOTE, vect_location,
242 "==> examining pattern statement: %G",
243 stmt_info->stmt);
244 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
245 if (!res)
246 return res;
247 }
248
249 return opt_result::success ();
250 }
251
252 /* Function vect_determine_vectorization_factor
253
254 Determine the vectorization factor (VF). VF is the number of data elements
255 that are operated upon in parallel in a single iteration of the vectorized
256 loop. For example, when vectorizing a loop that operates on 4byte elements,
257 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
258 elements can fit in a single vector register.
259
260 We currently support vectorization of loops in which all types operated upon
261 are of the same size. Therefore this function currently sets VF according to
262 the size of the types operated upon, and fails if there are multiple sizes
263 in the loop.
264
265 VF is also the factor by which the loop iterations are strip-mined, e.g.:
266 original loop:
267 for (i=0; i<N; i++){
268 a[i] = b[i] + c[i];
269 }
270
271 vectorized loop:
272 for (i=0; i<N; i+=VF){
273 a[i:VF] = b[i:VF] + c[i:VF];
274 }
275 */
276
277 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
279 {
280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
281 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
282 unsigned nbbs = loop->num_nodes;
283 poly_uint64 vectorization_factor = 1;
284 tree scalar_type = NULL_TREE;
285 gphi *phi;
286 tree vectype;
287 stmt_vec_info stmt_info;
288 unsigned i;
289
290 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
291
292 for (i = 0; i < nbbs; i++)
293 {
294 basic_block bb = bbs[i];
295
296 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
297 gsi_next (&si))
298 {
299 phi = si.phi ();
300 stmt_info = loop_vinfo->lookup_stmt (phi);
301 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
303 phi);
304
305 gcc_assert (stmt_info);
306
307 if (STMT_VINFO_RELEVANT_P (stmt_info)
308 || STMT_VINFO_LIVE_P (stmt_info))
309 {
310 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
311 scalar_type = TREE_TYPE (PHI_RESULT (phi));
312
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location,
315 "get vectype for scalar type: %T\n",
316 scalar_type);
317
318 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
319 if (!vectype)
320 return opt_result::failure_at (phi,
321 "not vectorized: unsupported "
322 "data-type %T\n",
323 scalar_type);
324 STMT_VINFO_VECTYPE (stmt_info) = vectype;
325
326 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
328 vectype);
329
330 if (dump_enabled_p ())
331 {
332 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
333 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
334 dump_printf (MSG_NOTE, "\n");
335 }
336
337 vect_update_max_nunits (&vectorization_factor, vectype);
338 }
339 }
340
341 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
342 gsi_next (&si))
343 {
344 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
345 opt_result res
346 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
347 if (!res)
348 return res;
349 }
350 }
351
352 /* TODO: Analyze cost. Decide if worth while to vectorize. */
353 if (dump_enabled_p ())
354 {
355 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
356 dump_dec (MSG_NOTE, vectorization_factor);
357 dump_printf (MSG_NOTE, "\n");
358 }
359
360 if (known_le (vectorization_factor, 1U))
361 return opt_result::failure_at (vect_location,
362 "not vectorized: unsupported data-type\n");
363 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
364 return opt_result::success ();
365 }
366
367
368 /* Function vect_is_simple_iv_evolution.
369
370 FORNOW: A simple evolution of an induction variables in the loop is
371 considered a polynomial evolution. */
372
373 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)374 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
375 tree * step)
376 {
377 tree init_expr;
378 tree step_expr;
379 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
380 basic_block bb;
381
382 /* When there is no evolution in this loop, the evolution function
383 is not "simple". */
384 if (evolution_part == NULL_TREE)
385 return false;
386
387 /* When the evolution is a polynomial of degree >= 2
388 the evolution function is not "simple". */
389 if (tree_is_chrec (evolution_part))
390 return false;
391
392 step_expr = evolution_part;
393 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
394
395 if (dump_enabled_p ())
396 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
397 step_expr, init_expr);
398
399 *init = init_expr;
400 *step = step_expr;
401
402 if (TREE_CODE (step_expr) != INTEGER_CST
403 && (TREE_CODE (step_expr) != SSA_NAME
404 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
405 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
406 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
407 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
408 || !flag_associative_math)))
409 && (TREE_CODE (step_expr) != REAL_CST
410 || !flag_associative_math))
411 {
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "step unknown.\n");
415 return false;
416 }
417
418 return true;
419 }
420
421 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
422 what we are assuming is a double reduction. For example, given
423 a structure like this:
424
425 outer1:
426 x_1 = PHI <x_4(outer2), ...>;
427 ...
428
429 inner:
430 x_2 = PHI <x_1(outer1), ...>;
431 ...
432 x_3 = ...;
433 ...
434
435 outer2:
436 x_4 = PHI <x_3(inner)>;
437 ...
438
439 outer loop analysis would treat x_1 as a double reduction phi and
440 this function would then return true for x_2. */
441
442 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)443 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
444 {
445 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
446 use_operand_p use_p;
447 ssa_op_iter op_iter;
448 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
449 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
450 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
451 return true;
452 return false;
453 }
454
455 /* Function vect_analyze_scalar_cycles_1.
456
457 Examine the cross iteration def-use cycles of scalar variables
458 in LOOP. LOOP_VINFO represents the loop that is now being
459 considered for vectorization (can be LOOP, or an outer-loop
460 enclosing LOOP). */
461
462 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
464 {
465 basic_block bb = loop->header;
466 tree init, step;
467 auto_vec<stmt_vec_info, 64> worklist;
468 gphi_iterator gsi;
469 bool double_reduc, reduc_chain;
470
471 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
472
473 /* First - identify all inductions. Reduction detection assumes that all the
474 inductions have been identified, therefore, this order must not be
475 changed. */
476 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
477 {
478 gphi *phi = gsi.phi ();
479 tree access_fn = NULL;
480 tree def = PHI_RESULT (phi);
481 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
482
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
485
486 /* Skip virtual phi's. The data dependences that are associated with
487 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
488 if (virtual_operand_p (def))
489 continue;
490
491 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
492
493 /* Analyze the evolution function. */
494 access_fn = analyze_scalar_evolution (loop, def);
495 if (access_fn)
496 {
497 STRIP_NOPS (access_fn);
498 if (dump_enabled_p ())
499 dump_printf_loc (MSG_NOTE, vect_location,
500 "Access function of PHI: %T\n", access_fn);
501 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
502 = initial_condition_in_loop_num (access_fn, loop->num);
503 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
504 = evolution_part_in_loop_num (access_fn, loop->num);
505 }
506
507 if (!access_fn
508 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
509 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
510 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
511 && TREE_CODE (step) != INTEGER_CST))
512 {
513 worklist.safe_push (stmt_vinfo);
514 continue;
515 }
516
517 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
518 != NULL_TREE);
519 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
520
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
523 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
524 }
525
526
527 /* Second - identify all reductions and nested cycles. */
528 while (worklist.length () > 0)
529 {
530 stmt_vec_info stmt_vinfo = worklist.pop ();
531 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
532 tree def = PHI_RESULT (phi);
533
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
536
537 gcc_assert (!virtual_operand_p (def)
538 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
539
540 stmt_vec_info reduc_stmt_info
541 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
542 &reduc_chain);
543 if (reduc_stmt_info)
544 {
545 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
546 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
547 if (double_reduc)
548 {
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location,
551 "Detected double reduction.\n");
552
553 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
554 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
555 }
556 else
557 {
558 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
559 {
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "Detected vectorizable nested cycle.\n");
563
564 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
565 }
566 else
567 {
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected reduction.\n");
571
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
574 /* Store the reduction cycles for possible vectorization in
575 loop-aware SLP if it was not detected as reduction
576 chain. */
577 if (! reduc_chain)
578 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
579 (reduc_stmt_info);
580 }
581 }
582 }
583 else
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
586 "Unknown def-use cycle pattern.\n");
587 }
588 }
589
590
591 /* Function vect_analyze_scalar_cycles.
592
593 Examine the cross iteration def-use cycles of scalar variables, by
594 analyzing the loop-header PHIs of scalar variables. Classify each
595 cycle as one of the following: invariant, induction, reduction, unknown.
596 We do that for the loop represented by LOOP_VINFO, and also to its
597 inner-loop, if exists.
598 Examples for scalar cycles:
599
600 Example1: reduction:
601
602 loop1:
603 for (i=0; i<N; i++)
604 sum += a[i];
605
606 Example2: induction:
607
608 loop2:
609 for (i=0; i<N; i++)
610 a[i] = i; */
611
612 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
614 {
615 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
616
617 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
618
619 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
620 Reductions in such inner-loop therefore have different properties than
621 the reductions in the nest that gets vectorized:
622 1. When vectorized, they are executed in the same order as in the original
623 scalar loop, so we can't change the order of computation when
624 vectorizing them.
625 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
626 current checks are too strict. */
627
628 if (loop->inner)
629 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
630 }
631
632 /* Transfer group and reduction information from STMT_INFO to its
633 pattern stmt. */
634
635 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)636 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
637 {
638 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
639 stmt_vec_info stmtp;
640 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
642 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
643 do
644 {
645 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
646 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
647 == STMT_VINFO_DEF_TYPE (stmt_info));
648 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
649 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
650 if (stmt_info)
651 REDUC_GROUP_NEXT_ELEMENT (stmtp)
652 = STMT_VINFO_RELATED_STMT (stmt_info);
653 }
654 while (stmt_info);
655 }
656
657 /* Fixup scalar cycles that now have their stmts detected as patterns. */
658
659 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)660 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
661 {
662 stmt_vec_info first;
663 unsigned i;
664
665 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
666 if (STMT_VINFO_IN_PATTERN_P (first))
667 {
668 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
669 while (next)
670 {
671 if (! STMT_VINFO_IN_PATTERN_P (next)
672 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
673 break;
674 next = REDUC_GROUP_NEXT_ELEMENT (next);
675 }
676 /* If not all stmt in the chain are patterns or if we failed
677 to update STMT_VINFO_REDUC_IDX try to handle the chain
678 without patterns. */
679 if (! next
680 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
681 {
682 vect_fixup_reduc_chain (first);
683 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
684 = STMT_VINFO_RELATED_STMT (first);
685 }
686 }
687 }
688
689 /* Function vect_get_loop_niters.
690
691 Determine how many iterations the loop is executed and place it
692 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
693 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
694 niter information holds in ASSUMPTIONS.
695
696 Return the loop exit condition. */
697
698
699 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)700 vect_get_loop_niters (class loop *loop, tree *assumptions,
701 tree *number_of_iterations, tree *number_of_iterationsm1)
702 {
703 edge exit = single_exit (loop);
704 class tree_niter_desc niter_desc;
705 tree niter_assumptions, niter, may_be_zero;
706 gcond *cond = get_loop_exit_condition (loop);
707
708 *assumptions = boolean_true_node;
709 *number_of_iterationsm1 = chrec_dont_know;
710 *number_of_iterations = chrec_dont_know;
711 DUMP_VECT_SCOPE ("get_loop_niters");
712
713 if (!exit)
714 return cond;
715
716 may_be_zero = NULL_TREE;
717 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
718 || chrec_contains_undetermined (niter_desc.niter))
719 return cond;
720
721 niter_assumptions = niter_desc.assumptions;
722 may_be_zero = niter_desc.may_be_zero;
723 niter = niter_desc.niter;
724
725 if (may_be_zero && integer_zerop (may_be_zero))
726 may_be_zero = NULL_TREE;
727
728 if (may_be_zero)
729 {
730 if (COMPARISON_CLASS_P (may_be_zero))
731 {
732 /* Try to combine may_be_zero with assumptions, this can simplify
733 computation of niter expression. */
734 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
735 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
736 niter_assumptions,
737 fold_build1 (TRUTH_NOT_EXPR,
738 boolean_type_node,
739 may_be_zero));
740 else
741 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
742 build_int_cst (TREE_TYPE (niter), 0),
743 rewrite_to_non_trapping_overflow (niter));
744
745 may_be_zero = NULL_TREE;
746 }
747 else if (integer_nonzerop (may_be_zero))
748 {
749 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
750 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
751 return cond;
752 }
753 else
754 return cond;
755 }
756
757 *assumptions = niter_assumptions;
758 *number_of_iterationsm1 = niter;
759
760 /* We want the number of loop header executions which is the number
761 of latch executions plus one.
762 ??? For UINT_MAX latch executions this number overflows to zero
763 for loops like do { n++; } while (n != 0); */
764 if (niter && !chrec_contains_undetermined (niter))
765 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
766 build_int_cst (TREE_TYPE (niter), 1));
767 *number_of_iterations = niter;
768
769 return cond;
770 }
771
772 /* Function bb_in_loop_p
773
774 Used as predicate for dfs order traversal of the loop bbs. */
775
776 static bool
bb_in_loop_p(const_basic_block bb,const void * data)777 bb_in_loop_p (const_basic_block bb, const void *data)
778 {
779 const class loop *const loop = (const class loop *)data;
780 if (flow_bb_inside_loop_p (loop, bb))
781 return true;
782 return false;
783 }
784
785
786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
787 stmt_vec_info structs for all the stmts in LOOP_IN. */
788
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
790 : vec_info (vec_info::loop, init_cost (loop_in), shared),
791 loop (loop_in),
792 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
793 num_itersm1 (NULL_TREE),
794 num_iters (NULL_TREE),
795 num_iters_unchanged (NULL_TREE),
796 num_iters_assumptions (NULL_TREE),
797 th (0),
798 versioning_threshold (0),
799 vectorization_factor (0),
800 max_vectorization_factor (0),
801 mask_skip_niters (NULL_TREE),
802 mask_compare_type (NULL_TREE),
803 simd_if_cond (NULL_TREE),
804 unaligned_dr (NULL),
805 peeling_for_alignment (0),
806 ptr_mask (0),
807 ivexpr_map (NULL),
808 scan_map (NULL),
809 slp_unrolling_factor (1),
810 single_scalar_iteration_cost (0),
811 vec_outside_cost (0),
812 vec_inside_cost (0),
813 vectorizable (false),
814 can_fully_mask_p (true),
815 fully_masked_p (false),
816 peeling_for_gaps (false),
817 peeling_for_niter (false),
818 no_data_dependencies (false),
819 has_mask_store (false),
820 scalar_loop_scaling (profile_probability::uninitialized ()),
821 scalar_loop (NULL),
822 orig_loop_info (NULL)
823 {
824 /* CHECKME: We want to visit all BBs before their successors (except for
825 latch blocks, for which this assertion wouldn't hold). In the simple
826 case of the loop forms we allow, a dfs order of the BBs would the same
827 as reversed postorder traversal, so we are safe. */
828
829 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
830 bbs, loop->num_nodes, loop);
831 gcc_assert (nbbs == loop->num_nodes);
832
833 for (unsigned int i = 0; i < nbbs; i++)
834 {
835 basic_block bb = bbs[i];
836 gimple_stmt_iterator si;
837
838 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
839 {
840 gimple *phi = gsi_stmt (si);
841 gimple_set_uid (phi, 0);
842 add_stmt (phi);
843 }
844
845 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
846 {
847 gimple *stmt = gsi_stmt (si);
848 gimple_set_uid (stmt, 0);
849 add_stmt (stmt);
850 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
851 third argument is the #pragma omp simd if (x) condition, when 0,
852 loop shouldn't be vectorized, when non-zero constant, it should
853 be vectorized normally, otherwise versioned with vectorized loop
854 done if the condition is non-zero at runtime. */
855 if (loop_in->simduid
856 && is_gimple_call (stmt)
857 && gimple_call_internal_p (stmt)
858 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
859 && gimple_call_num_args (stmt) >= 3
860 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
861 && (loop_in->simduid
862 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
863 {
864 tree arg = gimple_call_arg (stmt, 2);
865 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
866 simd_if_cond = arg;
867 else
868 gcc_assert (integer_nonzerop (arg));
869 }
870 }
871 }
872
873 epilogue_vinfos.create (6);
874 }
875
876 /* Free all levels of MASKS. */
877
878 void
release_vec_loop_masks(vec_loop_masks * masks)879 release_vec_loop_masks (vec_loop_masks *masks)
880 {
881 rgroup_masks *rgm;
882 unsigned int i;
883 FOR_EACH_VEC_ELT (*masks, i, rgm)
884 rgm->masks.release ();
885 masks->release ();
886 }
887
888 /* Free all memory used by the _loop_vec_info, as well as all the
889 stmt_vec_info structs of all the stmts in the loop. */
890
~_loop_vec_info()891 _loop_vec_info::~_loop_vec_info ()
892 {
893 free (bbs);
894
895 release_vec_loop_masks (&masks);
896 delete ivexpr_map;
897 delete scan_map;
898 epilogue_vinfos.release ();
899
900 loop->aux = NULL;
901 }
902
903 /* Return an invariant or register for EXPR and emit necessary
904 computations in the LOOP_VINFO loop preheader. */
905
906 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)907 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
908 {
909 if (is_gimple_reg (expr)
910 || is_gimple_min_invariant (expr))
911 return expr;
912
913 if (! loop_vinfo->ivexpr_map)
914 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
915 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
916 if (! cached)
917 {
918 gimple_seq stmts = NULL;
919 cached = force_gimple_operand (unshare_expr (expr),
920 &stmts, true, NULL_TREE);
921 if (stmts)
922 {
923 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
924 gsi_insert_seq_on_edge_immediate (e, stmts);
925 }
926 }
927 return cached;
928 }
929
930 /* Return true if we can use CMP_TYPE as the comparison type to produce
931 all masks required to mask LOOP_VINFO. */
932
933 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)934 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
935 {
936 rgroup_masks *rgm;
937 unsigned int i;
938 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
939 if (rgm->mask_type != NULL_TREE
940 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
941 cmp_type, rgm->mask_type,
942 OPTIMIZE_FOR_SPEED))
943 return false;
944 return true;
945 }
946
947 /* Calculate the maximum number of scalars per iteration for every
948 rgroup in LOOP_VINFO. */
949
950 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)951 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
952 {
953 unsigned int res = 1;
954 unsigned int i;
955 rgroup_masks *rgm;
956 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
957 res = MAX (res, rgm->max_nscalars_per_iter);
958 return res;
959 }
960
961 /* Each statement in LOOP_VINFO can be masked where necessary. Check
962 whether we can actually generate the masks required. Return true if so,
963 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
964
965 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)966 vect_verify_full_masking (loop_vec_info loop_vinfo)
967 {
968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
969 unsigned int min_ni_width;
970 unsigned int max_nscalars_per_iter
971 = vect_get_max_nscalars_per_iter (loop_vinfo);
972
973 /* Use a normal loop if there are no statements that need masking.
974 This only happens in rare degenerate cases: it means that the loop
975 has no loads, no stores, and no live-out values. */
976 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
977 return false;
978
979 /* Get the maximum number of iterations that is representable
980 in the counter type. */
981 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
982 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
983
984 /* Get a more refined estimate for the number of iterations. */
985 widest_int max_back_edges;
986 if (max_loop_iterations (loop, &max_back_edges))
987 max_ni = wi::smin (max_ni, max_back_edges + 1);
988
989 /* Account for rgroup masks, in which each bit is replicated N times. */
990 max_ni *= max_nscalars_per_iter;
991
992 /* Work out how many bits we need to represent the limit. */
993 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
994
995 /* Find a scalar mode for which WHILE_ULT is supported. */
996 opt_scalar_int_mode cmp_mode_iter;
997 tree cmp_type = NULL_TREE;
998 tree iv_type = NULL_TREE;
999 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000 unsigned int iv_precision = UINT_MAX;
1001
1002 if (iv_limit != -1)
1003 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004 UNSIGNED);
1005
1006 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1007 {
1008 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1009 if (cmp_bits >= min_ni_width
1010 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1011 {
1012 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1013 if (this_type
1014 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1015 {
1016 /* Although we could stop as soon as we find a valid mode,
1017 there are at least two reasons why that's not always the
1018 best choice:
1019
1020 - An IV that's Pmode or wider is more likely to be reusable
1021 in address calculations than an IV that's narrower than
1022 Pmode.
1023
1024 - Doing the comparison in IV_PRECISION or wider allows
1025 a natural 0-based IV, whereas using a narrower comparison
1026 type requires mitigations against wrap-around.
1027
1028 Conversely, if the IV limit is variable, doing the comparison
1029 in a wider type than the original type can introduce
1030 unnecessary extensions, so picking the widest valid mode
1031 is not always a good choice either.
1032
1033 Here we prefer the first IV type that's Pmode or wider,
1034 and the first comparison type that's IV_PRECISION or wider.
1035 (The comparison type must be no wider than the IV type,
1036 to avoid extensions in the vector loop.)
1037
1038 ??? We might want to try continuing beyond Pmode for ILP32
1039 targets if CMP_BITS < IV_PRECISION. */
1040 iv_type = this_type;
1041 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042 cmp_type = this_type;
1043 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1044 break;
1045 }
1046 }
1047 }
1048
1049 if (!cmp_type)
1050 return false;
1051
1052 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1054 return true;
1055 }
1056
1057 /* Calculate the cost of one scalar iteration of the loop. */
1058 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1060 {
1061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1063 int nbbs = loop->num_nodes, factor;
1064 int innerloop_iters, i;
1065
1066 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1067
1068 /* Gather costs for statements in the scalar loop. */
1069
1070 /* FORNOW. */
1071 innerloop_iters = 1;
1072 if (loop->inner)
1073 innerloop_iters = 50; /* FIXME */
1074
1075 for (i = 0; i < nbbs; i++)
1076 {
1077 gimple_stmt_iterator si;
1078 basic_block bb = bbs[i];
1079
1080 if (bb->loop_father == loop->inner)
1081 factor = innerloop_iters;
1082 else
1083 factor = 1;
1084
1085 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1086 {
1087 gimple *stmt = gsi_stmt (si);
1088 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1089
1090 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1091 continue;
1092
1093 /* Skip stmts that are not vectorized inside the loop. */
1094 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1095 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1096 && (!STMT_VINFO_LIVE_P (vstmt_info)
1097 || !VECTORIZABLE_CYCLE_DEF
1098 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1099 continue;
1100
1101 vect_cost_for_stmt kind;
1102 if (STMT_VINFO_DATA_REF (stmt_info))
1103 {
1104 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1105 kind = scalar_load;
1106 else
1107 kind = scalar_store;
1108 }
1109 else if (vect_nop_conversion_p (stmt_info))
1110 continue;
1111 else
1112 kind = scalar_stmt;
1113
1114 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1115 factor, kind, stmt_info, 0, vect_prologue);
1116 }
1117 }
1118
1119 /* Now accumulate cost. */
1120 void *target_cost_data = init_cost (loop);
1121 stmt_info_for_cost *si;
1122 int j;
1123 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1124 j, si)
1125 (void) add_stmt_cost (target_cost_data, si->count,
1126 si->kind, si->stmt_info, si->misalign,
1127 vect_body);
1128 unsigned dummy, body_cost = 0;
1129 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1130 destroy_cost_data (target_cost_data);
1131 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1132 }
1133
1134
1135 /* Function vect_analyze_loop_form_1.
1136
1137 Verify that certain CFG restrictions hold, including:
1138 - the loop has a pre-header
1139 - the loop has a single entry and exit
1140 - the loop exit condition is simple enough
1141 - the number of iterations can be analyzed, i.e, a countable loop. The
1142 niter could be analyzed under some assumptions. */
1143
1144 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1146 tree *assumptions, tree *number_of_iterationsm1,
1147 tree *number_of_iterations, gcond **inner_loop_cond)
1148 {
1149 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1150
1151 /* Different restrictions apply when we are considering an inner-most loop,
1152 vs. an outer (nested) loop.
1153 (FORNOW. May want to relax some of these restrictions in the future). */
1154
1155 if (!loop->inner)
1156 {
1157 /* Inner-most loop. We currently require that the number of BBs is
1158 exactly 2 (the header and latch). Vectorizable inner-most loops
1159 look like this:
1160
1161 (pre-header)
1162 |
1163 header <--------+
1164 | | |
1165 | +--> latch --+
1166 |
1167 (exit-bb) */
1168
1169 if (loop->num_nodes != 2)
1170 return opt_result::failure_at (vect_location,
1171 "not vectorized:"
1172 " control flow in loop.\n");
1173
1174 if (empty_block_p (loop->header))
1175 return opt_result::failure_at (vect_location,
1176 "not vectorized: empty loop.\n");
1177 }
1178 else
1179 {
1180 class loop *innerloop = loop->inner;
1181 edge entryedge;
1182
1183 /* Nested loop. We currently require that the loop is doubly-nested,
1184 contains a single inner loop, and the number of BBs is exactly 5.
1185 Vectorizable outer-loops look like this:
1186
1187 (pre-header)
1188 |
1189 header <---+
1190 | |
1191 inner-loop |
1192 | |
1193 tail ------+
1194 |
1195 (exit-bb)
1196
1197 The inner-loop has the properties expected of inner-most loops
1198 as described above. */
1199
1200 if ((loop->inner)->inner || (loop->inner)->next)
1201 return opt_result::failure_at (vect_location,
1202 "not vectorized:"
1203 " multiple nested loops.\n");
1204
1205 if (loop->num_nodes != 5)
1206 return opt_result::failure_at (vect_location,
1207 "not vectorized:"
1208 " control flow in loop.\n");
1209
1210 entryedge = loop_preheader_edge (innerloop);
1211 if (entryedge->src != loop->header
1212 || !single_exit (innerloop)
1213 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1214 return opt_result::failure_at (vect_location,
1215 "not vectorized:"
1216 " unsupported outerloop form.\n");
1217
1218 /* Analyze the inner-loop. */
1219 tree inner_niterm1, inner_niter, inner_assumptions;
1220 opt_result res
1221 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1222 &inner_assumptions, &inner_niterm1,
1223 &inner_niter, NULL);
1224 if (!res)
1225 {
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "not vectorized: Bad inner loop.\n");
1229 return res;
1230 }
1231
1232 /* Don't support analyzing niter under assumptions for inner
1233 loop. */
1234 if (!integer_onep (inner_assumptions))
1235 return opt_result::failure_at (vect_location,
1236 "not vectorized: Bad inner loop.\n");
1237
1238 if (!expr_invariant_in_loop_p (loop, inner_niter))
1239 return opt_result::failure_at (vect_location,
1240 "not vectorized: inner-loop count not"
1241 " invariant.\n");
1242
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_NOTE, vect_location,
1245 "Considering outer-loop vectorization.\n");
1246 }
1247
1248 if (!single_exit (loop))
1249 return opt_result::failure_at (vect_location,
1250 "not vectorized: multiple exits.\n");
1251 if (EDGE_COUNT (loop->header->preds) != 2)
1252 return opt_result::failure_at (vect_location,
1253 "not vectorized:"
1254 " too many incoming edges.\n");
1255
1256 /* We assume that the loop exit condition is at the end of the loop. i.e,
1257 that the loop is represented as a do-while (with a proper if-guard
1258 before the loop if needed), where the loop header contains all the
1259 executable statements, and the latch is empty. */
1260 if (!empty_block_p (loop->latch)
1261 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1262 return opt_result::failure_at (vect_location,
1263 "not vectorized: latch block not empty.\n");
1264
1265 /* Make sure the exit is not abnormal. */
1266 edge e = single_exit (loop);
1267 if (e->flags & EDGE_ABNORMAL)
1268 return opt_result::failure_at (vect_location,
1269 "not vectorized:"
1270 " abnormal loop exit edge.\n");
1271
1272 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1273 number_of_iterationsm1);
1274 if (!*loop_cond)
1275 return opt_result::failure_at
1276 (vect_location,
1277 "not vectorized: complicated exit condition.\n");
1278
1279 if (integer_zerop (*assumptions)
1280 || !*number_of_iterations
1281 || chrec_contains_undetermined (*number_of_iterations))
1282 return opt_result::failure_at
1283 (*loop_cond,
1284 "not vectorized: number of iterations cannot be computed.\n");
1285
1286 if (integer_zerop (*number_of_iterations))
1287 return opt_result::failure_at
1288 (*loop_cond,
1289 "not vectorized: number of iterations = 0.\n");
1290
1291 return opt_result::success ();
1292 }
1293
1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1295
1296 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1298 {
1299 tree assumptions, number_of_iterations, number_of_iterationsm1;
1300 gcond *loop_cond, *inner_loop_cond = NULL;
1301
1302 opt_result res
1303 = vect_analyze_loop_form_1 (loop, &loop_cond,
1304 &assumptions, &number_of_iterationsm1,
1305 &number_of_iterations, &inner_loop_cond);
1306 if (!res)
1307 return opt_loop_vec_info::propagate_failure (res);
1308
1309 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1310 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313 if (!integer_onep (assumptions))
1314 {
1315 /* We consider to vectorize this loop by versioning it under
1316 some assumptions. In order to do this, we need to clear
1317 existing information computed by scev and niter analyzer. */
1318 scev_reset_htab ();
1319 free_numbers_of_iterations_estimates (loop);
1320 /* Also set flag for this loop so that following scev and niter
1321 analysis are done under the assumptions. */
1322 loop_constraint_set (loop, LOOP_C_FINITE);
1323 /* Also record the assumptions for versioning. */
1324 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1325 }
1326
1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1328 {
1329 if (dump_enabled_p ())
1330 {
1331 dump_printf_loc (MSG_NOTE, vect_location,
1332 "Symbolic number of iterations is ");
1333 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1334 dump_printf (MSG_NOTE, "\n");
1335 }
1336 }
1337
1338 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1339 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1340 if (inner_loop_cond)
1341 {
1342 stmt_vec_info inner_loop_cond_info
1343 = loop_vinfo->lookup_stmt (inner_loop_cond);
1344 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345 }
1346
1347 gcc_assert (!loop->aux);
1348 loop->aux = loop_vinfo;
1349 return opt_loop_vec_info::success (loop_vinfo);
1350 }
1351
1352
1353
1354 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1355 statements update the vectorization factor. */
1356
1357 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1359 {
1360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362 int nbbs = loop->num_nodes;
1363 poly_uint64 vectorization_factor;
1364 int i;
1365
1366 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1367
1368 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1369 gcc_assert (known_ne (vectorization_factor, 0U));
1370
1371 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1372 vectorization factor of the loop is the unrolling factor required by
1373 the SLP instances. If that unrolling factor is 1, we say, that we
1374 perform pure SLP on loop - cross iteration parallelism is not
1375 exploited. */
1376 bool only_slp_in_loop = true;
1377 for (i = 0; i < nbbs; i++)
1378 {
1379 basic_block bb = bbs[i];
1380 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381 gsi_next (&si))
1382 {
1383 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384 if (!stmt_info)
1385 continue;
1386 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388 && !PURE_SLP_STMT (stmt_info))
1389 /* STMT needs both SLP and loop-based vectorization. */
1390 only_slp_in_loop = false;
1391 }
1392 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393 gsi_next (&si))
1394 {
1395 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396 stmt_info = vect_stmt_to_vectorize (stmt_info);
1397 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399 && !PURE_SLP_STMT (stmt_info))
1400 /* STMT needs both SLP and loop-based vectorization. */
1401 only_slp_in_loop = false;
1402 }
1403 }
1404
1405 if (only_slp_in_loop)
1406 {
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Loop contains only SLP stmts\n");
1410 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1411 }
1412 else
1413 {
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "Loop contains SLP and non-SLP stmts\n");
1417 /* Both the vectorization factor and unroll factor have the form
1418 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1419 so they must have a common multiple. */
1420 vectorization_factor
1421 = force_common_multiple (vectorization_factor,
1422 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1423 }
1424
1425 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426 if (dump_enabled_p ())
1427 {
1428 dump_printf_loc (MSG_NOTE, vect_location,
1429 "Updating vectorization factor to ");
1430 dump_dec (MSG_NOTE, vectorization_factor);
1431 dump_printf (MSG_NOTE, ".\n");
1432 }
1433 }
1434
1435 /* Return true if STMT_INFO describes a double reduction phi and if
1436 the other phi in the reduction is also relevant for vectorization.
1437 This rejects cases such as:
1438
1439 outer1:
1440 x_1 = PHI <x_3(outer2), ...>;
1441 ...
1442
1443 inner:
1444 x_2 = ...;
1445 ...
1446
1447 outer2:
1448 x_3 = PHI <x_2(inner)>;
1449
1450 if nothing in x_2 or elsewhere makes x_1 relevant. */
1451
1452 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1453 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1454 {
1455 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456 return false;
1457
1458 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1459 }
1460
1461 /* Function vect_analyze_loop_operations.
1462
1463 Scan the loop stmts and make sure they are all vectorizable. */
1464
1465 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1467 {
1468 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470 int nbbs = loop->num_nodes;
1471 int i;
1472 stmt_vec_info stmt_info;
1473 bool need_to_vectorize = false;
1474 bool ok;
1475
1476 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477
1478 auto_vec<stmt_info_for_cost> cost_vec;
1479
1480 for (i = 0; i < nbbs; i++)
1481 {
1482 basic_block bb = bbs[i];
1483
1484 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485 gsi_next (&si))
1486 {
1487 gphi *phi = si.phi ();
1488 ok = true;
1489
1490 stmt_info = loop_vinfo->lookup_stmt (phi);
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493 if (virtual_operand_p (gimple_phi_result (phi)))
1494 continue;
1495
1496 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497 (i.e., a phi in the tail of the outer-loop). */
1498 if (! is_loop_header_bb_p (bb))
1499 {
1500 /* FORNOW: we currently don't support the case that these phis
1501 are not used in the outerloop (unless it is double reduction,
1502 i.e., this phi is vect_reduction_def), cause this case
1503 requires to actually do something here. */
1504 if (STMT_VINFO_LIVE_P (stmt_info)
1505 && !vect_active_double_reduction_p (stmt_info))
1506 return opt_result::failure_at (phi,
1507 "Unsupported loop-closed phi"
1508 " in outer-loop.\n");
1509
1510 /* If PHI is used in the outer loop, we check that its operand
1511 is defined in the inner loop. */
1512 if (STMT_VINFO_RELEVANT_P (stmt_info))
1513 {
1514 tree phi_op;
1515
1516 if (gimple_phi_num_args (phi) != 1)
1517 return opt_result::failure_at (phi, "unsupported phi");
1518
1519 phi_op = PHI_ARG_DEF (phi, 0);
1520 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521 if (!op_def_info)
1522 return opt_result::failure_at (phi, "unsupported phi\n");
1523
1524 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525 && (STMT_VINFO_RELEVANT (op_def_info)
1526 != vect_used_in_outer_by_reduction))
1527 return opt_result::failure_at (phi, "unsupported phi\n");
1528
1529 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530 || (STMT_VINFO_DEF_TYPE (stmt_info)
1531 == vect_double_reduction_def))
1532 && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533 return opt_result::failure_at (phi, "unsupported phi\n");
1534 }
1535
1536 continue;
1537 }
1538
1539 gcc_assert (stmt_info);
1540
1541 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542 || STMT_VINFO_LIVE_P (stmt_info))
1543 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544 /* A scalar-dependence cycle that we don't support. */
1545 return opt_result::failure_at (phi,
1546 "not vectorized:"
1547 " scalar dependence cycle.\n");
1548
1549 if (STMT_VINFO_RELEVANT_P (stmt_info))
1550 {
1551 need_to_vectorize = true;
1552 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553 && ! PURE_SLP_STMT (stmt_info))
1554 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555 &cost_vec);
1556 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557 || (STMT_VINFO_DEF_TYPE (stmt_info)
1558 == vect_double_reduction_def)
1559 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560 && ! PURE_SLP_STMT (stmt_info))
1561 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1562 }
1563
1564 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1565 if (ok
1566 && STMT_VINFO_LIVE_P (stmt_info)
1567 && !PURE_SLP_STMT (stmt_info))
1568 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1569 -1, false, &cost_vec);
1570
1571 if (!ok)
1572 return opt_result::failure_at (phi,
1573 "not vectorized: relevant phi not "
1574 "supported: %G",
1575 static_cast <gimple *> (phi));
1576 }
1577
1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 gsi_next (&si))
1580 {
1581 gimple *stmt = gsi_stmt (si);
1582 if (!gimple_clobber_p (stmt))
1583 {
1584 opt_result res
1585 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1586 &need_to_vectorize,
1587 NULL, NULL, &cost_vec);
1588 if (!res)
1589 return res;
1590 }
1591 }
1592 } /* bbs */
1593
1594 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1595
1596 /* All operations in the loop are either irrelevant (deal with loop
1597 control, or dead), or only used outside the loop and can be moved
1598 out of the loop (e.g. invariants, inductions). The loop can be
1599 optimized away by scalar optimizations. We're better off not
1600 touching this loop. */
1601 if (!need_to_vectorize)
1602 {
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_NOTE, vect_location,
1605 "All the computation can be taken out of the loop.\n");
1606 return opt_result::failure_at
1607 (vect_location,
1608 "not vectorized: redundant loop. no profit to vectorize.\n");
1609 }
1610
1611 return opt_result::success ();
1612 }
1613
1614 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1615 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1616 definitely no, or -1 if it's worth retrying. */
1617
1618 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1620 {
1621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1622 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1623
1624 /* Only fully-masked loops can have iteration counts less than the
1625 vectorization factor. */
1626 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1627 {
1628 HOST_WIDE_INT max_niter;
1629
1630 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1631 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1632 else
1633 max_niter = max_stmt_executions_int (loop);
1634
1635 if (max_niter != -1
1636 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1637 {
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640 "not vectorized: iteration count smaller than "
1641 "vectorization factor.\n");
1642 return 0;
1643 }
1644 }
1645
1646 int min_profitable_iters, min_profitable_estimate;
1647 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648 &min_profitable_estimate);
1649
1650 if (min_profitable_iters < 0)
1651 {
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "not vectorized: vectorization not profitable.\n");
1655 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 "not vectorized: vector version will never be "
1658 "profitable.\n");
1659 return -1;
1660 }
1661
1662 int min_scalar_loop_bound = (param_min_vect_loop_bound
1663 * assumed_vf);
1664
1665 /* Use the cost model only if it is more conservative than user specified
1666 threshold. */
1667 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1668 min_profitable_iters);
1669
1670 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1671
1672 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1673 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1674 {
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 "not vectorized: vectorization not profitable.\n");
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_NOTE, vect_location,
1680 "not vectorized: iteration count smaller than user "
1681 "specified loop bound parameter or minimum profitable "
1682 "iterations (whichever is more conservative).\n");
1683 return 0;
1684 }
1685
1686 /* The static profitablity threshold min_profitable_estimate includes
1687 the cost of having to check at runtime whether the scalar loop
1688 should be used instead. If it turns out that we don't need or want
1689 such a check, the threshold we should use for the static estimate
1690 is simply the point at which the vector loop becomes more profitable
1691 than the scalar loop. */
1692 if (min_profitable_estimate > min_profitable_iters
1693 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1697 {
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700 " choice between the scalar and vector loops\n");
1701 min_profitable_estimate = min_profitable_iters;
1702 }
1703
1704 HOST_WIDE_INT estimated_niter;
1705
1706 /* If we are vectorizing an epilogue then we know the maximum number of
1707 scalar iterations it will cover is at least one lower than the
1708 vectorization factor of the main loop. */
1709 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710 estimated_niter
1711 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712 else
1713 {
1714 estimated_niter = estimated_stmt_executions_int (loop);
1715 if (estimated_niter == -1)
1716 estimated_niter = likely_max_stmt_executions_int (loop);
1717 }
1718 if (estimated_niter != -1
1719 && ((unsigned HOST_WIDE_INT) estimated_niter
1720 < MAX (th, (unsigned) min_profitable_estimate)))
1721 {
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: estimated iteration count too "
1725 "small.\n");
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE, vect_location,
1728 "not vectorized: estimated iteration count smaller "
1729 "than specified loop bound parameter or minimum "
1730 "profitable iterations (whichever is more "
1731 "conservative).\n");
1732 return -1;
1733 }
1734
1735 return 1;
1736 }
1737
1738 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1739 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1740 vec<data_reference_p> *datarefs,
1741 unsigned int *n_stmts)
1742 {
1743 *n_stmts = 0;
1744 for (unsigned i = 0; i < loop->num_nodes; i++)
1745 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1746 !gsi_end_p (gsi); gsi_next (&gsi))
1747 {
1748 gimple *stmt = gsi_stmt (gsi);
1749 if (is_gimple_debug (stmt))
1750 continue;
1751 ++(*n_stmts);
1752 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1753 if (!res)
1754 {
1755 if (is_gimple_call (stmt) && loop->safelen)
1756 {
1757 tree fndecl = gimple_call_fndecl (stmt), op;
1758 if (fndecl != NULL_TREE)
1759 {
1760 cgraph_node *node = cgraph_node::get (fndecl);
1761 if (node != NULL && node->simd_clones != NULL)
1762 {
1763 unsigned int j, n = gimple_call_num_args (stmt);
1764 for (j = 0; j < n; j++)
1765 {
1766 op = gimple_call_arg (stmt, j);
1767 if (DECL_P (op)
1768 || (REFERENCE_CLASS_P (op)
1769 && get_base_address (op)))
1770 break;
1771 }
1772 op = gimple_call_lhs (stmt);
1773 /* Ignore #pragma omp declare simd functions
1774 if they don't have data references in the
1775 call stmt itself. */
1776 if (j == n
1777 && !(op
1778 && (DECL_P (op)
1779 || (REFERENCE_CLASS_P (op)
1780 && get_base_address (op)))))
1781 continue;
1782 }
1783 }
1784 }
1785 return res;
1786 }
1787 /* If dependence analysis will give up due to the limit on the
1788 number of datarefs stop here and fail fatally. */
1789 if (datarefs->length ()
1790 > (unsigned)param_loop_max_datarefs_for_datadeps)
1791 return opt_result::failure_at (stmt, "exceeded param "
1792 "loop-max-datarefs-for-datadeps\n");
1793 }
1794 return opt_result::success ();
1795 }
1796
1797 /* Look for SLP-only access groups and turn each individual access into its own
1798 group. */
1799 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1801 {
1802 unsigned int i;
1803 struct data_reference *dr;
1804
1805 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1806
1807 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1808 FOR_EACH_VEC_ELT (datarefs, i, dr)
1809 {
1810 gcc_assert (DR_REF (dr));
1811 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1812
1813 /* Check if the load is a part of an interleaving chain. */
1814 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1815 {
1816 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1817 unsigned int group_size = DR_GROUP_SIZE (first_element);
1818
1819 /* Check if SLP-only groups. */
1820 if (!STMT_SLP_TYPE (stmt_info)
1821 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1822 {
1823 /* Dissolve the group. */
1824 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1825
1826 stmt_vec_info vinfo = first_element;
1827 while (vinfo)
1828 {
1829 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1830 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1831 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1832 DR_GROUP_SIZE (vinfo) = 1;
1833 if (STMT_VINFO_STRIDED_P (first_element))
1834 DR_GROUP_GAP (vinfo) = 0;
1835 else
1836 DR_GROUP_GAP (vinfo) = group_size - 1;
1837 vinfo = next;
1838 }
1839 }
1840 }
1841 }
1842 }
1843
1844
1845 /* Decides whether we need to create an epilogue loop to handle
1846 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1847
1848 void
determine_peel_for_niter(loop_vec_info loop_vinfo)1849 determine_peel_for_niter (loop_vec_info loop_vinfo)
1850 {
1851 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1852
1853 unsigned HOST_WIDE_INT const_vf;
1854 HOST_WIDE_INT max_niter
1855 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1856
1857 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1858 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1859 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1860 (loop_vinfo));
1861
1862 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1863 /* The main loop handles all iterations. */
1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1867 {
1868 /* Work out the (constant) number of iterations that need to be
1869 peeled for reasons other than niters. */
1870 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1871 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1872 peel_niter += 1;
1873 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1874 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1875 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1876 }
1877 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1878 /* ??? When peeling for gaps but not alignment, we could
1879 try to check whether the (variable) niters is known to be
1880 VF * N + 1. That's something of a niche case though. */
1881 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1882 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1883 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1884 < (unsigned) exact_log2 (const_vf))
1885 /* In case of versioning, check if the maximum number of
1886 iterations is greater than th. If they are identical,
1887 the epilogue is unnecessary. */
1888 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1889 || ((unsigned HOST_WIDE_INT) max_niter
1890 > (th / const_vf) * const_vf))))
1891 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1892 }
1893
1894
1895 /* Function vect_analyze_loop_2.
1896
1897 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898 for it. The different analyses will record information in the
1899 loop_vec_info struct. */
1900 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1902 {
1903 opt_result ok = opt_result::success ();
1904 int res;
1905 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906 poly_uint64 min_vf = 2;
1907 loop_vec_info orig_loop_vinfo = NULL;
1908
1909 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910 loop_vec_info of the first vectorized loop. */
1911 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913 else
1914 orig_loop_vinfo = loop_vinfo;
1915 gcc_assert (orig_loop_vinfo);
1916
1917 /* The first group of checks is independent of the vector size. */
1918 fatal = true;
1919
1920 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922 return opt_result::failure_at (vect_location,
1923 "not vectorized: simd if(0)\n");
1924
1925 /* Find all data references in the loop (which correspond to vdefs/vuses)
1926 and analyze their evolution in the loop. */
1927
1928 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1929
1930 /* Gather the data references and count stmts in the loop. */
1931 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1932 {
1933 opt_result res
1934 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935 &LOOP_VINFO_DATAREFS (loop_vinfo),
1936 n_stmts);
1937 if (!res)
1938 {
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "not vectorized: loop contains function "
1942 "calls or data references that cannot "
1943 "be analyzed\n");
1944 return res;
1945 }
1946 loop_vinfo->shared->save_datarefs ();
1947 }
1948 else
1949 loop_vinfo->shared->check_datarefs ();
1950
1951 /* Analyze the data references and also adjust the minimal
1952 vectorization factor according to the loads and stores. */
1953
1954 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955 if (!ok)
1956 {
1957 if (dump_enabled_p ())
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 "bad data references.\n");
1960 return ok;
1961 }
1962
1963 /* Classify all cross-iteration scalar data-flow cycles.
1964 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1965 vect_analyze_scalar_cycles (loop_vinfo);
1966
1967 vect_pattern_recog (loop_vinfo);
1968
1969 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1970
1971 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1973
1974 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975 if (!ok)
1976 {
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979 "bad data access.\n");
1980 return ok;
1981 }
1982
1983 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1984
1985 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986 if (!ok)
1987 {
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990 "unexpected pattern.\n");
1991 return ok;
1992 }
1993
1994 /* While the rest of the analysis below depends on it in some way. */
1995 fatal = false;
1996
1997 /* Analyze data dependences between the data-refs in the loop
1998 and adjust the maximum vectorization factor according to
1999 the dependences.
2000 FORNOW: fail at the first data dependence that we encounter. */
2001
2002 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003 if (!ok)
2004 {
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007 "bad data dependence.\n");
2008 return ok;
2009 }
2010 if (max_vf != MAX_VECTORIZATION_FACTOR
2011 && maybe_lt (max_vf, min_vf))
2012 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2014
2015 ok = vect_determine_vectorization_factor (loop_vinfo);
2016 if (!ok)
2017 {
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "can't determine vectorization factor.\n");
2021 return ok;
2022 }
2023 if (max_vf != MAX_VECTORIZATION_FACTOR
2024 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026
2027 /* Compute the scalar iteration cost. */
2028 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2029
2030 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2031
2032 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2033 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034 if (!ok)
2035 return ok;
2036
2037 /* If there are any SLP instances mark them as pure_slp. */
2038 bool slp = vect_make_slp_decision (loop_vinfo);
2039 if (slp)
2040 {
2041 /* Find stmts that need to be both vectorized and SLPed. */
2042 vect_detect_hybrid_slp (loop_vinfo);
2043
2044 /* Update the vectorization factor based on the SLP decision. */
2045 vect_update_vf_for_slp (loop_vinfo);
2046 }
2047
2048 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2049
2050 /* We don't expect to have to roll back to anything other than an empty
2051 set of rgroups. */
2052 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2053
2054 /* This is the point where we can re-start analysis with SLP forced off. */
2055 start_over:
2056
2057 /* Now the vectorization factor is final. */
2058 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059 gcc_assert (known_ne (vectorization_factor, 0U));
2060
2061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2062 {
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "vectorization_factor = ");
2065 dump_dec (MSG_NOTE, vectorization_factor);
2066 dump_printf (MSG_NOTE, ", niters = %wd\n",
2067 LOOP_VINFO_INT_NITERS (loop_vinfo));
2068 }
2069
2070 /* Analyze the alignment of the data-refs in the loop.
2071 Fail if a data reference is found that cannot be vectorized. */
2072
2073 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074 if (!ok)
2075 {
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "bad data alignment.\n");
2079 return ok;
2080 }
2081
2082 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083 It is important to call pruning after vect_analyze_data_ref_accesses,
2084 since we use grouping information gathered by interleaving analysis. */
2085 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086 if (!ok)
2087 return ok;
2088
2089 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090 vectorization, since we do not want to add extra peeling or
2091 add versioning for alignment. */
2092 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093 /* This pass will decide on using loop versioning and/or loop peeling in
2094 order to enhance the alignment of data references in the loop. */
2095 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096 else
2097 ok = vect_verify_datarefs_alignment (loop_vinfo);
2098 if (!ok)
2099 return ok;
2100
2101 if (slp)
2102 {
2103 /* Analyze operations in the SLP instances. Note this may
2104 remove unsupported SLP instances which makes the above
2105 SLP kind detection invalid. */
2106 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107 vect_slp_analyze_operations (loop_vinfo);
2108 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2109 {
2110 ok = opt_result::failure_at (vect_location,
2111 "unsupported SLP instances\n");
2112 goto again;
2113 }
2114 }
2115
2116 /* Dissolve SLP-only groups. */
2117 vect_dissolve_slp_only_groups (loop_vinfo);
2118
2119 /* Scan all the remaining operations in the loop that are not subject
2120 to SLP and make sure they are vectorizable. */
2121 ok = vect_analyze_loop_operations (loop_vinfo);
2122 if (!ok)
2123 {
2124 if (dump_enabled_p ())
2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 "bad operation or unsupported loop bound.\n");
2127 return ok;
2128 }
2129
2130 /* Decide whether to use a fully-masked loop for this vectorization
2131 factor. */
2132 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134 && vect_verify_full_masking (loop_vinfo));
2135 if (dump_enabled_p ())
2136 {
2137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "using a fully-masked loop.\n");
2140 else
2141 dump_printf_loc (MSG_NOTE, vect_location,
2142 "not using a fully-masked loop.\n");
2143 }
2144
2145 /* If epilog loop is required because of data accesses with gaps,
2146 one additional iteration needs to be peeled. Check if there is
2147 enough iterations for vectorization. */
2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2151 {
2152 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2154
2155 if (known_lt (wi::to_widest (scalar_niters), vf))
2156 return opt_result::failure_at (vect_location,
2157 "loop has no enough iterations to"
2158 " support peeling for gaps.\n");
2159 }
2160
2161 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162 loop or a loop that has a lower VF than the main loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167 return opt_result::failure_at (vect_location,
2168 "Vectorization factor too high for"
2169 " epilogue loop.\n");
2170
2171 /* Check the costings of the loop make vectorizing worthwhile. */
2172 res = vect_analyze_loop_costing (loop_vinfo);
2173 if (res < 0)
2174 {
2175 ok = opt_result::failure_at (vect_location,
2176 "Loop costings may not be worthwhile.\n");
2177 goto again;
2178 }
2179 if (!res)
2180 return opt_result::failure_at (vect_location,
2181 "Loop costings not worthwhile.\n");
2182
2183 determine_peel_for_niter (loop_vinfo);
2184 /* If an epilogue loop is required make sure we can create one. */
2185 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2186 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2187 {
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2190 if (!vect_can_advance_ivs_p (loop_vinfo)
2191 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2192 single_exit (LOOP_VINFO_LOOP
2193 (loop_vinfo))))
2194 {
2195 ok = opt_result::failure_at (vect_location,
2196 "not vectorized: can't create required "
2197 "epilog loop\n");
2198 goto again;
2199 }
2200 }
2201
2202 /* During peeling, we need to check if number of loop iterations is
2203 enough for both peeled prolog loop and vector loop. This check
2204 can be merged along with threshold check of loop versioning, so
2205 increase threshold for this case if necessary.
2206
2207 If we are analyzing an epilogue we still want to check what its
2208 versioning threshold would be. If we decide to vectorize the epilogues we
2209 will want to use the lowest versioning threshold of all epilogues and main
2210 loop. This will enable us to enter a vectorized epilogue even when
2211 versioning the loop. We can't simply check whether the epilogue requires
2212 versioning though since we may have skipped some versioning checks when
2213 analyzing the epilogue. For instance, checks for alias versioning will be
2214 skipped when dealing with epilogues as we assume we already checked them
2215 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2216 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2217 {
2218 poly_uint64 niters_th = 0;
2219 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2220
2221 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2222 {
2223 /* Niters for peeled prolog loop. */
2224 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2225 {
2226 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2227 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2228 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2229 }
2230 else
2231 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2232 }
2233
2234 /* Niters for at least one iteration of vectorized loop. */
2235 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2236 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2237 /* One additional iteration because of peeling for gap. */
2238 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2239 niters_th += 1;
2240
2241 /* Use the same condition as vect_transform_loop to decide when to use
2242 the cost to determine a versioning threshold. */
2243 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244 && ordered_p (th, niters_th))
2245 niters_th = ordered_max (poly_uint64 (th), niters_th);
2246
2247 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2248 }
2249
2250 gcc_assert (known_eq (vectorization_factor,
2251 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2252
2253 /* Ok to vectorize! */
2254 return opt_result::success ();
2255
2256 again:
2257 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2258 gcc_assert (!ok);
2259
2260 /* Try again with SLP forced off but if we didn't do any SLP there is
2261 no point in re-trying. */
2262 if (!slp)
2263 return ok;
2264
2265 /* If there are reduction chains re-trying will fail anyway. */
2266 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2267 return ok;
2268
2269 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2270 via interleaving or lane instructions. */
2271 slp_instance instance;
2272 slp_tree node;
2273 unsigned i, j;
2274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2275 {
2276 stmt_vec_info vinfo;
2277 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2278 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2279 continue;
2280 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2281 unsigned int size = DR_GROUP_SIZE (vinfo);
2282 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2283 if (! vect_store_lanes_supported (vectype, size, false)
2284 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2285 && ! vect_grouped_store_supported (vectype, size))
2286 return opt_result::failure_at (vinfo->stmt,
2287 "unsupported grouped store\n");
2288 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2289 {
2290 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2291 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2292 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2293 size = DR_GROUP_SIZE (vinfo);
2294 vectype = STMT_VINFO_VECTYPE (vinfo);
2295 if (! vect_load_lanes_supported (vectype, size, false)
2296 && ! vect_grouped_load_supported (vectype, single_element_p,
2297 size))
2298 return opt_result::failure_at (vinfo->stmt,
2299 "unsupported grouped load\n");
2300 }
2301 }
2302
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "re-trying with SLP disabled\n");
2306
2307 /* Roll back state appropriately. No SLP this time. */
2308 slp = false;
2309 /* Restore vectorization factor as it were without SLP. */
2310 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311 /* Free the SLP instances. */
2312 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313 vect_free_slp_instance (instance, false);
2314 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315 /* Reset SLP type to loop_vect on all stmts. */
2316 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2317 {
2318 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320 !gsi_end_p (si); gsi_next (&si))
2321 {
2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2323 STMT_SLP_TYPE (stmt_info) = loop_vect;
2324 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2326 {
2327 /* vectorizable_reduction adjusts reduction stmt def-types,
2328 restore them to that of the PHI. */
2329 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330 = STMT_VINFO_DEF_TYPE (stmt_info);
2331 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332 (STMT_VINFO_REDUC_DEF (stmt_info)))
2333 = STMT_VINFO_DEF_TYPE (stmt_info);
2334 }
2335 }
2336 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2337 !gsi_end_p (si); gsi_next (&si))
2338 {
2339 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2340 STMT_SLP_TYPE (stmt_info) = loop_vect;
2341 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2342 {
2343 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2344 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2345 STMT_SLP_TYPE (stmt_info) = loop_vect;
2346 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2347 !gsi_end_p (pi); gsi_next (&pi))
2348 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2349 = loop_vect;
2350 }
2351 }
2352 }
2353 /* Free optimized alias test DDRS. */
2354 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2355 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2356 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2357 /* Reset target cost data. */
2358 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2359 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2360 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2361 /* Reset accumulated rgroup information. */
2362 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2363 /* Reset assorted flags. */
2364 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2365 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2366 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2367 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2368 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2369
2370 goto start_over;
2371 }
2372
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2375 OLD_LOOP_VINFO is better unless something specifically indicates
2376 otherwise.
2377
2378 Note that this deliberately isn't a partial order. */
2379
2380 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382 loop_vec_info old_loop_vinfo)
2383 {
2384 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2386
2387 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2389
2390 /* Always prefer a VF of loop->simdlen over any other VF. */
2391 if (loop->simdlen)
2392 {
2393 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395 if (new_simdlen_p != old_simdlen_p)
2396 return new_simdlen_p;
2397 }
2398
2399 /* Limit the VFs to what is likely to be the maximum number of iterations,
2400 to handle cases in which at least one loop_vinfo is fully-masked. */
2401 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402 if (estimated_max_niter != -1)
2403 {
2404 if (known_le (estimated_max_niter, new_vf))
2405 new_vf = estimated_max_niter;
2406 if (known_le (estimated_max_niter, old_vf))
2407 old_vf = estimated_max_niter;
2408 }
2409
2410 /* Check whether the (fractional) cost per scalar iteration is lower
2411 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2412 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413 * poly_widest_int (old_vf));
2414 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415 * poly_widest_int (new_vf));
2416 if (maybe_lt (rel_old, rel_new))
2417 {
2418 /* When old_loop_vinfo uses a variable vectorization factor,
2419 we know that it has a lower cost for at least one runtime VF.
2420 However, we don't know how likely that VF is.
2421
2422 One option would be to compare the costs for the estimated VFs.
2423 The problem is that that can put too much pressure on the cost
2424 model. E.g. if the estimated VF is also the lowest possible VF,
2425 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2426 for the estimated VF, we'd then choose new_loop_vinfo even
2427 though (a) new_loop_vinfo might not actually be better than
2428 old_loop_vinfo for that VF and (b) it would be significantly
2429 worse at larger VFs.
2430
2431 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2432 no more expensive than old_loop_vinfo even after doubling the
2433 estimated old_loop_vinfo VF. For all but trivial loops, this
2434 ensures that we only pick new_loop_vinfo if it is significantly
2435 better than old_loop_vinfo at the estimated VF. */
2436 if (rel_new.is_constant ())
2437 return false;
2438
2439 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2440 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2441 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2442 * widest_int (old_estimated_vf));
2443 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2444 * widest_int (new_estimated_vf));
2445 return estimated_rel_new * 2 <= estimated_rel_old;
2446 }
2447 if (known_lt (rel_new, rel_old))
2448 return true;
2449
2450 /* If there's nothing to choose between the loop bodies, see whether
2451 there's a difference in the prologue and epilogue costs. */
2452 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2453 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2454
2455 return false;
2456 }
2457
2458 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2459 true if we should. */
2460
2461 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2462 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2463 loop_vec_info old_loop_vinfo)
2464 {
2465 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2466 return false;
2467
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_NOTE, vect_location,
2470 "***** Preferring vector mode %s to vector mode %s\n",
2471 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2472 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2473 return true;
2474 }
2475
2476 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2477 try to reanalyze it as a main loop. Return the loop_vinfo on success
2478 and null on failure. */
2479
2480 static loop_vec_info
vect_reanalyze_as_main_loop(loop_vec_info loop_vinfo,unsigned int * n_stmts)2481 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2482 {
2483 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2484 return loop_vinfo;
2485
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "***** Reanalyzing as a main loop with vector mode %s\n",
2489 GET_MODE_NAME (loop_vinfo->vector_mode));
2490
2491 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2492 vec_info_shared *shared = loop_vinfo->shared;
2493 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2494 gcc_assert (main_loop_vinfo);
2495
2496 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2497
2498 bool fatal = false;
2499 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2500 loop->aux = NULL;
2501 if (!res)
2502 {
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_NOTE, vect_location,
2505 "***** Failed to analyze main loop with vector"
2506 " mode %s\n",
2507 GET_MODE_NAME (loop_vinfo->vector_mode));
2508 delete main_loop_vinfo;
2509 return NULL;
2510 }
2511 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2512 return main_loop_vinfo;
2513 }
2514
2515 /* Function vect_analyze_loop.
2516
2517 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2518 for it. The different analyses will record information in the
2519 loop_vec_info struct. */
2520 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2521 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2522 {
2523 auto_vector_modes vector_modes;
2524
2525 /* Autodetect first vector size we try. */
2526 unsigned int autovec_flags
2527 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2528 loop->simdlen != 0);
2529 unsigned int mode_i = 0;
2530
2531 DUMP_VECT_SCOPE ("analyze_loop_nest");
2532
2533 if (loop_outer (loop)
2534 && loop_vec_info_for_loop (loop_outer (loop))
2535 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2536 return opt_loop_vec_info::failure_at (vect_location,
2537 "outer-loop already vectorized.\n");
2538
2539 if (!find_loop_nest (loop, &shared->loop_nest))
2540 return opt_loop_vec_info::failure_at
2541 (vect_location,
2542 "not vectorized: loop nest containing two or more consecutive inner"
2543 " loops cannot be vectorized\n");
2544
2545 unsigned n_stmts = 0;
2546 machine_mode autodetected_vector_mode = VOIDmode;
2547 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2548 machine_mode next_vector_mode = VOIDmode;
2549 poly_uint64 lowest_th = 0;
2550 unsigned vectorized_loops = 0;
2551 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2552 && !unlimited_cost_model (loop));
2553
2554 bool vect_epilogues = false;
2555 opt_result res = opt_result::success ();
2556 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2557 while (1)
2558 {
2559 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2560 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2561 if (!loop_vinfo)
2562 {
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565 "bad loop form.\n");
2566 gcc_checking_assert (first_loop_vinfo == NULL);
2567 return loop_vinfo;
2568 }
2569 loop_vinfo->vector_mode = next_vector_mode;
2570
2571 bool fatal = false;
2572
2573 /* When pick_lowest_cost_p is true, we should in principle iterate
2574 over all the loop_vec_infos that LOOP_VINFO could replace and
2575 try to vectorize LOOP_VINFO under the same conditions.
2576 E.g. when trying to replace an epilogue loop, we should vectorize
2577 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2578 to replace the main loop, we should vectorize LOOP_VINFO as a main
2579 loop too.
2580
2581 However, autovectorize_vector_modes is usually sorted as follows:
2582
2583 - Modes that naturally produce lower VFs usually follow modes that
2584 naturally produce higher VFs.
2585
2586 - When modes naturally produce the same VF, maskable modes
2587 usually follow unmaskable ones, so that the maskable mode
2588 can be used to vectorize the epilogue of the unmaskable mode.
2589
2590 This order is preferred because it leads to the maximum
2591 epilogue vectorization opportunities. Targets should only use
2592 a different order if they want to make wide modes available while
2593 disparaging them relative to earlier, smaller modes. The assumption
2594 in that case is that the wider modes are more expensive in some
2595 way that isn't reflected directly in the costs.
2596
2597 There should therefore be few interesting cases in which
2598 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2599 treated as a standalone loop, and ends up being genuinely cheaper
2600 than FIRST_LOOP_VINFO. */
2601 if (vect_epilogues)
2602 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2603
2604 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2605 if (mode_i == 0)
2606 autodetected_vector_mode = loop_vinfo->vector_mode;
2607 if (dump_enabled_p ())
2608 {
2609 if (res)
2610 dump_printf_loc (MSG_NOTE, vect_location,
2611 "***** Analysis succeeded with vector mode %s\n",
2612 GET_MODE_NAME (loop_vinfo->vector_mode));
2613 else
2614 dump_printf_loc (MSG_NOTE, vect_location,
2615 "***** Analysis failed with vector mode %s\n",
2616 GET_MODE_NAME (loop_vinfo->vector_mode));
2617 }
2618
2619 loop->aux = NULL;
2620
2621 if (!fatal)
2622 while (mode_i < vector_modes.length ()
2623 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2624 {
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_NOTE, vect_location,
2627 "***** The result for vector mode %s would"
2628 " be the same\n",
2629 GET_MODE_NAME (vector_modes[mode_i]));
2630 mode_i += 1;
2631 }
2632
2633 if (res)
2634 {
2635 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2636 vectorized_loops++;
2637
2638 /* Once we hit the desired simdlen for the first time,
2639 discard any previous attempts. */
2640 if (simdlen
2641 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2642 {
2643 delete first_loop_vinfo;
2644 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2645 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2646 simdlen = 0;
2647 }
2648 else if (pick_lowest_cost_p && first_loop_vinfo)
2649 {
2650 /* Keep trying to roll back vectorization attempts while the
2651 loop_vec_infos they produced were worse than this one. */
2652 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2653 while (!vinfos.is_empty ()
2654 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2655 {
2656 gcc_assert (vect_epilogues);
2657 delete vinfos.pop ();
2658 }
2659 if (vinfos.is_empty ()
2660 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2661 {
2662 loop_vec_info main_loop_vinfo
2663 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
2664 if (main_loop_vinfo == loop_vinfo)
2665 {
2666 delete first_loop_vinfo;
2667 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2668 }
2669 else if (main_loop_vinfo
2670 && vect_joust_loop_vinfos (main_loop_vinfo,
2671 first_loop_vinfo))
2672 {
2673 delete first_loop_vinfo;
2674 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2675 delete loop_vinfo;
2676 loop_vinfo
2677 = opt_loop_vec_info::success (main_loop_vinfo);
2678 }
2679 else
2680 delete main_loop_vinfo;
2681 }
2682 }
2683
2684 if (first_loop_vinfo == NULL)
2685 {
2686 first_loop_vinfo = loop_vinfo;
2687 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2688 }
2689 else if (vect_epilogues
2690 /* For now only allow one epilogue loop. */
2691 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2692 {
2693 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2694 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2695 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2696 || maybe_ne (lowest_th, 0U));
2697 /* Keep track of the known smallest versioning
2698 threshold. */
2699 if (ordered_p (lowest_th, th))
2700 lowest_th = ordered_min (lowest_th, th);
2701 }
2702 else
2703 delete loop_vinfo;
2704
2705 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2706 enabled, SIMDUID is not set, it is the innermost loop and we have
2707 either already found the loop's SIMDLEN or there was no SIMDLEN to
2708 begin with.
2709 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2710 vect_epilogues = (!simdlen
2711 && loop->inner == NULL
2712 && param_vect_epilogues_nomask
2713 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2714 && !loop->simduid
2715 /* For now only allow one epilogue loop, but allow
2716 pick_lowest_cost_p to replace it. */
2717 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2718 || pick_lowest_cost_p));
2719
2720 /* Commit to first_loop_vinfo if we have no reason to try
2721 alternatives. */
2722 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2723 break;
2724 }
2725 else
2726 {
2727 delete loop_vinfo;
2728 if (fatal)
2729 {
2730 gcc_checking_assert (first_loop_vinfo == NULL);
2731 break;
2732 }
2733 }
2734
2735 if (mode_i < vector_modes.length ()
2736 && VECTOR_MODE_P (autodetected_vector_mode)
2737 && (related_vector_mode (vector_modes[mode_i],
2738 GET_MODE_INNER (autodetected_vector_mode))
2739 == autodetected_vector_mode)
2740 && (related_vector_mode (autodetected_vector_mode,
2741 GET_MODE_INNER (vector_modes[mode_i]))
2742 == vector_modes[mode_i]))
2743 {
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_NOTE, vect_location,
2746 "***** Skipping vector mode %s, which would"
2747 " repeat the analysis for %s\n",
2748 GET_MODE_NAME (vector_modes[mode_i]),
2749 GET_MODE_NAME (autodetected_vector_mode));
2750 mode_i += 1;
2751 }
2752
2753 if (mode_i == vector_modes.length ()
2754 || autodetected_vector_mode == VOIDmode)
2755 break;
2756
2757 /* Try the next biggest vector size. */
2758 next_vector_mode = vector_modes[mode_i++];
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_NOTE, vect_location,
2761 "***** Re-trying analysis with vector mode %s\n",
2762 GET_MODE_NAME (next_vector_mode));
2763 }
2764
2765 if (first_loop_vinfo)
2766 {
2767 loop->aux = (loop_vec_info) first_loop_vinfo;
2768 if (dump_enabled_p ())
2769 dump_printf_loc (MSG_NOTE, vect_location,
2770 "***** Choosing vector mode %s\n",
2771 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2772 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2773 return first_loop_vinfo;
2774 }
2775
2776 return opt_loop_vec_info::propagate_failure (res);
2777 }
2778
2779 /* Return true if there is an in-order reduction function for CODE, storing
2780 it in *REDUC_FN if so. */
2781
2782 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2783 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2784 {
2785 switch (code)
2786 {
2787 case PLUS_EXPR:
2788 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2789 return true;
2790
2791 default:
2792 return false;
2793 }
2794 }
2795
2796 /* Function reduction_fn_for_scalar_code
2797
2798 Input:
2799 CODE - tree_code of a reduction operations.
2800
2801 Output:
2802 REDUC_FN - the corresponding internal function to be used to reduce the
2803 vector of partial results into a single scalar result, or IFN_LAST
2804 if the operation is a supported reduction operation, but does not have
2805 such an internal function.
2806
2807 Return FALSE if CODE currently cannot be vectorized as reduction. */
2808
2809 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2810 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2811 {
2812 switch (code)
2813 {
2814 case MAX_EXPR:
2815 *reduc_fn = IFN_REDUC_MAX;
2816 return true;
2817
2818 case MIN_EXPR:
2819 *reduc_fn = IFN_REDUC_MIN;
2820 return true;
2821
2822 case PLUS_EXPR:
2823 *reduc_fn = IFN_REDUC_PLUS;
2824 return true;
2825
2826 case BIT_AND_EXPR:
2827 *reduc_fn = IFN_REDUC_AND;
2828 return true;
2829
2830 case BIT_IOR_EXPR:
2831 *reduc_fn = IFN_REDUC_IOR;
2832 return true;
2833
2834 case BIT_XOR_EXPR:
2835 *reduc_fn = IFN_REDUC_XOR;
2836 return true;
2837
2838 case MULT_EXPR:
2839 case MINUS_EXPR:
2840 *reduc_fn = IFN_LAST;
2841 return true;
2842
2843 default:
2844 return false;
2845 }
2846 }
2847
2848 /* If there is a neutral value X such that SLP reduction NODE would not
2849 be affected by the introduction of additional X elements, return that X,
2850 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2851 is the vector type that would hold element X. REDUC_CHAIN is true if
2852 the SLP statements perform a single reduction, false if each statement
2853 performs an independent reduction. */
2854
2855 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)2856 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2857 tree_code code, bool reduc_chain)
2858 {
2859 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2860 stmt_vec_info stmt_vinfo = stmts[0];
2861 tree scalar_type = TREE_TYPE (vector_type);
2862 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2863 gcc_assert (loop);
2864
2865 switch (code)
2866 {
2867 case WIDEN_SUM_EXPR:
2868 case DOT_PROD_EXPR:
2869 case SAD_EXPR:
2870 case PLUS_EXPR:
2871 case MINUS_EXPR:
2872 case BIT_IOR_EXPR:
2873 case BIT_XOR_EXPR:
2874 return build_zero_cst (scalar_type);
2875
2876 case MULT_EXPR:
2877 return build_one_cst (scalar_type);
2878
2879 case BIT_AND_EXPR:
2880 return build_all_ones_cst (scalar_type);
2881
2882 case MAX_EXPR:
2883 case MIN_EXPR:
2884 /* For MIN/MAX the initial values are neutral. A reduction chain
2885 has only a single initial value, so that value is neutral for
2886 all statements. */
2887 if (reduc_chain)
2888 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2889 loop_preheader_edge (loop));
2890 return NULL_TREE;
2891
2892 default:
2893 return NULL_TREE;
2894 }
2895 }
2896
2897 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2898 STMT is printed with a message MSG. */
2899
2900 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2901 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2902 {
2903 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2904 }
2905
2906 /* Return true if we need an in-order reduction for operation CODE
2907 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2908 overflow must wrap. */
2909
2910 bool
needs_fold_left_reduction_p(tree type,tree_code code)2911 needs_fold_left_reduction_p (tree type, tree_code code)
2912 {
2913 /* CHECKME: check for !flag_finite_math_only too? */
2914 if (SCALAR_FLOAT_TYPE_P (type))
2915 switch (code)
2916 {
2917 case MIN_EXPR:
2918 case MAX_EXPR:
2919 return false;
2920
2921 default:
2922 return !flag_associative_math;
2923 }
2924
2925 if (INTEGRAL_TYPE_P (type))
2926 {
2927 if (!operation_no_trapping_overflow (type, code))
2928 return true;
2929 return false;
2930 }
2931
2932 if (SAT_FIXED_POINT_TYPE_P (type))
2933 return true;
2934
2935 return false;
2936 }
2937
2938 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2939 has a handled computation expression. Store the main reduction
2940 operation in *CODE. */
2941
2942 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)2943 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2944 tree loop_arg, enum tree_code *code,
2945 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2946 {
2947 auto_bitmap visited;
2948 tree lookfor = PHI_RESULT (phi);
2949 ssa_op_iter curri;
2950 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2951 while (USE_FROM_PTR (curr) != loop_arg)
2952 curr = op_iter_next_use (&curri);
2953 curri.i = curri.numops;
2954 do
2955 {
2956 path.safe_push (std::make_pair (curri, curr));
2957 tree use = USE_FROM_PTR (curr);
2958 if (use == lookfor)
2959 break;
2960 gimple *def = SSA_NAME_DEF_STMT (use);
2961 if (gimple_nop_p (def)
2962 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2963 {
2964 pop:
2965 do
2966 {
2967 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2968 curri = x.first;
2969 curr = x.second;
2970 do
2971 curr = op_iter_next_use (&curri);
2972 /* Skip already visited or non-SSA operands (from iterating
2973 over PHI args). */
2974 while (curr != NULL_USE_OPERAND_P
2975 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2976 || ! bitmap_set_bit (visited,
2977 SSA_NAME_VERSION
2978 (USE_FROM_PTR (curr)))));
2979 }
2980 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2981 if (curr == NULL_USE_OPERAND_P)
2982 break;
2983 }
2984 else
2985 {
2986 if (gimple_code (def) == GIMPLE_PHI)
2987 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2988 else
2989 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2990 while (curr != NULL_USE_OPERAND_P
2991 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2992 || ! bitmap_set_bit (visited,
2993 SSA_NAME_VERSION
2994 (USE_FROM_PTR (curr)))))
2995 curr = op_iter_next_use (&curri);
2996 if (curr == NULL_USE_OPERAND_P)
2997 goto pop;
2998 }
2999 }
3000 while (1);
3001 if (dump_file && (dump_flags & TDF_DETAILS))
3002 {
3003 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3004 unsigned i;
3005 std::pair<ssa_op_iter, use_operand_p> *x;
3006 FOR_EACH_VEC_ELT (path, i, x)
3007 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3008 dump_printf (MSG_NOTE, "\n");
3009 }
3010
3011 /* Check whether the reduction path detected is valid. */
3012 bool fail = path.length () == 0;
3013 bool neg = false;
3014 int sign = -1;
3015 *code = ERROR_MARK;
3016 for (unsigned i = 1; i < path.length (); ++i)
3017 {
3018 gimple *use_stmt = USE_STMT (path[i].second);
3019 tree op = USE_FROM_PTR (path[i].second);
3020 if (! is_gimple_assign (use_stmt)
3021 /* The following make sure we can compute the operand index
3022 easily plus it mostly disallows chaining via COND_EXPR condition
3023 operands. */
3024 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3025 && (gimple_num_ops (use_stmt) <= 2
3026 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3027 && (gimple_num_ops (use_stmt) <= 3
3028 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3029 {
3030 fail = true;
3031 break;
3032 }
3033 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3034 if (use_code == MINUS_EXPR)
3035 {
3036 use_code = PLUS_EXPR;
3037 /* Track whether we negate the reduction value each iteration. */
3038 if (gimple_assign_rhs2 (use_stmt) == op)
3039 neg = ! neg;
3040 }
3041 if (CONVERT_EXPR_CODE_P (use_code)
3042 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3043 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3044 ;
3045 else if (*code == ERROR_MARK)
3046 {
3047 *code = use_code;
3048 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3049 }
3050 else if (use_code != *code)
3051 {
3052 fail = true;
3053 break;
3054 }
3055 else if ((use_code == MIN_EXPR
3056 || use_code == MAX_EXPR)
3057 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3058 {
3059 fail = true;
3060 break;
3061 }
3062 /* Check there's only a single stmt the op is used on. For the
3063 not value-changing tail and the last stmt allow out-of-loop uses.
3064 ??? We could relax this and handle arbitrary live stmts by
3065 forcing a scalar epilogue for example. */
3066 imm_use_iterator imm_iter;
3067 gimple *op_use_stmt;
3068 unsigned cnt = 0;
3069 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3070 if (!is_gimple_debug (op_use_stmt)
3071 && (*code != ERROR_MARK
3072 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3073 {
3074 /* We want to allow x + x but not x < 1 ? x : 2. */
3075 if (is_gimple_assign (op_use_stmt)
3076 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3077 {
3078 use_operand_p use_p;
3079 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3080 cnt++;
3081 }
3082 else
3083 cnt++;
3084 }
3085 if (cnt != 1)
3086 {
3087 fail = true;
3088 break;
3089 }
3090 }
3091 return ! fail && ! neg && *code != ERROR_MARK;
3092 }
3093
3094 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3095 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3096 tree loop_arg, enum tree_code code)
3097 {
3098 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3099 enum tree_code code_;
3100 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3101 && code_ == code);
3102 }
3103
3104
3105
3106 /* Function vect_is_simple_reduction
3107
3108 (1) Detect a cross-iteration def-use cycle that represents a simple
3109 reduction computation. We look for the following pattern:
3110
3111 loop_header:
3112 a1 = phi < a0, a2 >
3113 a3 = ...
3114 a2 = operation (a3, a1)
3115
3116 or
3117
3118 a3 = ...
3119 loop_header:
3120 a1 = phi < a0, a2 >
3121 a2 = operation (a3, a1)
3122
3123 such that:
3124 1. operation is commutative and associative and it is safe to
3125 change the order of the computation
3126 2. no uses for a2 in the loop (a2 is used out of the loop)
3127 3. no uses of a1 in the loop besides the reduction operation
3128 4. no uses of a1 outside the loop.
3129
3130 Conditions 1,4 are tested here.
3131 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3132
3133 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3134 nested cycles.
3135
3136 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3137 reductions:
3138
3139 a1 = phi < a0, a2 >
3140 inner loop (def of a3)
3141 a2 = phi < a3 >
3142
3143 (4) Detect condition expressions, ie:
3144 for (int i = 0; i < N; i++)
3145 if (a[i] < val)
3146 ret_val = a[i];
3147
3148 */
3149
3150 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3151 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3152 bool *double_reduc, bool *reduc_chain_p)
3153 {
3154 gphi *phi = as_a <gphi *> (phi_info->stmt);
3155 gimple *phi_use_stmt = NULL;
3156 imm_use_iterator imm_iter;
3157 use_operand_p use_p;
3158
3159 *double_reduc = false;
3160 *reduc_chain_p = false;
3161 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3162
3163 tree phi_name = PHI_RESULT (phi);
3164 /* ??? If there are no uses of the PHI result the inner loop reduction
3165 won't be detected as possibly double-reduction by vectorizable_reduction
3166 because that tries to walk the PHI arg from the preheader edge which
3167 can be constant. See PR60382. */
3168 if (has_zero_uses (phi_name))
3169 return NULL;
3170 class loop *loop = (gimple_bb (phi))->loop_father;
3171 unsigned nphi_def_loop_uses = 0;
3172 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3173 {
3174 gimple *use_stmt = USE_STMT (use_p);
3175 if (is_gimple_debug (use_stmt))
3176 continue;
3177
3178 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3179 {
3180 if (dump_enabled_p ())
3181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3182 "intermediate value used outside loop.\n");
3183
3184 return NULL;
3185 }
3186
3187 nphi_def_loop_uses++;
3188 phi_use_stmt = use_stmt;
3189 }
3190
3191 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3192 if (TREE_CODE (latch_def) != SSA_NAME)
3193 {
3194 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196 "reduction: not ssa_name: %T\n", latch_def);
3197 return NULL;
3198 }
3199
3200 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3201 if (!def_stmt_info
3202 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3203 return NULL;
3204
3205 bool nested_in_vect_loop
3206 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3207 unsigned nlatch_def_loop_uses = 0;
3208 auto_vec<gphi *, 3> lcphis;
3209 bool inner_loop_of_double_reduc = false;
3210 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3211 {
3212 gimple *use_stmt = USE_STMT (use_p);
3213 if (is_gimple_debug (use_stmt))
3214 continue;
3215 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3216 nlatch_def_loop_uses++;
3217 else
3218 {
3219 /* We can have more than one loop-closed PHI. */
3220 lcphis.safe_push (as_a <gphi *> (use_stmt));
3221 if (nested_in_vect_loop
3222 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3223 == vect_double_reduction_def))
3224 inner_loop_of_double_reduc = true;
3225 }
3226 }
3227
3228 /* If we are vectorizing an inner reduction we are executing that
3229 in the original order only in case we are not dealing with a
3230 double reduction. */
3231 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3232 {
3233 if (dump_enabled_p ())
3234 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3235 "detected nested cycle: ");
3236 return def_stmt_info;
3237 }
3238
3239 /* When the inner loop of a double reduction ends up with more than
3240 one loop-closed PHI we have failed to classify alternate such
3241 PHIs as double reduction, leading to wrong code. See PR103237. */
3242 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3243 {
3244 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3246 "unhandle double reduction\n");
3247 return NULL;
3248 }
3249
3250 /* If this isn't a nested cycle or if the nested cycle reduction value
3251 is used ouside of the inner loop we cannot handle uses of the reduction
3252 value. */
3253 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3254 {
3255 if (dump_enabled_p ())
3256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3257 "reduction used in loop.\n");
3258 return NULL;
3259 }
3260
3261 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3262 defined in the inner loop. */
3263 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3264 {
3265 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3266 if (gimple_phi_num_args (def_stmt) != 1
3267 || TREE_CODE (op1) != SSA_NAME)
3268 {
3269 if (dump_enabled_p ())
3270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271 "unsupported phi node definition.\n");
3272
3273 return NULL;
3274 }
3275
3276 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3277 if (gimple_bb (def1)
3278 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3279 && loop->inner
3280 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3281 && is_gimple_assign (def1)
3282 && is_a <gphi *> (phi_use_stmt)
3283 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3284 {
3285 if (dump_enabled_p ())
3286 report_vect_op (MSG_NOTE, def_stmt,
3287 "detected double reduction: ");
3288
3289 *double_reduc = true;
3290 return def_stmt_info;
3291 }
3292
3293 return NULL;
3294 }
3295
3296 /* Look for the expression computing latch_def from then loop PHI result. */
3297 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3298 enum tree_code code;
3299 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3300 path))
3301 {
3302 STMT_VINFO_REDUC_CODE (phi_info) = code;
3303 if (code == COND_EXPR && !nested_in_vect_loop)
3304 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3305
3306 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3307 reduction chain for which the additional restriction is that
3308 all operations in the chain are the same. */
3309 auto_vec<stmt_vec_info, 8> reduc_chain;
3310 unsigned i;
3311 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3312 for (i = path.length () - 1; i >= 1; --i)
3313 {
3314 gimple *stmt = USE_STMT (path[i].second);
3315 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3316 STMT_VINFO_REDUC_IDX (stmt_info)
3317 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3318 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3319 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3320 && (i == 1 || i == path.length () - 1));
3321 if ((stmt_code != code && !leading_conversion)
3322 /* We can only handle the final value in epilogue
3323 generation for reduction chains. */
3324 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3325 is_slp_reduc = false;
3326 /* For reduction chains we support a trailing/leading
3327 conversions. We do not store those in the actual chain. */
3328 if (leading_conversion)
3329 continue;
3330 reduc_chain.safe_push (stmt_info);
3331 }
3332 if (is_slp_reduc && reduc_chain.length () > 1)
3333 {
3334 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3335 {
3336 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3337 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3338 }
3339 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3340 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3341
3342 /* Save the chain for further analysis in SLP detection. */
3343 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3344 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3345
3346 *reduc_chain_p = true;
3347 if (dump_enabled_p ())
3348 dump_printf_loc (MSG_NOTE, vect_location,
3349 "reduction: detected reduction chain\n");
3350 }
3351 else if (dump_enabled_p ())
3352 dump_printf_loc (MSG_NOTE, vect_location,
3353 "reduction: detected reduction\n");
3354
3355 return def_stmt_info;
3356 }
3357
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "reduction: unknown pattern\n");
3361
3362 return NULL;
3363 }
3364
3365 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3366 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3367 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3368 int *peel_iters_epilogue,
3369 stmt_vector_for_cost *scalar_cost_vec,
3370 stmt_vector_for_cost *prologue_cost_vec,
3371 stmt_vector_for_cost *epilogue_cost_vec)
3372 {
3373 int retval = 0;
3374 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3375
3376 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3377 {
3378 *peel_iters_epilogue = assumed_vf / 2;
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "cost model: epilogue peel iters set to vf/2 "
3382 "because loop iterations are unknown .\n");
3383
3384 /* If peeled iterations are known but number of scalar loop
3385 iterations are unknown, count a taken branch per peeled loop. */
3386 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3387 NULL, 0, vect_prologue);
3388 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3389 NULL, 0, vect_epilogue);
3390 }
3391 else
3392 {
3393 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3394 peel_iters_prologue = niters < peel_iters_prologue ?
3395 niters : peel_iters_prologue;
3396 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3397 /* If we need to peel for gaps, but no peeling is required, we have to
3398 peel VF iterations. */
3399 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3400 *peel_iters_epilogue = assumed_vf;
3401 }
3402
3403 stmt_info_for_cost *si;
3404 int j;
3405 if (peel_iters_prologue)
3406 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3407 retval += record_stmt_cost (prologue_cost_vec,
3408 si->count * peel_iters_prologue,
3409 si->kind, si->stmt_info, si->misalign,
3410 vect_prologue);
3411 if (*peel_iters_epilogue)
3412 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3413 retval += record_stmt_cost (epilogue_cost_vec,
3414 si->count * *peel_iters_epilogue,
3415 si->kind, si->stmt_info, si->misalign,
3416 vect_epilogue);
3417
3418 return retval;
3419 }
3420
3421 /* Function vect_estimate_min_profitable_iters
3422
3423 Return the number of iterations required for the vector version of the
3424 loop to be profitable relative to the cost of the scalar version of the
3425 loop.
3426
3427 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3428 of iterations for vectorization. -1 value means loop vectorization
3429 is not profitable. This returned value may be used for dynamic
3430 profitability check.
3431
3432 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3433 for static check against estimated number of iterations. */
3434
3435 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3436 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3437 int *ret_min_profitable_niters,
3438 int *ret_min_profitable_estimate)
3439 {
3440 int min_profitable_iters;
3441 int min_profitable_estimate;
3442 int peel_iters_prologue;
3443 int peel_iters_epilogue;
3444 unsigned vec_inside_cost = 0;
3445 int vec_outside_cost = 0;
3446 unsigned vec_prologue_cost = 0;
3447 unsigned vec_epilogue_cost = 0;
3448 int scalar_single_iter_cost = 0;
3449 int scalar_outside_cost = 0;
3450 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3451 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3452 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3453
3454 /* Cost model disabled. */
3455 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3456 {
3457 if (dump_enabled_p ())
3458 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3459 *ret_min_profitable_niters = 0;
3460 *ret_min_profitable_estimate = 0;
3461 return;
3462 }
3463
3464 /* Requires loop versioning tests to handle misalignment. */
3465 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3466 {
3467 /* FIXME: Make cost depend on complexity of individual check. */
3468 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3469 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3470 vect_prologue);
3471 if (dump_enabled_p ())
3472 dump_printf (MSG_NOTE,
3473 "cost model: Adding cost of checks for loop "
3474 "versioning to treat misalignment.\n");
3475 }
3476
3477 /* Requires loop versioning with alias checks. */
3478 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3479 {
3480 /* FIXME: Make cost depend on complexity of individual check. */
3481 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3482 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3483 vect_prologue);
3484 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3485 if (len)
3486 /* Count LEN - 1 ANDs and LEN comparisons. */
3487 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3488 NULL, 0, vect_prologue);
3489 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3490 if (len)
3491 {
3492 /* Count LEN - 1 ANDs and LEN comparisons. */
3493 unsigned int nstmts = len * 2 - 1;
3494 /* +1 for each bias that needs adding. */
3495 for (unsigned int i = 0; i < len; ++i)
3496 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3497 nstmts += 1;
3498 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3499 NULL, 0, vect_prologue);
3500 }
3501 if (dump_enabled_p ())
3502 dump_printf (MSG_NOTE,
3503 "cost model: Adding cost of checks for loop "
3504 "versioning aliasing.\n");
3505 }
3506
3507 /* Requires loop versioning with niter checks. */
3508 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3509 {
3510 /* FIXME: Make cost depend on complexity of individual check. */
3511 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3512 vect_prologue);
3513 if (dump_enabled_p ())
3514 dump_printf (MSG_NOTE,
3515 "cost model: Adding cost of checks for loop "
3516 "versioning niters.\n");
3517 }
3518
3519 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3520 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3521 vect_prologue);
3522
3523 /* Count statements in scalar loop. Using this as scalar cost for a single
3524 iteration for now.
3525
3526 TODO: Add outer loop support.
3527
3528 TODO: Consider assigning different costs to different scalar
3529 statements. */
3530
3531 scalar_single_iter_cost
3532 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3533
3534 /* Add additional cost for the peeled instructions in prologue and epilogue
3535 loop. (For fully-masked loops there will be no peeling.)
3536
3537 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3538 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3539
3540 TODO: Build an expression that represents peel_iters for prologue and
3541 epilogue to be used in a run-time test. */
3542
3543 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3544 {
3545 peel_iters_prologue = 0;
3546 peel_iters_epilogue = 0;
3547
3548 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3549 {
3550 /* We need to peel exactly one iteration. */
3551 peel_iters_epilogue += 1;
3552 stmt_info_for_cost *si;
3553 int j;
3554 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3555 j, si)
3556 (void) add_stmt_cost (target_cost_data, si->count,
3557 si->kind, si->stmt_info, si->misalign,
3558 vect_epilogue);
3559 }
3560
3561 /* Calculate how many masks we need to generate. */
3562 unsigned int num_masks = 0;
3563 rgroup_masks *rgm;
3564 unsigned int num_vectors_m1;
3565 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3566 if (rgm->mask_type)
3567 num_masks += num_vectors_m1 + 1;
3568 gcc_assert (num_masks > 0);
3569
3570 /* In the worst case, we need to generate each mask in the prologue
3571 and in the loop body. One of the loop body mask instructions
3572 replaces the comparison in the scalar loop, and since we don't
3573 count the scalar comparison against the scalar body, we shouldn't
3574 count that vector instruction against the vector body either.
3575
3576 Sometimes we can use unpacks instead of generating prologue
3577 masks and sometimes the prologue mask will fold to a constant,
3578 so the actual prologue cost might be smaller. However, it's
3579 simpler and safer to use the worst-case cost; if this ends up
3580 being the tie-breaker between vectorizing or not, then it's
3581 probably better not to vectorize. */
3582 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3583 NULL, 0, vect_prologue);
3584 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3585 NULL, 0, vect_body);
3586 }
3587 else if (npeel < 0)
3588 {
3589 peel_iters_prologue = assumed_vf / 2;
3590 if (dump_enabled_p ())
3591 dump_printf (MSG_NOTE, "cost model: "
3592 "prologue peel iters set to vf/2.\n");
3593
3594 /* If peeling for alignment is unknown, loop bound of main loop becomes
3595 unknown. */
3596 peel_iters_epilogue = assumed_vf / 2;
3597 if (dump_enabled_p ())
3598 dump_printf (MSG_NOTE, "cost model: "
3599 "epilogue peel iters set to vf/2 because "
3600 "peeling for alignment is unknown.\n");
3601
3602 /* If peeled iterations are unknown, count a taken branch and a not taken
3603 branch per peeled loop. Even if scalar loop iterations are known,
3604 vector iterations are not known since peeled prologue iterations are
3605 not known. Hence guards remain the same. */
3606 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3607 NULL, 0, vect_prologue);
3608 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3609 NULL, 0, vect_prologue);
3610 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3611 NULL, 0, vect_epilogue);
3612 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3613 NULL, 0, vect_epilogue);
3614 stmt_info_for_cost *si;
3615 int j;
3616 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3617 {
3618 (void) add_stmt_cost (target_cost_data,
3619 si->count * peel_iters_prologue,
3620 si->kind, si->stmt_info, si->misalign,
3621 vect_prologue);
3622 (void) add_stmt_cost (target_cost_data,
3623 si->count * peel_iters_epilogue,
3624 si->kind, si->stmt_info, si->misalign,
3625 vect_epilogue);
3626 }
3627 }
3628 else
3629 {
3630 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3631 stmt_info_for_cost *si;
3632 int j;
3633 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3634
3635 prologue_cost_vec.create (2);
3636 epilogue_cost_vec.create (2);
3637 peel_iters_prologue = npeel;
3638
3639 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3640 &peel_iters_epilogue,
3641 &LOOP_VINFO_SCALAR_ITERATION_COST
3642 (loop_vinfo),
3643 &prologue_cost_vec,
3644 &epilogue_cost_vec);
3645
3646 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3647 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3648 si->misalign, vect_prologue);
3649
3650 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3651 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3652 si->misalign, vect_epilogue);
3653
3654 prologue_cost_vec.release ();
3655 epilogue_cost_vec.release ();
3656 }
3657
3658 /* FORNOW: The scalar outside cost is incremented in one of the
3659 following ways:
3660
3661 1. The vectorizer checks for alignment and aliasing and generates
3662 a condition that allows dynamic vectorization. A cost model
3663 check is ANDED with the versioning condition. Hence scalar code
3664 path now has the added cost of the versioning check.
3665
3666 if (cost > th & versioning_check)
3667 jmp to vector code
3668
3669 Hence run-time scalar is incremented by not-taken branch cost.
3670
3671 2. The vectorizer then checks if a prologue is required. If the
3672 cost model check was not done before during versioning, it has to
3673 be done before the prologue check.
3674
3675 if (cost <= th)
3676 prologue = scalar_iters
3677 if (prologue == 0)
3678 jmp to vector code
3679 else
3680 execute prologue
3681 if (prologue == num_iters)
3682 go to exit
3683
3684 Hence the run-time scalar cost is incremented by a taken branch,
3685 plus a not-taken branch, plus a taken branch cost.
3686
3687 3. The vectorizer then checks if an epilogue is required. If the
3688 cost model check was not done before during prologue check, it
3689 has to be done with the epilogue check.
3690
3691 if (prologue == 0)
3692 jmp to vector code
3693 else
3694 execute prologue
3695 if (prologue == num_iters)
3696 go to exit
3697 vector code:
3698 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3699 jmp to epilogue
3700
3701 Hence the run-time scalar cost should be incremented by 2 taken
3702 branches.
3703
3704 TODO: The back end may reorder the BBS's differently and reverse
3705 conditions/branch directions. Change the estimates below to
3706 something more reasonable. */
3707
3708 /* If the number of iterations is known and we do not do versioning, we can
3709 decide whether to vectorize at compile time. Hence the scalar version
3710 do not carry cost model guard costs. */
3711 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3712 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3713 {
3714 /* Cost model check occurs at versioning. */
3715 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3716 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3717 else
3718 {
3719 /* Cost model check occurs at prologue generation. */
3720 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3721 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3722 + vect_get_stmt_cost (cond_branch_not_taken);
3723 /* Cost model check occurs at epilogue generation. */
3724 else
3725 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3726 }
3727 }
3728
3729 /* Complete the target-specific cost calculations. */
3730 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3731 &vec_inside_cost, &vec_epilogue_cost);
3732
3733 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3734
3735 /* Stash the costs so that we can compare two loop_vec_infos. */
3736 loop_vinfo->vec_inside_cost = vec_inside_cost;
3737 loop_vinfo->vec_outside_cost = vec_outside_cost;
3738
3739 if (dump_enabled_p ())
3740 {
3741 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3742 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3743 vec_inside_cost);
3744 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3745 vec_prologue_cost);
3746 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3747 vec_epilogue_cost);
3748 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3749 scalar_single_iter_cost);
3750 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3751 scalar_outside_cost);
3752 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3753 vec_outside_cost);
3754 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3755 peel_iters_prologue);
3756 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3757 peel_iters_epilogue);
3758 }
3759
3760 /* Calculate number of iterations required to make the vector version
3761 profitable, relative to the loop bodies only. The following condition
3762 must hold true:
3763 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3764 where
3765 SIC = scalar iteration cost, VIC = vector iteration cost,
3766 VOC = vector outside cost, VF = vectorization factor,
3767 NPEEL = prologue iterations + epilogue iterations,
3768 SOC = scalar outside cost for run time cost model check. */
3769
3770 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3771 - vec_inside_cost);
3772 if (saving_per_viter <= 0)
3773 {
3774 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3775 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3776 "vectorization did not happen for a simd loop");
3777
3778 if (dump_enabled_p ())
3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780 "cost model: the vector iteration cost = %d "
3781 "divided by the scalar iteration cost = %d "
3782 "is greater or equal to the vectorization factor = %d"
3783 ".\n",
3784 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3785 *ret_min_profitable_niters = -1;
3786 *ret_min_profitable_estimate = -1;
3787 return;
3788 }
3789
3790 /* ??? The "if" arm is written to handle all cases; see below for what
3791 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3792 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3793 {
3794 /* Rewriting the condition above in terms of the number of
3795 vector iterations (vniters) rather than the number of
3796 scalar iterations (niters) gives:
3797
3798 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3799
3800 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3801
3802 For integer N, X and Y when X > 0:
3803
3804 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3805 int outside_overhead = (vec_outside_cost
3806 - scalar_single_iter_cost * peel_iters_prologue
3807 - scalar_single_iter_cost * peel_iters_epilogue
3808 - scalar_outside_cost);
3809 /* We're only interested in cases that require at least one
3810 vector iteration. */
3811 int min_vec_niters = 1;
3812 if (outside_overhead > 0)
3813 min_vec_niters = outside_overhead / saving_per_viter + 1;
3814
3815 if (dump_enabled_p ())
3816 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3817 min_vec_niters);
3818
3819 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3820 {
3821 /* Now that we know the minimum number of vector iterations,
3822 find the minimum niters for which the scalar cost is larger:
3823
3824 SIC * niters > VIC * vniters + VOC - SOC
3825
3826 We know that the minimum niters is no more than
3827 vniters * VF + NPEEL, but it might be (and often is) less
3828 than that if a partial vector iteration is cheaper than the
3829 equivalent scalar code. */
3830 int threshold = (vec_inside_cost * min_vec_niters
3831 + vec_outside_cost
3832 - scalar_outside_cost);
3833 if (threshold <= 0)
3834 min_profitable_iters = 1;
3835 else
3836 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3837 }
3838 else
3839 /* Convert the number of vector iterations into a number of
3840 scalar iterations. */
3841 min_profitable_iters = (min_vec_niters * assumed_vf
3842 + peel_iters_prologue
3843 + peel_iters_epilogue);
3844 }
3845 else
3846 {
3847 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3848 * assumed_vf
3849 - vec_inside_cost * peel_iters_prologue
3850 - vec_inside_cost * peel_iters_epilogue);
3851 if (min_profitable_iters <= 0)
3852 min_profitable_iters = 0;
3853 else
3854 {
3855 min_profitable_iters /= saving_per_viter;
3856
3857 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3858 <= (((int) vec_inside_cost * min_profitable_iters)
3859 + (((int) vec_outside_cost - scalar_outside_cost)
3860 * assumed_vf)))
3861 min_profitable_iters++;
3862 }
3863 }
3864
3865 if (dump_enabled_p ())
3866 dump_printf (MSG_NOTE,
3867 " Calculated minimum iters for profitability: %d\n",
3868 min_profitable_iters);
3869
3870 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3871 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3872 /* We want the vectorized loop to execute at least once. */
3873 min_profitable_iters = assumed_vf + peel_iters_prologue;
3874
3875 if (dump_enabled_p ())
3876 dump_printf_loc (MSG_NOTE, vect_location,
3877 " Runtime profitability threshold = %d\n",
3878 min_profitable_iters);
3879
3880 *ret_min_profitable_niters = min_profitable_iters;
3881
3882 /* Calculate number of iterations required to make the vector version
3883 profitable, relative to the loop bodies only.
3884
3885 Non-vectorized variant is SIC * niters and it must win over vector
3886 variant on the expected loop trip count. The following condition must hold true:
3887 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3888
3889 if (vec_outside_cost <= 0)
3890 min_profitable_estimate = 0;
3891 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3892 {
3893 /* This is a repeat of the code above, but with + SOC rather
3894 than - SOC. */
3895 int outside_overhead = (vec_outside_cost
3896 - scalar_single_iter_cost * peel_iters_prologue
3897 - scalar_single_iter_cost * peel_iters_epilogue
3898 + scalar_outside_cost);
3899 int min_vec_niters = 1;
3900 if (outside_overhead > 0)
3901 min_vec_niters = outside_overhead / saving_per_viter + 1;
3902
3903 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3904 {
3905 int threshold = (vec_inside_cost * min_vec_niters
3906 + vec_outside_cost
3907 + scalar_outside_cost);
3908 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3909 }
3910 else
3911 min_profitable_estimate = (min_vec_niters * assumed_vf
3912 + peel_iters_prologue
3913 + peel_iters_epilogue);
3914 }
3915 else
3916 {
3917 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3918 * assumed_vf
3919 - vec_inside_cost * peel_iters_prologue
3920 - vec_inside_cost * peel_iters_epilogue)
3921 / ((scalar_single_iter_cost * assumed_vf)
3922 - vec_inside_cost);
3923 }
3924 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3925 if (dump_enabled_p ())
3926 dump_printf_loc (MSG_NOTE, vect_location,
3927 " Static estimate profitability threshold = %d\n",
3928 min_profitable_estimate);
3929
3930 *ret_min_profitable_estimate = min_profitable_estimate;
3931 }
3932
3933 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3934 vector elements (not bits) for a vector with NELT elements. */
3935 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3936 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3937 vec_perm_builder *sel)
3938 {
3939 /* The encoding is a single stepped pattern. Any wrap-around is handled
3940 by vec_perm_indices. */
3941 sel->new_vector (nelt, 1, 3);
3942 for (unsigned int i = 0; i < 3; i++)
3943 sel->quick_push (i + offset);
3944 }
3945
3946 /* Checks whether the target supports whole-vector shifts for vectors of mode
3947 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3948 it supports vec_perm_const with masks for all necessary shift amounts. */
3949 static bool
have_whole_vector_shift(machine_mode mode)3950 have_whole_vector_shift (machine_mode mode)
3951 {
3952 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3953 return true;
3954
3955 /* Variable-length vectors should be handled via the optab. */
3956 unsigned int nelt;
3957 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3958 return false;
3959
3960 vec_perm_builder sel;
3961 vec_perm_indices indices;
3962 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3963 {
3964 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3965 indices.new_vector (sel, 2, nelt);
3966 if (!can_vec_perm_const_p (mode, indices, false))
3967 return false;
3968 }
3969 return true;
3970 }
3971
3972 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3973 functions. Design better to avoid maintenance issues. */
3974
3975 /* Function vect_model_reduction_cost.
3976
3977 Models cost for a reduction operation, including the vector ops
3978 generated within the strip-mine loop in some cases, the initial
3979 definition before the loop, and the epilogue code that must be generated. */
3980
3981 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)3982 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3983 vect_reduction_type reduction_type,
3984 int ncopies, stmt_vector_for_cost *cost_vec)
3985 {
3986 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3987 enum tree_code code;
3988 optab optab;
3989 tree vectype;
3990 machine_mode mode;
3991 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3992 class loop *loop = NULL;
3993
3994 if (loop_vinfo)
3995 loop = LOOP_VINFO_LOOP (loop_vinfo);
3996
3997 /* Condition reductions generate two reductions in the loop. */
3998 if (reduction_type == COND_REDUCTION)
3999 ncopies *= 2;
4000
4001 vectype = STMT_VINFO_VECTYPE (stmt_info);
4002 mode = TYPE_MODE (vectype);
4003 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4004
4005 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4006
4007 if (reduction_type == EXTRACT_LAST_REDUCTION)
4008 /* No extra instructions are needed in the prologue. The loop body
4009 operations are costed in vectorizable_condition. */
4010 inside_cost = 0;
4011 else if (reduction_type == FOLD_LEFT_REDUCTION)
4012 {
4013 /* No extra instructions needed in the prologue. */
4014 prologue_cost = 0;
4015
4016 if (reduc_fn != IFN_LAST)
4017 /* Count one reduction-like operation per vector. */
4018 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4019 stmt_info, 0, vect_body);
4020 else
4021 {
4022 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4023 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4024 inside_cost = record_stmt_cost (cost_vec, nelements,
4025 vec_to_scalar, stmt_info, 0,
4026 vect_body);
4027 inside_cost += record_stmt_cost (cost_vec, nelements,
4028 scalar_stmt, stmt_info, 0,
4029 vect_body);
4030 }
4031 }
4032 else
4033 {
4034 /* Add in cost for initial definition.
4035 For cond reduction we have four vectors: initial index, step,
4036 initial result of the data reduction, initial value of the index
4037 reduction. */
4038 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4039 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4040 scalar_to_vec, stmt_info, 0,
4041 vect_prologue);
4042 }
4043
4044 /* Determine cost of epilogue code.
4045
4046 We have a reduction operator that will reduce the vector in one statement.
4047 Also requires scalar extract. */
4048
4049 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4050 {
4051 if (reduc_fn != IFN_LAST)
4052 {
4053 if (reduction_type == COND_REDUCTION)
4054 {
4055 /* An EQ stmt and an COND_EXPR stmt. */
4056 epilogue_cost += record_stmt_cost (cost_vec, 2,
4057 vector_stmt, stmt_info, 0,
4058 vect_epilogue);
4059 /* Reduction of the max index and a reduction of the found
4060 values. */
4061 epilogue_cost += record_stmt_cost (cost_vec, 2,
4062 vec_to_scalar, stmt_info, 0,
4063 vect_epilogue);
4064 /* A broadcast of the max value. */
4065 epilogue_cost += record_stmt_cost (cost_vec, 1,
4066 scalar_to_vec, stmt_info, 0,
4067 vect_epilogue);
4068 }
4069 else
4070 {
4071 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4072 stmt_info, 0, vect_epilogue);
4073 epilogue_cost += record_stmt_cost (cost_vec, 1,
4074 vec_to_scalar, stmt_info, 0,
4075 vect_epilogue);
4076 }
4077 }
4078 else if (reduction_type == COND_REDUCTION)
4079 {
4080 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4081 /* Extraction of scalar elements. */
4082 epilogue_cost += record_stmt_cost (cost_vec,
4083 2 * estimated_nunits,
4084 vec_to_scalar, stmt_info, 0,
4085 vect_epilogue);
4086 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4087 epilogue_cost += record_stmt_cost (cost_vec,
4088 2 * estimated_nunits - 3,
4089 scalar_stmt, stmt_info, 0,
4090 vect_epilogue);
4091 }
4092 else if (reduction_type == EXTRACT_LAST_REDUCTION
4093 || reduction_type == FOLD_LEFT_REDUCTION)
4094 /* No extra instructions need in the epilogue. */
4095 ;
4096 else
4097 {
4098 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4099 tree bitsize =
4100 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4101 int element_bitsize = tree_to_uhwi (bitsize);
4102 int nelements = vec_size_in_bits / element_bitsize;
4103
4104 if (code == COND_EXPR)
4105 code = MAX_EXPR;
4106
4107 optab = optab_for_tree_code (code, vectype, optab_default);
4108
4109 /* We have a whole vector shift available. */
4110 if (optab != unknown_optab
4111 && VECTOR_MODE_P (mode)
4112 && optab_handler (optab, mode) != CODE_FOR_nothing
4113 && have_whole_vector_shift (mode))
4114 {
4115 /* Final reduction via vector shifts and the reduction operator.
4116 Also requires scalar extract. */
4117 epilogue_cost += record_stmt_cost (cost_vec,
4118 exact_log2 (nelements) * 2,
4119 vector_stmt, stmt_info, 0,
4120 vect_epilogue);
4121 epilogue_cost += record_stmt_cost (cost_vec, 1,
4122 vec_to_scalar, stmt_info, 0,
4123 vect_epilogue);
4124 }
4125 else
4126 /* Use extracts and reduction op for final reduction. For N
4127 elements, we have N extracts and N-1 reduction ops. */
4128 epilogue_cost += record_stmt_cost (cost_vec,
4129 nelements + nelements - 1,
4130 vector_stmt, stmt_info, 0,
4131 vect_epilogue);
4132 }
4133 }
4134
4135 if (dump_enabled_p ())
4136 dump_printf (MSG_NOTE,
4137 "vect_model_reduction_cost: inside_cost = %d, "
4138 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4139 prologue_cost, epilogue_cost);
4140 }
4141
4142
4143 /* Function vect_model_induction_cost.
4144
4145 Models cost for induction operations. */
4146
4147 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4148 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4149 stmt_vector_for_cost *cost_vec)
4150 {
4151 unsigned inside_cost, prologue_cost;
4152
4153 if (PURE_SLP_STMT (stmt_info))
4154 return;
4155
4156 /* loop cost for vec_loop. */
4157 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4158 stmt_info, 0, vect_body);
4159
4160 /* prologue cost for vec_init and vec_step. */
4161 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4162 stmt_info, 0, vect_prologue);
4163
4164 if (dump_enabled_p ())
4165 dump_printf_loc (MSG_NOTE, vect_location,
4166 "vect_model_induction_cost: inside_cost = %d, "
4167 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4168 }
4169
4170
4171
4172 /* Function get_initial_def_for_reduction
4173
4174 Input:
4175 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4176 INIT_VAL - the initial value of the reduction variable
4177
4178 Output:
4179 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4180 of the reduction (used for adjusting the epilog - see below).
4181 Return a vector variable, initialized according to the operation that
4182 STMT_VINFO performs. This vector will be used as the initial value
4183 of the vector of partial results.
4184
4185 Option1 (adjust in epilog): Initialize the vector as follows:
4186 add/bit or/xor: [0,0,...,0,0]
4187 mult/bit and: [1,1,...,1,1]
4188 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4189 and when necessary (e.g. add/mult case) let the caller know
4190 that it needs to adjust the result by init_val.
4191
4192 Option2: Initialize the vector as follows:
4193 add/bit or/xor: [init_val,0,0,...,0]
4194 mult/bit and: [init_val,1,1,...,1]
4195 min/max/cond_expr: [init_val,init_val,...,init_val]
4196 and no adjustments are needed.
4197
4198 For example, for the following code:
4199
4200 s = init_val;
4201 for (i=0;i<n;i++)
4202 s = s + a[i];
4203
4204 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4205 For a vector of 4 units, we want to return either [0,0,0,init_val],
4206 or [0,0,0,0] and let the caller know that it needs to adjust
4207 the result at the end by 'init_val'.
4208
4209 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4210 initialization vector is simpler (same element in all entries), if
4211 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4212
4213 A cost model should help decide between these two schemes. */
4214
4215 static tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4216 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4217 enum tree_code code, tree init_val,
4218 tree *adjustment_def)
4219 {
4220 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4221 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4222 tree scalar_type = TREE_TYPE (init_val);
4223 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4224 tree def_for_init;
4225 tree init_def;
4226 REAL_VALUE_TYPE real_init_val = dconst0;
4227 int int_init_val = 0;
4228 gimple_seq stmts = NULL;
4229
4230 gcc_assert (vectype);
4231
4232 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4233 || SCALAR_FLOAT_TYPE_P (scalar_type));
4234
4235 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4236 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4237
4238 /* ADJUSTMENT_DEF is NULL when called from
4239 vect_create_epilog_for_reduction to vectorize double reduction. */
4240 if (adjustment_def)
4241 *adjustment_def = NULL;
4242
4243 switch (code)
4244 {
4245 case WIDEN_SUM_EXPR:
4246 case DOT_PROD_EXPR:
4247 case SAD_EXPR:
4248 case PLUS_EXPR:
4249 case MINUS_EXPR:
4250 case BIT_IOR_EXPR:
4251 case BIT_XOR_EXPR:
4252 case MULT_EXPR:
4253 case BIT_AND_EXPR:
4254 {
4255 if (code == MULT_EXPR)
4256 {
4257 real_init_val = dconst1;
4258 int_init_val = 1;
4259 }
4260
4261 if (code == BIT_AND_EXPR)
4262 int_init_val = -1;
4263
4264 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4265 def_for_init = build_real (scalar_type, real_init_val);
4266 else
4267 def_for_init = build_int_cst (scalar_type, int_init_val);
4268
4269 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4270 {
4271 /* Option1: the first element is '0' or '1' as well. */
4272 if (!operand_equal_p (def_for_init, init_val, 0))
4273 *adjustment_def = init_val;
4274 init_def = gimple_build_vector_from_val (&stmts, vectype,
4275 def_for_init);
4276 }
4277 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4278 {
4279 /* Option2 (variable length): the first element is INIT_VAL. */
4280 init_def = gimple_build_vector_from_val (&stmts, vectype,
4281 def_for_init);
4282 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4283 vectype, init_def, init_val);
4284 }
4285 else
4286 {
4287 /* Option2: the first element is INIT_VAL. */
4288 tree_vector_builder elts (vectype, 1, 2);
4289 elts.quick_push (init_val);
4290 elts.quick_push (def_for_init);
4291 init_def = gimple_build_vector (&stmts, &elts);
4292 }
4293 }
4294 break;
4295
4296 case MIN_EXPR:
4297 case MAX_EXPR:
4298 case COND_EXPR:
4299 {
4300 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4301 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4302 }
4303 break;
4304
4305 default:
4306 gcc_unreachable ();
4307 }
4308
4309 if (stmts)
4310 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4311 return init_def;
4312 }
4313
4314 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4315 NUMBER_OF_VECTORS is the number of vector defs to create.
4316 If NEUTRAL_OP is nonnull, introducing extra elements of that
4317 value will not change the result. */
4318
4319 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4320 get_initial_defs_for_reduction (slp_tree slp_node,
4321 vec<tree> *vec_oprnds,
4322 unsigned int number_of_vectors,
4323 bool reduc_chain, tree neutral_op)
4324 {
4325 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326 stmt_vec_info stmt_vinfo = stmts[0];
4327 vec_info *vinfo = stmt_vinfo->vinfo;
4328 unsigned HOST_WIDE_INT nunits;
4329 unsigned j, number_of_places_left_in_vector;
4330 tree vector_type;
4331 unsigned int group_size = stmts.length ();
4332 unsigned int i;
4333 class loop *loop;
4334
4335 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4336
4337 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4338
4339 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4340 gcc_assert (loop);
4341 edge pe = loop_preheader_edge (loop);
4342
4343 gcc_assert (!reduc_chain || neutral_op);
4344
4345 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4346 created vectors. It is greater than 1 if unrolling is performed.
4347
4348 For example, we have two scalar operands, s1 and s2 (e.g., group of
4349 strided accesses of size two), while NUNITS is four (i.e., four scalars
4350 of this type can be packed in a vector). The output vector will contain
4351 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4352 will be 2).
4353
4354 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4355 vectors containing the operands.
4356
4357 For example, NUNITS is four as before, and the group size is 8
4358 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4359 {s5, s6, s7, s8}. */
4360
4361 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4362 nunits = group_size;
4363
4364 number_of_places_left_in_vector = nunits;
4365 bool constant_p = true;
4366 tree_vector_builder elts (vector_type, nunits, 1);
4367 elts.quick_grow (nunits);
4368 gimple_seq ctor_seq = NULL;
4369 for (j = 0; j < nunits * number_of_vectors; ++j)
4370 {
4371 tree op;
4372 i = j % group_size;
4373 stmt_vinfo = stmts[i];
4374
4375 /* Get the def before the loop. In reduction chain we have only
4376 one initial value. Else we have as many as PHIs in the group. */
4377 if (reduc_chain)
4378 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4379 else if (((vec_oprnds->length () + 1) * nunits
4380 - number_of_places_left_in_vector >= group_size)
4381 && neutral_op)
4382 op = neutral_op;
4383 else
4384 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4385
4386 /* Create 'vect_ = {op0,op1,...,opn}'. */
4387 number_of_places_left_in_vector--;
4388 elts[nunits - number_of_places_left_in_vector - 1] = op;
4389 if (!CONSTANT_CLASS_P (op))
4390 constant_p = false;
4391
4392 if (number_of_places_left_in_vector == 0)
4393 {
4394 tree init;
4395 if (constant_p && !neutral_op
4396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4398 /* Build the vector directly from ELTS. */
4399 init = gimple_build_vector (&ctor_seq, &elts);
4400 else if (neutral_op)
4401 {
4402 /* Build a vector of the neutral value and shift the
4403 other elements into place. */
4404 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4405 neutral_op);
4406 int k = nunits;
4407 while (k > 0 && elts[k - 1] == neutral_op)
4408 k -= 1;
4409 while (k > 0)
4410 {
4411 k -= 1;
4412 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4413 vector_type, init, elts[k]);
4414 }
4415 }
4416 else
4417 {
4418 /* First time round, duplicate ELTS to fill the
4419 required number of vectors. */
4420 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4421 number_of_vectors, *vec_oprnds);
4422 break;
4423 }
4424 vec_oprnds->quick_push (init);
4425
4426 number_of_places_left_in_vector = nunits;
4427 elts.new_vector (vector_type, nunits, 1);
4428 elts.quick_grow (nunits);
4429 constant_p = true;
4430 }
4431 }
4432 if (ctor_seq != NULL)
4433 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4434 }
4435
4436 /* For a statement STMT_INFO taking part in a reduction operation return
4437 the stmt_vec_info the meta information is stored on. */
4438
4439 stmt_vec_info
info_for_reduction(stmt_vec_info stmt_info)4440 info_for_reduction (stmt_vec_info stmt_info)
4441 {
4442 stmt_info = vect_orig_stmt (stmt_info);
4443 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4444 if (!is_a <gphi *> (stmt_info->stmt)
4445 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4446 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4447 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4448 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4449 {
4450 if (gimple_phi_num_args (phi) == 1)
4451 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4452 }
4453 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4454 {
4455 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4456 stmt_vec_info info
4457 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4458 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4459 stmt_info = info;
4460 }
4461 return stmt_info;
4462 }
4463
4464 /* Function vect_create_epilog_for_reduction
4465
4466 Create code at the loop-epilog to finalize the result of a reduction
4467 computation.
4468
4469 STMT_INFO is the scalar reduction stmt that is being vectorized.
4470 SLP_NODE is an SLP node containing a group of reduction statements. The
4471 first one in this group is STMT_INFO.
4472 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4473 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4474 (counting from 0)
4475
4476 This function:
4477 1. Completes the reduction def-use cycles.
4478 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4479 by calling the function specified by REDUC_FN if available, or by
4480 other means (whole-vector shifts or a scalar loop).
4481 The function also creates a new phi node at the loop exit to preserve
4482 loop-closed form, as illustrated below.
4483
4484 The flow at the entry to this function:
4485
4486 loop:
4487 vec_def = phi <vec_init, null> # REDUCTION_PHI
4488 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4489 s_loop = scalar_stmt # (scalar) STMT_INFO
4490 loop_exit:
4491 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4492 use <s_out0>
4493 use <s_out0>
4494
4495 The above is transformed by this function into:
4496
4497 loop:
4498 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4499 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4500 s_loop = scalar_stmt # (scalar) STMT_INFO
4501 loop_exit:
4502 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4503 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4504 v_out2 = reduce <v_out1>
4505 s_out3 = extract_field <v_out2, 0>
4506 s_out4 = adjust_result <s_out3>
4507 use <s_out4>
4508 use <s_out4>
4509 */
4510
4511 static void
vect_create_epilog_for_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4512 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4513 slp_tree slp_node,
4514 slp_instance slp_node_instance)
4515 {
4516 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4517 gcc_assert (reduc_info->is_reduc_info);
4518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4519 /* For double reductions we need to get at the inner loop reduction
4520 stmt which has the meta info attached. Our stmt_info is that of the
4521 loop-closed PHI of the inner loop which we remember as
4522 def for the reduction PHI generation. */
4523 bool double_reduc = false;
4524 stmt_vec_info rdef_info = stmt_info;
4525 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4526 {
4527 gcc_assert (!slp_node);
4528 double_reduc = true;
4529 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4530 (stmt_info->stmt, 0));
4531 stmt_info = vect_stmt_to_vectorize (stmt_info);
4532 }
4533 gphi *reduc_def_stmt
4534 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4535 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4536 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4537 stmt_vec_info prev_phi_info;
4538 tree vectype;
4539 machine_mode mode;
4540 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4541 basic_block exit_bb;
4542 tree scalar_dest;
4543 tree scalar_type;
4544 gimple *new_phi = NULL, *phi;
4545 stmt_vec_info phi_info;
4546 gimple_stmt_iterator exit_gsi;
4547 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4548 gimple *epilog_stmt = NULL;
4549 gimple *exit_phi;
4550 tree bitsize;
4551 tree def;
4552 tree orig_name, scalar_result;
4553 imm_use_iterator imm_iter, phi_imm_iter;
4554 use_operand_p use_p, phi_use_p;
4555 gimple *use_stmt;
4556 bool nested_in_vect_loop = false;
4557 auto_vec<gimple *> new_phis;
4558 int j, i;
4559 auto_vec<tree> scalar_results;
4560 unsigned int group_size = 1, k;
4561 auto_vec<gimple *> phis;
4562 bool slp_reduc = false;
4563 bool direct_slp_reduc;
4564 tree new_phi_result;
4565 tree induction_index = NULL_TREE;
4566
4567 if (slp_node)
4568 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4569
4570 if (nested_in_vect_loop_p (loop, stmt_info))
4571 {
4572 outer_loop = loop;
4573 loop = loop->inner;
4574 nested_in_vect_loop = true;
4575 gcc_assert (!slp_node);
4576 }
4577 gcc_assert (!nested_in_vect_loop || double_reduc);
4578
4579 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4580 gcc_assert (vectype);
4581 mode = TYPE_MODE (vectype);
4582
4583 tree initial_def = NULL;
4584 tree induc_val = NULL_TREE;
4585 tree adjustment_def = NULL;
4586 if (slp_node)
4587 ;
4588 else
4589 {
4590 /* Get at the scalar def before the loop, that defines the initial value
4591 of the reduction variable. */
4592 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4593 loop_preheader_edge (loop));
4594 /* Optimize: for induction condition reduction, if we can't use zero
4595 for induc_val, use initial_def. */
4596 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4597 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4598 else if (double_reduc)
4599 ;
4600 else if (nested_in_vect_loop)
4601 ;
4602 else
4603 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4604 }
4605
4606 unsigned vec_num;
4607 int ncopies;
4608 if (slp_node)
4609 {
4610 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4611 ncopies = 1;
4612 }
4613 else
4614 {
4615 vec_num = 1;
4616 ncopies = 0;
4617 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4618 do
4619 {
4620 ncopies++;
4621 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4622 }
4623 while (phi_info);
4624 }
4625
4626 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4627 which is updated with the current index of the loop for every match of
4628 the original loop's cond_expr (VEC_STMT). This results in a vector
4629 containing the last time the condition passed for that vector lane.
4630 The first match will be a 1 to allow 0 to be used for non-matching
4631 indexes. If there are no matches at all then the vector will be all
4632 zeroes.
4633
4634 PR92772: This algorithm is broken for architectures that support
4635 masked vectors, but do not provide fold_extract_last. */
4636 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4637 {
4638 auto_vec<std::pair<tree, bool>, 2> ccompares;
4639 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4640 cond_info = vect_stmt_to_vectorize (cond_info);
4641 while (cond_info != reduc_info)
4642 {
4643 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4644 {
4645 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4646 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4647 ccompares.safe_push
4648 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4649 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4650 }
4651 cond_info
4652 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4653 1 + STMT_VINFO_REDUC_IDX
4654 (cond_info)));
4655 cond_info = vect_stmt_to_vectorize (cond_info);
4656 }
4657 gcc_assert (ccompares.length () != 0);
4658
4659 tree indx_before_incr, indx_after_incr;
4660 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4661 int scalar_precision
4662 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4663 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4664 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4665 (TYPE_MODE (vectype), cr_index_scalar_type,
4666 TYPE_VECTOR_SUBPARTS (vectype));
4667
4668 /* First we create a simple vector induction variable which starts
4669 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4670 vector size (STEP). */
4671
4672 /* Create a {1,2,3,...} vector. */
4673 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4674
4675 /* Create a vector of the step value. */
4676 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4677 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4678
4679 /* Create an induction variable. */
4680 gimple_stmt_iterator incr_gsi;
4681 bool insert_after;
4682 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4683 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4684 insert_after, &indx_before_incr, &indx_after_incr);
4685
4686 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4687 filled with zeros (VEC_ZERO). */
4688
4689 /* Create a vector of 0s. */
4690 tree zero = build_zero_cst (cr_index_scalar_type);
4691 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4692
4693 /* Create a vector phi node. */
4694 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4695 new_phi = create_phi_node (new_phi_tree, loop->header);
4696 loop_vinfo->add_stmt (new_phi);
4697 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4698 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4699
4700 /* Now take the condition from the loops original cond_exprs
4701 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4702 every match uses values from the induction variable
4703 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4704 (NEW_PHI_TREE).
4705 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4706 the new cond_expr (INDEX_COND_EXPR). */
4707 gimple_seq stmts = NULL;
4708 for (int i = ccompares.length () - 1; i != -1; --i)
4709 {
4710 tree ccompare = ccompares[i].first;
4711 if (ccompares[i].second)
4712 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4713 cr_index_vector_type,
4714 ccompare,
4715 indx_before_incr, new_phi_tree);
4716 else
4717 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4718 cr_index_vector_type,
4719 ccompare,
4720 new_phi_tree, indx_before_incr);
4721 }
4722 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4723 stmt_vec_info index_vec_info
4724 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4725 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4726
4727 /* Update the phi with the vec cond. */
4728 induction_index = new_phi_tree;
4729 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4730 loop_latch_edge (loop), UNKNOWN_LOCATION);
4731 }
4732
4733 /* 2. Create epilog code.
4734 The reduction epilog code operates across the elements of the vector
4735 of partial results computed by the vectorized loop.
4736 The reduction epilog code consists of:
4737
4738 step 1: compute the scalar result in a vector (v_out2)
4739 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4740 step 3: adjust the scalar result (s_out3) if needed.
4741
4742 Step 1 can be accomplished using one the following three schemes:
4743 (scheme 1) using reduc_fn, if available.
4744 (scheme 2) using whole-vector shifts, if available.
4745 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4746 combined.
4747
4748 The overall epilog code looks like this:
4749
4750 s_out0 = phi <s_loop> # original EXIT_PHI
4751 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4752 v_out2 = reduce <v_out1> # step 1
4753 s_out3 = extract_field <v_out2, 0> # step 2
4754 s_out4 = adjust_result <s_out3> # step 3
4755
4756 (step 3 is optional, and steps 1 and 2 may be combined).
4757 Lastly, the uses of s_out0 are replaced by s_out4. */
4758
4759
4760 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4761 v_out1 = phi <VECT_DEF>
4762 Store them in NEW_PHIS. */
4763 if (double_reduc)
4764 loop = outer_loop;
4765 exit_bb = single_exit (loop)->dest;
4766 prev_phi_info = NULL;
4767 new_phis.create (slp_node ? vec_num : ncopies);
4768 for (unsigned i = 0; i < vec_num; i++)
4769 {
4770 if (slp_node)
4771 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4772 else
4773 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4774 for (j = 0; j < ncopies; j++)
4775 {
4776 tree new_def = copy_ssa_name (def);
4777 phi = create_phi_node (new_def, exit_bb);
4778 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4779 if (j == 0)
4780 new_phis.quick_push (phi);
4781 else
4782 {
4783 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4784 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4785 }
4786
4787 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4788 prev_phi_info = phi_info;
4789 }
4790 }
4791
4792 exit_gsi = gsi_after_labels (exit_bb);
4793
4794 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4795 (i.e. when reduc_fn is not available) and in the final adjustment
4796 code (if needed). Also get the original scalar reduction variable as
4797 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4798 represents a reduction pattern), the tree-code and scalar-def are
4799 taken from the original stmt that the pattern-stmt (STMT) replaces.
4800 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4801 are taken from STMT. */
4802
4803 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4804 if (orig_stmt_info != stmt_info)
4805 {
4806 /* Reduction pattern */
4807 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4808 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4809 }
4810
4811 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4812 scalar_type = TREE_TYPE (scalar_dest);
4813 scalar_results.create (group_size);
4814 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4815 bitsize = TYPE_SIZE (scalar_type);
4816
4817 /* SLP reduction without reduction chain, e.g.,
4818 # a1 = phi <a2, a0>
4819 # b1 = phi <b2, b0>
4820 a2 = operation (a1)
4821 b2 = operation (b1) */
4822 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4823
4824 /* True if we should implement SLP_REDUC using native reduction operations
4825 instead of scalar operations. */
4826 direct_slp_reduc = (reduc_fn != IFN_LAST
4827 && slp_reduc
4828 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4829
4830 /* In case of reduction chain, e.g.,
4831 # a1 = phi <a3, a0>
4832 a2 = operation (a1)
4833 a3 = operation (a2),
4834
4835 we may end up with more than one vector result. Here we reduce them to
4836 one vector. */
4837 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4838 {
4839 gimple_seq stmts = NULL;
4840 tree first_vect = PHI_RESULT (new_phis[0]);
4841 first_vect = gimple_convert (&stmts, vectype, first_vect);
4842 for (k = 1; k < new_phis.length (); k++)
4843 {
4844 gimple *next_phi = new_phis[k];
4845 tree second_vect = PHI_RESULT (next_phi);
4846 second_vect = gimple_convert (&stmts, vectype, second_vect);
4847 first_vect = gimple_build (&stmts, code, vectype,
4848 first_vect, second_vect);
4849 }
4850 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4851
4852 new_phi_result = first_vect;
4853 new_phis.truncate (0);
4854 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4855 }
4856 /* Likewise if we couldn't use a single defuse cycle. */
4857 else if (ncopies > 1)
4858 {
4859 gcc_assert (new_phis.length () == 1);
4860 gimple_seq stmts = NULL;
4861 tree first_vect = PHI_RESULT (new_phis[0]);
4862 first_vect = gimple_convert (&stmts, vectype, first_vect);
4863 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4864 for (int k = 1; k < ncopies; ++k)
4865 {
4866 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4867 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4868 second_vect = gimple_convert (&stmts, vectype, second_vect);
4869 first_vect = gimple_build (&stmts, code, vectype,
4870 first_vect, second_vect);
4871 }
4872 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4873 new_phi_result = first_vect;
4874 new_phis.truncate (0);
4875 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4876 }
4877 else
4878 new_phi_result = PHI_RESULT (new_phis[0]);
4879
4880 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4881 && reduc_fn != IFN_LAST)
4882 {
4883 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4884 various data values where the condition matched and another vector
4885 (INDUCTION_INDEX) containing all the indexes of those matches. We
4886 need to extract the last matching index (which will be the index with
4887 highest value) and use this to index into the data vector.
4888 For the case where there were no matches, the data vector will contain
4889 all default values and the index vector will be all zeros. */
4890
4891 /* Get various versions of the type of the vector of indexes. */
4892 tree index_vec_type = TREE_TYPE (induction_index);
4893 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4894 tree index_scalar_type = TREE_TYPE (index_vec_type);
4895 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4896
4897 /* Get an unsigned integer version of the type of the data vector. */
4898 int scalar_precision
4899 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4900 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4901 tree vectype_unsigned = build_vector_type
4902 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4903
4904 /* First we need to create a vector (ZERO_VEC) of zeros and another
4905 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4906 can create using a MAX reduction and then expanding.
4907 In the case where the loop never made any matches, the max index will
4908 be zero. */
4909
4910 /* Vector of {0, 0, 0,...}. */
4911 tree zero_vec = build_zero_cst (vectype);
4912
4913 gimple_seq stmts = NULL;
4914 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4915 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916
4917 /* Find maximum value from the vector of found indexes. */
4918 tree max_index = make_ssa_name (index_scalar_type);
4919 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4920 1, induction_index);
4921 gimple_call_set_lhs (max_index_stmt, max_index);
4922 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4923
4924 /* Vector of {max_index, max_index, max_index,...}. */
4925 tree max_index_vec = make_ssa_name (index_vec_type);
4926 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4927 max_index);
4928 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4929 max_index_vec_rhs);
4930 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4931
4932 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4933 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4934 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4935 otherwise. Only one value should match, resulting in a vector
4936 (VEC_COND) with one data value and the rest zeros.
4937 In the case where the loop never made any matches, every index will
4938 match, resulting in a vector with all data values (which will all be
4939 the default value). */
4940
4941 /* Compare the max index vector to the vector of found indexes to find
4942 the position of the max value. */
4943 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4944 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4945 induction_index,
4946 max_index_vec);
4947 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4948
4949 /* Use the compare to choose either values from the data vector or
4950 zero. */
4951 tree vec_cond = make_ssa_name (vectype);
4952 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4953 vec_compare, new_phi_result,
4954 zero_vec);
4955 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4956
4957 /* Finally we need to extract the data value from the vector (VEC_COND)
4958 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4959 reduction, but because this doesn't exist, we can use a MAX reduction
4960 instead. The data value might be signed or a float so we need to cast
4961 it first.
4962 In the case where the loop never made any matches, the data values are
4963 all identical, and so will reduce down correctly. */
4964
4965 /* Make the matched data values unsigned. */
4966 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4967 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4968 vec_cond);
4969 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4970 VIEW_CONVERT_EXPR,
4971 vec_cond_cast_rhs);
4972 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4973
4974 /* Reduce down to a scalar value. */
4975 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4976 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4977 1, vec_cond_cast);
4978 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4979 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4980
4981 /* Convert the reduced value back to the result type and set as the
4982 result. */
4983 stmts = NULL;
4984 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4985 data_reduc);
4986 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4987 scalar_results.safe_push (new_temp);
4988 }
4989 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4990 && reduc_fn == IFN_LAST)
4991 {
4992 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4993 idx = 0;
4994 idx_val = induction_index[0];
4995 val = data_reduc[0];
4996 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4997 if (induction_index[i] > idx_val)
4998 val = data_reduc[i], idx_val = induction_index[i];
4999 return val; */
5000
5001 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5002 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5003 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5004 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5005 /* Enforced by vectorizable_reduction, which ensures we have target
5006 support before allowing a conditional reduction on variable-length
5007 vectors. */
5008 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5009 tree idx_val = NULL_TREE, val = NULL_TREE;
5010 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5011 {
5012 tree old_idx_val = idx_val;
5013 tree old_val = val;
5014 idx_val = make_ssa_name (idx_eltype);
5015 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5016 build3 (BIT_FIELD_REF, idx_eltype,
5017 induction_index,
5018 bitsize_int (el_size),
5019 bitsize_int (off)));
5020 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5021 val = make_ssa_name (data_eltype);
5022 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5023 build3 (BIT_FIELD_REF,
5024 data_eltype,
5025 new_phi_result,
5026 bitsize_int (el_size),
5027 bitsize_int (off)));
5028 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5029 if (off != 0)
5030 {
5031 tree new_idx_val = idx_val;
5032 if (off != v_size - el_size)
5033 {
5034 new_idx_val = make_ssa_name (idx_eltype);
5035 epilog_stmt = gimple_build_assign (new_idx_val,
5036 MAX_EXPR, idx_val,
5037 old_idx_val);
5038 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039 }
5040 tree new_val = make_ssa_name (data_eltype);
5041 epilog_stmt = gimple_build_assign (new_val,
5042 COND_EXPR,
5043 build2 (GT_EXPR,
5044 boolean_type_node,
5045 idx_val,
5046 old_idx_val),
5047 val, old_val);
5048 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5049 idx_val = new_idx_val;
5050 val = new_val;
5051 }
5052 }
5053 /* Convert the reduced value back to the result type and set as the
5054 result. */
5055 gimple_seq stmts = NULL;
5056 val = gimple_convert (&stmts, scalar_type, val);
5057 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5058 scalar_results.safe_push (val);
5059 }
5060
5061 /* 2.3 Create the reduction code, using one of the three schemes described
5062 above. In SLP we simply need to extract all the elements from the
5063 vector (without reducing them), so we use scalar shifts. */
5064 else if (reduc_fn != IFN_LAST && !slp_reduc)
5065 {
5066 tree tmp;
5067 tree vec_elem_type;
5068
5069 /* Case 1: Create:
5070 v_out2 = reduc_expr <v_out1> */
5071
5072 if (dump_enabled_p ())
5073 dump_printf_loc (MSG_NOTE, vect_location,
5074 "Reduce using direct vector reduction.\n");
5075
5076 gimple_seq stmts = NULL;
5077 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5078 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5079 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5080 vec_elem_type, new_phi_result);
5081 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5082 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5083
5084 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5085 && induc_val)
5086 {
5087 /* Earlier we set the initial value to be a vector if induc_val
5088 values. Check the result and if it is induc_val then replace
5089 with the original initial value, unless induc_val is
5090 the same as initial_def already. */
5091 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5092 induc_val);
5093
5094 tmp = make_ssa_name (new_scalar_dest);
5095 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5096 initial_def, new_temp);
5097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098 new_temp = tmp;
5099 }
5100
5101 scalar_results.safe_push (new_temp);
5102 }
5103 else if (direct_slp_reduc)
5104 {
5105 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5106 with the elements for other SLP statements replaced with the
5107 neutral value. We can then do a normal reduction on each vector. */
5108
5109 /* Enforced by vectorizable_reduction. */
5110 gcc_assert (new_phis.length () == 1);
5111 gcc_assert (pow2p_hwi (group_size));
5112
5113 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5114 vec<stmt_vec_info> orig_phis
5115 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5116 gimple_seq seq = NULL;
5117
5118 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5119 and the same element size as VECTYPE. */
5120 tree index = build_index_vector (vectype, 0, 1);
5121 tree index_type = TREE_TYPE (index);
5122 tree index_elt_type = TREE_TYPE (index_type);
5123 tree mask_type = truth_type_for (index_type);
5124
5125 /* Create a vector that, for each element, identifies which of
5126 the REDUC_GROUP_SIZE results should use it. */
5127 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5128 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5129 build_vector_from_val (index_type, index_mask));
5130
5131 /* Get a neutral vector value. This is simply a splat of the neutral
5132 scalar value if we have one, otherwise the initial scalar value
5133 is itself a neutral value. */
5134 tree vector_identity = NULL_TREE;
5135 tree neutral_op = NULL_TREE;
5136 if (slp_node)
5137 {
5138 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5139 neutral_op
5140 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5141 vectype, code, first != NULL);
5142 }
5143 if (neutral_op)
5144 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5145 neutral_op);
5146 for (unsigned int i = 0; i < group_size; ++i)
5147 {
5148 /* If there's no univeral neutral value, we can use the
5149 initial scalar value from the original PHI. This is used
5150 for MIN and MAX reduction, for example. */
5151 if (!neutral_op)
5152 {
5153 tree scalar_value
5154 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5155 loop_preheader_edge (loop));
5156 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5157 scalar_value);
5158 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5159 scalar_value);
5160 }
5161
5162 /* Calculate the equivalent of:
5163
5164 sel[j] = (index[j] == i);
5165
5166 which selects the elements of NEW_PHI_RESULT that should
5167 be included in the result. */
5168 tree compare_val = build_int_cst (index_elt_type, i);
5169 compare_val = build_vector_from_val (index_type, compare_val);
5170 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5171 index, compare_val);
5172
5173 /* Calculate the equivalent of:
5174
5175 vec = seq ? new_phi_result : vector_identity;
5176
5177 VEC is now suitable for a full vector reduction. */
5178 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5179 sel, new_phi_result, vector_identity);
5180
5181 /* Do the reduction and convert it to the appropriate type. */
5182 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5183 TREE_TYPE (vectype), vec);
5184 scalar = gimple_convert (&seq, scalar_type, scalar);
5185 scalar_results.safe_push (scalar);
5186 }
5187 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5188 }
5189 else
5190 {
5191 bool reduce_with_shift;
5192 tree vec_temp;
5193
5194 gcc_assert (slp_reduc || new_phis.length () == 1);
5195
5196 /* See if the target wants to do the final (shift) reduction
5197 in a vector mode of smaller size and first reduce upper/lower
5198 halves against each other. */
5199 enum machine_mode mode1 = mode;
5200 tree stype = TREE_TYPE (vectype);
5201 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5202 unsigned nunits1 = nunits;
5203 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5204 && new_phis.length () == 1)
5205 {
5206 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5207 /* For SLP reductions we have to make sure lanes match up, but
5208 since we're doing individual element final reduction reducing
5209 vector width here is even more important.
5210 ??? We can also separate lanes with permutes, for the common
5211 case of power-of-two group-size odd/even extracts would work. */
5212 if (slp_reduc && nunits != nunits1)
5213 {
5214 nunits1 = least_common_multiple (nunits1, group_size);
5215 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5216 }
5217 }
5218 if (!slp_reduc
5219 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5220 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5221
5222 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5223 stype, nunits1);
5224 reduce_with_shift = have_whole_vector_shift (mode1);
5225 if (!VECTOR_MODE_P (mode1))
5226 reduce_with_shift = false;
5227 else
5228 {
5229 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5230 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5231 reduce_with_shift = false;
5232 }
5233
5234 /* First reduce the vector to the desired vector size we should
5235 do shift reduction on by combining upper and lower halves. */
5236 new_temp = new_phi_result;
5237 while (nunits > nunits1)
5238 {
5239 nunits /= 2;
5240 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5241 stype, nunits);
5242 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5243
5244 /* The target has to make sure we support lowpart/highpart
5245 extraction, either via direct vector extract or through
5246 an integer mode punning. */
5247 tree dst1, dst2;
5248 if (convert_optab_handler (vec_extract_optab,
5249 TYPE_MODE (TREE_TYPE (new_temp)),
5250 TYPE_MODE (vectype1))
5251 != CODE_FOR_nothing)
5252 {
5253 /* Extract sub-vectors directly once vec_extract becomes
5254 a conversion optab. */
5255 dst1 = make_ssa_name (vectype1);
5256 epilog_stmt
5257 = gimple_build_assign (dst1, BIT_FIELD_REF,
5258 build3 (BIT_FIELD_REF, vectype1,
5259 new_temp, TYPE_SIZE (vectype1),
5260 bitsize_int (0)));
5261 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5262 dst2 = make_ssa_name (vectype1);
5263 epilog_stmt
5264 = gimple_build_assign (dst2, BIT_FIELD_REF,
5265 build3 (BIT_FIELD_REF, vectype1,
5266 new_temp, TYPE_SIZE (vectype1),
5267 bitsize_int (bitsize)));
5268 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5269 }
5270 else
5271 {
5272 /* Extract via punning to appropriately sized integer mode
5273 vector. */
5274 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5275 tree etype = build_vector_type (eltype, 2);
5276 gcc_assert (convert_optab_handler (vec_extract_optab,
5277 TYPE_MODE (etype),
5278 TYPE_MODE (eltype))
5279 != CODE_FOR_nothing);
5280 tree tem = make_ssa_name (etype);
5281 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5282 build1 (VIEW_CONVERT_EXPR,
5283 etype, new_temp));
5284 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285 new_temp = tem;
5286 tem = make_ssa_name (eltype);
5287 epilog_stmt
5288 = gimple_build_assign (tem, BIT_FIELD_REF,
5289 build3 (BIT_FIELD_REF, eltype,
5290 new_temp, TYPE_SIZE (eltype),
5291 bitsize_int (0)));
5292 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293 dst1 = make_ssa_name (vectype1);
5294 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5295 build1 (VIEW_CONVERT_EXPR,
5296 vectype1, tem));
5297 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298 tem = make_ssa_name (eltype);
5299 epilog_stmt
5300 = gimple_build_assign (tem, BIT_FIELD_REF,
5301 build3 (BIT_FIELD_REF, eltype,
5302 new_temp, TYPE_SIZE (eltype),
5303 bitsize_int (bitsize)));
5304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305 dst2 = make_ssa_name (vectype1);
5306 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5307 build1 (VIEW_CONVERT_EXPR,
5308 vectype1, tem));
5309 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310 }
5311
5312 new_temp = make_ssa_name (vectype1);
5313 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5314 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315 new_phis[0] = epilog_stmt;
5316 }
5317
5318 if (reduce_with_shift && !slp_reduc)
5319 {
5320 int element_bitsize = tree_to_uhwi (bitsize);
5321 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5322 for variable-length vectors and also requires direct target support
5323 for loop reductions. */
5324 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5325 int nelements = vec_size_in_bits / element_bitsize;
5326 vec_perm_builder sel;
5327 vec_perm_indices indices;
5328
5329 int elt_offset;
5330
5331 tree zero_vec = build_zero_cst (vectype1);
5332 /* Case 2: Create:
5333 for (offset = nelements/2; offset >= 1; offset/=2)
5334 {
5335 Create: va' = vec_shift <va, offset>
5336 Create: va = vop <va, va'>
5337 } */
5338
5339 tree rhs;
5340
5341 if (dump_enabled_p ())
5342 dump_printf_loc (MSG_NOTE, vect_location,
5343 "Reduce using vector shifts\n");
5344
5345 gimple_seq stmts = NULL;
5346 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5347 for (elt_offset = nelements / 2;
5348 elt_offset >= 1;
5349 elt_offset /= 2)
5350 {
5351 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5352 indices.new_vector (sel, 2, nelements);
5353 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5354 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5355 new_temp, zero_vec, mask);
5356 new_temp = gimple_build (&stmts, code,
5357 vectype1, new_name, new_temp);
5358 }
5359 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5360
5361 /* 2.4 Extract the final scalar result. Create:
5362 s_out3 = extract_field <v_out2, bitpos> */
5363
5364 if (dump_enabled_p ())
5365 dump_printf_loc (MSG_NOTE, vect_location,
5366 "extract scalar result\n");
5367
5368 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5369 bitsize, bitsize_zero_node);
5370 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5371 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5372 gimple_assign_set_lhs (epilog_stmt, new_temp);
5373 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5374 scalar_results.safe_push (new_temp);
5375 }
5376 else
5377 {
5378 /* Case 3: Create:
5379 s = extract_field <v_out2, 0>
5380 for (offset = element_size;
5381 offset < vector_size;
5382 offset += element_size;)
5383 {
5384 Create: s' = extract_field <v_out2, offset>
5385 Create: s = op <s, s'> // For non SLP cases
5386 } */
5387
5388 if (dump_enabled_p ())
5389 dump_printf_loc (MSG_NOTE, vect_location,
5390 "Reduce using scalar code.\n");
5391
5392 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5393 int element_bitsize = tree_to_uhwi (bitsize);
5394 tree compute_type = TREE_TYPE (vectype);
5395 gimple_seq stmts = NULL;
5396 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5397 {
5398 int bit_offset;
5399 if (gimple_code (new_phi) == GIMPLE_PHI)
5400 vec_temp = PHI_RESULT (new_phi);
5401 else
5402 vec_temp = gimple_assign_lhs (new_phi);
5403 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5404 vec_temp, bitsize, bitsize_zero_node);
5405
5406 /* In SLP we don't need to apply reduction operation, so we just
5407 collect s' values in SCALAR_RESULTS. */
5408 if (slp_reduc)
5409 scalar_results.safe_push (new_temp);
5410
5411 for (bit_offset = element_bitsize;
5412 bit_offset < vec_size_in_bits;
5413 bit_offset += element_bitsize)
5414 {
5415 tree bitpos = bitsize_int (bit_offset);
5416 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5417 compute_type, vec_temp,
5418 bitsize, bitpos);
5419 if (slp_reduc)
5420 {
5421 /* In SLP we don't need to apply reduction operation, so
5422 we just collect s' values in SCALAR_RESULTS. */
5423 new_temp = new_name;
5424 scalar_results.safe_push (new_name);
5425 }
5426 else
5427 new_temp = gimple_build (&stmts, code, compute_type,
5428 new_name, new_temp);
5429 }
5430 }
5431
5432 /* The only case where we need to reduce scalar results in SLP, is
5433 unrolling. If the size of SCALAR_RESULTS is greater than
5434 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5435 REDUC_GROUP_SIZE. */
5436 if (slp_reduc)
5437 {
5438 tree res, first_res, new_res;
5439
5440 /* Reduce multiple scalar results in case of SLP unrolling. */
5441 for (j = group_size; scalar_results.iterate (j, &res);
5442 j++)
5443 {
5444 first_res = scalar_results[j % group_size];
5445 new_res = gimple_build (&stmts, code, compute_type,
5446 first_res, res);
5447 scalar_results[j % group_size] = new_res;
5448 }
5449 for (k = 0; k < group_size; k++)
5450 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5451 scalar_results[k]);
5452 }
5453 else
5454 {
5455 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5456 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5457 scalar_results.safe_push (new_temp);
5458 }
5459
5460 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5461 }
5462
5463 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5464 && induc_val)
5465 {
5466 /* Earlier we set the initial value to be a vector if induc_val
5467 values. Check the result and if it is induc_val then replace
5468 with the original initial value, unless induc_val is
5469 the same as initial_def already. */
5470 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5471 induc_val);
5472
5473 tree tmp = make_ssa_name (new_scalar_dest);
5474 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5475 initial_def, new_temp);
5476 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5477 scalar_results[0] = tmp;
5478 }
5479 }
5480
5481 /* 2.5 Adjust the final result by the initial value of the reduction
5482 variable. (When such adjustment is not needed, then
5483 'adjustment_def' is zero). For example, if code is PLUS we create:
5484 new_temp = loop_exit_def + adjustment_def */
5485
5486 if (adjustment_def)
5487 {
5488 gcc_assert (!slp_reduc);
5489 gimple_seq stmts = NULL;
5490 if (nested_in_vect_loop)
5491 {
5492 new_phi = new_phis[0];
5493 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5494 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5495 new_temp = gimple_build (&stmts, code, vectype,
5496 PHI_RESULT (new_phi), adjustment_def);
5497 }
5498 else
5499 {
5500 new_temp = scalar_results[0];
5501 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5502 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5503 new_temp = gimple_build (&stmts, code, scalar_type,
5504 new_temp, adjustment_def);
5505 }
5506
5507 epilog_stmt = gimple_seq_last_stmt (stmts);
5508 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5509 if (nested_in_vect_loop)
5510 {
5511 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5512 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5513 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5514
5515 if (!double_reduc)
5516 scalar_results.quick_push (new_temp);
5517 else
5518 scalar_results[0] = new_temp;
5519 }
5520 else
5521 scalar_results[0] = new_temp;
5522
5523 new_phis[0] = epilog_stmt;
5524 }
5525
5526 if (double_reduc)
5527 loop = loop->inner;
5528
5529 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5530 phis with new adjusted scalar results, i.e., replace use <s_out0>
5531 with use <s_out4>.
5532
5533 Transform:
5534 loop_exit:
5535 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5536 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5537 v_out2 = reduce <v_out1>
5538 s_out3 = extract_field <v_out2, 0>
5539 s_out4 = adjust_result <s_out3>
5540 use <s_out0>
5541 use <s_out0>
5542
5543 into:
5544
5545 loop_exit:
5546 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5547 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5548 v_out2 = reduce <v_out1>
5549 s_out3 = extract_field <v_out2, 0>
5550 s_out4 = adjust_result <s_out3>
5551 use <s_out4>
5552 use <s_out4> */
5553
5554
5555 /* In SLP reduction chain we reduce vector results into one vector if
5556 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5557 LHS of the last stmt in the reduction chain, since we are looking for
5558 the loop exit phi node. */
5559 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5560 {
5561 stmt_vec_info dest_stmt_info
5562 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5563 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5564 group_size = 1;
5565 }
5566
5567 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5568 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5569 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5570 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5571 correspond to the first vector stmt, etc.
5572 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5573 if (group_size > new_phis.length ())
5574 gcc_assert (!(group_size % new_phis.length ()));
5575
5576 for (k = 0; k < group_size; k++)
5577 {
5578 if (slp_reduc)
5579 {
5580 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5581
5582 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5583 /* SLP statements can't participate in patterns. */
5584 gcc_assert (!orig_stmt_info);
5585 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5586 }
5587
5588 if (nested_in_vect_loop)
5589 {
5590 if (double_reduc)
5591 loop = outer_loop;
5592 else
5593 gcc_unreachable ();
5594 }
5595
5596 phis.create (3);
5597 /* Find the loop-closed-use at the loop exit of the original scalar
5598 result. (The reduction result is expected to have two immediate uses,
5599 one at the latch block, and one at the loop exit). For double
5600 reductions we are looking for exit phis of the outer loop. */
5601 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5602 {
5603 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5604 {
5605 if (!is_gimple_debug (USE_STMT (use_p)))
5606 phis.safe_push (USE_STMT (use_p));
5607 }
5608 else
5609 {
5610 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5611 {
5612 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5613
5614 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5615 {
5616 if (!flow_bb_inside_loop_p (loop,
5617 gimple_bb (USE_STMT (phi_use_p)))
5618 && !is_gimple_debug (USE_STMT (phi_use_p)))
5619 phis.safe_push (USE_STMT (phi_use_p));
5620 }
5621 }
5622 }
5623 }
5624
5625 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5626 {
5627 /* Replace the uses: */
5628 orig_name = PHI_RESULT (exit_phi);
5629 scalar_result = scalar_results[k];
5630 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5631 {
5632 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5633 SET_USE (use_p, scalar_result);
5634 update_stmt (use_stmt);
5635 }
5636 }
5637
5638 phis.release ();
5639 }
5640 }
5641
5642 /* Return a vector of type VECTYPE that is equal to the vector select
5643 operation "MASK ? VEC : IDENTITY". Insert the select statements
5644 before GSI. */
5645
5646 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5647 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5648 tree vec, tree identity)
5649 {
5650 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5651 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5652 mask, vec, identity);
5653 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5654 return cond;
5655 }
5656
5657 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5658 order, starting with LHS. Insert the extraction statements before GSI and
5659 associate the new scalar SSA names with variable SCALAR_DEST.
5660 Return the SSA name for the result. */
5661
5662 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5663 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5664 tree_code code, tree lhs, tree vector_rhs)
5665 {
5666 tree vectype = TREE_TYPE (vector_rhs);
5667 tree scalar_type = TREE_TYPE (vectype);
5668 tree bitsize = TYPE_SIZE (scalar_type);
5669 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5670 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5671
5672 for (unsigned HOST_WIDE_INT bit_offset = 0;
5673 bit_offset < vec_size_in_bits;
5674 bit_offset += element_bitsize)
5675 {
5676 tree bitpos = bitsize_int (bit_offset);
5677 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5678 bitsize, bitpos);
5679
5680 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5681 rhs = make_ssa_name (scalar_dest, stmt);
5682 gimple_assign_set_lhs (stmt, rhs);
5683 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5684
5685 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5686 tree new_name = make_ssa_name (scalar_dest, stmt);
5687 gimple_assign_set_lhs (stmt, new_name);
5688 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5689 lhs = new_name;
5690 }
5691 return lhs;
5692 }
5693
5694 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5695 type of the vector input. */
5696
5697 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)5698 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5699 {
5700 internal_fn mask_reduc_fn;
5701
5702 switch (reduc_fn)
5703 {
5704 case IFN_FOLD_LEFT_PLUS:
5705 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5706 break;
5707
5708 default:
5709 return IFN_LAST;
5710 }
5711
5712 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5713 OPTIMIZE_FOR_SPEED))
5714 return mask_reduc_fn;
5715 return IFN_LAST;
5716 }
5717
5718 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5719 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5720 statement. CODE is the operation performed by STMT_INFO and OPS are
5721 its scalar operands. REDUC_INDEX is the index of the operand in
5722 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5723 implements in-order reduction, or IFN_LAST if we should open-code it.
5724 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5725 that should be used to control the operation in a fully-masked loop. */
5726
5727 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5728 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5729 gimple_stmt_iterator *gsi,
5730 stmt_vec_info *vec_stmt, slp_tree slp_node,
5731 gimple *reduc_def_stmt,
5732 tree_code code, internal_fn reduc_fn,
5733 tree ops[3], tree vectype_in,
5734 int reduc_index, vec_loop_masks *masks)
5735 {
5736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5738 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5739 stmt_vec_info new_stmt_info = NULL;
5740 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5741
5742 int ncopies;
5743 if (slp_node)
5744 ncopies = 1;
5745 else
5746 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5747
5748 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5749 gcc_assert (ncopies == 1);
5750 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5751
5752 if (slp_node)
5753 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5754 TYPE_VECTOR_SUBPARTS (vectype_in)));
5755
5756 tree op0 = ops[1 - reduc_index];
5757
5758 int group_size = 1;
5759 stmt_vec_info scalar_dest_def_info;
5760 auto_vec<tree> vec_oprnds0;
5761 if (slp_node)
5762 {
5763 auto_vec<vec<tree> > vec_defs (2);
5764 vect_get_slp_defs (slp_node, &vec_defs);
5765 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5766 vec_defs[0].release ();
5767 vec_defs[1].release ();
5768 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5769 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5770 }
5771 else
5772 {
5773 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5774 vec_oprnds0.create (1);
5775 vec_oprnds0.quick_push (loop_vec_def0);
5776 scalar_dest_def_info = stmt_info;
5777 }
5778
5779 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5780 tree scalar_type = TREE_TYPE (scalar_dest);
5781 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5782
5783 int vec_num = vec_oprnds0.length ();
5784 gcc_assert (vec_num == 1 || slp_node);
5785 tree vec_elem_type = TREE_TYPE (vectype_out);
5786 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5787
5788 tree vector_identity = NULL_TREE;
5789 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5790 vector_identity = build_zero_cst (vectype_out);
5791
5792 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5793 int i;
5794 tree def0;
5795 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5796 {
5797 gimple *new_stmt;
5798 tree mask = NULL_TREE;
5799 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5800 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5801
5802 /* Handle MINUS by adding the negative. */
5803 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5804 {
5805 tree negated = make_ssa_name (vectype_out);
5806 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5807 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5808 def0 = negated;
5809 }
5810
5811 if (mask && mask_reduc_fn == IFN_LAST)
5812 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5813 vector_identity);
5814
5815 /* On the first iteration the input is simply the scalar phi
5816 result, and for subsequent iterations it is the output of
5817 the preceding operation. */
5818 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5819 {
5820 if (mask && mask_reduc_fn != IFN_LAST)
5821 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5822 def0, mask);
5823 else
5824 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5825 def0);
5826 /* For chained SLP reductions the output of the previous reduction
5827 operation serves as the input of the next. For the final statement
5828 the output cannot be a temporary - we reuse the original
5829 scalar destination of the last statement. */
5830 if (i != vec_num - 1)
5831 {
5832 gimple_set_lhs (new_stmt, scalar_dest_var);
5833 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5834 gimple_set_lhs (new_stmt, reduc_var);
5835 }
5836 }
5837 else
5838 {
5839 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5840 reduc_var, def0);
5841 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5842 /* Remove the statement, so that we can use the same code paths
5843 as for statements that we've just created. */
5844 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5845 gsi_remove (&tmp_gsi, true);
5846 }
5847
5848 if (i == vec_num - 1)
5849 {
5850 gimple_set_lhs (new_stmt, scalar_dest);
5851 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5852 new_stmt);
5853 }
5854 else
5855 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5856 new_stmt, gsi);
5857
5858 if (slp_node)
5859 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5860 }
5861
5862 if (!slp_node)
5863 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5864
5865 return true;
5866 }
5867
5868 /* Function is_nonwrapping_integer_induction.
5869
5870 Check if STMT_VINO (which is part of loop LOOP) both increments and
5871 does not cause overflow. */
5872
5873 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)5874 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5875 {
5876 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5877 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5878 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5879 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5880 widest_int ni, max_loop_value, lhs_max;
5881 wi::overflow_type overflow = wi::OVF_NONE;
5882
5883 /* Make sure the loop is integer based. */
5884 if (TREE_CODE (base) != INTEGER_CST
5885 || TREE_CODE (step) != INTEGER_CST)
5886 return false;
5887
5888 /* Check that the max size of the loop will not wrap. */
5889
5890 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5891 return true;
5892
5893 if (! max_stmt_executions (loop, &ni))
5894 return false;
5895
5896 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5897 &overflow);
5898 if (overflow)
5899 return false;
5900
5901 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5902 TYPE_SIGN (lhs_type), &overflow);
5903 if (overflow)
5904 return false;
5905
5906 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5907 <= TYPE_PRECISION (lhs_type));
5908 }
5909
5910 /* Check if masking can be supported by inserting a conditional expression.
5911 CODE is the code for the operation. COND_FN is the conditional internal
5912 function, if it exists. VECTYPE_IN is the type of the vector input. */
5913 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)5914 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5915 tree vectype_in)
5916 {
5917 if (cond_fn != IFN_LAST
5918 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5919 OPTIMIZE_FOR_SPEED))
5920 return false;
5921
5922 switch (code)
5923 {
5924 case DOT_PROD_EXPR:
5925 case SAD_EXPR:
5926 return true;
5927
5928 default:
5929 return false;
5930 }
5931 }
5932
5933 /* Insert a conditional expression to enable masked vectorization. CODE is the
5934 code for the operation. VOP is the array of operands. MASK is the loop
5935 mask. GSI is a statement iterator used to place the new conditional
5936 expression. */
5937 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)5938 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5939 gimple_stmt_iterator *gsi)
5940 {
5941 switch (code)
5942 {
5943 case DOT_PROD_EXPR:
5944 {
5945 tree vectype = TREE_TYPE (vop[1]);
5946 tree zero = build_zero_cst (vectype);
5947 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5948 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5949 mask, vop[1], zero);
5950 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5951 vop[1] = masked_op1;
5952 break;
5953 }
5954
5955 case SAD_EXPR:
5956 {
5957 tree vectype = TREE_TYPE (vop[1]);
5958 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5959 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5960 mask, vop[1], vop[0]);
5961 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5962 vop[1] = masked_op1;
5963 break;
5964 }
5965
5966 default:
5967 gcc_unreachable ();
5968 }
5969 }
5970
5971 /* Function vectorizable_reduction.
5972
5973 Check if STMT_INFO performs a reduction operation that can be vectorized.
5974 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5975 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5976 Return true if STMT_INFO is vectorizable in this way.
5977
5978 This function also handles reduction idioms (patterns) that have been
5979 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5980 may be of this form:
5981 X = pattern_expr (arg0, arg1, ..., X)
5982 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5983 sequence that had been detected and replaced by the pattern-stmt
5984 (STMT_INFO).
5985
5986 This function also handles reduction of condition expressions, for example:
5987 for (int i = 0; i < N; i++)
5988 if (a[i] < value)
5989 last = a[i];
5990 This is handled by vectorising the loop and creating an additional vector
5991 containing the loop indexes for which "a[i] < value" was true. In the
5992 function epilogue this is reduced to a single max value and then used to
5993 index into the vector of results.
5994
5995 In some cases of reduction patterns, the type of the reduction variable X is
5996 different than the type of the other arguments of STMT_INFO.
5997 In such cases, the vectype that is used when transforming STMT_INFO into
5998 a vector stmt is different than the vectype that is used to determine the
5999 vectorization factor, because it consists of a different number of elements
6000 than the actual number of elements that are being operated upon in parallel.
6001
6002 For example, consider an accumulation of shorts into an int accumulator.
6003 On some targets it's possible to vectorize this pattern operating on 8
6004 shorts at a time (hence, the vectype for purposes of determining the
6005 vectorization factor should be V8HI); on the other hand, the vectype that
6006 is used to create the vector form is actually V4SI (the type of the result).
6007
6008 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6009 indicates what is the actual level of parallelism (V8HI in the example), so
6010 that the right vectorization factor would be derived. This vectype
6011 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6012 be used to create the vectorized stmt. The right vectype for the vectorized
6013 stmt is obtained from the type of the result X:
6014 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6015
6016 This means that, contrary to "regular" reductions (or "regular" stmts in
6017 general), the following equation:
6018 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6019 does *NOT* necessarily hold for reduction patterns. */
6020
6021 bool
vectorizable_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6022 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
6023 slp_instance slp_node_instance,
6024 stmt_vector_for_cost *cost_vec)
6025 {
6026 tree scalar_dest;
6027 tree vectype_in = NULL_TREE;
6028 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6029 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6030 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6031 stmt_vec_info cond_stmt_vinfo = NULL;
6032 tree scalar_type;
6033 int i;
6034 int ncopies;
6035 bool single_defuse_cycle = false;
6036 bool nested_cycle = false;
6037 bool double_reduc = false;
6038 int vec_num;
6039 tree tem;
6040 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6041 tree cond_reduc_val = NULL_TREE;
6042
6043 /* Make sure it was already recognized as a reduction computation. */
6044 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6045 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6046 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6047 return false;
6048
6049 /* The stmt we store reduction analysis meta on. */
6050 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6051 reduc_info->is_reduc_info = true;
6052
6053 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6054 {
6055 if (is_a <gphi *> (stmt_info->stmt))
6056 /* Analysis for double-reduction is done on the outer
6057 loop PHI, nested cycles have no further restrictions. */
6058 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6059 else
6060 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6061 return true;
6062 }
6063
6064 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6065 stmt_vec_info phi_info = stmt_info;
6066 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6067 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6068 {
6069 if (!is_a <gphi *> (stmt_info->stmt))
6070 {
6071 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6072 return true;
6073 }
6074 if (slp_node)
6075 {
6076 slp_node_instance->reduc_phis = slp_node;
6077 /* ??? We're leaving slp_node to point to the PHIs, we only
6078 need it to get at the number of vector stmts which wasn't
6079 yet initialized for the instance root. */
6080 }
6081 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6082 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6083 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6084 {
6085 use_operand_p use_p;
6086 gimple *use_stmt;
6087 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6088 &use_p, &use_stmt);
6089 gcc_assert (res);
6090 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6091 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6092 }
6093 }
6094
6095 /* PHIs should not participate in patterns. */
6096 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6097 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6098
6099 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6100 and compute the reduction chain length. */
6101 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6102 loop_latch_edge (loop));
6103 unsigned reduc_chain_length = 0;
6104 bool only_slp_reduc_chain = true;
6105 stmt_info = NULL;
6106 while (reduc_def != PHI_RESULT (reduc_def_phi))
6107 {
6108 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6109 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6110 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6111 {
6112 if (dump_enabled_p ())
6113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6114 "reduction chain broken by patterns.\n");
6115 return false;
6116 }
6117 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6118 only_slp_reduc_chain = false;
6119 /* ??? For epilogue generation live members of the chain need
6120 to point back to the PHI via their original stmt for
6121 info_for_reduction to work. */
6122 if (STMT_VINFO_LIVE_P (vdef))
6123 STMT_VINFO_REDUC_DEF (def) = phi_info;
6124 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6125 if (!assign)
6126 {
6127 if (dump_enabled_p ())
6128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6129 "reduction chain includes calls.\n");
6130 return false;
6131 }
6132 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6133 {
6134 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6135 TREE_TYPE (gimple_assign_rhs1 (assign))))
6136 {
6137 if (dump_enabled_p ())
6138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139 "conversion in the reduction chain.\n");
6140 return false;
6141 }
6142 }
6143 else if (!stmt_info)
6144 /* First non-conversion stmt. */
6145 stmt_info = vdef;
6146 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6147 reduc_chain_length++;
6148 }
6149 /* PHIs should not participate in patterns. */
6150 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6151
6152 if (nested_in_vect_loop_p (loop, stmt_info))
6153 {
6154 loop = loop->inner;
6155 nested_cycle = true;
6156 }
6157
6158 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6159 element. */
6160 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6161 {
6162 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6163 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6164 }
6165 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6166 gcc_assert (slp_node
6167 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6168
6169 /* 1. Is vectorizable reduction? */
6170 /* Not supportable if the reduction variable is used in the loop, unless
6171 it's a reduction chain. */
6172 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6173 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6174 return false;
6175
6176 /* Reductions that are not used even in an enclosing outer-loop,
6177 are expected to be "live" (used out of the loop). */
6178 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6179 && !STMT_VINFO_LIVE_P (stmt_info))
6180 return false;
6181
6182 /* 2. Has this been recognized as a reduction pattern?
6183
6184 Check if STMT represents a pattern that has been recognized
6185 in earlier analysis stages. For stmts that represent a pattern,
6186 the STMT_VINFO_RELATED_STMT field records the last stmt in
6187 the original sequence that constitutes the pattern. */
6188
6189 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6190 if (orig_stmt_info)
6191 {
6192 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6193 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6194 }
6195
6196 /* 3. Check the operands of the operation. The first operands are defined
6197 inside the loop body. The last operand is the reduction variable,
6198 which is defined by the loop-header-phi. */
6199
6200 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6201 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6202 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6203 enum tree_code code = gimple_assign_rhs_code (stmt);
6204 bool lane_reduc_code_p
6205 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6206 int op_type = TREE_CODE_LENGTH (code);
6207
6208 scalar_dest = gimple_assign_lhs (stmt);
6209 scalar_type = TREE_TYPE (scalar_dest);
6210 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6211 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6212 return false;
6213
6214 /* Do not try to vectorize bit-precision reductions. */
6215 if (!type_has_mode_precision_p (scalar_type))
6216 return false;
6217
6218 /* For lane-reducing ops we're reducing the number of reduction PHIs
6219 which means the only use of that may be in the lane-reducing operation. */
6220 if (lane_reduc_code_p
6221 && reduc_chain_length != 1
6222 && !only_slp_reduc_chain)
6223 {
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226 "lane-reducing reduction with extra stmts.\n");
6227 return false;
6228 }
6229
6230 /* All uses but the last are expected to be defined in the loop.
6231 The last use is the reduction variable. In case of nested cycle this
6232 assumption is not true: we use reduc_index to record the index of the
6233 reduction variable. */
6234 reduc_def = PHI_RESULT (reduc_def_phi);
6235 for (i = 0; i < op_type; i++)
6236 {
6237 tree op = gimple_op (stmt, i + 1);
6238 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6239 if (i == 0 && code == COND_EXPR)
6240 continue;
6241
6242 stmt_vec_info def_stmt_info;
6243 enum vect_def_type dt;
6244 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6245 &def_stmt_info))
6246 {
6247 if (dump_enabled_p ())
6248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249 "use not simple.\n");
6250 return false;
6251 }
6252 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6253 continue;
6254
6255 /* There should be only one cycle def in the stmt, the one
6256 leading to reduc_def. */
6257 if (VECTORIZABLE_CYCLE_DEF (dt))
6258 return false;
6259
6260 /* To properly compute ncopies we are interested in the widest
6261 non-reduction input type in case we're looking at a widening
6262 accumulation that we later handle in vect_transform_reduction. */
6263 if (lane_reduc_code_p
6264 && tem
6265 && (!vectype_in
6266 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6267 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6268 vectype_in = tem;
6269
6270 if (code == COND_EXPR)
6271 {
6272 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6273 if (dt == vect_constant_def)
6274 {
6275 cond_reduc_dt = dt;
6276 cond_reduc_val = op;
6277 }
6278 if (dt == vect_induction_def
6279 && def_stmt_info
6280 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6281 {
6282 cond_reduc_dt = dt;
6283 cond_stmt_vinfo = def_stmt_info;
6284 }
6285 }
6286 }
6287 if (!vectype_in)
6288 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6289 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6290
6291 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6292 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6293 /* If we have a condition reduction, see if we can simplify it further. */
6294 if (v_reduc_type == COND_REDUCTION)
6295 {
6296 if (slp_node)
6297 return false;
6298
6299 /* When the condition uses the reduction value in the condition, fail. */
6300 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6301 {
6302 if (dump_enabled_p ())
6303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304 "condition depends on previous iteration\n");
6305 return false;
6306 }
6307
6308 if (reduc_chain_length == 1
6309 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6310 vectype_in, OPTIMIZE_FOR_SPEED))
6311 {
6312 if (dump_enabled_p ())
6313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314 "optimizing condition reduction with"
6315 " FOLD_EXTRACT_LAST.\n");
6316 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6317 }
6318 else if (cond_reduc_dt == vect_induction_def)
6319 {
6320 tree base
6321 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6322 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6323
6324 gcc_assert (TREE_CODE (base) == INTEGER_CST
6325 && TREE_CODE (step) == INTEGER_CST);
6326 cond_reduc_val = NULL_TREE;
6327 enum tree_code cond_reduc_op_code = ERROR_MARK;
6328 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6329 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6330 ;
6331 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6332 above base; punt if base is the minimum value of the type for
6333 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6334 else if (tree_int_cst_sgn (step) == -1)
6335 {
6336 cond_reduc_op_code = MIN_EXPR;
6337 if (tree_int_cst_sgn (base) == -1)
6338 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6339 else if (tree_int_cst_lt (base,
6340 TYPE_MAX_VALUE (TREE_TYPE (base))))
6341 cond_reduc_val
6342 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6343 }
6344 else
6345 {
6346 cond_reduc_op_code = MAX_EXPR;
6347 if (tree_int_cst_sgn (base) == 1)
6348 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6349 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6350 base))
6351 cond_reduc_val
6352 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6353 }
6354 if (cond_reduc_val)
6355 {
6356 if (dump_enabled_p ())
6357 dump_printf_loc (MSG_NOTE, vect_location,
6358 "condition expression based on "
6359 "integer induction.\n");
6360 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6361 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6362 = cond_reduc_val;
6363 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6364 }
6365 }
6366 else if (cond_reduc_dt == vect_constant_def)
6367 {
6368 enum vect_def_type cond_initial_dt;
6369 tree cond_initial_val
6370 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6371
6372 gcc_assert (cond_reduc_val != NULL_TREE);
6373 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6374 if (cond_initial_dt == vect_constant_def
6375 && types_compatible_p (TREE_TYPE (cond_initial_val),
6376 TREE_TYPE (cond_reduc_val)))
6377 {
6378 tree e = fold_binary (LE_EXPR, boolean_type_node,
6379 cond_initial_val, cond_reduc_val);
6380 if (e && (integer_onep (e) || integer_zerop (e)))
6381 {
6382 if (dump_enabled_p ())
6383 dump_printf_loc (MSG_NOTE, vect_location,
6384 "condition expression based on "
6385 "compile time constant.\n");
6386 /* Record reduction code at analysis stage. */
6387 STMT_VINFO_REDUC_CODE (reduc_info)
6388 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6389 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6390 }
6391 }
6392 }
6393 }
6394
6395 if (STMT_VINFO_LIVE_P (phi_info))
6396 return false;
6397
6398 if (slp_node)
6399 ncopies = 1;
6400 else
6401 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6402
6403 gcc_assert (ncopies >= 1);
6404
6405 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6406
6407 if (nested_cycle)
6408 {
6409 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6410 == vect_double_reduction_def);
6411 double_reduc = true;
6412 }
6413
6414 /* 4.2. Check support for the epilog operation.
6415
6416 If STMT represents a reduction pattern, then the type of the
6417 reduction variable may be different than the type of the rest
6418 of the arguments. For example, consider the case of accumulation
6419 of shorts into an int accumulator; The original code:
6420 S1: int_a = (int) short_a;
6421 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6422
6423 was replaced with:
6424 STMT: int_acc = widen_sum <short_a, int_acc>
6425
6426 This means that:
6427 1. The tree-code that is used to create the vector operation in the
6428 epilog code (that reduces the partial results) is not the
6429 tree-code of STMT, but is rather the tree-code of the original
6430 stmt from the pattern that STMT is replacing. I.e, in the example
6431 above we want to use 'widen_sum' in the loop, but 'plus' in the
6432 epilog.
6433 2. The type (mode) we use to check available target support
6434 for the vector operation to be created in the *epilog*, is
6435 determined by the type of the reduction variable (in the example
6436 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6437 However the type (mode) we use to check available target support
6438 for the vector operation to be created *inside the loop*, is
6439 determined by the type of the other arguments to STMT (in the
6440 example we'd check this: optab_handler (widen_sum_optab,
6441 vect_short_mode)).
6442
6443 This is contrary to "regular" reductions, in which the types of all
6444 the arguments are the same as the type of the reduction variable.
6445 For "regular" reductions we can therefore use the same vector type
6446 (and also the same tree-code) when generating the epilog code and
6447 when generating the code inside the loop. */
6448
6449 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6450 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6451
6452 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6453 if (reduction_type == TREE_CODE_REDUCTION)
6454 {
6455 /* Check whether it's ok to change the order of the computation.
6456 Generally, when vectorizing a reduction we change the order of the
6457 computation. This may change the behavior of the program in some
6458 cases, so we need to check that this is ok. One exception is when
6459 vectorizing an outer-loop: the inner-loop is executed sequentially,
6460 and therefore vectorizing reductions in the inner-loop during
6461 outer-loop vectorization is safe. */
6462 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6463 {
6464 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6465 is not directy used in stmt. */
6466 if (!only_slp_reduc_chain
6467 && reduc_chain_length != 1)
6468 {
6469 if (dump_enabled_p ())
6470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471 "in-order reduction chain without SLP.\n");
6472 return false;
6473 }
6474 STMT_VINFO_REDUC_TYPE (reduc_info)
6475 = reduction_type = FOLD_LEFT_REDUCTION;
6476 }
6477 else if (!commutative_tree_code (orig_code)
6478 || !associative_tree_code (orig_code))
6479 {
6480 if (dump_enabled_p ())
6481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6482 "reduction: not commutative/associative");
6483 return false;
6484 }
6485 }
6486
6487 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6488 && ncopies > 1)
6489 {
6490 if (dump_enabled_p ())
6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 "multiple types in double reduction or condition "
6493 "reduction or fold-left reduction.\n");
6494 return false;
6495 }
6496
6497 internal_fn reduc_fn = IFN_LAST;
6498 if (reduction_type == TREE_CODE_REDUCTION
6499 || reduction_type == FOLD_LEFT_REDUCTION
6500 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6501 || reduction_type == CONST_COND_REDUCTION)
6502 {
6503 if (reduction_type == FOLD_LEFT_REDUCTION
6504 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6505 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6506 {
6507 if (reduc_fn != IFN_LAST
6508 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6509 OPTIMIZE_FOR_SPEED))
6510 {
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 "reduc op not supported by target.\n");
6514
6515 reduc_fn = IFN_LAST;
6516 }
6517 }
6518 else
6519 {
6520 if (!nested_cycle || double_reduc)
6521 {
6522 if (dump_enabled_p ())
6523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6524 "no reduc code for scalar code.\n");
6525
6526 return false;
6527 }
6528 }
6529 }
6530 else if (reduction_type == COND_REDUCTION)
6531 {
6532 int scalar_precision
6533 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6534 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6535 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6536 nunits_out);
6537
6538 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6539 OPTIMIZE_FOR_SPEED))
6540 reduc_fn = IFN_REDUC_MAX;
6541 }
6542 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6543
6544 if (reduction_type != EXTRACT_LAST_REDUCTION
6545 && (!nested_cycle || double_reduc)
6546 && reduc_fn == IFN_LAST
6547 && !nunits_out.is_constant ())
6548 {
6549 if (dump_enabled_p ())
6550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551 "missing target support for reduction on"
6552 " variable-length vectors.\n");
6553 return false;
6554 }
6555
6556 /* For SLP reductions, see if there is a neutral value we can use. */
6557 tree neutral_op = NULL_TREE;
6558 if (slp_node)
6559 neutral_op = neutral_op_for_slp_reduction
6560 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6561 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6562
6563 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6564 {
6565 /* We can't support in-order reductions of code such as this:
6566
6567 for (int i = 0; i < n1; ++i)
6568 for (int j = 0; j < n2; ++j)
6569 l += a[j];
6570
6571 since GCC effectively transforms the loop when vectorizing:
6572
6573 for (int i = 0; i < n1 / VF; ++i)
6574 for (int j = 0; j < n2; ++j)
6575 for (int k = 0; k < VF; ++k)
6576 l += a[j];
6577
6578 which is a reassociation of the original operation. */
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 "in-order double reduction not supported.\n");
6582
6583 return false;
6584 }
6585
6586 if (reduction_type == FOLD_LEFT_REDUCTION
6587 && slp_node
6588 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6589 {
6590 /* We cannot use in-order reductions in this case because there is
6591 an implicit reassociation of the operations involved. */
6592 if (dump_enabled_p ())
6593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6594 "in-order unchained SLP reductions not supported.\n");
6595 return false;
6596 }
6597
6598 /* For double reductions, and for SLP reductions with a neutral value,
6599 we construct a variable-length initial vector by loading a vector
6600 full of the neutral value and then shift-and-inserting the start
6601 values into the low-numbered elements. */
6602 if ((double_reduc || neutral_op)
6603 && !nunits_out.is_constant ()
6604 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6605 vectype_out, OPTIMIZE_FOR_SPEED))
6606 {
6607 if (dump_enabled_p ())
6608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 "reduction on variable-length vectors requires"
6610 " target support for a vector-shift-and-insert"
6611 " operation.\n");
6612 return false;
6613 }
6614
6615 /* Check extra constraints for variable-length unchained SLP reductions. */
6616 if (STMT_SLP_TYPE (stmt_info)
6617 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6618 && !nunits_out.is_constant ())
6619 {
6620 /* We checked above that we could build the initial vector when
6621 there's a neutral element value. Check here for the case in
6622 which each SLP statement has its own initial value and in which
6623 that value needs to be repeated for every instance of the
6624 statement within the initial vector. */
6625 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6626 if (!neutral_op
6627 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6628 TREE_TYPE (vectype_out)))
6629 {
6630 if (dump_enabled_p ())
6631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632 "unsupported form of SLP reduction for"
6633 " variable-length vectors: cannot build"
6634 " initial vector.\n");
6635 return false;
6636 }
6637 /* The epilogue code relies on the number of elements being a multiple
6638 of the group size. The duplicate-and-interleave approach to setting
6639 up the initial vector does too. */
6640 if (!multiple_p (nunits_out, group_size))
6641 {
6642 if (dump_enabled_p ())
6643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6644 "unsupported form of SLP reduction for"
6645 " variable-length vectors: the vector size"
6646 " is not a multiple of the number of results.\n");
6647 return false;
6648 }
6649 }
6650
6651 if (reduction_type == COND_REDUCTION)
6652 {
6653 widest_int ni;
6654
6655 if (! max_loop_iterations (loop, &ni))
6656 {
6657 if (dump_enabled_p ())
6658 dump_printf_loc (MSG_NOTE, vect_location,
6659 "loop count not known, cannot create cond "
6660 "reduction.\n");
6661 return false;
6662 }
6663 /* Convert backedges to iterations. */
6664 ni += 1;
6665
6666 /* The additional index will be the same type as the condition. Check
6667 that the loop can fit into this less one (because we'll use up the
6668 zero slot for when there are no matches). */
6669 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6670 if (wi::geu_p (ni, wi::to_widest (max_index)))
6671 {
6672 if (dump_enabled_p ())
6673 dump_printf_loc (MSG_NOTE, vect_location,
6674 "loop size is greater than data size.\n");
6675 return false;
6676 }
6677 }
6678
6679 /* In case the vectorization factor (VF) is bigger than the number
6680 of elements that we can fit in a vectype (nunits), we have to generate
6681 more than one vector stmt - i.e - we need to "unroll" the
6682 vector stmt by a factor VF/nunits. For more details see documentation
6683 in vectorizable_operation. */
6684
6685 /* If the reduction is used in an outer loop we need to generate
6686 VF intermediate results, like so (e.g. for ncopies=2):
6687 r0 = phi (init, r0)
6688 r1 = phi (init, r1)
6689 r0 = x0 + r0;
6690 r1 = x1 + r1;
6691 (i.e. we generate VF results in 2 registers).
6692 In this case we have a separate def-use cycle for each copy, and therefore
6693 for each copy we get the vector def for the reduction variable from the
6694 respective phi node created for this copy.
6695
6696 Otherwise (the reduction is unused in the loop nest), we can combine
6697 together intermediate results, like so (e.g. for ncopies=2):
6698 r = phi (init, r)
6699 r = x0 + r;
6700 r = x1 + r;
6701 (i.e. we generate VF/2 results in a single register).
6702 In this case for each copy we get the vector def for the reduction variable
6703 from the vectorized reduction operation generated in the previous iteration.
6704
6705 This only works when we see both the reduction PHI and its only consumer
6706 in vectorizable_reduction and there are no intermediate stmts
6707 participating. */
6708 if (ncopies > 1
6709 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6710 && reduc_chain_length == 1)
6711 single_defuse_cycle = true;
6712
6713 if (single_defuse_cycle || lane_reduc_code_p)
6714 {
6715 gcc_assert (code != COND_EXPR);
6716
6717 /* 4. Supportable by target? */
6718 bool ok = true;
6719
6720 /* 4.1. check support for the operation in the loop */
6721 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6722 if (!optab)
6723 {
6724 if (dump_enabled_p ())
6725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6726 "no optab.\n");
6727 ok = false;
6728 }
6729
6730 machine_mode vec_mode = TYPE_MODE (vectype_in);
6731 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6732 {
6733 if (dump_enabled_p ())
6734 dump_printf (MSG_NOTE, "op not supported by target.\n");
6735 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6736 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6737 ok = false;
6738 else
6739 if (dump_enabled_p ())
6740 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6741 }
6742
6743 /* Worthwhile without SIMD support? */
6744 if (ok
6745 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6746 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6747 {
6748 if (dump_enabled_p ())
6749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6750 "not worthwhile without SIMD support.\n");
6751 ok = false;
6752 }
6753
6754 /* lane-reducing operations have to go through vect_transform_reduction.
6755 For the other cases try without the single cycle optimization. */
6756 if (!ok)
6757 {
6758 if (lane_reduc_code_p)
6759 return false;
6760 else
6761 single_defuse_cycle = false;
6762 }
6763 }
6764 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6765
6766 /* If the reduction stmt is one of the patterns that have lane
6767 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6768 if ((ncopies > 1 && ! single_defuse_cycle)
6769 && lane_reduc_code_p)
6770 {
6771 if (dump_enabled_p ())
6772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6773 "multi def-use cycle not possible for lane-reducing "
6774 "reduction operation\n");
6775 return false;
6776 }
6777
6778 if (slp_node)
6779 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6780 else
6781 vec_num = 1;
6782
6783 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6784 cost_vec);
6785 /* Cost the reduction op inside the loop if transformed via
6786 vect_transform_reduction. Otherwise this is costed by the
6787 separate vectorizable_* routines. */
6788 if (single_defuse_cycle
6789 || code == DOT_PROD_EXPR
6790 || code == WIDEN_SUM_EXPR
6791 || code == SAD_EXPR)
6792 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
6793
6794 if (dump_enabled_p ()
6795 && reduction_type == FOLD_LEFT_REDUCTION)
6796 dump_printf_loc (MSG_NOTE, vect_location,
6797 "using an in-order (fold-left) reduction.\n");
6798 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6799 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6800 reductions go through their own vectorizable_* routines. */
6801 if (!single_defuse_cycle
6802 && code != DOT_PROD_EXPR
6803 && code != WIDEN_SUM_EXPR
6804 && code != SAD_EXPR
6805 && reduction_type != FOLD_LEFT_REDUCTION)
6806 {
6807 stmt_vec_info tem
6808 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6809 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6810 {
6811 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6812 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6813 }
6814 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6815 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6816 }
6817 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6818 {
6819 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6820 internal_fn cond_fn = get_conditional_internal_fn (code);
6821
6822 if (reduction_type != FOLD_LEFT_REDUCTION
6823 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6824 && (cond_fn == IFN_LAST
6825 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6826 OPTIMIZE_FOR_SPEED)))
6827 {
6828 if (dump_enabled_p ())
6829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830 "can't use a fully-masked loop because no"
6831 " conditional operation is available.\n");
6832 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6833 }
6834 else if (reduction_type == FOLD_LEFT_REDUCTION
6835 && reduc_fn == IFN_LAST
6836 && !expand_vec_cond_expr_p (vectype_in,
6837 truth_type_for (vectype_in),
6838 SSA_NAME))
6839 {
6840 if (dump_enabled_p ())
6841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842 "can't use a fully-masked loop because no"
6843 " conditional operation is available.\n");
6844 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6845 }
6846 else
6847 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6848 vectype_in, NULL);
6849 }
6850 return true;
6851 }
6852
6853 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6854 value. */
6855
6856 bool
vect_transform_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node)6857 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6858 stmt_vec_info *vec_stmt, slp_tree slp_node)
6859 {
6860 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6861 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6862 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6863 int i;
6864 int ncopies;
6865 int j;
6866 int vec_num;
6867
6868 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6869 gcc_assert (reduc_info->is_reduc_info);
6870
6871 if (nested_in_vect_loop_p (loop, stmt_info))
6872 {
6873 loop = loop->inner;
6874 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6875 }
6876
6877 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6878 enum tree_code code = gimple_assign_rhs_code (stmt);
6879 int op_type = TREE_CODE_LENGTH (code);
6880
6881 /* Flatten RHS. */
6882 tree ops[3];
6883 switch (get_gimple_rhs_class (code))
6884 {
6885 case GIMPLE_TERNARY_RHS:
6886 ops[2] = gimple_assign_rhs3 (stmt);
6887 /* Fall thru. */
6888 case GIMPLE_BINARY_RHS:
6889 ops[0] = gimple_assign_rhs1 (stmt);
6890 ops[1] = gimple_assign_rhs2 (stmt);
6891 break;
6892 default:
6893 gcc_unreachable ();
6894 }
6895
6896 /* All uses but the last are expected to be defined in the loop.
6897 The last use is the reduction variable. In case of nested cycle this
6898 assumption is not true: we use reduc_index to record the index of the
6899 reduction variable. */
6900 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6901 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6902 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6903 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6904
6905 if (slp_node)
6906 {
6907 ncopies = 1;
6908 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6909 }
6910 else
6911 {
6912 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6913 vec_num = 1;
6914 }
6915
6916 internal_fn cond_fn = get_conditional_internal_fn (code);
6917 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6918 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6919
6920 /* Transform. */
6921 stmt_vec_info new_stmt_info = NULL;
6922 stmt_vec_info prev_stmt_info;
6923 tree new_temp = NULL_TREE;
6924 auto_vec<tree> vec_oprnds0;
6925 auto_vec<tree> vec_oprnds1;
6926 auto_vec<tree> vec_oprnds2;
6927 tree def0;
6928
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6931
6932 /* FORNOW: Multiple types are not supported for condition. */
6933 if (code == COND_EXPR)
6934 gcc_assert (ncopies == 1);
6935
6936 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6937
6938 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6939 if (reduction_type == FOLD_LEFT_REDUCTION)
6940 {
6941 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6942 return vectorize_fold_left_reduction
6943 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6944 reduc_fn, ops, vectype_in, reduc_index, masks);
6945 }
6946
6947 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6948 gcc_assert (single_defuse_cycle
6949 || code == DOT_PROD_EXPR
6950 || code == WIDEN_SUM_EXPR
6951 || code == SAD_EXPR);
6952
6953 /* Create the destination vector */
6954 tree scalar_dest = gimple_assign_lhs (stmt);
6955 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6956
6957 prev_stmt_info = NULL;
6958 if (!slp_node)
6959 {
6960 vec_oprnds0.create (1);
6961 vec_oprnds1.create (1);
6962 if (op_type == ternary_op)
6963 vec_oprnds2.create (1);
6964 }
6965
6966 for (j = 0; j < ncopies; j++)
6967 {
6968 /* Handle uses. */
6969 if (j == 0)
6970 {
6971 if (slp_node)
6972 {
6973 /* Get vec defs for all the operands except the reduction index,
6974 ensuring the ordering of the ops in the vector is kept. */
6975 auto_vec<vec<tree>, 3> vec_defs;
6976 vect_get_slp_defs (slp_node, &vec_defs);
6977 vec_oprnds0.safe_splice (vec_defs[0]);
6978 vec_defs[0].release ();
6979 vec_oprnds1.safe_splice (vec_defs[1]);
6980 vec_defs[1].release ();
6981 if (op_type == ternary_op)
6982 {
6983 vec_oprnds2.safe_splice (vec_defs[2]);
6984 vec_defs[2].release ();
6985 }
6986 }
6987 else
6988 {
6989 vec_oprnds0.quick_push
6990 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6991 vec_oprnds1.quick_push
6992 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6993 if (op_type == ternary_op)
6994 vec_oprnds2.quick_push
6995 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6996 }
6997 }
6998 else
6999 {
7000 if (!slp_node)
7001 {
7002 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7003
7004 if (single_defuse_cycle && reduc_index == 0)
7005 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7006 else
7007 vec_oprnds0[0]
7008 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7009 vec_oprnds0[0]);
7010 if (single_defuse_cycle && reduc_index == 1)
7011 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7012 else
7013 vec_oprnds1[0]
7014 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7015 vec_oprnds1[0]);
7016 if (op_type == ternary_op)
7017 {
7018 if (single_defuse_cycle && reduc_index == 2)
7019 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7020 else
7021 vec_oprnds2[0]
7022 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7023 vec_oprnds2[0]);
7024 }
7025 }
7026 }
7027
7028 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7029 {
7030 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7031 if (masked_loop_p && !mask_by_cond_expr)
7032 {
7033 /* Make sure that the reduction accumulator is vop[0]. */
7034 if (reduc_index == 1)
7035 {
7036 gcc_assert (commutative_tree_code (code));
7037 std::swap (vop[0], vop[1]);
7038 }
7039 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7040 vectype_in, i * ncopies + j);
7041 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7042 vop[0], vop[1],
7043 vop[0]);
7044 new_temp = make_ssa_name (vec_dest, call);
7045 gimple_call_set_lhs (call, new_temp);
7046 gimple_call_set_nothrow (call, true);
7047 new_stmt_info
7048 = vect_finish_stmt_generation (stmt_info, call, gsi);
7049 }
7050 else
7051 {
7052 if (op_type == ternary_op)
7053 vop[2] = vec_oprnds2[i];
7054
7055 if (masked_loop_p && mask_by_cond_expr)
7056 {
7057 tree mask = vect_get_loop_mask (gsi, masks,
7058 vec_num * ncopies,
7059 vectype_in, i * ncopies + j);
7060 build_vect_cond_expr (code, vop, mask, gsi);
7061 }
7062
7063 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7064 vop[0], vop[1], vop[2]);
7065 new_temp = make_ssa_name (vec_dest, new_stmt);
7066 gimple_assign_set_lhs (new_stmt, new_temp);
7067 new_stmt_info
7068 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7069 }
7070
7071 if (slp_node)
7072 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7073 }
7074
7075 if (slp_node || single_defuse_cycle)
7076 continue;
7077
7078 if (j == 0)
7079 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7080 else
7081 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7082
7083 prev_stmt_info = new_stmt_info;
7084 }
7085
7086 if (single_defuse_cycle && !slp_node)
7087 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7088
7089 return true;
7090 }
7091
7092 /* Transform phase of a cycle PHI. */
7093
7094 bool
vect_transform_cycle_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7095 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7096 slp_tree slp_node, slp_instance slp_node_instance)
7097 {
7098 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7100 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7101 int i;
7102 int ncopies;
7103 stmt_vec_info prev_phi_info;
7104 int j;
7105 bool nested_cycle = false;
7106 int vec_num;
7107
7108 if (nested_in_vect_loop_p (loop, stmt_info))
7109 {
7110 loop = loop->inner;
7111 nested_cycle = true;
7112 }
7113
7114 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7115 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7116 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7117 gcc_assert (reduc_info->is_reduc_info);
7118
7119 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7120 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7121 /* Leave the scalar phi in place. */
7122 return true;
7123
7124 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7125 /* For a nested cycle we do not fill the above. */
7126 if (!vectype_in)
7127 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7128 gcc_assert (vectype_in);
7129
7130 if (slp_node)
7131 {
7132 /* The size vect_schedule_slp_instance computes is off for us. */
7133 vec_num = vect_get_num_vectors
7134 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7135 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7136 ncopies = 1;
7137 }
7138 else
7139 {
7140 vec_num = 1;
7141 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7142 }
7143
7144 /* Check whether we should use a single PHI node and accumulate
7145 vectors to one before the backedge. */
7146 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7147 ncopies = 1;
7148
7149 /* Create the destination vector */
7150 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7151 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7152 vectype_out);
7153
7154 /* Get the loop-entry arguments. */
7155 tree vec_initial_def;
7156 auto_vec<tree> vec_initial_defs;
7157 if (slp_node)
7158 {
7159 vec_initial_defs.reserve (vec_num);
7160 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7161 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7162 tree neutral_op
7163 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7164 STMT_VINFO_REDUC_CODE (reduc_info),
7165 first != NULL);
7166 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7167 &vec_initial_defs, vec_num,
7168 first != NULL, neutral_op);
7169 }
7170 else
7171 {
7172 /* Get at the scalar def before the loop, that defines the initial
7173 value of the reduction variable. */
7174 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7175 loop_preheader_edge (loop));
7176 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7177 and we can't use zero for induc_val, use initial_def. Similarly
7178 for REDUC_MIN and initial_def larger than the base. */
7179 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7180 {
7181 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7182 if (TREE_CODE (initial_def) == INTEGER_CST
7183 && !integer_zerop (induc_val)
7184 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7185 && tree_int_cst_lt (initial_def, induc_val))
7186 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7187 && tree_int_cst_lt (induc_val, initial_def))))
7188 {
7189 induc_val = initial_def;
7190 /* Communicate we used the initial_def to epilouge
7191 generation. */
7192 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7193 }
7194 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7195 }
7196 else if (nested_cycle)
7197 {
7198 /* Do not use an adjustment def as that case is not supported
7199 correctly if ncopies is not one. */
7200 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7201 reduc_stmt_info);
7202 }
7203 else
7204 {
7205 tree adjustment_def = NULL_TREE;
7206 tree *adjustment_defp = &adjustment_def;
7207 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7208 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7209 adjustment_defp = NULL;
7210 vec_initial_def
7211 = get_initial_def_for_reduction (reduc_stmt_info, code,
7212 initial_def, adjustment_defp);
7213 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7214 }
7215 vec_initial_defs.create (1);
7216 vec_initial_defs.quick_push (vec_initial_def);
7217 }
7218
7219 /* Generate the reduction PHIs upfront. */
7220 prev_phi_info = NULL;
7221 for (i = 0; i < vec_num; i++)
7222 {
7223 tree vec_init_def = vec_initial_defs[i];
7224 for (j = 0; j < ncopies; j++)
7225 {
7226 /* Create the reduction-phi that defines the reduction
7227 operand. */
7228 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7229 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7230
7231 /* Set the loop-entry arg of the reduction-phi. */
7232 if (j != 0 && nested_cycle)
7233 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7234 vec_init_def);
7235 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7236 UNKNOWN_LOCATION);
7237
7238 /* The loop-latch arg is set in epilogue processing. */
7239
7240 if (slp_node)
7241 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7242 else
7243 {
7244 if (j == 0)
7245 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7246 else
7247 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7248 prev_phi_info = new_phi_info;
7249 }
7250 }
7251 }
7252
7253 return true;
7254 }
7255
7256 /* Vectorizes LC PHIs. */
7257
7258 bool
vectorizable_lc_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node)7259 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7260 slp_tree slp_node)
7261 {
7262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7263 if (!loop_vinfo
7264 || !is_a <gphi *> (stmt_info->stmt)
7265 || gimple_phi_num_args (stmt_info->stmt) != 1)
7266 return false;
7267
7268 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7269 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7270 return false;
7271
7272 if (!vec_stmt) /* transformation not required. */
7273 {
7274 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7275 return true;
7276 }
7277
7278 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7279 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7280 basic_block bb = gimple_bb (stmt_info->stmt);
7281 edge e = single_pred_edge (bb);
7282 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7283 vec<tree> vec_oprnds = vNULL;
7284 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7285 stmt_info, &vec_oprnds, NULL, slp_node);
7286 if (slp_node)
7287 {
7288 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7289 gcc_assert (vec_oprnds.length () == vec_num);
7290 for (unsigned i = 0; i < vec_num; i++)
7291 {
7292 /* Create the vectorized LC PHI node. */
7293 gphi *new_phi = create_phi_node (vec_dest, bb);
7294 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7295 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7296 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7297 }
7298 }
7299 else
7300 {
7301 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7302 stmt_vec_info prev_phi_info = NULL;
7303 for (unsigned i = 0; i < ncopies; i++)
7304 {
7305 if (i != 0)
7306 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7307 /* Create the vectorized LC PHI node. */
7308 gphi *new_phi = create_phi_node (vec_dest, bb);
7309 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7310 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7311 if (i == 0)
7312 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7313 else
7314 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7315 prev_phi_info = new_phi_info;
7316 }
7317 }
7318 vec_oprnds.release ();
7319
7320 return true;
7321 }
7322
7323
7324 /* Function vect_min_worthwhile_factor.
7325
7326 For a loop where we could vectorize the operation indicated by CODE,
7327 return the minimum vectorization factor that makes it worthwhile
7328 to use generic vectors. */
7329 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7330 vect_min_worthwhile_factor (enum tree_code code)
7331 {
7332 switch (code)
7333 {
7334 case PLUS_EXPR:
7335 case MINUS_EXPR:
7336 case NEGATE_EXPR:
7337 return 4;
7338
7339 case BIT_AND_EXPR:
7340 case BIT_IOR_EXPR:
7341 case BIT_XOR_EXPR:
7342 case BIT_NOT_EXPR:
7343 return 2;
7344
7345 default:
7346 return INT_MAX;
7347 }
7348 }
7349
7350 /* Return true if VINFO indicates we are doing loop vectorization and if
7351 it is worth decomposing CODE operations into scalar operations for
7352 that loop's vectorization factor. */
7353
7354 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7355 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7356 {
7357 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7358 unsigned HOST_WIDE_INT value;
7359 return (loop_vinfo
7360 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7361 && value >= vect_min_worthwhile_factor (code));
7362 }
7363
7364 /* Function vectorizable_induction
7365
7366 Check if STMT_INFO performs an induction computation that can be vectorized.
7367 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7368 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7369 Return true if STMT_INFO is vectorizable in this way. */
7370
7371 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7372 vectorizable_induction (stmt_vec_info stmt_info,
7373 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7374 stmt_vec_info *vec_stmt, slp_tree slp_node,
7375 stmt_vector_for_cost *cost_vec)
7376 {
7377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7378 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7379 unsigned ncopies;
7380 bool nested_in_vect_loop = false;
7381 class loop *iv_loop;
7382 tree vec_def;
7383 edge pe = loop_preheader_edge (loop);
7384 basic_block new_bb;
7385 tree new_vec, vec_init, vec_step, t;
7386 tree new_name;
7387 gimple *new_stmt;
7388 gphi *induction_phi;
7389 tree induc_def, vec_dest;
7390 tree init_expr, step_expr;
7391 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7392 unsigned i;
7393 tree expr;
7394 gimple_seq stmts;
7395 imm_use_iterator imm_iter;
7396 use_operand_p use_p;
7397 gimple *exit_phi;
7398 edge latch_e;
7399 tree loop_arg;
7400 gimple_stmt_iterator si;
7401
7402 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7403 if (!phi)
7404 return false;
7405
7406 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7407 return false;
7408
7409 /* Make sure it was recognized as induction computation. */
7410 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7411 return false;
7412
7413 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7414 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7415
7416 if (slp_node)
7417 ncopies = 1;
7418 else
7419 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7420 gcc_assert (ncopies >= 1);
7421
7422 /* FORNOW. These restrictions should be relaxed. */
7423 if (nested_in_vect_loop_p (loop, stmt_info))
7424 {
7425 imm_use_iterator imm_iter;
7426 use_operand_p use_p;
7427 gimple *exit_phi;
7428 edge latch_e;
7429 tree loop_arg;
7430
7431 if (ncopies > 1)
7432 {
7433 if (dump_enabled_p ())
7434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7435 "multiple types in nested loop.\n");
7436 return false;
7437 }
7438
7439 /* FORNOW: outer loop induction with SLP not supported. */
7440 if (STMT_SLP_TYPE (stmt_info))
7441 return false;
7442
7443 exit_phi = NULL;
7444 latch_e = loop_latch_edge (loop->inner);
7445 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7446 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7447 {
7448 gimple *use_stmt = USE_STMT (use_p);
7449 if (is_gimple_debug (use_stmt))
7450 continue;
7451
7452 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7453 {
7454 exit_phi = use_stmt;
7455 break;
7456 }
7457 }
7458 if (exit_phi)
7459 {
7460 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7461 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7462 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7463 {
7464 if (dump_enabled_p ())
7465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7466 "inner-loop induction only used outside "
7467 "of the outer vectorized loop.\n");
7468 return false;
7469 }
7470 }
7471
7472 nested_in_vect_loop = true;
7473 iv_loop = loop->inner;
7474 }
7475 else
7476 iv_loop = loop;
7477 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7478
7479 if (slp_node && !nunits.is_constant ())
7480 {
7481 /* The current SLP code creates the initial value element-by-element. */
7482 if (dump_enabled_p ())
7483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7484 "SLP induction not supported for variable-length"
7485 " vectors.\n");
7486 return false;
7487 }
7488
7489 if (!vec_stmt) /* transformation not required. */
7490 {
7491 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7492 DUMP_VECT_SCOPE ("vectorizable_induction");
7493 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7494 return true;
7495 }
7496
7497 /* Transform. */
7498
7499 /* Compute a vector variable, initialized with the first VF values of
7500 the induction variable. E.g., for an iv with IV_PHI='X' and
7501 evolution S, for a vector of 4 units, we want to compute:
7502 [X, X + S, X + 2*S, X + 3*S]. */
7503
7504 if (dump_enabled_p ())
7505 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7506
7507 latch_e = loop_latch_edge (iv_loop);
7508 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7509
7510 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7511 gcc_assert (step_expr != NULL_TREE);
7512 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7513
7514 pe = loop_preheader_edge (iv_loop);
7515 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7516 loop_preheader_edge (iv_loop));
7517
7518 stmts = NULL;
7519 if (!nested_in_vect_loop)
7520 {
7521 /* Convert the initial value to the IV update type. */
7522 tree new_type = TREE_TYPE (step_expr);
7523 init_expr = gimple_convert (&stmts, new_type, init_expr);
7524
7525 /* If we are using the loop mask to "peel" for alignment then we need
7526 to adjust the start value here. */
7527 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7528 if (skip_niters != NULL_TREE)
7529 {
7530 if (FLOAT_TYPE_P (vectype))
7531 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7532 skip_niters);
7533 else
7534 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7535 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7536 skip_niters, step_expr);
7537 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7538 init_expr, skip_step);
7539 }
7540 }
7541
7542 if (stmts)
7543 {
7544 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7545 gcc_assert (!new_bb);
7546 }
7547
7548 /* Find the first insertion point in the BB. */
7549 basic_block bb = gimple_bb (phi);
7550 si = gsi_after_labels (bb);
7551
7552 /* For SLP induction we have to generate several IVs as for example
7553 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7554 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7555 [VF*S, VF*S, VF*S, VF*S] for all. */
7556 if (slp_node)
7557 {
7558 /* Enforced above. */
7559 unsigned int const_nunits = nunits.to_constant ();
7560
7561 /* Generate [VF*S, VF*S, ... ]. */
7562 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7563 {
7564 expr = build_int_cst (integer_type_node, vf);
7565 expr = fold_convert (TREE_TYPE (step_expr), expr);
7566 }
7567 else
7568 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7569 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7570 expr, step_expr);
7571 if (! CONSTANT_CLASS_P (new_name))
7572 new_name = vect_init_vector (stmt_info, new_name,
7573 TREE_TYPE (step_expr), NULL);
7574 new_vec = build_vector_from_val (step_vectype, new_name);
7575 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7576
7577 /* Now generate the IVs. */
7578 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7579 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7580 unsigned elts = const_nunits * nvects;
7581 unsigned nivs = least_common_multiple (group_size,
7582 const_nunits) / const_nunits;
7583 gcc_assert (elts % group_size == 0);
7584 tree elt = init_expr;
7585 unsigned ivn;
7586 for (ivn = 0; ivn < nivs; ++ivn)
7587 {
7588 tree_vector_builder elts (step_vectype, const_nunits, 1);
7589 stmts = NULL;
7590 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7591 {
7592 if (ivn*const_nunits + eltn >= group_size
7593 && (ivn * const_nunits + eltn) % group_size == 0)
7594 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7595 elt, step_expr);
7596 elts.quick_push (elt);
7597 }
7598 vec_init = gimple_build_vector (&stmts, &elts);
7599 vec_init = gimple_convert (&stmts, vectype, vec_init);
7600 if (stmts)
7601 {
7602 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7603 gcc_assert (!new_bb);
7604 }
7605
7606 /* Create the induction-phi that defines the induction-operand. */
7607 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7608 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7609 stmt_vec_info induction_phi_info
7610 = loop_vinfo->add_stmt (induction_phi);
7611 induc_def = PHI_RESULT (induction_phi);
7612
7613 /* Create the iv update inside the loop */
7614 gimple_seq stmts = NULL;
7615 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7616 vec_def = gimple_build (&stmts,
7617 PLUS_EXPR, step_vectype, vec_def, vec_step);
7618 vec_def = gimple_convert (&stmts, vectype, vec_def);
7619 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7620 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7621
7622 /* Set the arguments of the phi node: */
7623 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7624 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7625 UNKNOWN_LOCATION);
7626
7627 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7628 }
7629
7630 /* Re-use IVs when we can. */
7631 if (ivn < nvects)
7632 {
7633 unsigned vfp
7634 = least_common_multiple (group_size, const_nunits) / group_size;
7635 /* Generate [VF'*S, VF'*S, ... ]. */
7636 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7637 {
7638 expr = build_int_cst (integer_type_node, vfp);
7639 expr = fold_convert (TREE_TYPE (step_expr), expr);
7640 }
7641 else
7642 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7643 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7644 expr, step_expr);
7645 if (! CONSTANT_CLASS_P (new_name))
7646 new_name = vect_init_vector (stmt_info, new_name,
7647 TREE_TYPE (step_expr), NULL);
7648 new_vec = build_vector_from_val (step_vectype, new_name);
7649 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7650 for (; ivn < nvects; ++ivn)
7651 {
7652 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7653 tree def;
7654 if (gimple_code (iv) == GIMPLE_PHI)
7655 def = gimple_phi_result (iv);
7656 else
7657 def = gimple_assign_lhs (iv);
7658 gimple_seq stmts = NULL;
7659 def = gimple_convert (&stmts, step_vectype, def);
7660 def = gimple_build (&stmts,
7661 PLUS_EXPR, step_vectype, def, vec_step);
7662 def = gimple_convert (&stmts, vectype, def);
7663 if (gimple_code (iv) == GIMPLE_PHI)
7664 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7665 else
7666 {
7667 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7668 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7669 }
7670 SLP_TREE_VEC_STMTS (slp_node).quick_push
7671 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7672 }
7673 }
7674
7675 return true;
7676 }
7677
7678 /* Create the vector that holds the initial_value of the induction. */
7679 if (nested_in_vect_loop)
7680 {
7681 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7682 been created during vectorization of previous stmts. We obtain it
7683 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7684 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7685 /* If the initial value is not of proper type, convert it. */
7686 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7687 {
7688 new_stmt
7689 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7690 vect_simple_var,
7691 "vec_iv_"),
7692 VIEW_CONVERT_EXPR,
7693 build1 (VIEW_CONVERT_EXPR, vectype,
7694 vec_init));
7695 vec_init = gimple_assign_lhs (new_stmt);
7696 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7697 new_stmt);
7698 gcc_assert (!new_bb);
7699 loop_vinfo->add_stmt (new_stmt);
7700 }
7701 }
7702 else
7703 {
7704 /* iv_loop is the loop to be vectorized. Create:
7705 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7706 stmts = NULL;
7707 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7708
7709 unsigned HOST_WIDE_INT const_nunits;
7710 if (nunits.is_constant (&const_nunits))
7711 {
7712 tree_vector_builder elts (step_vectype, const_nunits, 1);
7713 elts.quick_push (new_name);
7714 for (i = 1; i < const_nunits; i++)
7715 {
7716 /* Create: new_name_i = new_name + step_expr */
7717 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7718 new_name, step_expr);
7719 elts.quick_push (new_name);
7720 }
7721 /* Create a vector from [new_name_0, new_name_1, ...,
7722 new_name_nunits-1] */
7723 vec_init = gimple_build_vector (&stmts, &elts);
7724 }
7725 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7726 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7727 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7728 new_name, step_expr);
7729 else
7730 {
7731 /* Build:
7732 [base, base, base, ...]
7733 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7734 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7735 gcc_assert (flag_associative_math);
7736 tree index = build_index_vector (step_vectype, 0, 1);
7737 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7738 new_name);
7739 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7740 step_expr);
7741 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7742 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7743 vec_init, step_vec);
7744 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7745 vec_init, base_vec);
7746 }
7747 vec_init = gimple_convert (&stmts, vectype, vec_init);
7748
7749 if (stmts)
7750 {
7751 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7752 gcc_assert (!new_bb);
7753 }
7754 }
7755
7756
7757 /* Create the vector that holds the step of the induction. */
7758 if (nested_in_vect_loop)
7759 /* iv_loop is nested in the loop to be vectorized. Generate:
7760 vec_step = [S, S, S, S] */
7761 new_name = step_expr;
7762 else
7763 {
7764 /* iv_loop is the loop to be vectorized. Generate:
7765 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7766 gimple_seq seq = NULL;
7767 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7768 {
7769 expr = build_int_cst (integer_type_node, vf);
7770 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7771 }
7772 else
7773 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7774 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7775 expr, step_expr);
7776 if (seq)
7777 {
7778 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7779 gcc_assert (!new_bb);
7780 }
7781 }
7782
7783 t = unshare_expr (new_name);
7784 gcc_assert (CONSTANT_CLASS_P (new_name)
7785 || TREE_CODE (new_name) == SSA_NAME);
7786 new_vec = build_vector_from_val (step_vectype, t);
7787 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7788
7789
7790 /* Create the following def-use cycle:
7791 loop prolog:
7792 vec_init = ...
7793 vec_step = ...
7794 loop:
7795 vec_iv = PHI <vec_init, vec_loop>
7796 ...
7797 STMT
7798 ...
7799 vec_loop = vec_iv + vec_step; */
7800
7801 /* Create the induction-phi that defines the induction-operand. */
7802 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7803 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7804 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7805 induc_def = PHI_RESULT (induction_phi);
7806
7807 /* Create the iv update inside the loop */
7808 stmts = NULL;
7809 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7810 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7811 vec_def = gimple_convert (&stmts, vectype, vec_def);
7812 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7813 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7814 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7815
7816 /* Set the arguments of the phi node: */
7817 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7818 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7819 UNKNOWN_LOCATION);
7820
7821 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7822
7823 /* In case that vectorization factor (VF) is bigger than the number
7824 of elements that we can fit in a vectype (nunits), we have to generate
7825 more than one vector stmt - i.e - we need to "unroll" the
7826 vector stmt by a factor VF/nunits. For more details see documentation
7827 in vectorizable_operation. */
7828
7829 if (ncopies > 1)
7830 {
7831 gimple_seq seq = NULL;
7832 stmt_vec_info prev_stmt_vinfo;
7833 /* FORNOW. This restriction should be relaxed. */
7834 gcc_assert (!nested_in_vect_loop);
7835
7836 /* Create the vector that holds the step of the induction. */
7837 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7838 {
7839 expr = build_int_cst (integer_type_node, nunits);
7840 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7841 }
7842 else
7843 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7844 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7845 expr, step_expr);
7846 if (seq)
7847 {
7848 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7849 gcc_assert (!new_bb);
7850 }
7851
7852 t = unshare_expr (new_name);
7853 gcc_assert (CONSTANT_CLASS_P (new_name)
7854 || TREE_CODE (new_name) == SSA_NAME);
7855 new_vec = build_vector_from_val (step_vectype, t);
7856 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7857
7858 vec_def = induc_def;
7859 prev_stmt_vinfo = induction_phi_info;
7860 for (i = 1; i < ncopies; i++)
7861 {
7862 /* vec_i = vec_prev + vec_step */
7863 gimple_seq stmts = NULL;
7864 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7865 vec_def = gimple_build (&stmts,
7866 PLUS_EXPR, step_vectype, vec_def, vec_step);
7867 vec_def = gimple_convert (&stmts, vectype, vec_def);
7868
7869 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7870 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7871 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7872 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7873 prev_stmt_vinfo = new_stmt_info;
7874 }
7875 }
7876
7877 if (nested_in_vect_loop)
7878 {
7879 /* Find the loop-closed exit-phi of the induction, and record
7880 the final vector of induction results: */
7881 exit_phi = NULL;
7882 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7883 {
7884 gimple *use_stmt = USE_STMT (use_p);
7885 if (is_gimple_debug (use_stmt))
7886 continue;
7887
7888 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7889 {
7890 exit_phi = use_stmt;
7891 break;
7892 }
7893 }
7894 if (exit_phi)
7895 {
7896 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7897 /* FORNOW. Currently not supporting the case that an inner-loop induction
7898 is not used in the outer-loop (i.e. only outside the outer-loop). */
7899 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7900 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7901
7902 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7903 if (dump_enabled_p ())
7904 dump_printf_loc (MSG_NOTE, vect_location,
7905 "vector of inductions after inner-loop:%G",
7906 new_stmt);
7907 }
7908 }
7909
7910
7911 if (dump_enabled_p ())
7912 dump_printf_loc (MSG_NOTE, vect_location,
7913 "transform induction: created def-use cycle: %G%G",
7914 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7915
7916 return true;
7917 }
7918
7919 /* Function vectorizable_live_operation.
7920
7921 STMT_INFO computes a value that is used outside the loop. Check if
7922 it can be supported. */
7923
7924 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost *)7925 vectorizable_live_operation (stmt_vec_info stmt_info,
7926 gimple_stmt_iterator *gsi,
7927 slp_tree slp_node, slp_instance slp_node_instance,
7928 int slp_index, bool vec_stmt_p,
7929 stmt_vector_for_cost *)
7930 {
7931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7932 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7933 imm_use_iterator imm_iter;
7934 tree lhs, lhs_type, bitsize, vec_bitsize;
7935 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7936 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7937 int ncopies;
7938 gimple *use_stmt;
7939 auto_vec<tree> vec_oprnds;
7940 int vec_entry = 0;
7941 poly_uint64 vec_index = 0;
7942
7943 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7944
7945 /* Due to how we generate code for SLP_TREE_TWO_OPERATORS we cannot
7946 vectorize live operations out of it. */
7947 if (slp_node && SLP_TREE_TWO_OPERATORS (slp_node))
7948 return false;
7949
7950 /* If a stmt of a reduction is live, vectorize it via
7951 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7952 validity so just trigger the transform here. */
7953 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7954 {
7955 if (!vec_stmt_p)
7956 return true;
7957 if (slp_node)
7958 {
7959 /* For reduction chains the meta-info is attached to
7960 the group leader. */
7961 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7962 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7963 /* For SLP reductions we vectorize the epilogue for
7964 all involved stmts together. */
7965 else if (slp_index != 0)
7966 return true;
7967 }
7968 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7969 gcc_assert (reduc_info->is_reduc_info);
7970 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7971 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7972 return true;
7973 vect_create_epilog_for_reduction (stmt_info, slp_node,
7974 slp_node_instance);
7975 return true;
7976 }
7977
7978 /* FORNOW. CHECKME. */
7979 if (nested_in_vect_loop_p (loop, stmt_info))
7980 return false;
7981
7982 /* If STMT is not relevant and it is a simple assignment and its inputs are
7983 invariant then it can remain in place, unvectorized. The original last
7984 scalar value that it computes will be used. */
7985 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7986 {
7987 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7988 if (dump_enabled_p ())
7989 dump_printf_loc (MSG_NOTE, vect_location,
7990 "statement is simple and uses invariant. Leaving in "
7991 "place.\n");
7992 return true;
7993 }
7994
7995 if (slp_node)
7996 ncopies = 1;
7997 else
7998 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7999
8000 if (slp_node)
8001 {
8002 gcc_assert (slp_index >= 0);
8003
8004 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8005 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8006
8007 /* Get the last occurrence of the scalar index from the concatenation of
8008 all the slp vectors. Calculate which slp vector it is and the index
8009 within. */
8010 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8011
8012 /* Calculate which vector contains the result, and which lane of
8013 that vector we need. */
8014 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8015 {
8016 if (dump_enabled_p ())
8017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8018 "Cannot determine which vector holds the"
8019 " final result.\n");
8020 return false;
8021 }
8022 }
8023
8024 if (!vec_stmt_p)
8025 {
8026 /* No transformation required. */
8027 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8028 {
8029 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8030 OPTIMIZE_FOR_SPEED))
8031 {
8032 if (dump_enabled_p ())
8033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8034 "can't use a fully-masked loop because "
8035 "the target doesn't support extract last "
8036 "reduction.\n");
8037 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8038 }
8039 else if (slp_node)
8040 {
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8043 "can't use a fully-masked loop because an "
8044 "SLP statement is live after the loop.\n");
8045 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8046 }
8047 else if (ncopies > 1)
8048 {
8049 if (dump_enabled_p ())
8050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051 "can't use a fully-masked loop because"
8052 " ncopies is greater than 1.\n");
8053 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8054 }
8055 else
8056 {
8057 gcc_assert (ncopies == 1 && !slp_node);
8058 vect_record_loop_mask (loop_vinfo,
8059 &LOOP_VINFO_MASKS (loop_vinfo),
8060 1, vectype, NULL);
8061 }
8062 }
8063 return true;
8064 }
8065
8066 /* Use the lhs of the original scalar statement. */
8067 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8068
8069 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8070 : gimple_get_lhs (stmt);
8071 lhs_type = TREE_TYPE (lhs);
8072
8073 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8074 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8075 : TYPE_SIZE (TREE_TYPE (vectype)));
8076 vec_bitsize = TYPE_SIZE (vectype);
8077
8078 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8079 tree vec_lhs, bitstart;
8080 if (slp_node)
8081 {
8082 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8083
8084 /* Get the correct slp vectorized stmt. */
8085 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8086 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8087 vec_lhs = gimple_phi_result (phi);
8088 else
8089 vec_lhs = gimple_get_lhs (vec_stmt);
8090
8091 /* Get entry to use. */
8092 bitstart = bitsize_int (vec_index);
8093 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8094 }
8095 else
8096 {
8097 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8098 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8099 gcc_checking_assert (ncopies == 1
8100 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8101
8102 /* For multiple copies, get the last copy. */
8103 for (int i = 1; i < ncopies; ++i)
8104 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8105
8106 /* Get the last lane in the vector. */
8107 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8108 }
8109
8110 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8111 requirement, insert one phi node for it. It looks like:
8112 loop;
8113 BB:
8114 # lhs' = PHI <lhs>
8115 ==>
8116 loop;
8117 BB:
8118 # vec_lhs' = PHI <vec_lhs>
8119 new_tree = lane_extract <vec_lhs', ...>;
8120 lhs' = new_tree; */
8121
8122 basic_block exit_bb = single_exit (loop)->dest;
8123 gcc_assert (single_pred_p (exit_bb));
8124
8125 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8126 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8127 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8128
8129 gimple_seq stmts = NULL;
8130 tree new_tree;
8131 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8132 {
8133 /* Emit:
8134
8135 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8136
8137 where VEC_LHS is the vectorized live-out result and MASK is
8138 the loop mask for the final iteration. */
8139 gcc_assert (ncopies == 1 && !slp_node);
8140 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8141 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8142 vectype, 0);
8143 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8144 mask, vec_lhs_phi);
8145
8146 /* Convert the extracted vector element to the required scalar type. */
8147 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8148 }
8149 else
8150 {
8151 tree bftype = TREE_TYPE (vectype);
8152 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8153 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8154 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8155 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8156 &stmts, true, NULL_TREE);
8157 }
8158
8159 if (stmts)
8160 {
8161 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8162 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8163
8164 /* Remove existing phi from lhs and create one copy from new_tree. */
8165 tree lhs_phi = NULL_TREE;
8166 gimple_stmt_iterator gsi;
8167 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8168 {
8169 gimple *phi = gsi_stmt (gsi);
8170 if ((gimple_phi_arg_def (phi, 0) == lhs))
8171 {
8172 remove_phi_node (&gsi, false);
8173 lhs_phi = gimple_phi_result (phi);
8174 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8175 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8176 break;
8177 }
8178 }
8179 }
8180
8181 /* Replace use of lhs with newly computed result. If the use stmt is a
8182 single arg PHI, just replace all uses of PHI result. It's necessary
8183 because lcssa PHI defining lhs may be before newly inserted stmt. */
8184 use_operand_p use_p;
8185 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8186 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8187 && !is_gimple_debug (use_stmt))
8188 {
8189 if (gimple_code (use_stmt) == GIMPLE_PHI
8190 && gimple_phi_num_args (use_stmt) == 1)
8191 {
8192 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8193 }
8194 else
8195 {
8196 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8197 SET_USE (use_p, new_tree);
8198 }
8199 update_stmt (use_stmt);
8200 }
8201
8202 return true;
8203 }
8204
8205 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8206
8207 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8208 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8209 {
8210 ssa_op_iter op_iter;
8211 imm_use_iterator imm_iter;
8212 def_operand_p def_p;
8213 gimple *ustmt;
8214
8215 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8216 {
8217 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8218 {
8219 basic_block bb;
8220
8221 if (!is_gimple_debug (ustmt))
8222 continue;
8223
8224 bb = gimple_bb (ustmt);
8225
8226 if (!flow_bb_inside_loop_p (loop, bb))
8227 {
8228 if (gimple_debug_bind_p (ustmt))
8229 {
8230 if (dump_enabled_p ())
8231 dump_printf_loc (MSG_NOTE, vect_location,
8232 "killing debug use\n");
8233
8234 gimple_debug_bind_reset_value (ustmt);
8235 update_stmt (ustmt);
8236 }
8237 else
8238 gcc_unreachable ();
8239 }
8240 }
8241 }
8242 }
8243
8244 /* Given loop represented by LOOP_VINFO, return true if computation of
8245 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8246 otherwise. */
8247
8248 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8249 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8250 {
8251 /* Constant case. */
8252 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8253 {
8254 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8255 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8256
8257 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8258 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8259 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8260 return true;
8261 }
8262
8263 widest_int max;
8264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8265 /* Check the upper bound of loop niters. */
8266 if (get_max_loop_iterations (loop, &max))
8267 {
8268 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8269 signop sgn = TYPE_SIGN (type);
8270 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8271 if (max < type_max)
8272 return true;
8273 }
8274 return false;
8275 }
8276
8277 /* Return a mask type with half the number of elements as OLD_TYPE,
8278 given that it should have mode NEW_MODE. */
8279
8280 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8281 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8282 {
8283 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8284 return build_truth_vector_type_for_mode (nunits, new_mode);
8285 }
8286
8287 /* Return a mask type with twice as many elements as OLD_TYPE,
8288 given that it should have mode NEW_MODE. */
8289
8290 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8291 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8292 {
8293 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8294 return build_truth_vector_type_for_mode (nunits, new_mode);
8295 }
8296
8297 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8298 contain a sequence of NVECTORS masks that each control a vector of type
8299 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8300 these vector masks with the vector version of SCALAR_MASK. */
8301
8302 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8303 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8304 unsigned int nvectors, tree vectype, tree scalar_mask)
8305 {
8306 gcc_assert (nvectors != 0);
8307 if (masks->length () < nvectors)
8308 masks->safe_grow_cleared (nvectors);
8309 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8310 /* The number of scalars per iteration and the number of vectors are
8311 both compile-time constants. */
8312 unsigned int nscalars_per_iter
8313 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8314 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8315
8316 if (scalar_mask)
8317 {
8318 scalar_cond_masked_key cond (scalar_mask, nvectors);
8319 loop_vinfo->scalar_cond_masked_set.add (cond);
8320 }
8321
8322 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8323 {
8324 rgm->max_nscalars_per_iter = nscalars_per_iter;
8325 rgm->mask_type = truth_type_for (vectype);
8326 }
8327 }
8328
8329 /* Given a complete set of masks MASKS, extract mask number INDEX
8330 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8331 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8332
8333 See the comment above vec_loop_masks for more details about the mask
8334 arrangement. */
8335
8336 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8337 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8338 unsigned int nvectors, tree vectype, unsigned int index)
8339 {
8340 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8341 tree mask_type = rgm->mask_type;
8342
8343 /* Populate the rgroup's mask array, if this is the first time we've
8344 used it. */
8345 if (rgm->masks.is_empty ())
8346 {
8347 rgm->masks.safe_grow_cleared (nvectors);
8348 for (unsigned int i = 0; i < nvectors; ++i)
8349 {
8350 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8351 /* Provide a dummy definition until the real one is available. */
8352 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8353 rgm->masks[i] = mask;
8354 }
8355 }
8356
8357 tree mask = rgm->masks[index];
8358 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8359 TYPE_VECTOR_SUBPARTS (vectype)))
8360 {
8361 /* A loop mask for data type X can be reused for data type Y
8362 if X has N times more elements than Y and if Y's elements
8363 are N times bigger than X's. In this case each sequence
8364 of N elements in the loop mask will be all-zero or all-one.
8365 We can then view-convert the mask so that each sequence of
8366 N elements is replaced by a single element. */
8367 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8368 TYPE_VECTOR_SUBPARTS (vectype)));
8369 gimple_seq seq = NULL;
8370 mask_type = truth_type_for (vectype);
8371 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8372 if (seq)
8373 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8374 }
8375 return mask;
8376 }
8377
8378 /* Scale profiling counters by estimation for LOOP which is vectorized
8379 by factor VF. */
8380
8381 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)8382 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8383 {
8384 edge preheader = loop_preheader_edge (loop);
8385 /* Reduce loop iterations by the vectorization factor. */
8386 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8387 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8388
8389 if (freq_h.nonzero_p ())
8390 {
8391 profile_probability p;
8392
8393 /* Avoid dropping loop body profile counter to 0 because of zero count
8394 in loop's preheader. */
8395 if (!(freq_e == profile_count::zero ()))
8396 freq_e = freq_e.force_nonzero ();
8397 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8398 scale_loop_frequencies (loop, p);
8399 }
8400
8401 edge exit_e = single_exit (loop);
8402 exit_e->probability = profile_probability::always ()
8403 .apply_scale (1, new_est_niter + 1);
8404
8405 edge exit_l = single_pred_edge (loop->latch);
8406 profile_probability prob = exit_l->probability;
8407 exit_l->probability = exit_e->probability.invert ();
8408 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8409 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8410 }
8411
8412 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8413 latch edge values originally defined by it. */
8414
8415 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)8416 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8417 stmt_vec_info def_stmt_info)
8418 {
8419 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8420 if (!def || TREE_CODE (def) != SSA_NAME)
8421 return;
8422 stmt_vec_info phi_info;
8423 imm_use_iterator iter;
8424 use_operand_p use_p;
8425 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8426 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8427 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8428 && (phi_info = loop_vinfo->lookup_stmt (phi))
8429 && STMT_VINFO_RELEVANT_P (phi_info)
8430 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8431 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8432 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8433 {
8434 loop_p loop = gimple_bb (phi)->loop_father;
8435 edge e = loop_latch_edge (loop);
8436 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8437 {
8438 stmt_vec_info phi_vec_info = STMT_VINFO_VEC_STMT (phi_info);
8439 stmt_vec_info def_vec_info = STMT_VINFO_VEC_STMT (def_stmt_info);
8440 do
8441 {
8442 add_phi_arg (as_a <gphi *> (phi_vec_info->stmt),
8443 gimple_get_lhs (def_vec_info->stmt), e,
8444 gimple_phi_arg_location (phi, e->dest_idx));
8445 phi_vec_info = STMT_VINFO_RELATED_STMT (phi_vec_info);
8446 def_vec_info = STMT_VINFO_RELATED_STMT (def_vec_info);
8447 }
8448 while (phi_vec_info);
8449 gcc_assert (!def_vec_info);
8450 }
8451 }
8452 }
8453
8454 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8455 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8456 stmt_vec_info. */
8457
8458 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8459 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8460 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8461 {
8462 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8463 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8464
8465 if (dump_enabled_p ())
8466 dump_printf_loc (MSG_NOTE, vect_location,
8467 "------>vectorizing statement: %G", stmt_info->stmt);
8468
8469 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8470 vect_loop_kill_debug_uses (loop, stmt_info);
8471
8472 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8473 && !STMT_VINFO_LIVE_P (stmt_info))
8474 return false;
8475
8476 if (STMT_VINFO_VECTYPE (stmt_info))
8477 {
8478 poly_uint64 nunits
8479 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8480 if (!STMT_SLP_TYPE (stmt_info)
8481 && maybe_ne (nunits, vf)
8482 && dump_enabled_p ())
8483 /* For SLP VF is set according to unrolling factor, and not
8484 to vector size, hence for SLP this print is not valid. */
8485 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8486 }
8487
8488 /* Pure SLP statements have already been vectorized. We still need
8489 to apply loop vectorization to hybrid SLP statements. */
8490 if (PURE_SLP_STMT (stmt_info))
8491 return false;
8492
8493 if (dump_enabled_p ())
8494 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8495
8496 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8497 *seen_store = stmt_info;
8498
8499 return true;
8500 }
8501
8502 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8503 in the hash_map with its corresponding values. */
8504
8505 static tree
find_in_mapping(tree t,void * context)8506 find_in_mapping (tree t, void *context)
8507 {
8508 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8509
8510 tree *value = mapping->get (t);
8511 return value ? *value : t;
8512 }
8513
8514 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8515 original loop that has now been vectorized.
8516
8517 The inits of the data_references need to be advanced with the number of
8518 iterations of the main loop. This has been computed in vect_do_peeling and
8519 is stored in parameter ADVANCE. We first restore the data_references
8520 initial offset with the values recored in ORIG_DRS_INIT.
8521
8522 Since the loop_vec_info of this EPILOGUE was constructed for the original
8523 loop, its stmt_vec_infos all point to the original statements. These need
8524 to be updated to point to their corresponding copies as well as the SSA_NAMES
8525 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8526
8527 The data_reference's connections also need to be updated. Their
8528 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8529 stmt_vec_infos, their statements need to point to their corresponding copy,
8530 if they are gather loads or scatter stores then their reference needs to be
8531 updated to point to its corresponding copy and finally we set
8532 'base_misaligned' to false as we have already peeled for alignment in the
8533 prologue of the main loop. */
8534
8535 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)8536 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8537 {
8538 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8539 auto_vec<gimple *> stmt_worklist;
8540 hash_map<tree,tree> mapping;
8541 gimple *orig_stmt, *new_stmt;
8542 gimple_stmt_iterator epilogue_gsi;
8543 gphi_iterator epilogue_phi_gsi;
8544 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8545 basic_block *epilogue_bbs = get_loop_body (epilogue);
8546 unsigned i;
8547
8548 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8549
8550 /* Advance data_reference's with the number of iterations of the previous
8551 loop and its prologue. */
8552 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8553
8554
8555 /* The EPILOGUE loop is a copy of the original loop so they share the same
8556 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8557 point to the copied statements. We also create a mapping of all LHS' in
8558 the original loop and all the LHS' in the EPILOGUE and create worklists to
8559 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8560 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8561 {
8562 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8563 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8564 {
8565 new_stmt = epilogue_phi_gsi.phi ();
8566
8567 gcc_assert (gimple_uid (new_stmt) > 0);
8568 stmt_vinfo
8569 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8570
8571 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8572 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8573
8574 mapping.put (gimple_phi_result (orig_stmt),
8575 gimple_phi_result (new_stmt));
8576 /* PHI nodes can not have patterns or related statements. */
8577 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8578 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8579 }
8580
8581 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8582 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8583 {
8584 new_stmt = gsi_stmt (epilogue_gsi);
8585
8586 gcc_assert (gimple_uid (new_stmt) > 0);
8587 stmt_vinfo
8588 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8589
8590 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8591 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8592
8593 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8594 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8595
8596 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8597 {
8598 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8599 for (gimple_stmt_iterator gsi = gsi_start (seq);
8600 !gsi_end_p (gsi); gsi_next (&gsi))
8601 stmt_worklist.safe_push (gsi_stmt (gsi));
8602 }
8603
8604 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8605 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8606 {
8607 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8608 stmt_worklist.safe_push (stmt);
8609 /* Set BB such that the assert in
8610 'get_initial_def_for_reduction' is able to determine that
8611 the BB of the related stmt is inside this loop. */
8612 gimple_set_bb (stmt,
8613 gimple_bb (new_stmt));
8614 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8615 gcc_assert (related_vinfo == NULL
8616 || related_vinfo == stmt_vinfo);
8617 }
8618 }
8619 }
8620
8621 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8622 using the original main loop and thus need to be updated to refer to the
8623 cloned variables used in the epilogue. */
8624 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8625 {
8626 gimple *stmt = stmt_worklist[i];
8627 tree *new_op;
8628
8629 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8630 {
8631 tree op = gimple_op (stmt, j);
8632 if ((new_op = mapping.get(op)))
8633 gimple_set_op (stmt, j, *new_op);
8634 else
8635 {
8636 /* PR92429: The last argument of simplify_replace_tree disables
8637 folding when replacing arguments. This is required as
8638 otherwise you might end up with different statements than the
8639 ones analyzed in vect_loop_analyze, leading to different
8640 vectorization. */
8641 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8642 &find_in_mapping, &mapping, false);
8643 gimple_set_op (stmt, j, op);
8644 }
8645 }
8646 }
8647
8648 struct data_reference *dr;
8649 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8650 FOR_EACH_VEC_ELT (datarefs, i, dr)
8651 {
8652 orig_stmt = DR_STMT (dr);
8653 gcc_assert (gimple_uid (orig_stmt) > 0);
8654 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8655 /* Data references for gather loads and scatter stores do not use the
8656 updated offset we set using ADVANCE. Instead we have to make sure the
8657 reference in the data references point to the corresponding copy of
8658 the original in the epilogue. */
8659 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8660 == VMAT_GATHER_SCATTER)
8661 {
8662 DR_REF (dr)
8663 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8664 &find_in_mapping, &mapping);
8665 DR_BASE_ADDRESS (dr)
8666 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8667 &find_in_mapping, &mapping);
8668 }
8669 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8670 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8671 /* The vector size of the epilogue is smaller than that of the main loop
8672 so the alignment is either the same or lower. This means the dr will
8673 thus by definition be aligned. */
8674 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8675 }
8676
8677 epilogue_vinfo->shared->datarefs_copy.release ();
8678 epilogue_vinfo->shared->save_datarefs ();
8679 }
8680
8681 /* Function vect_transform_loop.
8682
8683 The analysis phase has determined that the loop is vectorizable.
8684 Vectorize the loop - created vectorized stmts to replace the scalar
8685 stmts in the loop, and update the loop exit condition.
8686 Returns scalar epilogue loop if any. */
8687
8688 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)8689 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8690 {
8691 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8692 class loop *epilogue = NULL;
8693 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8694 int nbbs = loop->num_nodes;
8695 int i;
8696 tree niters_vector = NULL_TREE;
8697 tree step_vector = NULL_TREE;
8698 tree niters_vector_mult_vf = NULL_TREE;
8699 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8700 unsigned int lowest_vf = constant_lower_bound (vf);
8701 gimple *stmt;
8702 bool check_profitability = false;
8703 unsigned int th;
8704
8705 DUMP_VECT_SCOPE ("vec_transform_loop");
8706
8707 loop_vinfo->shared->check_datarefs ();
8708
8709 /* Use the more conservative vectorization threshold. If the number
8710 of iterations is constant assume the cost check has been performed
8711 by our caller. If the threshold makes all loops profitable that
8712 run at least the (estimated) vectorization factor number of times
8713 checking is pointless, too. */
8714 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8715 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8716 {
8717 if (dump_enabled_p ())
8718 dump_printf_loc (MSG_NOTE, vect_location,
8719 "Profitability threshold is %d loop iterations.\n",
8720 th);
8721 check_profitability = true;
8722 }
8723
8724 /* Make sure there exists a single-predecessor exit bb. Do this before
8725 versioning. */
8726 edge e = single_exit (loop);
8727 if (! single_pred_p (e->dest))
8728 {
8729 split_loop_exit_edge (e, true);
8730 if (dump_enabled_p ())
8731 dump_printf (MSG_NOTE, "split exit edge\n");
8732 }
8733
8734 /* Version the loop first, if required, so the profitability check
8735 comes first. */
8736
8737 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8738 {
8739 class loop *sloop
8740 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8741 sloop->force_vectorize = false;
8742 check_profitability = false;
8743 }
8744
8745 /* Make sure there exists a single-predecessor exit bb also on the
8746 scalar loop copy. Do this after versioning but before peeling
8747 so CFG structure is fine for both scalar and if-converted loop
8748 to make slpeel_duplicate_current_defs_from_edges face matched
8749 loop closed PHI nodes on the exit. */
8750 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8751 {
8752 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8753 if (! single_pred_p (e->dest))
8754 {
8755 split_loop_exit_edge (e, true);
8756 if (dump_enabled_p ())
8757 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8758 }
8759 }
8760
8761 tree niters = vect_build_loop_niters (loop_vinfo);
8762 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8763 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8764 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8765 tree advance;
8766 drs_init_vec orig_drs_init;
8767
8768 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8769 &step_vector, &niters_vector_mult_vf, th,
8770 check_profitability, niters_no_overflow,
8771 &advance);
8772
8773 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8774 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8775 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8776 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8777
8778 if (niters_vector == NULL_TREE)
8779 {
8780 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8781 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8782 && known_eq (lowest_vf, vf))
8783 {
8784 niters_vector
8785 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8786 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8787 step_vector = build_one_cst (TREE_TYPE (niters));
8788 }
8789 else
8790 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8791 &step_vector, niters_no_overflow);
8792 }
8793
8794 /* 1) Make sure the loop header has exactly two entries
8795 2) Make sure we have a preheader basic block. */
8796
8797 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8798
8799 split_edge (loop_preheader_edge (loop));
8800
8801 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8802 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8803 /* This will deal with any possible peeling. */
8804 vect_prepare_for_masked_peels (loop_vinfo);
8805
8806 /* Schedule the SLP instances first, then handle loop vectorization
8807 below. */
8808 if (!loop_vinfo->slp_instances.is_empty ())
8809 {
8810 DUMP_VECT_SCOPE ("scheduling SLP instances");
8811 vect_schedule_slp (loop_vinfo);
8812 }
8813
8814 /* FORNOW: the vectorizer supports only loops which body consist
8815 of one basic block (header + empty latch). When the vectorizer will
8816 support more involved loop forms, the order by which the BBs are
8817 traversed need to be reconsidered. */
8818
8819 for (i = 0; i < nbbs; i++)
8820 {
8821 basic_block bb = bbs[i];
8822 stmt_vec_info stmt_info;
8823
8824 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8825 gsi_next (&si))
8826 {
8827 gphi *phi = si.phi ();
8828 if (dump_enabled_p ())
8829 dump_printf_loc (MSG_NOTE, vect_location,
8830 "------>vectorizing phi: %G", phi);
8831 stmt_info = loop_vinfo->lookup_stmt (phi);
8832 if (!stmt_info)
8833 continue;
8834
8835 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8836 vect_loop_kill_debug_uses (loop, stmt_info);
8837
8838 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8839 && !STMT_VINFO_LIVE_P (stmt_info))
8840 continue;
8841
8842 if (STMT_VINFO_VECTYPE (stmt_info)
8843 && (maybe_ne
8844 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8845 && dump_enabled_p ())
8846 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8847
8848 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8849 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8850 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8851 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8852 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8853 && ! PURE_SLP_STMT (stmt_info))
8854 {
8855 if (dump_enabled_p ())
8856 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8857 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8858 }
8859 }
8860
8861 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8862 gsi_next (&si))
8863 {
8864 gphi *phi = si.phi ();
8865 stmt_info = loop_vinfo->lookup_stmt (phi);
8866 if (!stmt_info)
8867 continue;
8868
8869 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8870 && !STMT_VINFO_LIVE_P (stmt_info))
8871 continue;
8872
8873 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8874 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8875 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8876 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8877 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8878 && ! PURE_SLP_STMT (stmt_info))
8879 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
8880 }
8881
8882 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8883 !gsi_end_p (si);)
8884 {
8885 stmt = gsi_stmt (si);
8886 /* During vectorization remove existing clobber stmts. */
8887 if (gimple_clobber_p (stmt))
8888 {
8889 unlink_stmt_vdef (stmt);
8890 gsi_remove (&si, true);
8891 release_defs (stmt);
8892 }
8893 else
8894 {
8895 stmt_info = loop_vinfo->lookup_stmt (stmt);
8896
8897 /* vector stmts created in the outer-loop during vectorization of
8898 stmts in an inner-loop may not have a stmt_info, and do not
8899 need to be vectorized. */
8900 stmt_vec_info seen_store = NULL;
8901 if (stmt_info)
8902 {
8903 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8904 {
8905 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8906 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8907 !gsi_end_p (subsi); gsi_next (&subsi))
8908 {
8909 stmt_vec_info pat_stmt_info
8910 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8911 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8912 &si, &seen_store);
8913 }
8914 stmt_vec_info pat_stmt_info
8915 = STMT_VINFO_RELATED_STMT (stmt_info);
8916 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8917 &si, &seen_store))
8918 maybe_set_vectorized_backedge_value (loop_vinfo,
8919 pat_stmt_info);
8920 }
8921 else
8922 {
8923 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8924 &seen_store))
8925 maybe_set_vectorized_backedge_value (loop_vinfo,
8926 stmt_info);
8927 }
8928 }
8929 gsi_next (&si);
8930 if (seen_store)
8931 {
8932 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8933 /* Interleaving. If IS_STORE is TRUE, the
8934 vectorization of the interleaving chain was
8935 completed - free all the stores in the chain. */
8936 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8937 else
8938 /* Free the attached stmt_vec_info and remove the stmt. */
8939 loop_vinfo->remove_stmt (stmt_info);
8940 }
8941 }
8942 }
8943
8944 /* Stub out scalar statements that must not survive vectorization.
8945 Doing this here helps with grouped statements, or statements that
8946 are involved in patterns. */
8947 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8948 !gsi_end_p (gsi); gsi_next (&gsi))
8949 {
8950 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8951 if (!call || !gimple_call_internal_p (call))
8952 continue;
8953 internal_fn ifn = gimple_call_internal_fn (call);
8954 if (ifn == IFN_MASK_LOAD)
8955 {
8956 tree lhs = gimple_get_lhs (call);
8957 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8958 {
8959 tree zero = build_zero_cst (TREE_TYPE (lhs));
8960 gimple *new_stmt = gimple_build_assign (lhs, zero);
8961 gsi_replace (&gsi, new_stmt, true);
8962 }
8963 }
8964 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
8965 {
8966 tree lhs = gimple_get_lhs (call);
8967 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8968 {
8969 tree else_arg
8970 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
8971 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
8972 gsi_replace (&gsi, new_stmt, true);
8973 }
8974 }
8975 }
8976 } /* BBs in loop */
8977
8978 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8979 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8980 if (integer_onep (step_vector))
8981 niters_no_overflow = true;
8982 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8983 niters_vector_mult_vf, !niters_no_overflow);
8984
8985 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8986 scale_profile_for_vect_loop (loop, assumed_vf);
8987
8988 /* True if the final iteration might not handle a full vector's
8989 worth of scalar iterations. */
8990 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8991 /* The minimum number of iterations performed by the epilogue. This
8992 is 1 when peeling for gaps because we always need a final scalar
8993 iteration. */
8994 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8995 /* +1 to convert latch counts to loop iteration counts,
8996 -min_epilogue_iters to remove iterations that cannot be performed
8997 by the vector code. */
8998 int bias_for_lowest = 1 - min_epilogue_iters;
8999 int bias_for_assumed = bias_for_lowest;
9000 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9001 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9002 {
9003 /* When the amount of peeling is known at compile time, the first
9004 iteration will have exactly alignment_npeels active elements.
9005 In the worst case it will have at least one. */
9006 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9007 bias_for_lowest += lowest_vf - min_first_active;
9008 bias_for_assumed += assumed_vf - min_first_active;
9009 }
9010 /* In these calculations the "- 1" converts loop iteration counts
9011 back to latch counts. */
9012 if (loop->any_upper_bound)
9013 loop->nb_iterations_upper_bound
9014 = (final_iter_may_be_partial
9015 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9016 lowest_vf) - 1
9017 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9018 lowest_vf) - 1);
9019 if (loop->any_likely_upper_bound)
9020 loop->nb_iterations_likely_upper_bound
9021 = (final_iter_may_be_partial
9022 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9023 + bias_for_lowest, lowest_vf) - 1
9024 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9025 + bias_for_lowest, lowest_vf) - 1);
9026 if (loop->any_estimate)
9027 loop->nb_iterations_estimate
9028 = (final_iter_may_be_partial
9029 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9030 assumed_vf) - 1
9031 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9032 assumed_vf) - 1);
9033
9034 if (dump_enabled_p ())
9035 {
9036 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9037 {
9038 dump_printf_loc (MSG_NOTE, vect_location,
9039 "LOOP VECTORIZED\n");
9040 if (loop->inner)
9041 dump_printf_loc (MSG_NOTE, vect_location,
9042 "OUTER LOOP VECTORIZED\n");
9043 dump_printf (MSG_NOTE, "\n");
9044 }
9045 else
9046 dump_printf_loc (MSG_NOTE, vect_location,
9047 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9048 GET_MODE_NAME (loop_vinfo->vector_mode));
9049 }
9050
9051 /* Loops vectorized with a variable factor won't benefit from
9052 unrolling/peeling. */
9053 if (!vf.is_constant ())
9054 {
9055 loop->unroll = 1;
9056 if (dump_enabled_p ())
9057 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9058 " variable-length vectorization factor\n");
9059 }
9060 /* Free SLP instances here because otherwise stmt reference counting
9061 won't work. */
9062 slp_instance instance;
9063 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9064 vect_free_slp_instance (instance, true);
9065 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9066 /* Clear-up safelen field since its value is invalid after vectorization
9067 since vectorized loop can have loop-carried dependencies. */
9068 loop->safelen = 0;
9069
9070 if (epilogue)
9071 {
9072 update_epilogue_loop_vinfo (epilogue, advance);
9073
9074 epilogue->simduid = loop->simduid;
9075 epilogue->force_vectorize = loop->force_vectorize;
9076 epilogue->dont_vectorize = false;
9077 }
9078
9079 return epilogue;
9080 }
9081
9082 /* The code below is trying to perform simple optimization - revert
9083 if-conversion for masked stores, i.e. if the mask of a store is zero
9084 do not perform it and all stored value producers also if possible.
9085 For example,
9086 for (i=0; i<n; i++)
9087 if (c[i])
9088 {
9089 p1[i] += 1;
9090 p2[i] = p3[i] +2;
9091 }
9092 this transformation will produce the following semi-hammock:
9093
9094 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9095 {
9096 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9097 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9098 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9099 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9100 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9101 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9102 }
9103 */
9104
9105 void
optimize_mask_stores(class loop * loop)9106 optimize_mask_stores (class loop *loop)
9107 {
9108 basic_block *bbs = get_loop_body (loop);
9109 unsigned nbbs = loop->num_nodes;
9110 unsigned i;
9111 basic_block bb;
9112 class loop *bb_loop;
9113 gimple_stmt_iterator gsi;
9114 gimple *stmt;
9115 auto_vec<gimple *> worklist;
9116 auto_purge_vect_location sentinel;
9117
9118 vect_location = find_loop_location (loop);
9119 /* Pick up all masked stores in loop if any. */
9120 for (i = 0; i < nbbs; i++)
9121 {
9122 bb = bbs[i];
9123 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9124 gsi_next (&gsi))
9125 {
9126 stmt = gsi_stmt (gsi);
9127 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9128 worklist.safe_push (stmt);
9129 }
9130 }
9131
9132 free (bbs);
9133 if (worklist.is_empty ())
9134 return;
9135
9136 /* Loop has masked stores. */
9137 while (!worklist.is_empty ())
9138 {
9139 gimple *last, *last_store;
9140 edge e, efalse;
9141 tree mask;
9142 basic_block store_bb, join_bb;
9143 gimple_stmt_iterator gsi_to;
9144 tree vdef, new_vdef;
9145 gphi *phi;
9146 tree vectype;
9147 tree zero;
9148
9149 last = worklist.pop ();
9150 mask = gimple_call_arg (last, 2);
9151 bb = gimple_bb (last);
9152 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9153 the same loop as if_bb. It could be different to LOOP when two
9154 level loop-nest is vectorized and mask_store belongs to the inner
9155 one. */
9156 e = split_block (bb, last);
9157 bb_loop = bb->loop_father;
9158 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9159 join_bb = e->dest;
9160 store_bb = create_empty_bb (bb);
9161 add_bb_to_loop (store_bb, bb_loop);
9162 e->flags = EDGE_TRUE_VALUE;
9163 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9164 /* Put STORE_BB to likely part. */
9165 efalse->probability = profile_probability::unlikely ();
9166 store_bb->count = efalse->count ();
9167 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9168 if (dom_info_available_p (CDI_DOMINATORS))
9169 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9170 if (dump_enabled_p ())
9171 dump_printf_loc (MSG_NOTE, vect_location,
9172 "Create new block %d to sink mask stores.",
9173 store_bb->index);
9174 /* Create vector comparison with boolean result. */
9175 vectype = TREE_TYPE (mask);
9176 zero = build_zero_cst (vectype);
9177 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9178 gsi = gsi_last_bb (bb);
9179 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9180 /* Create new PHI node for vdef of the last masked store:
9181 .MEM_2 = VDEF <.MEM_1>
9182 will be converted to
9183 .MEM.3 = VDEF <.MEM_1>
9184 and new PHI node will be created in join bb
9185 .MEM_2 = PHI <.MEM_1, .MEM_3>
9186 */
9187 vdef = gimple_vdef (last);
9188 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9189 gimple_set_vdef (last, new_vdef);
9190 phi = create_phi_node (vdef, join_bb);
9191 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9192
9193 /* Put all masked stores with the same mask to STORE_BB if possible. */
9194 while (true)
9195 {
9196 gimple_stmt_iterator gsi_from;
9197 gimple *stmt1 = NULL;
9198
9199 /* Move masked store to STORE_BB. */
9200 last_store = last;
9201 gsi = gsi_for_stmt (last);
9202 gsi_from = gsi;
9203 /* Shift GSI to the previous stmt for further traversal. */
9204 gsi_prev (&gsi);
9205 gsi_to = gsi_start_bb (store_bb);
9206 gsi_move_before (&gsi_from, &gsi_to);
9207 /* Setup GSI_TO to the non-empty block start. */
9208 gsi_to = gsi_start_bb (store_bb);
9209 if (dump_enabled_p ())
9210 dump_printf_loc (MSG_NOTE, vect_location,
9211 "Move stmt to created bb\n%G", last);
9212 /* Move all stored value producers if possible. */
9213 while (!gsi_end_p (gsi))
9214 {
9215 tree lhs;
9216 imm_use_iterator imm_iter;
9217 use_operand_p use_p;
9218 bool res;
9219
9220 /* Skip debug statements. */
9221 if (is_gimple_debug (gsi_stmt (gsi)))
9222 {
9223 gsi_prev (&gsi);
9224 continue;
9225 }
9226 stmt1 = gsi_stmt (gsi);
9227 /* Do not consider statements writing to memory or having
9228 volatile operand. */
9229 if (gimple_vdef (stmt1)
9230 || gimple_has_volatile_ops (stmt1))
9231 break;
9232 gsi_from = gsi;
9233 gsi_prev (&gsi);
9234 lhs = gimple_get_lhs (stmt1);
9235 if (!lhs)
9236 break;
9237
9238 /* LHS of vectorized stmt must be SSA_NAME. */
9239 if (TREE_CODE (lhs) != SSA_NAME)
9240 break;
9241
9242 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9243 {
9244 /* Remove dead scalar statement. */
9245 if (has_zero_uses (lhs))
9246 {
9247 gsi_remove (&gsi_from, true);
9248 continue;
9249 }
9250 }
9251
9252 /* Check that LHS does not have uses outside of STORE_BB. */
9253 res = true;
9254 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9255 {
9256 gimple *use_stmt;
9257 use_stmt = USE_STMT (use_p);
9258 if (is_gimple_debug (use_stmt))
9259 continue;
9260 if (gimple_bb (use_stmt) != store_bb)
9261 {
9262 res = false;
9263 break;
9264 }
9265 }
9266 if (!res)
9267 break;
9268
9269 if (gimple_vuse (stmt1)
9270 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9271 break;
9272
9273 /* Can move STMT1 to STORE_BB. */
9274 if (dump_enabled_p ())
9275 dump_printf_loc (MSG_NOTE, vect_location,
9276 "Move stmt to created bb\n%G", stmt1);
9277 gsi_move_before (&gsi_from, &gsi_to);
9278 /* Shift GSI_TO for further insertion. */
9279 gsi_prev (&gsi_to);
9280 }
9281 /* Put other masked stores with the same mask to STORE_BB. */
9282 if (worklist.is_empty ()
9283 || gimple_call_arg (worklist.last (), 2) != mask
9284 || worklist.last () != stmt1)
9285 break;
9286 last = worklist.pop ();
9287 }
9288 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9289 }
9290 }
9291
9292 /* Decide whether it is possible to use a zero-based induction variable
9293 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9294 return the value that the induction variable must be able to hold
9295 in order to ensure that the loop ends with an all-false mask.
9296 Return -1 otherwise. */
9297 widest_int
vect_iv_limit_for_full_masking(loop_vec_info loop_vinfo)9298 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9299 {
9300 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9301 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9302 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9303
9304 /* Calculate the value that the induction variable must be able
9305 to hit in order to ensure that we end the loop with an all-false mask.
9306 This involves adding the maximum number of inactive trailing scalar
9307 iterations. */
9308 widest_int iv_limit = -1;
9309 if (max_loop_iterations (loop, &iv_limit))
9310 {
9311 if (niters_skip)
9312 {
9313 /* Add the maximum number of skipped iterations to the
9314 maximum iteration count. */
9315 if (TREE_CODE (niters_skip) == INTEGER_CST)
9316 iv_limit += wi::to_widest (niters_skip);
9317 else
9318 iv_limit += max_vf - 1;
9319 }
9320 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9321 /* Make a conservatively-correct assumption. */
9322 iv_limit += max_vf - 1;
9323
9324 /* IV_LIMIT is the maximum number of latch iterations, which is also
9325 the maximum in-range IV value. Round this value down to the previous
9326 vector alignment boundary and then add an extra full iteration. */
9327 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9328 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9329 }
9330 return iv_limit;
9331 }
9332
9333