1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
vect_determine_vf_for_stmt_1(vec_info * vinfo,stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
vect_determine_vf_for_stmt(vec_info * vinfo,stmt_vec_info stmt_info,poly_uint64 * vf)211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
353 }
354 }
355
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
358 {
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
362 }
363
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
369 }
370
371
372 /* Function vect_is_simple_iv_evolution.
373
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
376
377 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
380 {
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
385
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
390
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
395
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
402
403 *init = init_expr;
404 *step = step_expr;
405
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
415 {
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
420 }
421
422 return true;
423 }
424
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
428
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
431 ...
432
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
435 ...
436 x_3 = ...;
437 ...
438
439 outer2:
440 x_4 = PHI <x_3(inner)>;
441 ...
442
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
445
446 static bool
vect_inner_phi_in_double_reduction_p(loop_vec_info loop_vinfo,gphi * phi)447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
456 }
457
458 /* Function vect_analyze_scalar_cycles_1.
459
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
464
465 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
473
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480 {
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
493
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
499 {
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
508 }
509
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
515 {
516 worklist.safe_push (stmt_vinfo);
517 continue;
518 }
519
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 }
528
529
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
532 {
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
547 {
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
551 {
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
555
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558 }
559 else
560 {
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562 {
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
566
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568 }
569 else
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
583 }
584 }
585 }
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
590 }
591 }
592
593
594 /* Function vect_analyze_scalar_cycles.
595
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
602
603 Example1: reduction:
604
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
608
609 Example2: induction:
610
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
614
615 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
630
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
637
638 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 do
647 {
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
656 }
657 while (stmt_info);
658 }
659
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661
662 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665 stmt_vec_info first;
666 unsigned i;
667
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 {
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
672 {
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
678 }
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
683 {
684 if (STMT_VINFO_IN_PATTERN_P (first))
685 {
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
689 }
690 }
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
695 {
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
699 {
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
705 }
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
711 }
712 }
713 }
714
715 /* Function vect_get_loop_niters.
716
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
721
722 Return the loop exit condition. */
723
724
725 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
728 {
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
733
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
738
739 if (!exit)
740 return cond;
741
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
746
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
750
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
753
754 if (may_be_zero)
755 {
756 if (COMPARISON_CLASS_P (may_be_zero))
757 {
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
770
771 may_be_zero = NULL_TREE;
772 }
773 else if (integer_nonzerop (may_be_zero))
774 {
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
778 }
779 else
780 return cond;
781 }
782
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
785
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
794
795 return cond;
796 }
797
798 /* Function bb_in_loop_p
799
800 Used as predicate for dfs order traversal of the loop bbs. */
801
802 static bool
bb_in_loop_p(const_basic_block bb,const void * data)803 bb_in_loop_p (const_basic_block bb, const void *data)
804 {
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
809 }
810
811
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
814
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
850 {
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
855
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
859
860 for (unsigned int i = 0; i < nbbs; i++)
861 {
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
864
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
866 {
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
870 }
871
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
873 {
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
892 {
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
898 }
899 }
900 }
901
902 epilogue_vinfos.create (6);
903 }
904
905 /* Free all levels of rgroup CONTROLS. */
906
907 void
release_vec_loop_controls(vec<rgroup_controls> * controls)908 release_vec_loop_controls (vec<rgroup_controls> *controls)
909 {
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
915 }
916
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
919
~_loop_vec_info()920 _loop_vec_info::~_loop_vec_info ()
921 {
922 free (bbs);
923
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
929
930 /* When we release an epiloge vinfo that we do not intend to use
931 avoid clearing AUX of the main loop which should continue to
932 point to the main loop vinfo since otherwise we'll leak that. */
933 if (loop->aux == this)
934 loop->aux = NULL;
935 }
936
937 /* Return an invariant or register for EXPR and emit necessary
938 computations in the LOOP_VINFO loop preheader. */
939
940 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)941 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
942 {
943 if (is_gimple_reg (expr)
944 || is_gimple_min_invariant (expr))
945 return expr;
946
947 if (! loop_vinfo->ivexpr_map)
948 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
949 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
950 if (! cached)
951 {
952 gimple_seq stmts = NULL;
953 cached = force_gimple_operand (unshare_expr (expr),
954 &stmts, true, NULL_TREE);
955 if (stmts)
956 {
957 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
958 gsi_insert_seq_on_edge_immediate (e, stmts);
959 }
960 }
961 return cached;
962 }
963
964 /* Return true if we can use CMP_TYPE as the comparison type to produce
965 all masks required to mask LOOP_VINFO. */
966
967 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)968 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
969 {
970 rgroup_controls *rgm;
971 unsigned int i;
972 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
973 if (rgm->type != NULL_TREE
974 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
975 cmp_type, rgm->type,
976 OPTIMIZE_FOR_SPEED))
977 return false;
978 return true;
979 }
980
981 /* Calculate the maximum number of scalars per iteration for every
982 rgroup in LOOP_VINFO. */
983
984 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)985 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
986 {
987 unsigned int res = 1;
988 unsigned int i;
989 rgroup_controls *rgm;
990 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
991 res = MAX (res, rgm->max_nscalars_per_iter);
992 return res;
993 }
994
995 /* Calculate the minimum precision necessary to represent:
996
997 MAX_NITERS * FACTOR
998
999 as an unsigned integer, where MAX_NITERS is the maximum number of
1000 loop header iterations for the original scalar form of LOOP_VINFO. */
1001
1002 static unsigned
vect_min_prec_for_max_niters(loop_vec_info loop_vinfo,unsigned int factor)1003 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1004 {
1005 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1006
1007 /* Get the maximum number of iterations that is representable
1008 in the counter type. */
1009 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1010 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1011
1012 /* Get a more refined estimate for the number of iterations. */
1013 widest_int max_back_edges;
1014 if (max_loop_iterations (loop, &max_back_edges))
1015 max_ni = wi::smin (max_ni, max_back_edges + 1);
1016
1017 /* Work out how many bits we need to represent the limit. */
1018 return wi::min_precision (max_ni * factor, UNSIGNED);
1019 }
1020
1021 /* True if the loop needs peeling or partial vectors when vectorized. */
1022
1023 static bool
vect_need_peeling_or_partial_vectors_p(loop_vec_info loop_vinfo)1024 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1025 {
1026 unsigned HOST_WIDE_INT const_vf;
1027 HOST_WIDE_INT max_niter
1028 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1029
1030 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1031 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1032 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1033 (loop_vinfo));
1034
1035 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1036 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1037 {
1038 /* Work out the (constant) number of iterations that need to be
1039 peeled for reasons other than niters. */
1040 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1041 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1042 peel_niter += 1;
1043 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1044 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1045 return true;
1046 }
1047 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1048 /* ??? When peeling for gaps but not alignment, we could
1049 try to check whether the (variable) niters is known to be
1050 VF * N + 1. That's something of a niche case though. */
1051 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1052 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1053 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1054 < (unsigned) exact_log2 (const_vf))
1055 /* In case of versioning, check if the maximum number of
1056 iterations is greater than th. If they are identical,
1057 the epilogue is unnecessary. */
1058 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1059 || ((unsigned HOST_WIDE_INT) max_niter
1060 > (th / const_vf) * const_vf))))
1061 return true;
1062
1063 return false;
1064 }
1065
1066 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1067 whether we can actually generate the masks required. Return true if so,
1068 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1069
1070 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1071 vect_verify_full_masking (loop_vec_info loop_vinfo)
1072 {
1073 unsigned int min_ni_width;
1074 unsigned int max_nscalars_per_iter
1075 = vect_get_max_nscalars_per_iter (loop_vinfo);
1076
1077 /* Use a normal loop if there are no statements that need masking.
1078 This only happens in rare degenerate cases: it means that the loop
1079 has no loads, no stores, and no live-out values. */
1080 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1081 return false;
1082
1083 /* Work out how many bits we need to represent the limit. */
1084 min_ni_width
1085 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1086
1087 /* Find a scalar mode for which WHILE_ULT is supported. */
1088 opt_scalar_int_mode cmp_mode_iter;
1089 tree cmp_type = NULL_TREE;
1090 tree iv_type = NULL_TREE;
1091 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1092 unsigned int iv_precision = UINT_MAX;
1093
1094 if (iv_limit != -1)
1095 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1096 UNSIGNED);
1097
1098 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1099 {
1100 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1101 if (cmp_bits >= min_ni_width
1102 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1103 {
1104 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1105 if (this_type
1106 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1107 {
1108 /* Although we could stop as soon as we find a valid mode,
1109 there are at least two reasons why that's not always the
1110 best choice:
1111
1112 - An IV that's Pmode or wider is more likely to be reusable
1113 in address calculations than an IV that's narrower than
1114 Pmode.
1115
1116 - Doing the comparison in IV_PRECISION or wider allows
1117 a natural 0-based IV, whereas using a narrower comparison
1118 type requires mitigations against wrap-around.
1119
1120 Conversely, if the IV limit is variable, doing the comparison
1121 in a wider type than the original type can introduce
1122 unnecessary extensions, so picking the widest valid mode
1123 is not always a good choice either.
1124
1125 Here we prefer the first IV type that's Pmode or wider,
1126 and the first comparison type that's IV_PRECISION or wider.
1127 (The comparison type must be no wider than the IV type,
1128 to avoid extensions in the vector loop.)
1129
1130 ??? We might want to try continuing beyond Pmode for ILP32
1131 targets if CMP_BITS < IV_PRECISION. */
1132 iv_type = this_type;
1133 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1134 cmp_type = this_type;
1135 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1136 break;
1137 }
1138 }
1139 }
1140
1141 if (!cmp_type)
1142 return false;
1143
1144 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1145 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1146 return true;
1147 }
1148
1149 /* Check whether we can use vector access with length based on precison
1150 comparison. So far, to keep it simple, we only allow the case that the
1151 precision of the target supported length is larger than the precision
1152 required by loop niters. */
1153
1154 static bool
vect_verify_loop_lens(loop_vec_info loop_vinfo)1155 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1156 {
1157 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1158 return false;
1159
1160 unsigned int max_nitems_per_iter = 1;
1161 unsigned int i;
1162 rgroup_controls *rgl;
1163 /* Find the maximum number of items per iteration for every rgroup. */
1164 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1165 {
1166 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1167 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1168 }
1169
1170 /* Work out how many bits we need to represent the length limit. */
1171 unsigned int min_ni_prec
1172 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1173
1174 /* Now use the maximum of below precisions for one suitable IV type:
1175 - the IV's natural precision
1176 - the precision needed to hold: the maximum number of scalar
1177 iterations multiplied by the scale factor (min_ni_prec above)
1178 - the Pmode precision
1179
1180 If min_ni_prec is less than the precision of the current niters,
1181 we perfer to still use the niters type. Prefer to use Pmode and
1182 wider IV to avoid narrow conversions. */
1183
1184 unsigned int ni_prec
1185 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1186 min_ni_prec = MAX (min_ni_prec, ni_prec);
1187 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1188
1189 tree iv_type = NULL_TREE;
1190 opt_scalar_int_mode tmode_iter;
1191 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1192 {
1193 scalar_mode tmode = tmode_iter.require ();
1194 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1195
1196 /* ??? Do we really want to construct one IV whose precision exceeds
1197 BITS_PER_WORD? */
1198 if (tbits > BITS_PER_WORD)
1199 break;
1200
1201 /* Find the first available standard integral type. */
1202 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1203 {
1204 iv_type = build_nonstandard_integer_type (tbits, true);
1205 break;
1206 }
1207 }
1208
1209 if (!iv_type)
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "can't vectorize with length-based partial vectors"
1214 " because there is no suitable iv type.\n");
1215 return false;
1216 }
1217
1218 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1219 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1220
1221 return true;
1222 }
1223
1224 /* Calculate the cost of one scalar iteration of the loop. */
1225 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1226 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1227 {
1228 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1229 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1230 int nbbs = loop->num_nodes, factor;
1231 int innerloop_iters, i;
1232
1233 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1234
1235 /* Gather costs for statements in the scalar loop. */
1236
1237 /* FORNOW. */
1238 innerloop_iters = 1;
1239 if (loop->inner)
1240 innerloop_iters = 50; /* FIXME */
1241
1242 for (i = 0; i < nbbs; i++)
1243 {
1244 gimple_stmt_iterator si;
1245 basic_block bb = bbs[i];
1246
1247 if (bb->loop_father == loop->inner)
1248 factor = innerloop_iters;
1249 else
1250 factor = 1;
1251
1252 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1253 {
1254 gimple *stmt = gsi_stmt (si);
1255 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1256
1257 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1258 continue;
1259
1260 /* Skip stmts that are not vectorized inside the loop. */
1261 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1262 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1263 && (!STMT_VINFO_LIVE_P (vstmt_info)
1264 || !VECTORIZABLE_CYCLE_DEF
1265 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1266 continue;
1267
1268 vect_cost_for_stmt kind;
1269 if (STMT_VINFO_DATA_REF (stmt_info))
1270 {
1271 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1272 kind = scalar_load;
1273 else
1274 kind = scalar_store;
1275 }
1276 else if (vect_nop_conversion_p (stmt_info))
1277 continue;
1278 else
1279 kind = scalar_stmt;
1280
1281 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1282 factor, kind, stmt_info, 0, vect_prologue);
1283 }
1284 }
1285
1286 /* Now accumulate cost. */
1287 void *target_cost_data = init_cost (loop);
1288 stmt_info_for_cost *si;
1289 int j;
1290 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291 j, si)
1292 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1293 si->kind, si->stmt_info, si->vectype,
1294 si->misalign, vect_body);
1295 unsigned dummy, body_cost = 0;
1296 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1297 destroy_cost_data (target_cost_data);
1298 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1299 }
1300
1301
1302 /* Function vect_analyze_loop_form_1.
1303
1304 Verify that certain CFG restrictions hold, including:
1305 - the loop has a pre-header
1306 - the loop has a single entry and exit
1307 - the loop exit condition is simple enough
1308 - the number of iterations can be analyzed, i.e, a countable loop. The
1309 niter could be analyzed under some assumptions. */
1310
1311 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1312 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1313 tree *assumptions, tree *number_of_iterationsm1,
1314 tree *number_of_iterations, gcond **inner_loop_cond)
1315 {
1316 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1317
1318 /* Different restrictions apply when we are considering an inner-most loop,
1319 vs. an outer (nested) loop.
1320 (FORNOW. May want to relax some of these restrictions in the future). */
1321
1322 if (!loop->inner)
1323 {
1324 /* Inner-most loop. We currently require that the number of BBs is
1325 exactly 2 (the header and latch). Vectorizable inner-most loops
1326 look like this:
1327
1328 (pre-header)
1329 |
1330 header <--------+
1331 | | |
1332 | +--> latch --+
1333 |
1334 (exit-bb) */
1335
1336 if (loop->num_nodes != 2)
1337 return opt_result::failure_at (vect_location,
1338 "not vectorized:"
1339 " control flow in loop.\n");
1340
1341 if (empty_block_p (loop->header))
1342 return opt_result::failure_at (vect_location,
1343 "not vectorized: empty loop.\n");
1344 }
1345 else
1346 {
1347 class loop *innerloop = loop->inner;
1348 edge entryedge;
1349
1350 /* Nested loop. We currently require that the loop is doubly-nested,
1351 contains a single inner loop, and the number of BBs is exactly 5.
1352 Vectorizable outer-loops look like this:
1353
1354 (pre-header)
1355 |
1356 header <---+
1357 | |
1358 inner-loop |
1359 | |
1360 tail ------+
1361 |
1362 (exit-bb)
1363
1364 The inner-loop has the properties expected of inner-most loops
1365 as described above. */
1366
1367 if ((loop->inner)->inner || (loop->inner)->next)
1368 return opt_result::failure_at (vect_location,
1369 "not vectorized:"
1370 " multiple nested loops.\n");
1371
1372 if (loop->num_nodes != 5)
1373 return opt_result::failure_at (vect_location,
1374 "not vectorized:"
1375 " control flow in loop.\n");
1376
1377 entryedge = loop_preheader_edge (innerloop);
1378 if (entryedge->src != loop->header
1379 || !single_exit (innerloop)
1380 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1381 return opt_result::failure_at (vect_location,
1382 "not vectorized:"
1383 " unsupported outerloop form.\n");
1384
1385 /* Analyze the inner-loop. */
1386 tree inner_niterm1, inner_niter, inner_assumptions;
1387 opt_result res
1388 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1389 &inner_assumptions, &inner_niterm1,
1390 &inner_niter, NULL);
1391 if (!res)
1392 {
1393 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1395 "not vectorized: Bad inner loop.\n");
1396 return res;
1397 }
1398
1399 /* Don't support analyzing niter under assumptions for inner
1400 loop. */
1401 if (!integer_onep (inner_assumptions))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: Bad inner loop.\n");
1404
1405 if (!expr_invariant_in_loop_p (loop, inner_niter))
1406 return opt_result::failure_at (vect_location,
1407 "not vectorized: inner-loop count not"
1408 " invariant.\n");
1409
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_NOTE, vect_location,
1412 "Considering outer-loop vectorization.\n");
1413 }
1414
1415 if (!single_exit (loop))
1416 return opt_result::failure_at (vect_location,
1417 "not vectorized: multiple exits.\n");
1418 if (EDGE_COUNT (loop->header->preds) != 2)
1419 return opt_result::failure_at (vect_location,
1420 "not vectorized:"
1421 " too many incoming edges.\n");
1422
1423 /* We assume that the loop exit condition is at the end of the loop. i.e,
1424 that the loop is represented as a do-while (with a proper if-guard
1425 before the loop if needed), where the loop header contains all the
1426 executable statements, and the latch is empty. */
1427 if (!empty_block_p (loop->latch)
1428 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1429 return opt_result::failure_at (vect_location,
1430 "not vectorized: latch block not empty.\n");
1431
1432 /* Make sure the exit is not abnormal. */
1433 edge e = single_exit (loop);
1434 if (e->flags & EDGE_ABNORMAL)
1435 return opt_result::failure_at (vect_location,
1436 "not vectorized:"
1437 " abnormal loop exit edge.\n");
1438
1439 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1440 number_of_iterationsm1);
1441 if (!*loop_cond)
1442 return opt_result::failure_at
1443 (vect_location,
1444 "not vectorized: complicated exit condition.\n");
1445
1446 if (integer_zerop (*assumptions)
1447 || !*number_of_iterations
1448 || chrec_contains_undetermined (*number_of_iterations))
1449 return opt_result::failure_at
1450 (*loop_cond,
1451 "not vectorized: number of iterations cannot be computed.\n");
1452
1453 if (integer_zerop (*number_of_iterations))
1454 return opt_result::failure_at
1455 (*loop_cond,
1456 "not vectorized: number of iterations = 0.\n");
1457
1458 return opt_result::success ();
1459 }
1460
1461 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1462
1463 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1464 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1465 {
1466 tree assumptions, number_of_iterations, number_of_iterationsm1;
1467 gcond *loop_cond, *inner_loop_cond = NULL;
1468
1469 opt_result res
1470 = vect_analyze_loop_form_1 (loop, &loop_cond,
1471 &assumptions, &number_of_iterationsm1,
1472 &number_of_iterations, &inner_loop_cond);
1473 if (!res)
1474 return opt_loop_vec_info::propagate_failure (res);
1475
1476 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1477 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1478 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1479 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1480 if (!integer_onep (assumptions))
1481 {
1482 /* We consider to vectorize this loop by versioning it under
1483 some assumptions. In order to do this, we need to clear
1484 existing information computed by scev and niter analyzer. */
1485 scev_reset_htab ();
1486 free_numbers_of_iterations_estimates (loop);
1487 /* Also set flag for this loop so that following scev and niter
1488 analysis are done under the assumptions. */
1489 loop_constraint_set (loop, LOOP_C_FINITE);
1490 /* Also record the assumptions for versioning. */
1491 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1492 }
1493
1494 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1495 {
1496 if (dump_enabled_p ())
1497 {
1498 dump_printf_loc (MSG_NOTE, vect_location,
1499 "Symbolic number of iterations is ");
1500 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1501 dump_printf (MSG_NOTE, "\n");
1502 }
1503 }
1504
1505 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1506 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1507 if (inner_loop_cond)
1508 {
1509 stmt_vec_info inner_loop_cond_info
1510 = loop_vinfo->lookup_stmt (inner_loop_cond);
1511 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1512 }
1513
1514 gcc_assert (!loop->aux);
1515 loop->aux = loop_vinfo;
1516 return opt_loop_vec_info::success (loop_vinfo);
1517 }
1518
1519
1520
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522 statements update the vectorization factor. */
1523
1524 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1526 {
1527 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1528 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1529 int nbbs = loop->num_nodes;
1530 poly_uint64 vectorization_factor;
1531 int i;
1532
1533 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1534
1535 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1536 gcc_assert (known_ne (vectorization_factor, 0U));
1537
1538 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1539 vectorization factor of the loop is the unrolling factor required by
1540 the SLP instances. If that unrolling factor is 1, we say, that we
1541 perform pure SLP on loop - cross iteration parallelism is not
1542 exploited. */
1543 bool only_slp_in_loop = true;
1544 for (i = 0; i < nbbs; i++)
1545 {
1546 basic_block bb = bbs[i];
1547 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1548 gsi_next (&si))
1549 {
1550 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1551 if (!stmt_info)
1552 continue;
1553 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555 && !PURE_SLP_STMT (stmt_info))
1556 /* STMT needs both SLP and loop-based vectorization. */
1557 only_slp_in_loop = false;
1558 }
1559 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1560 gsi_next (&si))
1561 {
1562 if (is_gimple_debug (gsi_stmt (si)))
1563 continue;
1564 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1565 stmt_info = vect_stmt_to_vectorize (stmt_info);
1566 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1567 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1568 && !PURE_SLP_STMT (stmt_info))
1569 /* STMT needs both SLP and loop-based vectorization. */
1570 only_slp_in_loop = false;
1571 }
1572 }
1573
1574 if (only_slp_in_loop)
1575 {
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "Loop contains only SLP stmts\n");
1579 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1580 }
1581 else
1582 {
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE, vect_location,
1585 "Loop contains SLP and non-SLP stmts\n");
1586 /* Both the vectorization factor and unroll factor have the form
1587 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1588 so they must have a common multiple. */
1589 vectorization_factor
1590 = force_common_multiple (vectorization_factor,
1591 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1592 }
1593
1594 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1595 if (dump_enabled_p ())
1596 {
1597 dump_printf_loc (MSG_NOTE, vect_location,
1598 "Updating vectorization factor to ");
1599 dump_dec (MSG_NOTE, vectorization_factor);
1600 dump_printf (MSG_NOTE, ".\n");
1601 }
1602 }
1603
1604 /* Return true if STMT_INFO describes a double reduction phi and if
1605 the other phi in the reduction is also relevant for vectorization.
1606 This rejects cases such as:
1607
1608 outer1:
1609 x_1 = PHI <x_3(outer2), ...>;
1610 ...
1611
1612 inner:
1613 x_2 = ...;
1614 ...
1615
1616 outer2:
1617 x_3 = PHI <x_2(inner)>;
1618
1619 if nothing in x_2 or elsewhere makes x_1 relevant. */
1620
1621 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1622 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1623 {
1624 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1625 return false;
1626
1627 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1628 }
1629
1630 /* Function vect_analyze_loop_operations.
1631
1632 Scan the loop stmts and make sure they are all vectorizable. */
1633
1634 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1635 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1636 {
1637 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1638 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1639 int nbbs = loop->num_nodes;
1640 int i;
1641 stmt_vec_info stmt_info;
1642 bool need_to_vectorize = false;
1643 bool ok;
1644
1645 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1646
1647 auto_vec<stmt_info_for_cost> cost_vec;
1648
1649 for (i = 0; i < nbbs; i++)
1650 {
1651 basic_block bb = bbs[i];
1652
1653 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1654 gsi_next (&si))
1655 {
1656 gphi *phi = si.phi ();
1657 ok = true;
1658
1659 stmt_info = loop_vinfo->lookup_stmt (phi);
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1662 if (virtual_operand_p (gimple_phi_result (phi)))
1663 continue;
1664
1665 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1666 (i.e., a phi in the tail of the outer-loop). */
1667 if (! is_loop_header_bb_p (bb))
1668 {
1669 /* FORNOW: we currently don't support the case that these phis
1670 are not used in the outerloop (unless it is double reduction,
1671 i.e., this phi is vect_reduction_def), cause this case
1672 requires to actually do something here. */
1673 if (STMT_VINFO_LIVE_P (stmt_info)
1674 && !vect_active_double_reduction_p (stmt_info))
1675 return opt_result::failure_at (phi,
1676 "Unsupported loop-closed phi"
1677 " in outer-loop.\n");
1678
1679 /* If PHI is used in the outer loop, we check that its operand
1680 is defined in the inner loop. */
1681 if (STMT_VINFO_RELEVANT_P (stmt_info))
1682 {
1683 tree phi_op;
1684
1685 if (gimple_phi_num_args (phi) != 1)
1686 return opt_result::failure_at (phi, "unsupported phi");
1687
1688 phi_op = PHI_ARG_DEF (phi, 0);
1689 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1690 if (!op_def_info)
1691 return opt_result::failure_at (phi, "unsupported phi\n");
1692
1693 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1694 && (STMT_VINFO_RELEVANT (op_def_info)
1695 != vect_used_in_outer_by_reduction))
1696 return opt_result::failure_at (phi, "unsupported phi\n");
1697
1698 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1699 || (STMT_VINFO_DEF_TYPE (stmt_info)
1700 == vect_double_reduction_def))
1701 && !vectorizable_lc_phi (loop_vinfo,
1702 stmt_info, NULL, NULL))
1703 return opt_result::failure_at (phi, "unsupported phi\n");
1704 }
1705
1706 continue;
1707 }
1708
1709 gcc_assert (stmt_info);
1710
1711 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712 || STMT_VINFO_LIVE_P (stmt_info))
1713 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714 /* A scalar-dependence cycle that we don't support. */
1715 return opt_result::failure_at (phi,
1716 "not vectorized:"
1717 " scalar dependence cycle.\n");
1718
1719 if (STMT_VINFO_RELEVANT_P (stmt_info))
1720 {
1721 need_to_vectorize = true;
1722 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1723 && ! PURE_SLP_STMT (stmt_info))
1724 ok = vectorizable_induction (loop_vinfo,
1725 stmt_info, NULL, NULL,
1726 &cost_vec);
1727 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1728 || (STMT_VINFO_DEF_TYPE (stmt_info)
1729 == vect_double_reduction_def)
1730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731 && ! PURE_SLP_STMT (stmt_info))
1732 ok = vectorizable_reduction (loop_vinfo,
1733 stmt_info, NULL, NULL, &cost_vec);
1734 }
1735
1736 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1737 if (ok
1738 && STMT_VINFO_LIVE_P (stmt_info)
1739 && !PURE_SLP_STMT (stmt_info))
1740 ok = vectorizable_live_operation (loop_vinfo,
1741 stmt_info, NULL, NULL, NULL,
1742 -1, false, &cost_vec);
1743
1744 if (!ok)
1745 return opt_result::failure_at (phi,
1746 "not vectorized: relevant phi not "
1747 "supported: %G",
1748 static_cast <gimple *> (phi));
1749 }
1750
1751 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752 gsi_next (&si))
1753 {
1754 gimple *stmt = gsi_stmt (si);
1755 if (!gimple_clobber_p (stmt)
1756 && !is_gimple_debug (stmt))
1757 {
1758 opt_result res
1759 = vect_analyze_stmt (loop_vinfo,
1760 loop_vinfo->lookup_stmt (stmt),
1761 &need_to_vectorize,
1762 NULL, NULL, &cost_vec);
1763 if (!res)
1764 return res;
1765 }
1766 }
1767 } /* bbs */
1768
1769 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1770
1771 /* All operations in the loop are either irrelevant (deal with loop
1772 control, or dead), or only used outside the loop and can be moved
1773 out of the loop (e.g. invariants, inductions). The loop can be
1774 optimized away by scalar optimizations. We're better off not
1775 touching this loop. */
1776 if (!need_to_vectorize)
1777 {
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_NOTE, vect_location,
1780 "All the computation can be taken out of the loop.\n");
1781 return opt_result::failure_at
1782 (vect_location,
1783 "not vectorized: redundant loop. no profit to vectorize.\n");
1784 }
1785
1786 return opt_result::success ();
1787 }
1788
1789 /* Return true if we know that the iteration count is smaller than the
1790 vectorization factor. Return false if it isn't, or if we can't be sure
1791 either way. */
1792
1793 static bool
vect_known_niters_smaller_than_vf(loop_vec_info loop_vinfo)1794 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1795 {
1796 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1797
1798 HOST_WIDE_INT max_niter;
1799 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1800 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1801 else
1802 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1803
1804 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1805 return true;
1806
1807 return false;
1808 }
1809
1810 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1811 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1812 definitely no, or -1 if it's worth retrying. */
1813
1814 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1815 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1816 {
1817 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1818 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1819
1820 /* Only loops that can handle partially-populated vectors can have iteration
1821 counts less than the vectorization factor. */
1822 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1823 {
1824 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1825 {
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "not vectorized: iteration count smaller than "
1829 "vectorization factor.\n");
1830 return 0;
1831 }
1832 }
1833
1834 /* If using the "very cheap" model. reject cases in which we'd keep
1835 a copy of the scalar code (even if we might be able to vectorize it). */
1836 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1837 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1838 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1839 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1840 {
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "some scalar iterations would need to be peeled\n");
1844 return 0;
1845 }
1846
1847 int min_profitable_iters, min_profitable_estimate;
1848 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1849 &min_profitable_estimate);
1850
1851 if (min_profitable_iters < 0)
1852 {
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "not vectorized: vectorization not profitable.\n");
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858 "not vectorized: vector version will never be "
1859 "profitable.\n");
1860 return -1;
1861 }
1862
1863 int min_scalar_loop_bound = (param_min_vect_loop_bound
1864 * assumed_vf);
1865
1866 /* Use the cost model only if it is more conservative than user specified
1867 threshold. */
1868 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1869 min_profitable_iters);
1870
1871 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1872
1873 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1874 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1875 {
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "not vectorized: vectorization not profitable.\n");
1879 if (dump_enabled_p ())
1880 dump_printf_loc (MSG_NOTE, vect_location,
1881 "not vectorized: iteration count smaller than user "
1882 "specified loop bound parameter or minimum profitable "
1883 "iterations (whichever is more conservative).\n");
1884 return 0;
1885 }
1886
1887 /* The static profitablity threshold min_profitable_estimate includes
1888 the cost of having to check at runtime whether the scalar loop
1889 should be used instead. If it turns out that we don't need or want
1890 such a check, the threshold we should use for the static estimate
1891 is simply the point at which the vector loop becomes more profitable
1892 than the scalar loop. */
1893 if (min_profitable_estimate > min_profitable_iters
1894 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1896 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1897 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1898 {
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1901 " choice between the scalar and vector loops\n");
1902 min_profitable_estimate = min_profitable_iters;
1903 }
1904
1905 /* If the vector loop needs multiple iterations to be beneficial then
1906 things are probably too close to call, and the conservative thing
1907 would be to stick with the scalar code. */
1908 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1909 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1910 {
1911 if (dump_enabled_p ())
1912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1913 "one iteration of the vector loop would be"
1914 " more expensive than the equivalent number of"
1915 " iterations of the scalar loop\n");
1916 return 0;
1917 }
1918
1919 HOST_WIDE_INT estimated_niter;
1920
1921 /* If we are vectorizing an epilogue then we know the maximum number of
1922 scalar iterations it will cover is at least one lower than the
1923 vectorization factor of the main loop. */
1924 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925 estimated_niter
1926 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1927 else
1928 {
1929 estimated_niter = estimated_stmt_executions_int (loop);
1930 if (estimated_niter == -1)
1931 estimated_niter = likely_max_stmt_executions_int (loop);
1932 }
1933 if (estimated_niter != -1
1934 && ((unsigned HOST_WIDE_INT) estimated_niter
1935 < MAX (th, (unsigned) min_profitable_estimate)))
1936 {
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 "not vectorized: estimated iteration count too "
1940 "small.\n");
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE, vect_location,
1943 "not vectorized: estimated iteration count smaller "
1944 "than specified loop bound parameter or minimum "
1945 "profitable iterations (whichever is more "
1946 "conservative).\n");
1947 return -1;
1948 }
1949
1950 return 1;
1951 }
1952
1953 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1954 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1955 vec<data_reference_p> *datarefs,
1956 unsigned int *n_stmts)
1957 {
1958 *n_stmts = 0;
1959 for (unsigned i = 0; i < loop->num_nodes; i++)
1960 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1961 !gsi_end_p (gsi); gsi_next (&gsi))
1962 {
1963 gimple *stmt = gsi_stmt (gsi);
1964 if (is_gimple_debug (stmt))
1965 continue;
1966 ++(*n_stmts);
1967 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1968 NULL, 0);
1969 if (!res)
1970 {
1971 if (is_gimple_call (stmt) && loop->safelen)
1972 {
1973 tree fndecl = gimple_call_fndecl (stmt), op;
1974 if (fndecl != NULL_TREE)
1975 {
1976 cgraph_node *node = cgraph_node::get (fndecl);
1977 if (node != NULL && node->simd_clones != NULL)
1978 {
1979 unsigned int j, n = gimple_call_num_args (stmt);
1980 for (j = 0; j < n; j++)
1981 {
1982 op = gimple_call_arg (stmt, j);
1983 if (DECL_P (op)
1984 || (REFERENCE_CLASS_P (op)
1985 && get_base_address (op)))
1986 break;
1987 }
1988 op = gimple_call_lhs (stmt);
1989 /* Ignore #pragma omp declare simd functions
1990 if they don't have data references in the
1991 call stmt itself. */
1992 if (j == n
1993 && !(op
1994 && (DECL_P (op)
1995 || (REFERENCE_CLASS_P (op)
1996 && get_base_address (op)))))
1997 continue;
1998 }
1999 }
2000 }
2001 return res;
2002 }
2003 /* If dependence analysis will give up due to the limit on the
2004 number of datarefs stop here and fail fatally. */
2005 if (datarefs->length ()
2006 > (unsigned)param_loop_max_datarefs_for_datadeps)
2007 return opt_result::failure_at (stmt, "exceeded param "
2008 "loop-max-datarefs-for-datadeps\n");
2009 }
2010 return opt_result::success ();
2011 }
2012
2013 /* Look for SLP-only access groups and turn each individual access into its own
2014 group. */
2015 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)2016 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2017 {
2018 unsigned int i;
2019 struct data_reference *dr;
2020
2021 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2022
2023 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2024 FOR_EACH_VEC_ELT (datarefs, i, dr)
2025 {
2026 gcc_assert (DR_REF (dr));
2027 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2028
2029 /* Check if the load is a part of an interleaving chain. */
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 {
2032 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2033 unsigned int group_size = DR_GROUP_SIZE (first_element);
2034
2035 /* Check if SLP-only groups. */
2036 if (!STMT_SLP_TYPE (stmt_info)
2037 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2038 {
2039 /* Dissolve the group. */
2040 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2041
2042 stmt_vec_info vinfo = first_element;
2043 while (vinfo)
2044 {
2045 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2046 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2047 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2048 DR_GROUP_SIZE (vinfo) = 1;
2049 if (STMT_VINFO_STRIDED_P (first_element))
2050 DR_GROUP_GAP (vinfo) = 0;
2051 else
2052 DR_GROUP_GAP (vinfo) = group_size - 1;
2053 vinfo = next;
2054 }
2055 }
2056 }
2057 }
2058 }
2059
2060 /* Determine if operating on full vectors for LOOP_VINFO might leave
2061 some scalar iterations still to do. If so, decide how we should
2062 handle those scalar iterations. The possibilities are:
2063
2064 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2065 In this case:
2066
2067 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2068 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2069 LOOP_VINFO_PEELING_FOR_NITER == false
2070
2071 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2072 to handle the remaining scalar iterations. In this case:
2073
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2075 LOOP_VINFO_PEELING_FOR_NITER == true
2076
2077 There are two choices:
2078
2079 (2a) Consider vectorizing the epilogue loop at the same VF as the
2080 main loop, but using partial vectors instead of full vectors.
2081 In this case:
2082
2083 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2084
2085 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2086 In this case:
2087
2088 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2089
2090 When FOR_EPILOGUE_P is true, make this determination based on the
2091 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2092 based on the assumption that LOOP_VINFO is the main loop. The caller
2093 has made sure that the number of iterations is set appropriately for
2094 this value of FOR_EPILOGUE_P. */
2095
2096 opt_result
vect_determine_partial_vectors_and_peeling(loop_vec_info loop_vinfo,bool for_epilogue_p)2097 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2098 bool for_epilogue_p)
2099 {
2100 /* Determine whether there would be any scalar iterations left over. */
2101 bool need_peeling_or_partial_vectors_p
2102 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2103
2104 /* Decide whether to vectorize the loop with partial vectors. */
2105 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2106 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2107 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2108 && need_peeling_or_partial_vectors_p)
2109 {
2110 /* For partial-vector-usage=1, try to push the handling of partial
2111 vectors to the epilogue, with the main loop continuing to operate
2112 on full vectors.
2113
2114 ??? We could then end up failing to use partial vectors if we
2115 decide to peel iterations into a prologue, and if the main loop
2116 then ends up processing fewer than VF iterations. */
2117 if (param_vect_partial_vector_usage == 1
2118 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2119 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2120 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2121 else
2122 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2123 }
2124
2125 if (dump_enabled_p ())
2126 {
2127 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "operating on partial vectors%s.\n",
2130 for_epilogue_p ? " for epilogue loop" : "");
2131 else
2132 dump_printf_loc (MSG_NOTE, vect_location,
2133 "operating only on full vectors%s.\n",
2134 for_epilogue_p ? " for epilogue loop" : "");
2135 }
2136
2137 if (for_epilogue_p)
2138 {
2139 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2140 gcc_assert (orig_loop_vinfo);
2141 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2143 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2144 }
2145
2146 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2147 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148 {
2149 /* Check that the loop processes at least one full vector. */
2150 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2151 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2152 if (known_lt (wi::to_widest (scalar_niters), vf))
2153 return opt_result::failure_at (vect_location,
2154 "loop does not have enough iterations"
2155 " to support vectorization.\n");
2156
2157 /* If we need to peel an extra epilogue iteration to handle data
2158 accesses with gaps, check that there are enough scalar iterations
2159 available.
2160
2161 The check above is redundant with this one when peeling for gaps,
2162 but the distinction is useful for diagnostics. */
2163 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2166 return opt_result::failure_at (vect_location,
2167 "loop does not have enough iterations"
2168 " to support peeling for gaps.\n");
2169 }
2170
2171 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2172 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2173 && need_peeling_or_partial_vectors_p);
2174
2175 return opt_result::success ();
2176 }
2177
2178 /* Function vect_analyze_loop_2.
2179
2180 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2181 for it. The different analyses will record information in the
2182 loop_vec_info struct. */
2183 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)2184 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2185 {
2186 opt_result ok = opt_result::success ();
2187 int res;
2188 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2189 poly_uint64 min_vf = 2;
2190 loop_vec_info orig_loop_vinfo = NULL;
2191
2192 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2193 loop_vec_info of the first vectorized loop. */
2194 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2195 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2196 else
2197 orig_loop_vinfo = loop_vinfo;
2198 gcc_assert (orig_loop_vinfo);
2199
2200 /* The first group of checks is independent of the vector size. */
2201 fatal = true;
2202
2203 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2204 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2205 return opt_result::failure_at (vect_location,
2206 "not vectorized: simd if(0)\n");
2207
2208 /* Find all data references in the loop (which correspond to vdefs/vuses)
2209 and analyze their evolution in the loop. */
2210
2211 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2212
2213 /* Gather the data references and count stmts in the loop. */
2214 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2215 {
2216 opt_result res
2217 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2218 &LOOP_VINFO_DATAREFS (loop_vinfo),
2219 n_stmts);
2220 if (!res)
2221 {
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 "not vectorized: loop contains function "
2225 "calls or data references that cannot "
2226 "be analyzed\n");
2227 return res;
2228 }
2229 loop_vinfo->shared->save_datarefs ();
2230 }
2231 else
2232 loop_vinfo->shared->check_datarefs ();
2233
2234 /* Analyze the data references and also adjust the minimal
2235 vectorization factor according to the loads and stores. */
2236
2237 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2238 if (!ok)
2239 {
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "bad data references.\n");
2243 return ok;
2244 }
2245
2246 /* Classify all cross-iteration scalar data-flow cycles.
2247 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2248 vect_analyze_scalar_cycles (loop_vinfo);
2249
2250 vect_pattern_recog (loop_vinfo);
2251
2252 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2253
2254 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2255 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2256
2257 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2258 if (!ok)
2259 {
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "bad data access.\n");
2263 return ok;
2264 }
2265
2266 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2267
2268 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2269 if (!ok)
2270 {
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 "unexpected pattern.\n");
2274 return ok;
2275 }
2276
2277 /* While the rest of the analysis below depends on it in some way. */
2278 fatal = false;
2279
2280 /* Analyze data dependences between the data-refs in the loop
2281 and adjust the maximum vectorization factor according to
2282 the dependences.
2283 FORNOW: fail at the first data dependence that we encounter. */
2284
2285 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2286 if (!ok)
2287 {
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290 "bad data dependence.\n");
2291 return ok;
2292 }
2293 if (max_vf != MAX_VECTORIZATION_FACTOR
2294 && maybe_lt (max_vf, min_vf))
2295 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2296 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2297
2298 ok = vect_determine_vectorization_factor (loop_vinfo);
2299 if (!ok)
2300 {
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "can't determine vectorization factor.\n");
2304 return ok;
2305 }
2306 if (max_vf != MAX_VECTORIZATION_FACTOR
2307 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2308 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2309
2310 /* Compute the scalar iteration cost. */
2311 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2312
2313 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2314
2315 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2316 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2317 if (!ok)
2318 return ok;
2319
2320 /* If there are any SLP instances mark them as pure_slp. */
2321 bool slp = vect_make_slp_decision (loop_vinfo);
2322 if (slp)
2323 {
2324 /* Find stmts that need to be both vectorized and SLPed. */
2325 vect_detect_hybrid_slp (loop_vinfo);
2326
2327 /* Update the vectorization factor based on the SLP decision. */
2328 vect_update_vf_for_slp (loop_vinfo);
2329
2330 /* Optimize the SLP graph with the vectorization factor fixed. */
2331 vect_optimize_slp (loop_vinfo);
2332
2333 /* Gather the loads reachable from the SLP graph entries. */
2334 vect_gather_slp_loads (loop_vinfo);
2335 }
2336
2337 bool saved_can_use_partial_vectors_p
2338 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2339
2340 /* We don't expect to have to roll back to anything other than an empty
2341 set of rgroups. */
2342 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2343
2344 /* This is the point where we can re-start analysis with SLP forced off. */
2345 start_over:
2346
2347 /* Now the vectorization factor is final. */
2348 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2349 gcc_assert (known_ne (vectorization_factor, 0U));
2350
2351 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2352 {
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "vectorization_factor = ");
2355 dump_dec (MSG_NOTE, vectorization_factor);
2356 dump_printf (MSG_NOTE, ", niters = %wd\n",
2357 LOOP_VINFO_INT_NITERS (loop_vinfo));
2358 }
2359
2360 /* Analyze the alignment of the data-refs in the loop.
2361 Fail if a data reference is found that cannot be vectorized. */
2362
2363 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2364 if (!ok)
2365 {
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 "bad data alignment.\n");
2369 return ok;
2370 }
2371
2372 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2373 It is important to call pruning after vect_analyze_data_ref_accesses,
2374 since we use grouping information gathered by interleaving analysis. */
2375 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2376 if (!ok)
2377 return ok;
2378
2379 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2380 vectorization, since we do not want to add extra peeling or
2381 add versioning for alignment. */
2382 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2383 /* This pass will decide on using loop versioning and/or loop peeling in
2384 order to enhance the alignment of data references in the loop. */
2385 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2386 if (!ok)
2387 return ok;
2388
2389 if (slp)
2390 {
2391 /* Analyze operations in the SLP instances. Note this may
2392 remove unsupported SLP instances which makes the above
2393 SLP kind detection invalid. */
2394 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2395 vect_slp_analyze_operations (loop_vinfo);
2396 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2397 {
2398 ok = opt_result::failure_at (vect_location,
2399 "unsupported SLP instances\n");
2400 goto again;
2401 }
2402
2403 /* Check whether any load in ALL SLP instances is possibly permuted. */
2404 slp_tree load_node, slp_root;
2405 unsigned i, x;
2406 slp_instance instance;
2407 bool can_use_lanes = true;
2408 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2409 {
2410 slp_root = SLP_INSTANCE_TREE (instance);
2411 int group_size = SLP_TREE_LANES (slp_root);
2412 tree vectype = SLP_TREE_VECTYPE (slp_root);
2413 bool loads_permuted = false;
2414 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2415 {
2416 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2417 continue;
2418 unsigned j;
2419 stmt_vec_info load_info;
2420 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2421 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2422 {
2423 loads_permuted = true;
2424 break;
2425 }
2426 }
2427
2428 /* If the loads and stores can be handled with load/store-lane
2429 instructions record it and move on to the next instance. */
2430 if (loads_permuted
2431 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2432 && vect_store_lanes_supported (vectype, group_size, false))
2433 {
2434 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2435 {
2436 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2437 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2438 /* Use SLP for strided accesses (or if we can't
2439 load-lanes). */
2440 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2441 || ! vect_load_lanes_supported
2442 (STMT_VINFO_VECTYPE (stmt_vinfo),
2443 DR_GROUP_SIZE (stmt_vinfo), false))
2444 break;
2445 }
2446
2447 can_use_lanes
2448 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2449
2450 if (can_use_lanes && dump_enabled_p ())
2451 dump_printf_loc (MSG_NOTE, vect_location,
2452 "SLP instance %p can use load/store-lanes\n",
2453 instance);
2454 }
2455 else
2456 {
2457 can_use_lanes = false;
2458 break;
2459 }
2460 }
2461
2462 /* If all SLP instances can use load/store-lanes abort SLP and try again
2463 with SLP disabled. */
2464 if (can_use_lanes)
2465 {
2466 ok = opt_result::failure_at (vect_location,
2467 "Built SLP cancelled: can use "
2468 "load/store-lanes\n");
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "Built SLP cancelled: all SLP instances support "
2472 "load/store-lanes\n");
2473 goto again;
2474 }
2475 }
2476
2477 /* Dissolve SLP-only groups. */
2478 vect_dissolve_slp_only_groups (loop_vinfo);
2479
2480 /* Scan all the remaining operations in the loop that are not subject
2481 to SLP and make sure they are vectorizable. */
2482 ok = vect_analyze_loop_operations (loop_vinfo);
2483 if (!ok)
2484 {
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487 "bad operation or unsupported loop bound.\n");
2488 return ok;
2489 }
2490
2491 /* For now, we don't expect to mix both masking and length approaches for one
2492 loop, disable it if both are recorded. */
2493 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2494 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2495 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2496 {
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 "can't vectorize a loop with partial vectors"
2500 " because we don't expect to mix different"
2501 " approaches with partial vectors for the"
2502 " same loop.\n");
2503 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2504 }
2505
2506 /* If we still have the option of using partial vectors,
2507 check whether we can generate the necessary loop controls. */
2508 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2509 && !vect_verify_full_masking (loop_vinfo)
2510 && !vect_verify_loop_lens (loop_vinfo))
2511 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2512
2513 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2514 to be able to handle fewer than VF scalars, or needs to have a lower VF
2515 than the main loop. */
2516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2517 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2518 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2519 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2520 return opt_result::failure_at (vect_location,
2521 "Vectorization factor too high for"
2522 " epilogue loop.\n");
2523
2524 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2525 assuming that the loop will be used as a main loop. We will redo
2526 this analysis later if we instead decide to use the loop as an
2527 epilogue loop. */
2528 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2529 if (!ok)
2530 return ok;
2531
2532 /* Check the costings of the loop make vectorizing worthwhile. */
2533 res = vect_analyze_loop_costing (loop_vinfo);
2534 if (res < 0)
2535 {
2536 ok = opt_result::failure_at (vect_location,
2537 "Loop costings may not be worthwhile.\n");
2538 goto again;
2539 }
2540 if (!res)
2541 return opt_result::failure_at (vect_location,
2542 "Loop costings not worthwhile.\n");
2543
2544 /* If an epilogue loop is required make sure we can create one. */
2545 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2546 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2547 {
2548 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2550 if (!vect_can_advance_ivs_p (loop_vinfo)
2551 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2552 single_exit (LOOP_VINFO_LOOP
2553 (loop_vinfo))))
2554 {
2555 ok = opt_result::failure_at (vect_location,
2556 "not vectorized: can't create required "
2557 "epilog loop\n");
2558 goto again;
2559 }
2560 }
2561
2562 /* During peeling, we need to check if number of loop iterations is
2563 enough for both peeled prolog loop and vector loop. This check
2564 can be merged along with threshold check of loop versioning, so
2565 increase threshold for this case if necessary.
2566
2567 If we are analyzing an epilogue we still want to check what its
2568 versioning threshold would be. If we decide to vectorize the epilogues we
2569 will want to use the lowest versioning threshold of all epilogues and main
2570 loop. This will enable us to enter a vectorized epilogue even when
2571 versioning the loop. We can't simply check whether the epilogue requires
2572 versioning though since we may have skipped some versioning checks when
2573 analyzing the epilogue. For instance, checks for alias versioning will be
2574 skipped when dealing with epilogues as we assume we already checked them
2575 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2576 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2577 {
2578 poly_uint64 niters_th = 0;
2579 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2580
2581 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2582 {
2583 /* Niters for peeled prolog loop. */
2584 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2585 {
2586 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2587 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2588 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2589 }
2590 else
2591 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2592 }
2593
2594 /* Niters for at least one iteration of vectorized loop. */
2595 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2596 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2597 /* One additional iteration because of peeling for gap. */
2598 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2599 niters_th += 1;
2600
2601 /* Use the same condition as vect_transform_loop to decide when to use
2602 the cost to determine a versioning threshold. */
2603 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2604 && ordered_p (th, niters_th))
2605 niters_th = ordered_max (poly_uint64 (th), niters_th);
2606
2607 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2608 }
2609
2610 gcc_assert (known_eq (vectorization_factor,
2611 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2612
2613 /* Ok to vectorize! */
2614 return opt_result::success ();
2615
2616 again:
2617 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2618 gcc_assert (!ok);
2619
2620 /* Try again with SLP forced off but if we didn't do any SLP there is
2621 no point in re-trying. */
2622 if (!slp)
2623 return ok;
2624
2625 /* If there are reduction chains re-trying will fail anyway. */
2626 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2627 return ok;
2628
2629 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630 via interleaving or lane instructions. */
2631 slp_instance instance;
2632 slp_tree node;
2633 unsigned i, j;
2634 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2635 {
2636 stmt_vec_info vinfo;
2637 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2638 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2639 continue;
2640 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2641 unsigned int size = DR_GROUP_SIZE (vinfo);
2642 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2643 if (! vect_store_lanes_supported (vectype, size, false)
2644 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2645 && ! vect_grouped_store_supported (vectype, size))
2646 return opt_result::failure_at (vinfo->stmt,
2647 "unsupported grouped store\n");
2648 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2649 {
2650 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2651 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2652 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2653 size = DR_GROUP_SIZE (vinfo);
2654 vectype = STMT_VINFO_VECTYPE (vinfo);
2655 if (! vect_load_lanes_supported (vectype, size, false)
2656 && ! vect_grouped_load_supported (vectype, single_element_p,
2657 size))
2658 return opt_result::failure_at (vinfo->stmt,
2659 "unsupported grouped load\n");
2660 }
2661 }
2662
2663 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_NOTE, vect_location,
2665 "re-trying with SLP disabled\n");
2666
2667 /* Roll back state appropriately. No SLP this time. */
2668 slp = false;
2669 /* Restore vectorization factor as it were without SLP. */
2670 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2671 /* Free the SLP instances. */
2672 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2673 vect_free_slp_instance (instance);
2674 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2675 /* Reset SLP type to loop_vect on all stmts. */
2676 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2677 {
2678 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2679 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2680 !gsi_end_p (si); gsi_next (&si))
2681 {
2682 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2683 STMT_SLP_TYPE (stmt_info) = loop_vect;
2684 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2685 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2686 {
2687 /* vectorizable_reduction adjusts reduction stmt def-types,
2688 restore them to that of the PHI. */
2689 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2690 = STMT_VINFO_DEF_TYPE (stmt_info);
2691 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2692 (STMT_VINFO_REDUC_DEF (stmt_info)))
2693 = STMT_VINFO_DEF_TYPE (stmt_info);
2694 }
2695 }
2696 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2697 !gsi_end_p (si); gsi_next (&si))
2698 {
2699 if (is_gimple_debug (gsi_stmt (si)))
2700 continue;
2701 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702 STMT_SLP_TYPE (stmt_info) = loop_vect;
2703 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2704 {
2705 stmt_vec_info pattern_stmt_info
2706 = STMT_VINFO_RELATED_STMT (stmt_info);
2707 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2708 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2709
2710 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2711 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2712 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2713 !gsi_end_p (pi); gsi_next (&pi))
2714 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2715 = loop_vect;
2716 }
2717 }
2718 }
2719 /* Free optimized alias test DDRS. */
2720 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2721 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2722 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2723 /* Reset target cost data. */
2724 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2725 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2726 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2727 /* Reset accumulated rgroup information. */
2728 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2729 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2730 /* Reset assorted flags. */
2731 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2732 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2733 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2734 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2735 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2736 = saved_can_use_partial_vectors_p;
2737
2738 goto start_over;
2739 }
2740
2741 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2742 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2743 OLD_LOOP_VINFO is better unless something specifically indicates
2744 otherwise.
2745
2746 Note that this deliberately isn't a partial order. */
2747
2748 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2749 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2750 loop_vec_info old_loop_vinfo)
2751 {
2752 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2753 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2754
2755 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2756 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2757
2758 /* Always prefer a VF of loop->simdlen over any other VF. */
2759 if (loop->simdlen)
2760 {
2761 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2762 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2763 if (new_simdlen_p != old_simdlen_p)
2764 return new_simdlen_p;
2765 }
2766
2767 /* Limit the VFs to what is likely to be the maximum number of iterations,
2768 to handle cases in which at least one loop_vinfo is fully-masked. */
2769 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2770 if (estimated_max_niter != -1)
2771 {
2772 if (known_le (estimated_max_niter, new_vf))
2773 new_vf = estimated_max_niter;
2774 if (known_le (estimated_max_niter, old_vf))
2775 old_vf = estimated_max_niter;
2776 }
2777
2778 /* Check whether the (fractional) cost per scalar iteration is lower
2779 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2780 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2781 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2782
2783 HOST_WIDE_INT est_rel_new_min
2784 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2785 HOST_WIDE_INT est_rel_new_max
2786 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2787
2788 HOST_WIDE_INT est_rel_old_min
2789 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2790 HOST_WIDE_INT est_rel_old_max
2791 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2792
2793 /* Check first if we can make out an unambigous total order from the minimum
2794 and maximum estimates. */
2795 if (est_rel_new_min < est_rel_old_min
2796 && est_rel_new_max < est_rel_old_max)
2797 return true;
2798 else if (est_rel_old_min < est_rel_new_min
2799 && est_rel_old_max < est_rel_new_max)
2800 return false;
2801 /* When old_loop_vinfo uses a variable vectorization factor,
2802 we know that it has a lower cost for at least one runtime VF.
2803 However, we don't know how likely that VF is.
2804
2805 One option would be to compare the costs for the estimated VFs.
2806 The problem is that that can put too much pressure on the cost
2807 model. E.g. if the estimated VF is also the lowest possible VF,
2808 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2809 for the estimated VF, we'd then choose new_loop_vinfo even
2810 though (a) new_loop_vinfo might not actually be better than
2811 old_loop_vinfo for that VF and (b) it would be significantly
2812 worse at larger VFs.
2813
2814 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2815 no more expensive than old_loop_vinfo even after doubling the
2816 estimated old_loop_vinfo VF. For all but trivial loops, this
2817 ensures that we only pick new_loop_vinfo if it is significantly
2818 better than old_loop_vinfo at the estimated VF. */
2819
2820 if (est_rel_old_min != est_rel_new_min
2821 || est_rel_old_max != est_rel_new_max)
2822 {
2823 HOST_WIDE_INT est_rel_new_likely
2824 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2825 HOST_WIDE_INT est_rel_old_likely
2826 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2827
2828 return est_rel_new_likely * 2 <= est_rel_old_likely;
2829 }
2830
2831 /* If there's nothing to choose between the loop bodies, see whether
2832 there's a difference in the prologue and epilogue costs. */
2833 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2834 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2835
2836 return false;
2837 }
2838
2839 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2840 true if we should. */
2841
2842 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2843 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2844 loop_vec_info old_loop_vinfo)
2845 {
2846 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2847 return false;
2848
2849 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE, vect_location,
2851 "***** Preferring vector mode %s to vector mode %s\n",
2852 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2853 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2854 return true;
2855 }
2856
2857 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2858 try to reanalyze it as a main loop. Return the loop_vinfo on success
2859 and null on failure. */
2860
2861 static loop_vec_info
vect_reanalyze_as_main_loop(loop_vec_info loop_vinfo,unsigned int * n_stmts)2862 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2863 {
2864 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2865 return loop_vinfo;
2866
2867 if (dump_enabled_p ())
2868 dump_printf_loc (MSG_NOTE, vect_location,
2869 "***** Reanalyzing as a main loop with vector mode %s\n",
2870 GET_MODE_NAME (loop_vinfo->vector_mode));
2871
2872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2873 vec_info_shared *shared = loop_vinfo->shared;
2874 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2875 gcc_assert (main_loop_vinfo);
2876
2877 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2878
2879 bool fatal = false;
2880 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2881 loop->aux = NULL;
2882 if (!res)
2883 {
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE, vect_location,
2886 "***** Failed to analyze main loop with vector"
2887 " mode %s\n",
2888 GET_MODE_NAME (loop_vinfo->vector_mode));
2889 delete main_loop_vinfo;
2890 return NULL;
2891 }
2892 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2893 return main_loop_vinfo;
2894 }
2895
2896 /* Function vect_analyze_loop.
2897
2898 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2899 for it. The different analyses will record information in the
2900 loop_vec_info struct. */
2901 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2902 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2903 {
2904 auto_vector_modes vector_modes;
2905
2906 /* Autodetect first vector size we try. */
2907 unsigned int autovec_flags
2908 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2909 loop->simdlen != 0);
2910 unsigned int mode_i = 0;
2911
2912 DUMP_VECT_SCOPE ("analyze_loop_nest");
2913
2914 if (loop_outer (loop)
2915 && loop_vec_info_for_loop (loop_outer (loop))
2916 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2917 return opt_loop_vec_info::failure_at (vect_location,
2918 "outer-loop already vectorized.\n");
2919
2920 if (!find_loop_nest (loop, &shared->loop_nest))
2921 return opt_loop_vec_info::failure_at
2922 (vect_location,
2923 "not vectorized: loop nest containing two or more consecutive inner"
2924 " loops cannot be vectorized\n");
2925
2926 unsigned n_stmts = 0;
2927 machine_mode autodetected_vector_mode = VOIDmode;
2928 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2929 machine_mode next_vector_mode = VOIDmode;
2930 poly_uint64 lowest_th = 0;
2931 unsigned vectorized_loops = 0;
2932 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2933 && !unlimited_cost_model (loop));
2934
2935 bool vect_epilogues = false;
2936 opt_result res = opt_result::success ();
2937 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2938 while (1)
2939 {
2940 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2941 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2942 if (!loop_vinfo)
2943 {
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946 "bad loop form.\n");
2947 gcc_checking_assert (first_loop_vinfo == NULL);
2948 return loop_vinfo;
2949 }
2950 loop_vinfo->vector_mode = next_vector_mode;
2951
2952 bool fatal = false;
2953
2954 /* When pick_lowest_cost_p is true, we should in principle iterate
2955 over all the loop_vec_infos that LOOP_VINFO could replace and
2956 try to vectorize LOOP_VINFO under the same conditions.
2957 E.g. when trying to replace an epilogue loop, we should vectorize
2958 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2959 to replace the main loop, we should vectorize LOOP_VINFO as a main
2960 loop too.
2961
2962 However, autovectorize_vector_modes is usually sorted as follows:
2963
2964 - Modes that naturally produce lower VFs usually follow modes that
2965 naturally produce higher VFs.
2966
2967 - When modes naturally produce the same VF, maskable modes
2968 usually follow unmaskable ones, so that the maskable mode
2969 can be used to vectorize the epilogue of the unmaskable mode.
2970
2971 This order is preferred because it leads to the maximum
2972 epilogue vectorization opportunities. Targets should only use
2973 a different order if they want to make wide modes available while
2974 disparaging them relative to earlier, smaller modes. The assumption
2975 in that case is that the wider modes are more expensive in some
2976 way that isn't reflected directly in the costs.
2977
2978 There should therefore be few interesting cases in which
2979 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2980 treated as a standalone loop, and ends up being genuinely cheaper
2981 than FIRST_LOOP_VINFO. */
2982 if (vect_epilogues)
2983 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2984
2985 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2986 if (mode_i == 0)
2987 autodetected_vector_mode = loop_vinfo->vector_mode;
2988 if (dump_enabled_p ())
2989 {
2990 if (res)
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "***** Analysis succeeded with vector mode %s\n",
2993 GET_MODE_NAME (loop_vinfo->vector_mode));
2994 else
2995 dump_printf_loc (MSG_NOTE, vect_location,
2996 "***** Analysis failed with vector mode %s\n",
2997 GET_MODE_NAME (loop_vinfo->vector_mode));
2998 }
2999
3000 loop->aux = NULL;
3001
3002 if (!fatal)
3003 while (mode_i < vector_modes.length ()
3004 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3005 {
3006 if (dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE, vect_location,
3008 "***** The result for vector mode %s would"
3009 " be the same\n",
3010 GET_MODE_NAME (vector_modes[mode_i]));
3011 mode_i += 1;
3012 }
3013
3014 if (res)
3015 {
3016 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3017 vectorized_loops++;
3018
3019 /* Once we hit the desired simdlen for the first time,
3020 discard any previous attempts. */
3021 if (simdlen
3022 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3023 {
3024 delete first_loop_vinfo;
3025 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3026 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3027 simdlen = 0;
3028 }
3029 else if (pick_lowest_cost_p && first_loop_vinfo)
3030 {
3031 /* Keep trying to roll back vectorization attempts while the
3032 loop_vec_infos they produced were worse than this one. */
3033 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3034 while (!vinfos.is_empty ()
3035 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3036 {
3037 gcc_assert (vect_epilogues);
3038 delete vinfos.pop ();
3039 }
3040 if (vinfos.is_empty ()
3041 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3042 {
3043 loop_vec_info main_loop_vinfo
3044 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3045 if (main_loop_vinfo == loop_vinfo)
3046 {
3047 delete first_loop_vinfo;
3048 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3049 }
3050 else if (main_loop_vinfo
3051 && vect_joust_loop_vinfos (main_loop_vinfo,
3052 first_loop_vinfo))
3053 {
3054 delete first_loop_vinfo;
3055 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3056 delete loop_vinfo;
3057 loop_vinfo
3058 = opt_loop_vec_info::success (main_loop_vinfo);
3059 }
3060 else
3061 delete main_loop_vinfo;
3062 }
3063 }
3064
3065 if (first_loop_vinfo == NULL)
3066 {
3067 first_loop_vinfo = loop_vinfo;
3068 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3069 }
3070 else if (vect_epilogues
3071 /* For now only allow one epilogue loop. */
3072 && first_loop_vinfo->epilogue_vinfos.is_empty ())
3073 {
3074 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3075 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3076 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3077 || maybe_ne (lowest_th, 0U));
3078 /* Keep track of the known smallest versioning
3079 threshold. */
3080 if (ordered_p (lowest_th, th))
3081 lowest_th = ordered_min (lowest_th, th);
3082 }
3083 else
3084 {
3085 delete loop_vinfo;
3086 loop_vinfo = opt_loop_vec_info::success (NULL);
3087 }
3088
3089 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3090 enabled, SIMDUID is not set, it is the innermost loop and we have
3091 either already found the loop's SIMDLEN or there was no SIMDLEN to
3092 begin with.
3093 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3094 vect_epilogues = (!simdlen
3095 && loop->inner == NULL
3096 && param_vect_epilogues_nomask
3097 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3098 && !loop->simduid
3099 /* For now only allow one epilogue loop, but allow
3100 pick_lowest_cost_p to replace it. */
3101 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3102 || pick_lowest_cost_p));
3103
3104 /* Commit to first_loop_vinfo if we have no reason to try
3105 alternatives. */
3106 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3107 break;
3108 }
3109 else
3110 {
3111 delete loop_vinfo;
3112 loop_vinfo = opt_loop_vec_info::success (NULL);
3113 if (fatal)
3114 {
3115 gcc_checking_assert (first_loop_vinfo == NULL);
3116 break;
3117 }
3118 }
3119
3120 /* Handle the case that the original loop can use partial
3121 vectorization, but want to only adopt it for the epilogue.
3122 The retry should be in the same mode as original. */
3123 if (vect_epilogues
3124 && loop_vinfo
3125 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3126 {
3127 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3128 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3129 if (dump_enabled_p ())
3130 dump_printf_loc (MSG_NOTE, vect_location,
3131 "***** Re-trying analysis with same vector mode"
3132 " %s for epilogue with partial vectors.\n",
3133 GET_MODE_NAME (loop_vinfo->vector_mode));
3134 continue;
3135 }
3136
3137 if (mode_i < vector_modes.length ()
3138 && VECTOR_MODE_P (autodetected_vector_mode)
3139 && (related_vector_mode (vector_modes[mode_i],
3140 GET_MODE_INNER (autodetected_vector_mode))
3141 == autodetected_vector_mode)
3142 && (related_vector_mode (autodetected_vector_mode,
3143 GET_MODE_INNER (vector_modes[mode_i]))
3144 == vector_modes[mode_i]))
3145 {
3146 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_NOTE, vect_location,
3148 "***** Skipping vector mode %s, which would"
3149 " repeat the analysis for %s\n",
3150 GET_MODE_NAME (vector_modes[mode_i]),
3151 GET_MODE_NAME (autodetected_vector_mode));
3152 mode_i += 1;
3153 }
3154
3155 if (mode_i == vector_modes.length ()
3156 || autodetected_vector_mode == VOIDmode)
3157 break;
3158
3159 /* Try the next biggest vector size. */
3160 next_vector_mode = vector_modes[mode_i++];
3161 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE, vect_location,
3163 "***** Re-trying analysis with vector mode %s\n",
3164 GET_MODE_NAME (next_vector_mode));
3165 }
3166
3167 if (first_loop_vinfo)
3168 {
3169 loop->aux = (loop_vec_info) first_loop_vinfo;
3170 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_NOTE, vect_location,
3172 "***** Choosing vector mode %s\n",
3173 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3174 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3175 return first_loop_vinfo;
3176 }
3177
3178 return opt_loop_vec_info::propagate_failure (res);
3179 }
3180
3181 /* Return true if there is an in-order reduction function for CODE, storing
3182 it in *REDUC_FN if so. */
3183
3184 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)3185 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3186 {
3187 switch (code)
3188 {
3189 case PLUS_EXPR:
3190 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3191 return true;
3192
3193 default:
3194 return false;
3195 }
3196 }
3197
3198 /* Function reduction_fn_for_scalar_code
3199
3200 Input:
3201 CODE - tree_code of a reduction operations.
3202
3203 Output:
3204 REDUC_FN - the corresponding internal function to be used to reduce the
3205 vector of partial results into a single scalar result, or IFN_LAST
3206 if the operation is a supported reduction operation, but does not have
3207 such an internal function.
3208
3209 Return FALSE if CODE currently cannot be vectorized as reduction. */
3210
3211 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)3212 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3213 {
3214 switch (code)
3215 {
3216 case MAX_EXPR:
3217 *reduc_fn = IFN_REDUC_MAX;
3218 return true;
3219
3220 case MIN_EXPR:
3221 *reduc_fn = IFN_REDUC_MIN;
3222 return true;
3223
3224 case PLUS_EXPR:
3225 *reduc_fn = IFN_REDUC_PLUS;
3226 return true;
3227
3228 case BIT_AND_EXPR:
3229 *reduc_fn = IFN_REDUC_AND;
3230 return true;
3231
3232 case BIT_IOR_EXPR:
3233 *reduc_fn = IFN_REDUC_IOR;
3234 return true;
3235
3236 case BIT_XOR_EXPR:
3237 *reduc_fn = IFN_REDUC_XOR;
3238 return true;
3239
3240 case MULT_EXPR:
3241 case MINUS_EXPR:
3242 *reduc_fn = IFN_LAST;
3243 return true;
3244
3245 default:
3246 return false;
3247 }
3248 }
3249
3250 /* If there is a neutral value X such that SLP reduction NODE would not
3251 be affected by the introduction of additional X elements, return that X,
3252 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3253 is the vector type that would hold element X. REDUC_CHAIN is true if
3254 the SLP statements perform a single reduction, false if each statement
3255 performs an independent reduction. */
3256
3257 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)3258 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3259 tree_code code, bool reduc_chain)
3260 {
3261 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3262 stmt_vec_info stmt_vinfo = stmts[0];
3263 tree scalar_type = TREE_TYPE (vector_type);
3264 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3265 gcc_assert (loop);
3266
3267 switch (code)
3268 {
3269 case WIDEN_SUM_EXPR:
3270 case DOT_PROD_EXPR:
3271 case SAD_EXPR:
3272 case PLUS_EXPR:
3273 case MINUS_EXPR:
3274 case BIT_IOR_EXPR:
3275 case BIT_XOR_EXPR:
3276 return build_zero_cst (scalar_type);
3277
3278 case MULT_EXPR:
3279 return build_one_cst (scalar_type);
3280
3281 case BIT_AND_EXPR:
3282 return build_all_ones_cst (scalar_type);
3283
3284 case MAX_EXPR:
3285 case MIN_EXPR:
3286 /* For MIN/MAX the initial values are neutral. A reduction chain
3287 has only a single initial value, so that value is neutral for
3288 all statements. */
3289 if (reduc_chain)
3290 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3291 loop_preheader_edge (loop));
3292 return NULL_TREE;
3293
3294 default:
3295 return NULL_TREE;
3296 }
3297 }
3298
3299 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3300 STMT is printed with a message MSG. */
3301
3302 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)3303 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3304 {
3305 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3306 }
3307
3308 /* Return true if we need an in-order reduction for operation CODE
3309 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3310 overflow must wrap. */
3311
3312 bool
needs_fold_left_reduction_p(tree type,tree_code code)3313 needs_fold_left_reduction_p (tree type, tree_code code)
3314 {
3315 /* CHECKME: check for !flag_finite_math_only too? */
3316 if (SCALAR_FLOAT_TYPE_P (type))
3317 switch (code)
3318 {
3319 case MIN_EXPR:
3320 case MAX_EXPR:
3321 return false;
3322
3323 default:
3324 return !flag_associative_math;
3325 }
3326
3327 if (INTEGRAL_TYPE_P (type))
3328 {
3329 if (!operation_no_trapping_overflow (type, code))
3330 return true;
3331 return false;
3332 }
3333
3334 if (SAT_FIXED_POINT_TYPE_P (type))
3335 return true;
3336
3337 return false;
3338 }
3339
3340 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3341 has a handled computation expression. Store the main reduction
3342 operation in *CODE. */
3343
3344 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)3345 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3346 tree loop_arg, enum tree_code *code,
3347 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3348 {
3349 auto_bitmap visited;
3350 tree lookfor = PHI_RESULT (phi);
3351 ssa_op_iter curri;
3352 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3353 while (USE_FROM_PTR (curr) != loop_arg)
3354 curr = op_iter_next_use (&curri);
3355 curri.i = curri.numops;
3356 do
3357 {
3358 path.safe_push (std::make_pair (curri, curr));
3359 tree use = USE_FROM_PTR (curr);
3360 if (use == lookfor)
3361 break;
3362 gimple *def = SSA_NAME_DEF_STMT (use);
3363 if (gimple_nop_p (def)
3364 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3365 {
3366 pop:
3367 do
3368 {
3369 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3370 curri = x.first;
3371 curr = x.second;
3372 do
3373 curr = op_iter_next_use (&curri);
3374 /* Skip already visited or non-SSA operands (from iterating
3375 over PHI args). */
3376 while (curr != NULL_USE_OPERAND_P
3377 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3378 || ! bitmap_set_bit (visited,
3379 SSA_NAME_VERSION
3380 (USE_FROM_PTR (curr)))));
3381 }
3382 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3383 if (curr == NULL_USE_OPERAND_P)
3384 break;
3385 }
3386 else
3387 {
3388 if (gimple_code (def) == GIMPLE_PHI)
3389 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3390 else
3391 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3392 while (curr != NULL_USE_OPERAND_P
3393 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3394 || ! bitmap_set_bit (visited,
3395 SSA_NAME_VERSION
3396 (USE_FROM_PTR (curr)))))
3397 curr = op_iter_next_use (&curri);
3398 if (curr == NULL_USE_OPERAND_P)
3399 goto pop;
3400 }
3401 }
3402 while (1);
3403 if (dump_file && (dump_flags & TDF_DETAILS))
3404 {
3405 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3406 unsigned i;
3407 std::pair<ssa_op_iter, use_operand_p> *x;
3408 FOR_EACH_VEC_ELT (path, i, x)
3409 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3410 dump_printf (MSG_NOTE, "\n");
3411 }
3412
3413 /* Check whether the reduction path detected is valid. */
3414 bool fail = path.length () == 0;
3415 bool neg = false;
3416 int sign = -1;
3417 *code = ERROR_MARK;
3418 for (unsigned i = 1; i < path.length (); ++i)
3419 {
3420 gimple *use_stmt = USE_STMT (path[i].second);
3421 tree op = USE_FROM_PTR (path[i].second);
3422 if (! is_gimple_assign (use_stmt)
3423 /* The following make sure we can compute the operand index
3424 easily plus it mostly disallows chaining via COND_EXPR condition
3425 operands. */
3426 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3427 && (gimple_num_ops (use_stmt) <= 2
3428 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3429 && (gimple_num_ops (use_stmt) <= 3
3430 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3431 {
3432 fail = true;
3433 break;
3434 }
3435 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3436 if (use_code == MINUS_EXPR)
3437 {
3438 use_code = PLUS_EXPR;
3439 /* Track whether we negate the reduction value each iteration. */
3440 if (gimple_assign_rhs2 (use_stmt) == op)
3441 neg = ! neg;
3442 }
3443 if (CONVERT_EXPR_CODE_P (use_code)
3444 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3445 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3446 ;
3447 else if (*code == ERROR_MARK)
3448 {
3449 *code = use_code;
3450 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3451 }
3452 else if (use_code != *code)
3453 {
3454 fail = true;
3455 break;
3456 }
3457 else if ((use_code == MIN_EXPR
3458 || use_code == MAX_EXPR)
3459 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3460 {
3461 fail = true;
3462 break;
3463 }
3464 /* Check there's only a single stmt the op is used on. For the
3465 not value-changing tail and the last stmt allow out-of-loop uses.
3466 ??? We could relax this and handle arbitrary live stmts by
3467 forcing a scalar epilogue for example. */
3468 imm_use_iterator imm_iter;
3469 gimple *op_use_stmt;
3470 unsigned cnt = 0;
3471 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3472 if (!is_gimple_debug (op_use_stmt)
3473 && (*code != ERROR_MARK
3474 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3475 {
3476 /* We want to allow x + x but not x < 1 ? x : 2. */
3477 if (is_gimple_assign (op_use_stmt)
3478 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3479 {
3480 use_operand_p use_p;
3481 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3482 cnt++;
3483 }
3484 else
3485 cnt++;
3486 }
3487 if (cnt != 1)
3488 {
3489 fail = true;
3490 break;
3491 }
3492 }
3493 return ! fail && ! neg && *code != ERROR_MARK;
3494 }
3495
3496 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3497 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3498 tree loop_arg, enum tree_code code)
3499 {
3500 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3501 enum tree_code code_;
3502 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3503 && code_ == code);
3504 }
3505
3506
3507
3508 /* Function vect_is_simple_reduction
3509
3510 (1) Detect a cross-iteration def-use cycle that represents a simple
3511 reduction computation. We look for the following pattern:
3512
3513 loop_header:
3514 a1 = phi < a0, a2 >
3515 a3 = ...
3516 a2 = operation (a3, a1)
3517
3518 or
3519
3520 a3 = ...
3521 loop_header:
3522 a1 = phi < a0, a2 >
3523 a2 = operation (a3, a1)
3524
3525 such that:
3526 1. operation is commutative and associative and it is safe to
3527 change the order of the computation
3528 2. no uses for a2 in the loop (a2 is used out of the loop)
3529 3. no uses of a1 in the loop besides the reduction operation
3530 4. no uses of a1 outside the loop.
3531
3532 Conditions 1,4 are tested here.
3533 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3534
3535 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3536 nested cycles.
3537
3538 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3539 reductions:
3540
3541 a1 = phi < a0, a2 >
3542 inner loop (def of a3)
3543 a2 = phi < a3 >
3544
3545 (4) Detect condition expressions, ie:
3546 for (int i = 0; i < N; i++)
3547 if (a[i] < val)
3548 ret_val = a[i];
3549
3550 */
3551
3552 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3553 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3554 bool *double_reduc, bool *reduc_chain_p)
3555 {
3556 gphi *phi = as_a <gphi *> (phi_info->stmt);
3557 gimple *phi_use_stmt = NULL;
3558 imm_use_iterator imm_iter;
3559 use_operand_p use_p;
3560
3561 *double_reduc = false;
3562 *reduc_chain_p = false;
3563 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3564
3565 tree phi_name = PHI_RESULT (phi);
3566 /* ??? If there are no uses of the PHI result the inner loop reduction
3567 won't be detected as possibly double-reduction by vectorizable_reduction
3568 because that tries to walk the PHI arg from the preheader edge which
3569 can be constant. See PR60382. */
3570 if (has_zero_uses (phi_name))
3571 return NULL;
3572 class loop *loop = (gimple_bb (phi))->loop_father;
3573 unsigned nphi_def_loop_uses = 0;
3574 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3575 {
3576 gimple *use_stmt = USE_STMT (use_p);
3577 if (is_gimple_debug (use_stmt))
3578 continue;
3579
3580 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3581 {
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "intermediate value used outside loop.\n");
3585
3586 return NULL;
3587 }
3588
3589 nphi_def_loop_uses++;
3590 phi_use_stmt = use_stmt;
3591 }
3592
3593 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3594 if (TREE_CODE (latch_def) != SSA_NAME)
3595 {
3596 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "reduction: not ssa_name: %T\n", latch_def);
3599 return NULL;
3600 }
3601
3602 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3603 if (!def_stmt_info
3604 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3605 return NULL;
3606
3607 bool nested_in_vect_loop
3608 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3609 unsigned nlatch_def_loop_uses = 0;
3610 auto_vec<gphi *, 3> lcphis;
3611 bool inner_loop_of_double_reduc = false;
3612 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3613 {
3614 gimple *use_stmt = USE_STMT (use_p);
3615 if (is_gimple_debug (use_stmt))
3616 continue;
3617 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3618 nlatch_def_loop_uses++;
3619 else
3620 {
3621 /* We can have more than one loop-closed PHI. */
3622 lcphis.safe_push (as_a <gphi *> (use_stmt));
3623 if (nested_in_vect_loop
3624 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3625 == vect_double_reduction_def))
3626 inner_loop_of_double_reduc = true;
3627 }
3628 }
3629
3630 /* If we are vectorizing an inner reduction we are executing that
3631 in the original order only in case we are not dealing with a
3632 double reduction. */
3633 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3634 {
3635 if (dump_enabled_p ())
3636 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3637 "detected nested cycle: ");
3638 return def_stmt_info;
3639 }
3640
3641 /* If this isn't a nested cycle or if the nested cycle reduction value
3642 is used ouside of the inner loop we cannot handle uses of the reduction
3643 value. */
3644 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3645 {
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648 "reduction used in loop.\n");
3649 return NULL;
3650 }
3651
3652 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3653 defined in the inner loop. */
3654 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3655 {
3656 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3657 if (gimple_phi_num_args (def_stmt) != 1
3658 || TREE_CODE (op1) != SSA_NAME)
3659 {
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 "unsupported phi node definition.\n");
3663
3664 return NULL;
3665 }
3666
3667 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3668 if (gimple_bb (def1)
3669 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3670 && loop->inner
3671 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3672 && is_gimple_assign (def1)
3673 && is_a <gphi *> (phi_use_stmt)
3674 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3675 {
3676 if (dump_enabled_p ())
3677 report_vect_op (MSG_NOTE, def_stmt,
3678 "detected double reduction: ");
3679
3680 *double_reduc = true;
3681 return def_stmt_info;
3682 }
3683
3684 return NULL;
3685 }
3686
3687 /* Look for the expression computing latch_def from then loop PHI result. */
3688 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3689 enum tree_code code;
3690 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3691 path))
3692 {
3693 STMT_VINFO_REDUC_CODE (phi_info) = code;
3694 if (code == COND_EXPR && !nested_in_vect_loop)
3695 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3696
3697 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3698 reduction chain for which the additional restriction is that
3699 all operations in the chain are the same. */
3700 auto_vec<stmt_vec_info, 8> reduc_chain;
3701 unsigned i;
3702 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3703 for (i = path.length () - 1; i >= 1; --i)
3704 {
3705 gimple *stmt = USE_STMT (path[i].second);
3706 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3707 STMT_VINFO_REDUC_IDX (stmt_info)
3708 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3709 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3710 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3711 && (i == 1 || i == path.length () - 1));
3712 if ((stmt_code != code && !leading_conversion)
3713 /* We can only handle the final value in epilogue
3714 generation for reduction chains. */
3715 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3716 is_slp_reduc = false;
3717 /* For reduction chains we support a trailing/leading
3718 conversions. We do not store those in the actual chain. */
3719 if (leading_conversion)
3720 continue;
3721 reduc_chain.safe_push (stmt_info);
3722 }
3723 if (is_slp_reduc && reduc_chain.length () > 1)
3724 {
3725 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3726 {
3727 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3728 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3729 }
3730 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3731 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3732
3733 /* Save the chain for further analysis in SLP detection. */
3734 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3735 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3736
3737 *reduc_chain_p = true;
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "reduction: detected reduction chain\n");
3741 }
3742 else if (dump_enabled_p ())
3743 dump_printf_loc (MSG_NOTE, vect_location,
3744 "reduction: detected reduction\n");
3745
3746 return def_stmt_info;
3747 }
3748
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "reduction: unknown pattern\n");
3752
3753 return NULL;
3754 }
3755
3756 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3757 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3758 or -1 if not known. */
3759
3760 static int
vect_get_peel_iters_epilogue(loop_vec_info loop_vinfo,int peel_iters_prologue)3761 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3762 {
3763 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3764 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3765 {
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_NOTE, vect_location,
3768 "cost model: epilogue peel iters set to vf/2 "
3769 "because loop iterations are unknown .\n");
3770 return assumed_vf / 2;
3771 }
3772 else
3773 {
3774 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3775 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3776 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3777 /* If we need to peel for gaps, but no peeling is required, we have to
3778 peel VF iterations. */
3779 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3780 peel_iters_epilogue = assumed_vf;
3781 return peel_iters_epilogue;
3782 }
3783 }
3784
3785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3786 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3788 int *peel_iters_epilogue,
3789 stmt_vector_for_cost *scalar_cost_vec,
3790 stmt_vector_for_cost *prologue_cost_vec,
3791 stmt_vector_for_cost *epilogue_cost_vec)
3792 {
3793 int retval = 0;
3794
3795 *peel_iters_epilogue
3796 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3797
3798 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3799 {
3800 /* If peeled iterations are known but number of scalar loop
3801 iterations are unknown, count a taken branch per peeled loop. */
3802 if (peel_iters_prologue > 0)
3803 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3804 NULL, NULL_TREE, 0, vect_prologue);
3805 if (*peel_iters_epilogue > 0)
3806 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3807 NULL, NULL_TREE, 0, vect_epilogue);
3808 }
3809
3810 stmt_info_for_cost *si;
3811 int j;
3812 if (peel_iters_prologue)
3813 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3814 retval += record_stmt_cost (prologue_cost_vec,
3815 si->count * peel_iters_prologue,
3816 si->kind, si->stmt_info, si->misalign,
3817 vect_prologue);
3818 if (*peel_iters_epilogue)
3819 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3820 retval += record_stmt_cost (epilogue_cost_vec,
3821 si->count * *peel_iters_epilogue,
3822 si->kind, si->stmt_info, si->misalign,
3823 vect_epilogue);
3824
3825 return retval;
3826 }
3827
3828 /* Function vect_estimate_min_profitable_iters
3829
3830 Return the number of iterations required for the vector version of the
3831 loop to be profitable relative to the cost of the scalar version of the
3832 loop.
3833
3834 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3835 of iterations for vectorization. -1 value means loop vectorization
3836 is not profitable. This returned value may be used for dynamic
3837 profitability check.
3838
3839 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3840 for static check against estimated number of iterations. */
3841
3842 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3843 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3844 int *ret_min_profitable_niters,
3845 int *ret_min_profitable_estimate)
3846 {
3847 int min_profitable_iters;
3848 int min_profitable_estimate;
3849 int peel_iters_prologue;
3850 int peel_iters_epilogue;
3851 unsigned vec_inside_cost = 0;
3852 int vec_outside_cost = 0;
3853 unsigned vec_prologue_cost = 0;
3854 unsigned vec_epilogue_cost = 0;
3855 int scalar_single_iter_cost = 0;
3856 int scalar_outside_cost = 0;
3857 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3858 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3859 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3860
3861 /* Cost model disabled. */
3862 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3863 {
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3866 *ret_min_profitable_niters = 0;
3867 *ret_min_profitable_estimate = 0;
3868 return;
3869 }
3870
3871 /* Requires loop versioning tests to handle misalignment. */
3872 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3873 {
3874 /* FIXME: Make cost depend on complexity of individual check. */
3875 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3876 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3877 NULL, NULL_TREE, 0, vect_prologue);
3878 if (dump_enabled_p ())
3879 dump_printf (MSG_NOTE,
3880 "cost model: Adding cost of checks for loop "
3881 "versioning to treat misalignment.\n");
3882 }
3883
3884 /* Requires loop versioning with alias checks. */
3885 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3886 {
3887 /* FIXME: Make cost depend on complexity of individual check. */
3888 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3889 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3890 NULL, NULL_TREE, 0, vect_prologue);
3891 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3892 if (len)
3893 /* Count LEN - 1 ANDs and LEN comparisons. */
3894 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3895 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3896 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3897 if (len)
3898 {
3899 /* Count LEN - 1 ANDs and LEN comparisons. */
3900 unsigned int nstmts = len * 2 - 1;
3901 /* +1 for each bias that needs adding. */
3902 for (unsigned int i = 0; i < len; ++i)
3903 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3904 nstmts += 1;
3905 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3906 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3907 }
3908 if (dump_enabled_p ())
3909 dump_printf (MSG_NOTE,
3910 "cost model: Adding cost of checks for loop "
3911 "versioning aliasing.\n");
3912 }
3913
3914 /* Requires loop versioning with niter checks. */
3915 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3916 {
3917 /* FIXME: Make cost depend on complexity of individual check. */
3918 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3919 NULL, NULL_TREE, 0, vect_prologue);
3920 if (dump_enabled_p ())
3921 dump_printf (MSG_NOTE,
3922 "cost model: Adding cost of checks for loop "
3923 "versioning niters.\n");
3924 }
3925
3926 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3927 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3928 NULL, NULL_TREE, 0, vect_prologue);
3929
3930 /* Count statements in scalar loop. Using this as scalar cost for a single
3931 iteration for now.
3932
3933 TODO: Add outer loop support.
3934
3935 TODO: Consider assigning different costs to different scalar
3936 statements. */
3937
3938 scalar_single_iter_cost
3939 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3940
3941 /* Add additional cost for the peeled instructions in prologue and epilogue
3942 loop. (For fully-masked loops there will be no peeling.)
3943
3944 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3945 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3946
3947 TODO: Build an expression that represents peel_iters for prologue and
3948 epilogue to be used in a run-time test. */
3949
3950 bool prologue_need_br_taken_cost = false;
3951 bool prologue_need_br_not_taken_cost = false;
3952
3953 /* Calculate peel_iters_prologue. */
3954 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3955 peel_iters_prologue = 0;
3956 else if (npeel < 0)
3957 {
3958 peel_iters_prologue = assumed_vf / 2;
3959 if (dump_enabled_p ())
3960 dump_printf (MSG_NOTE, "cost model: "
3961 "prologue peel iters set to vf/2.\n");
3962
3963 /* If peeled iterations are unknown, count a taken branch and a not taken
3964 branch per peeled loop. Even if scalar loop iterations are known,
3965 vector iterations are not known since peeled prologue iterations are
3966 not known. Hence guards remain the same. */
3967 prologue_need_br_taken_cost = true;
3968 prologue_need_br_not_taken_cost = true;
3969 }
3970 else
3971 {
3972 peel_iters_prologue = npeel;
3973 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3974 /* If peeled iterations are known but number of scalar loop
3975 iterations are unknown, count a taken branch per peeled loop. */
3976 prologue_need_br_taken_cost = true;
3977 }
3978
3979 bool epilogue_need_br_taken_cost = false;
3980 bool epilogue_need_br_not_taken_cost = false;
3981
3982 /* Calculate peel_iters_epilogue. */
3983 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3984 /* We need to peel exactly one iteration for gaps. */
3985 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3986 else if (npeel < 0)
3987 {
3988 /* If peeling for alignment is unknown, loop bound of main loop
3989 becomes unknown. */
3990 peel_iters_epilogue = assumed_vf / 2;
3991 if (dump_enabled_p ())
3992 dump_printf (MSG_NOTE, "cost model: "
3993 "epilogue peel iters set to vf/2 because "
3994 "peeling for alignment is unknown.\n");
3995
3996 /* See the same reason above in peel_iters_prologue calculation. */
3997 epilogue_need_br_taken_cost = true;
3998 epilogue_need_br_not_taken_cost = true;
3999 }
4000 else
4001 {
4002 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4003 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4004 /* If peeled iterations are known but number of scalar loop
4005 iterations are unknown, count a taken branch per peeled loop. */
4006 epilogue_need_br_taken_cost = true;
4007 }
4008
4009 stmt_info_for_cost *si;
4010 int j;
4011 /* Add costs associated with peel_iters_prologue. */
4012 if (peel_iters_prologue)
4013 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4014 {
4015 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4016 si->count * peel_iters_prologue, si->kind,
4017 si->stmt_info, si->vectype, si->misalign,
4018 vect_prologue);
4019 }
4020
4021 /* Add costs associated with peel_iters_epilogue. */
4022 if (peel_iters_epilogue)
4023 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4024 {
4025 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4026 si->count * peel_iters_epilogue, si->kind,
4027 si->stmt_info, si->vectype, si->misalign,
4028 vect_epilogue);
4029 }
4030
4031 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4032
4033 if (prologue_need_br_taken_cost)
4034 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4035 NULL, NULL_TREE, 0, vect_prologue);
4036
4037 if (prologue_need_br_not_taken_cost)
4038 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4039 cond_branch_not_taken, NULL, NULL_TREE, 0,
4040 vect_prologue);
4041
4042 if (epilogue_need_br_taken_cost)
4043 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044 NULL, NULL_TREE, 0, vect_epilogue);
4045
4046 if (epilogue_need_br_not_taken_cost)
4047 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048 cond_branch_not_taken, NULL, NULL_TREE, 0,
4049 vect_epilogue);
4050
4051 /* Take care of special costs for rgroup controls of partial vectors. */
4052 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4053 {
4054 /* Calculate how many masks we need to generate. */
4055 unsigned int num_masks = 0;
4056 rgroup_controls *rgm;
4057 unsigned int num_vectors_m1;
4058 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4059 if (rgm->type)
4060 num_masks += num_vectors_m1 + 1;
4061 gcc_assert (num_masks > 0);
4062
4063 /* In the worst case, we need to generate each mask in the prologue
4064 and in the loop body. One of the loop body mask instructions
4065 replaces the comparison in the scalar loop, and since we don't
4066 count the scalar comparison against the scalar body, we shouldn't
4067 count that vector instruction against the vector body either.
4068
4069 Sometimes we can use unpacks instead of generating prologue
4070 masks and sometimes the prologue mask will fold to a constant,
4071 so the actual prologue cost might be smaller. However, it's
4072 simpler and safer to use the worst-case cost; if this ends up
4073 being the tie-breaker between vectorizing or not, then it's
4074 probably better not to vectorize. */
4075 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4076 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4077 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4078 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4079 }
4080 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4081 {
4082 /* Referring to the functions vect_set_loop_condition_partial_vectors
4083 and vect_set_loop_controls_directly, we need to generate each
4084 length in the prologue and in the loop body if required. Although
4085 there are some possible optimizations, we consider the worst case
4086 here. */
4087
4088 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4089 bool need_iterate_p
4090 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4091 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4092
4093 /* Calculate how many statements to be added. */
4094 unsigned int prologue_stmts = 0;
4095 unsigned int body_stmts = 0;
4096
4097 rgroup_controls *rgc;
4098 unsigned int num_vectors_m1;
4099 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4100 if (rgc->type)
4101 {
4102 /* May need one SHIFT for nitems_total computation. */
4103 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4104 if (nitems != 1 && !niters_known_p)
4105 prologue_stmts += 1;
4106
4107 /* May need one MAX and one MINUS for wrap around. */
4108 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4109 prologue_stmts += 2;
4110
4111 /* Need one MAX and one MINUS for each batch limit excepting for
4112 the 1st one. */
4113 prologue_stmts += num_vectors_m1 * 2;
4114
4115 unsigned int num_vectors = num_vectors_m1 + 1;
4116
4117 /* Need to set up lengths in prologue, only one MIN required
4118 for each since start index is zero. */
4119 prologue_stmts += num_vectors;
4120
4121 /* Each may need two MINs and one MINUS to update lengths in body
4122 for next iteration. */
4123 if (need_iterate_p)
4124 body_stmts += 3 * num_vectors;
4125 }
4126
4127 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4128 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4129 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4130 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4131 }
4132
4133 /* FORNOW: The scalar outside cost is incremented in one of the
4134 following ways:
4135
4136 1. The vectorizer checks for alignment and aliasing and generates
4137 a condition that allows dynamic vectorization. A cost model
4138 check is ANDED with the versioning condition. Hence scalar code
4139 path now has the added cost of the versioning check.
4140
4141 if (cost > th & versioning_check)
4142 jmp to vector code
4143
4144 Hence run-time scalar is incremented by not-taken branch cost.
4145
4146 2. The vectorizer then checks if a prologue is required. If the
4147 cost model check was not done before during versioning, it has to
4148 be done before the prologue check.
4149
4150 if (cost <= th)
4151 prologue = scalar_iters
4152 if (prologue == 0)
4153 jmp to vector code
4154 else
4155 execute prologue
4156 if (prologue == num_iters)
4157 go to exit
4158
4159 Hence the run-time scalar cost is incremented by a taken branch,
4160 plus a not-taken branch, plus a taken branch cost.
4161
4162 3. The vectorizer then checks if an epilogue is required. If the
4163 cost model check was not done before during prologue check, it
4164 has to be done with the epilogue check.
4165
4166 if (prologue == 0)
4167 jmp to vector code
4168 else
4169 execute prologue
4170 if (prologue == num_iters)
4171 go to exit
4172 vector code:
4173 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4174 jmp to epilogue
4175
4176 Hence the run-time scalar cost should be incremented by 2 taken
4177 branches.
4178
4179 TODO: The back end may reorder the BBS's differently and reverse
4180 conditions/branch directions. Change the estimates below to
4181 something more reasonable. */
4182
4183 /* If the number of iterations is known and we do not do versioning, we can
4184 decide whether to vectorize at compile time. Hence the scalar version
4185 do not carry cost model guard costs. */
4186 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4187 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4188 {
4189 /* Cost model check occurs at versioning. */
4190 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4191 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4192 else
4193 {
4194 /* Cost model check occurs at prologue generation. */
4195 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4196 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4197 + vect_get_stmt_cost (cond_branch_not_taken);
4198 /* Cost model check occurs at epilogue generation. */
4199 else
4200 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4201 }
4202 }
4203
4204 /* Complete the target-specific cost calculations. */
4205 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4206 &vec_inside_cost, &vec_epilogue_cost);
4207
4208 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4209
4210 /* Stash the costs so that we can compare two loop_vec_infos. */
4211 loop_vinfo->vec_inside_cost = vec_inside_cost;
4212 loop_vinfo->vec_outside_cost = vec_outside_cost;
4213
4214 if (dump_enabled_p ())
4215 {
4216 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4217 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4218 vec_inside_cost);
4219 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4220 vec_prologue_cost);
4221 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4222 vec_epilogue_cost);
4223 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4224 scalar_single_iter_cost);
4225 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4226 scalar_outside_cost);
4227 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4228 vec_outside_cost);
4229 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4230 peel_iters_prologue);
4231 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4232 peel_iters_epilogue);
4233 }
4234
4235 /* Calculate number of iterations required to make the vector version
4236 profitable, relative to the loop bodies only. The following condition
4237 must hold true:
4238 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4239 where
4240 SIC = scalar iteration cost, VIC = vector iteration cost,
4241 VOC = vector outside cost, VF = vectorization factor,
4242 NPEEL = prologue iterations + epilogue iterations,
4243 SOC = scalar outside cost for run time cost model check. */
4244
4245 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4246 - vec_inside_cost);
4247 if (saving_per_viter <= 0)
4248 {
4249 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4250 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4251 "vectorization did not happen for a simd loop");
4252
4253 if (dump_enabled_p ())
4254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4255 "cost model: the vector iteration cost = %d "
4256 "divided by the scalar iteration cost = %d "
4257 "is greater or equal to the vectorization factor = %d"
4258 ".\n",
4259 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4260 *ret_min_profitable_niters = -1;
4261 *ret_min_profitable_estimate = -1;
4262 return;
4263 }
4264
4265 /* ??? The "if" arm is written to handle all cases; see below for what
4266 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4267 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4268 {
4269 /* Rewriting the condition above in terms of the number of
4270 vector iterations (vniters) rather than the number of
4271 scalar iterations (niters) gives:
4272
4273 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4274
4275 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4276
4277 For integer N, X and Y when X > 0:
4278
4279 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4280 int outside_overhead = (vec_outside_cost
4281 - scalar_single_iter_cost * peel_iters_prologue
4282 - scalar_single_iter_cost * peel_iters_epilogue
4283 - scalar_outside_cost);
4284 /* We're only interested in cases that require at least one
4285 vector iteration. */
4286 int min_vec_niters = 1;
4287 if (outside_overhead > 0)
4288 min_vec_niters = outside_overhead / saving_per_viter + 1;
4289
4290 if (dump_enabled_p ())
4291 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4292 min_vec_niters);
4293
4294 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4295 {
4296 /* Now that we know the minimum number of vector iterations,
4297 find the minimum niters for which the scalar cost is larger:
4298
4299 SIC * niters > VIC * vniters + VOC - SOC
4300
4301 We know that the minimum niters is no more than
4302 vniters * VF + NPEEL, but it might be (and often is) less
4303 than that if a partial vector iteration is cheaper than the
4304 equivalent scalar code. */
4305 int threshold = (vec_inside_cost * min_vec_niters
4306 + vec_outside_cost
4307 - scalar_outside_cost);
4308 if (threshold <= 0)
4309 min_profitable_iters = 1;
4310 else
4311 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4312 }
4313 else
4314 /* Convert the number of vector iterations into a number of
4315 scalar iterations. */
4316 min_profitable_iters = (min_vec_niters * assumed_vf
4317 + peel_iters_prologue
4318 + peel_iters_epilogue);
4319 }
4320 else
4321 {
4322 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4323 * assumed_vf
4324 - vec_inside_cost * peel_iters_prologue
4325 - vec_inside_cost * peel_iters_epilogue);
4326 if (min_profitable_iters <= 0)
4327 min_profitable_iters = 0;
4328 else
4329 {
4330 min_profitable_iters /= saving_per_viter;
4331
4332 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4333 <= (((int) vec_inside_cost * min_profitable_iters)
4334 + (((int) vec_outside_cost - scalar_outside_cost)
4335 * assumed_vf)))
4336 min_profitable_iters++;
4337 }
4338 }
4339
4340 if (dump_enabled_p ())
4341 dump_printf (MSG_NOTE,
4342 " Calculated minimum iters for profitability: %d\n",
4343 min_profitable_iters);
4344
4345 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4346 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4347 /* We want the vectorized loop to execute at least once. */
4348 min_profitable_iters = assumed_vf + peel_iters_prologue;
4349 else if (min_profitable_iters < peel_iters_prologue)
4350 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4351 vectorized loop executes at least once. */
4352 min_profitable_iters = peel_iters_prologue;
4353
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_NOTE, vect_location,
4356 " Runtime profitability threshold = %d\n",
4357 min_profitable_iters);
4358
4359 *ret_min_profitable_niters = min_profitable_iters;
4360
4361 /* Calculate number of iterations required to make the vector version
4362 profitable, relative to the loop bodies only.
4363
4364 Non-vectorized variant is SIC * niters and it must win over vector
4365 variant on the expected loop trip count. The following condition must hold true:
4366 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4367
4368 if (vec_outside_cost <= 0)
4369 min_profitable_estimate = 0;
4370 /* ??? This "else if" arm is written to handle all cases; see below for
4371 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4372 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4373 {
4374 /* This is a repeat of the code above, but with + SOC rather
4375 than - SOC. */
4376 int outside_overhead = (vec_outside_cost
4377 - scalar_single_iter_cost * peel_iters_prologue
4378 - scalar_single_iter_cost * peel_iters_epilogue
4379 + scalar_outside_cost);
4380 int min_vec_niters = 1;
4381 if (outside_overhead > 0)
4382 min_vec_niters = outside_overhead / saving_per_viter + 1;
4383
4384 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4385 {
4386 int threshold = (vec_inside_cost * min_vec_niters
4387 + vec_outside_cost
4388 + scalar_outside_cost);
4389 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4390 }
4391 else
4392 min_profitable_estimate = (min_vec_niters * assumed_vf
4393 + peel_iters_prologue
4394 + peel_iters_epilogue);
4395 }
4396 else
4397 {
4398 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4399 * assumed_vf
4400 - vec_inside_cost * peel_iters_prologue
4401 - vec_inside_cost * peel_iters_epilogue)
4402 / ((scalar_single_iter_cost * assumed_vf)
4403 - vec_inside_cost);
4404 }
4405 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4406 if (dump_enabled_p ())
4407 dump_printf_loc (MSG_NOTE, vect_location,
4408 " Static estimate profitability threshold = %d\n",
4409 min_profitable_estimate);
4410
4411 *ret_min_profitable_estimate = min_profitable_estimate;
4412 }
4413
4414 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4415 vector elements (not bits) for a vector with NELT elements. */
4416 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4417 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4418 vec_perm_builder *sel)
4419 {
4420 /* The encoding is a single stepped pattern. Any wrap-around is handled
4421 by vec_perm_indices. */
4422 sel->new_vector (nelt, 1, 3);
4423 for (unsigned int i = 0; i < 3; i++)
4424 sel->quick_push (i + offset);
4425 }
4426
4427 /* Checks whether the target supports whole-vector shifts for vectors of mode
4428 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4429 it supports vec_perm_const with masks for all necessary shift amounts. */
4430 static bool
have_whole_vector_shift(machine_mode mode)4431 have_whole_vector_shift (machine_mode mode)
4432 {
4433 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4434 return true;
4435
4436 /* Variable-length vectors should be handled via the optab. */
4437 unsigned int nelt;
4438 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4439 return false;
4440
4441 vec_perm_builder sel;
4442 vec_perm_indices indices;
4443 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4444 {
4445 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4446 indices.new_vector (sel, 2, nelt);
4447 if (!can_vec_perm_const_p (mode, indices, false))
4448 return false;
4449 }
4450 return true;
4451 }
4452
4453 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4454 functions. Design better to avoid maintenance issues. */
4455
4456 /* Function vect_model_reduction_cost.
4457
4458 Models cost for a reduction operation, including the vector ops
4459 generated within the strip-mine loop in some cases, the initial
4460 definition before the loop, and the epilogue code that must be generated. */
4461
4462 static void
vect_model_reduction_cost(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)4463 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4464 stmt_vec_info stmt_info, internal_fn reduc_fn,
4465 vect_reduction_type reduction_type,
4466 int ncopies, stmt_vector_for_cost *cost_vec)
4467 {
4468 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4469 enum tree_code code;
4470 optab optab;
4471 tree vectype;
4472 machine_mode mode;
4473 class loop *loop = NULL;
4474
4475 if (loop_vinfo)
4476 loop = LOOP_VINFO_LOOP (loop_vinfo);
4477
4478 /* Condition reductions generate two reductions in the loop. */
4479 if (reduction_type == COND_REDUCTION)
4480 ncopies *= 2;
4481
4482 vectype = STMT_VINFO_VECTYPE (stmt_info);
4483 mode = TYPE_MODE (vectype);
4484 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4485
4486 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4487
4488 if (reduction_type == EXTRACT_LAST_REDUCTION)
4489 /* No extra instructions are needed in the prologue. The loop body
4490 operations are costed in vectorizable_condition. */
4491 inside_cost = 0;
4492 else if (reduction_type == FOLD_LEFT_REDUCTION)
4493 {
4494 /* No extra instructions needed in the prologue. */
4495 prologue_cost = 0;
4496
4497 if (reduc_fn != IFN_LAST)
4498 /* Count one reduction-like operation per vector. */
4499 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4500 stmt_info, 0, vect_body);
4501 else
4502 {
4503 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4504 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4505 inside_cost = record_stmt_cost (cost_vec, nelements,
4506 vec_to_scalar, stmt_info, 0,
4507 vect_body);
4508 inside_cost += record_stmt_cost (cost_vec, nelements,
4509 scalar_stmt, stmt_info, 0,
4510 vect_body);
4511 }
4512 }
4513 else
4514 {
4515 /* Add in cost for initial definition.
4516 For cond reduction we have four vectors: initial index, step,
4517 initial result of the data reduction, initial value of the index
4518 reduction. */
4519 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4520 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4521 scalar_to_vec, stmt_info, 0,
4522 vect_prologue);
4523 }
4524
4525 /* Determine cost of epilogue code.
4526
4527 We have a reduction operator that will reduce the vector in one statement.
4528 Also requires scalar extract. */
4529
4530 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4531 {
4532 if (reduc_fn != IFN_LAST)
4533 {
4534 if (reduction_type == COND_REDUCTION)
4535 {
4536 /* An EQ stmt and an COND_EXPR stmt. */
4537 epilogue_cost += record_stmt_cost (cost_vec, 2,
4538 vector_stmt, stmt_info, 0,
4539 vect_epilogue);
4540 /* Reduction of the max index and a reduction of the found
4541 values. */
4542 epilogue_cost += record_stmt_cost (cost_vec, 2,
4543 vec_to_scalar, stmt_info, 0,
4544 vect_epilogue);
4545 /* A broadcast of the max value. */
4546 epilogue_cost += record_stmt_cost (cost_vec, 1,
4547 scalar_to_vec, stmt_info, 0,
4548 vect_epilogue);
4549 }
4550 else
4551 {
4552 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4553 stmt_info, 0, vect_epilogue);
4554 epilogue_cost += record_stmt_cost (cost_vec, 1,
4555 vec_to_scalar, stmt_info, 0,
4556 vect_epilogue);
4557 }
4558 }
4559 else if (reduction_type == COND_REDUCTION)
4560 {
4561 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4562 /* Extraction of scalar elements. */
4563 epilogue_cost += record_stmt_cost (cost_vec,
4564 2 * estimated_nunits,
4565 vec_to_scalar, stmt_info, 0,
4566 vect_epilogue);
4567 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4568 epilogue_cost += record_stmt_cost (cost_vec,
4569 2 * estimated_nunits - 3,
4570 scalar_stmt, stmt_info, 0,
4571 vect_epilogue);
4572 }
4573 else if (reduction_type == EXTRACT_LAST_REDUCTION
4574 || reduction_type == FOLD_LEFT_REDUCTION)
4575 /* No extra instructions need in the epilogue. */
4576 ;
4577 else
4578 {
4579 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4580 tree bitsize =
4581 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4582 int element_bitsize = tree_to_uhwi (bitsize);
4583 int nelements = vec_size_in_bits / element_bitsize;
4584
4585 if (code == COND_EXPR)
4586 code = MAX_EXPR;
4587
4588 optab = optab_for_tree_code (code, vectype, optab_default);
4589
4590 /* We have a whole vector shift available. */
4591 if (optab != unknown_optab
4592 && VECTOR_MODE_P (mode)
4593 && optab_handler (optab, mode) != CODE_FOR_nothing
4594 && have_whole_vector_shift (mode))
4595 {
4596 /* Final reduction via vector shifts and the reduction operator.
4597 Also requires scalar extract. */
4598 epilogue_cost += record_stmt_cost (cost_vec,
4599 exact_log2 (nelements) * 2,
4600 vector_stmt, stmt_info, 0,
4601 vect_epilogue);
4602 epilogue_cost += record_stmt_cost (cost_vec, 1,
4603 vec_to_scalar, stmt_info, 0,
4604 vect_epilogue);
4605 }
4606 else
4607 /* Use extracts and reduction op for final reduction. For N
4608 elements, we have N extracts and N-1 reduction ops. */
4609 epilogue_cost += record_stmt_cost (cost_vec,
4610 nelements + nelements - 1,
4611 vector_stmt, stmt_info, 0,
4612 vect_epilogue);
4613 }
4614 }
4615
4616 if (dump_enabled_p ())
4617 dump_printf (MSG_NOTE,
4618 "vect_model_reduction_cost: inside_cost = %d, "
4619 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4620 prologue_cost, epilogue_cost);
4621 }
4622
4623
4624
4625 /* Function get_initial_def_for_reduction
4626
4627 Input:
4628 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4629 INIT_VAL - the initial value of the reduction variable
4630
4631 Output:
4632 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4633 of the reduction (used for adjusting the epilog - see below).
4634 Return a vector variable, initialized according to the operation that
4635 STMT_VINFO performs. This vector will be used as the initial value
4636 of the vector of partial results.
4637
4638 Option1 (adjust in epilog): Initialize the vector as follows:
4639 add/bit or/xor: [0,0,...,0,0]
4640 mult/bit and: [1,1,...,1,1]
4641 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4642 and when necessary (e.g. add/mult case) let the caller know
4643 that it needs to adjust the result by init_val.
4644
4645 Option2: Initialize the vector as follows:
4646 add/bit or/xor: [init_val,0,0,...,0]
4647 mult/bit and: [init_val,1,1,...,1]
4648 min/max/cond_expr: [init_val,init_val,...,init_val]
4649 and no adjustments are needed.
4650
4651 For example, for the following code:
4652
4653 s = init_val;
4654 for (i=0;i<n;i++)
4655 s = s + a[i];
4656
4657 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4658 For a vector of 4 units, we want to return either [0,0,0,init_val],
4659 or [0,0,0,0] and let the caller know that it needs to adjust
4660 the result at the end by 'init_val'.
4661
4662 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4663 initialization vector is simpler (same element in all entries), if
4664 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4665
4666 A cost model should help decide between these two schemes. */
4667
4668 static tree
get_initial_def_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4669 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4670 stmt_vec_info stmt_vinfo,
4671 enum tree_code code, tree init_val,
4672 tree *adjustment_def)
4673 {
4674 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675 tree scalar_type = TREE_TYPE (init_val);
4676 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677 tree def_for_init;
4678 tree init_def;
4679 REAL_VALUE_TYPE real_init_val = dconst0;
4680 int int_init_val = 0;
4681 gimple_seq stmts = NULL;
4682
4683 gcc_assert (vectype);
4684
4685 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4686 || SCALAR_FLOAT_TYPE_P (scalar_type));
4687
4688 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4689 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4690
4691 /* ADJUSTMENT_DEF is NULL when called from
4692 vect_create_epilog_for_reduction to vectorize double reduction. */
4693 if (adjustment_def)
4694 *adjustment_def = NULL;
4695
4696 switch (code)
4697 {
4698 case WIDEN_SUM_EXPR:
4699 case DOT_PROD_EXPR:
4700 case SAD_EXPR:
4701 case PLUS_EXPR:
4702 case MINUS_EXPR:
4703 case BIT_IOR_EXPR:
4704 case BIT_XOR_EXPR:
4705 case MULT_EXPR:
4706 case BIT_AND_EXPR:
4707 {
4708 if (code == MULT_EXPR)
4709 {
4710 real_init_val = dconst1;
4711 int_init_val = 1;
4712 }
4713
4714 if (code == BIT_AND_EXPR)
4715 int_init_val = -1;
4716
4717 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4718 def_for_init = build_real (scalar_type, real_init_val);
4719 else
4720 def_for_init = build_int_cst (scalar_type, int_init_val);
4721
4722 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4723 {
4724 /* Option1: the first element is '0' or '1' as well. */
4725 if (!operand_equal_p (def_for_init, init_val, 0))
4726 *adjustment_def = init_val;
4727 init_def = gimple_build_vector_from_val (&stmts, vectype,
4728 def_for_init);
4729 }
4730 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4731 {
4732 /* Option2 (variable length): the first element is INIT_VAL. */
4733 init_def = gimple_build_vector_from_val (&stmts, vectype,
4734 def_for_init);
4735 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4736 vectype, init_def, init_val);
4737 }
4738 else
4739 {
4740 /* Option2: the first element is INIT_VAL. */
4741 tree_vector_builder elts (vectype, 1, 2);
4742 elts.quick_push (init_val);
4743 elts.quick_push (def_for_init);
4744 init_def = gimple_build_vector (&stmts, &elts);
4745 }
4746 }
4747 break;
4748
4749 case MIN_EXPR:
4750 case MAX_EXPR:
4751 case COND_EXPR:
4752 {
4753 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4754 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4755 }
4756 break;
4757
4758 default:
4759 gcc_unreachable ();
4760 }
4761
4762 if (stmts)
4763 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4764 return init_def;
4765 }
4766
4767 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4768 NUMBER_OF_VECTORS is the number of vector defs to create.
4769 If NEUTRAL_OP is nonnull, introducing extra elements of that
4770 value will not change the result. */
4771
4772 static void
get_initial_defs_for_reduction(vec_info * vinfo,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4773 get_initial_defs_for_reduction (vec_info *vinfo,
4774 slp_tree slp_node,
4775 vec<tree> *vec_oprnds,
4776 unsigned int number_of_vectors,
4777 bool reduc_chain, tree neutral_op)
4778 {
4779 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4780 stmt_vec_info stmt_vinfo = stmts[0];
4781 unsigned HOST_WIDE_INT nunits;
4782 unsigned j, number_of_places_left_in_vector;
4783 tree vector_type;
4784 unsigned int group_size = stmts.length ();
4785 unsigned int i;
4786 class loop *loop;
4787
4788 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4789
4790 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4791
4792 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4793 gcc_assert (loop);
4794 edge pe = loop_preheader_edge (loop);
4795
4796 gcc_assert (!reduc_chain || neutral_op);
4797
4798 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4799 created vectors. It is greater than 1 if unrolling is performed.
4800
4801 For example, we have two scalar operands, s1 and s2 (e.g., group of
4802 strided accesses of size two), while NUNITS is four (i.e., four scalars
4803 of this type can be packed in a vector). The output vector will contain
4804 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4805 will be 2).
4806
4807 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4808 vectors containing the operands.
4809
4810 For example, NUNITS is four as before, and the group size is 8
4811 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4812 {s5, s6, s7, s8}. */
4813
4814 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4815 nunits = group_size;
4816
4817 number_of_places_left_in_vector = nunits;
4818 bool constant_p = true;
4819 tree_vector_builder elts (vector_type, nunits, 1);
4820 elts.quick_grow (nunits);
4821 gimple_seq ctor_seq = NULL;
4822 for (j = 0; j < nunits * number_of_vectors; ++j)
4823 {
4824 tree op;
4825 i = j % group_size;
4826 stmt_vinfo = stmts[i];
4827
4828 /* Get the def before the loop. In reduction chain we have only
4829 one initial value. Else we have as many as PHIs in the group. */
4830 if (reduc_chain)
4831 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4832 else if (((vec_oprnds->length () + 1) * nunits
4833 - number_of_places_left_in_vector >= group_size)
4834 && neutral_op)
4835 op = neutral_op;
4836 else
4837 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4838
4839 /* Create 'vect_ = {op0,op1,...,opn}'. */
4840 number_of_places_left_in_vector--;
4841 elts[nunits - number_of_places_left_in_vector - 1] = op;
4842 if (!CONSTANT_CLASS_P (op))
4843 constant_p = false;
4844
4845 if (number_of_places_left_in_vector == 0)
4846 {
4847 tree init;
4848 if (constant_p && !neutral_op
4849 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4850 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4851 /* Build the vector directly from ELTS. */
4852 init = gimple_build_vector (&ctor_seq, &elts);
4853 else if (neutral_op)
4854 {
4855 /* Build a vector of the neutral value and shift the
4856 other elements into place. */
4857 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4858 neutral_op);
4859 int k = nunits;
4860 while (k > 0 && elts[k - 1] == neutral_op)
4861 k -= 1;
4862 while (k > 0)
4863 {
4864 k -= 1;
4865 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4866 vector_type, init, elts[k]);
4867 }
4868 }
4869 else
4870 {
4871 /* First time round, duplicate ELTS to fill the
4872 required number of vectors. */
4873 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4874 number_of_vectors, *vec_oprnds);
4875 break;
4876 }
4877 vec_oprnds->quick_push (init);
4878
4879 number_of_places_left_in_vector = nunits;
4880 elts.new_vector (vector_type, nunits, 1);
4881 elts.quick_grow (nunits);
4882 constant_p = true;
4883 }
4884 }
4885 if (ctor_seq != NULL)
4886 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4887 }
4888
4889 /* For a statement STMT_INFO taking part in a reduction operation return
4890 the stmt_vec_info the meta information is stored on. */
4891
4892 stmt_vec_info
info_for_reduction(vec_info * vinfo,stmt_vec_info stmt_info)4893 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4894 {
4895 stmt_info = vect_orig_stmt (stmt_info);
4896 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4897 if (!is_a <gphi *> (stmt_info->stmt)
4898 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4899 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4900 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4901 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4902 {
4903 if (gimple_phi_num_args (phi) == 1)
4904 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4905 }
4906 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4907 {
4908 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4909 stmt_vec_info info
4910 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4911 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4912 stmt_info = info;
4913 }
4914 return stmt_info;
4915 }
4916
4917 /* Function vect_create_epilog_for_reduction
4918
4919 Create code at the loop-epilog to finalize the result of a reduction
4920 computation.
4921
4922 STMT_INFO is the scalar reduction stmt that is being vectorized.
4923 SLP_NODE is an SLP node containing a group of reduction statements. The
4924 first one in this group is STMT_INFO.
4925 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4926 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4927 (counting from 0)
4928
4929 This function:
4930 1. Completes the reduction def-use cycles.
4931 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4932 by calling the function specified by REDUC_FN if available, or by
4933 other means (whole-vector shifts or a scalar loop).
4934 The function also creates a new phi node at the loop exit to preserve
4935 loop-closed form, as illustrated below.
4936
4937 The flow at the entry to this function:
4938
4939 loop:
4940 vec_def = phi <vec_init, null> # REDUCTION_PHI
4941 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4942 s_loop = scalar_stmt # (scalar) STMT_INFO
4943 loop_exit:
4944 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4945 use <s_out0>
4946 use <s_out0>
4947
4948 The above is transformed by this function into:
4949
4950 loop:
4951 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4952 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4953 s_loop = scalar_stmt # (scalar) STMT_INFO
4954 loop_exit:
4955 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4956 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4957 v_out2 = reduce <v_out1>
4958 s_out3 = extract_field <v_out2, 0>
4959 s_out4 = adjust_result <s_out3>
4960 use <s_out4>
4961 use <s_out4>
4962 */
4963
4964 static void
vect_create_epilog_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4965 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4966 stmt_vec_info stmt_info,
4967 slp_tree slp_node,
4968 slp_instance slp_node_instance)
4969 {
4970 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4971 gcc_assert (reduc_info->is_reduc_info);
4972 /* For double reductions we need to get at the inner loop reduction
4973 stmt which has the meta info attached. Our stmt_info is that of the
4974 loop-closed PHI of the inner loop which we remember as
4975 def for the reduction PHI generation. */
4976 bool double_reduc = false;
4977 stmt_vec_info rdef_info = stmt_info;
4978 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4979 {
4980 gcc_assert (!slp_node);
4981 double_reduc = true;
4982 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4983 (stmt_info->stmt, 0));
4984 stmt_info = vect_stmt_to_vectorize (stmt_info);
4985 }
4986 gphi *reduc_def_stmt
4987 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4988 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4989 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4990 tree vectype;
4991 machine_mode mode;
4992 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4993 basic_block exit_bb;
4994 tree scalar_dest;
4995 tree scalar_type;
4996 gimple *new_phi = NULL, *phi;
4997 gimple_stmt_iterator exit_gsi;
4998 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4999 gimple *epilog_stmt = NULL;
5000 gimple *exit_phi;
5001 tree bitsize;
5002 tree def;
5003 tree orig_name, scalar_result;
5004 imm_use_iterator imm_iter, phi_imm_iter;
5005 use_operand_p use_p, phi_use_p;
5006 gimple *use_stmt;
5007 bool nested_in_vect_loop = false;
5008 auto_vec<gimple *> new_phis;
5009 int j, i;
5010 auto_vec<tree> scalar_results;
5011 unsigned int group_size = 1, k;
5012 auto_vec<gimple *> phis;
5013 bool slp_reduc = false;
5014 bool direct_slp_reduc;
5015 tree new_phi_result;
5016 tree induction_index = NULL_TREE;
5017
5018 if (slp_node)
5019 group_size = SLP_TREE_LANES (slp_node);
5020
5021 if (nested_in_vect_loop_p (loop, stmt_info))
5022 {
5023 outer_loop = loop;
5024 loop = loop->inner;
5025 nested_in_vect_loop = true;
5026 gcc_assert (!slp_node);
5027 }
5028 gcc_assert (!nested_in_vect_loop || double_reduc);
5029
5030 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5031 gcc_assert (vectype);
5032 mode = TYPE_MODE (vectype);
5033
5034 tree initial_def = NULL;
5035 tree induc_val = NULL_TREE;
5036 tree adjustment_def = NULL;
5037 if (slp_node)
5038 ;
5039 else
5040 {
5041 /* Get at the scalar def before the loop, that defines the initial value
5042 of the reduction variable. */
5043 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5044 loop_preheader_edge (loop));
5045 /* Optimize: for induction condition reduction, if we can't use zero
5046 for induc_val, use initial_def. */
5047 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5048 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5049 else if (double_reduc)
5050 ;
5051 else if (nested_in_vect_loop)
5052 ;
5053 else
5054 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5055 }
5056
5057 unsigned vec_num;
5058 int ncopies;
5059 if (slp_node)
5060 {
5061 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5062 ncopies = 1;
5063 }
5064 else
5065 {
5066 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5067 vec_num = 1;
5068 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5069 }
5070
5071 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5072 which is updated with the current index of the loop for every match of
5073 the original loop's cond_expr (VEC_STMT). This results in a vector
5074 containing the last time the condition passed for that vector lane.
5075 The first match will be a 1 to allow 0 to be used for non-matching
5076 indexes. If there are no matches at all then the vector will be all
5077 zeroes.
5078
5079 PR92772: This algorithm is broken for architectures that support
5080 masked vectors, but do not provide fold_extract_last. */
5081 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5082 {
5083 auto_vec<std::pair<tree, bool>, 2> ccompares;
5084 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5085 cond_info = vect_stmt_to_vectorize (cond_info);
5086 while (cond_info != reduc_info)
5087 {
5088 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5089 {
5090 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5091 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5092 ccompares.safe_push
5093 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5094 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5095 }
5096 cond_info
5097 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5098 1 + STMT_VINFO_REDUC_IDX
5099 (cond_info)));
5100 cond_info = vect_stmt_to_vectorize (cond_info);
5101 }
5102 gcc_assert (ccompares.length () != 0);
5103
5104 tree indx_before_incr, indx_after_incr;
5105 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5106 int scalar_precision
5107 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5108 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5109 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5110 (TYPE_MODE (vectype), cr_index_scalar_type,
5111 TYPE_VECTOR_SUBPARTS (vectype));
5112
5113 /* First we create a simple vector induction variable which starts
5114 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5115 vector size (STEP). */
5116
5117 /* Create a {1,2,3,...} vector. */
5118 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5119
5120 /* Create a vector of the step value. */
5121 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5122 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5123
5124 /* Create an induction variable. */
5125 gimple_stmt_iterator incr_gsi;
5126 bool insert_after;
5127 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5128 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5129 insert_after, &indx_before_incr, &indx_after_incr);
5130
5131 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5132 filled with zeros (VEC_ZERO). */
5133
5134 /* Create a vector of 0s. */
5135 tree zero = build_zero_cst (cr_index_scalar_type);
5136 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5137
5138 /* Create a vector phi node. */
5139 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5140 new_phi = create_phi_node (new_phi_tree, loop->header);
5141 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5142 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5143
5144 /* Now take the condition from the loops original cond_exprs
5145 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5146 every match uses values from the induction variable
5147 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5148 (NEW_PHI_TREE).
5149 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5150 the new cond_expr (INDEX_COND_EXPR). */
5151 gimple_seq stmts = NULL;
5152 for (int i = ccompares.length () - 1; i != -1; --i)
5153 {
5154 tree ccompare = ccompares[i].first;
5155 if (ccompares[i].second)
5156 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5157 cr_index_vector_type,
5158 ccompare,
5159 indx_before_incr, new_phi_tree);
5160 else
5161 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5162 cr_index_vector_type,
5163 ccompare,
5164 new_phi_tree, indx_before_incr);
5165 }
5166 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5167
5168 /* Update the phi with the vec cond. */
5169 induction_index = new_phi_tree;
5170 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5171 loop_latch_edge (loop), UNKNOWN_LOCATION);
5172 }
5173
5174 /* 2. Create epilog code.
5175 The reduction epilog code operates across the elements of the vector
5176 of partial results computed by the vectorized loop.
5177 The reduction epilog code consists of:
5178
5179 step 1: compute the scalar result in a vector (v_out2)
5180 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5181 step 3: adjust the scalar result (s_out3) if needed.
5182
5183 Step 1 can be accomplished using one the following three schemes:
5184 (scheme 1) using reduc_fn, if available.
5185 (scheme 2) using whole-vector shifts, if available.
5186 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5187 combined.
5188
5189 The overall epilog code looks like this:
5190
5191 s_out0 = phi <s_loop> # original EXIT_PHI
5192 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5193 v_out2 = reduce <v_out1> # step 1
5194 s_out3 = extract_field <v_out2, 0> # step 2
5195 s_out4 = adjust_result <s_out3> # step 3
5196
5197 (step 3 is optional, and steps 1 and 2 may be combined).
5198 Lastly, the uses of s_out0 are replaced by s_out4. */
5199
5200
5201 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5202 v_out1 = phi <VECT_DEF>
5203 Store them in NEW_PHIS. */
5204 if (double_reduc)
5205 loop = outer_loop;
5206 exit_bb = single_exit (loop)->dest;
5207 new_phis.create (slp_node ? vec_num : ncopies);
5208 for (unsigned i = 0; i < vec_num; i++)
5209 {
5210 if (slp_node)
5211 def = vect_get_slp_vect_def (slp_node, i);
5212 else
5213 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5214 for (j = 0; j < ncopies; j++)
5215 {
5216 tree new_def = copy_ssa_name (def);
5217 phi = create_phi_node (new_def, exit_bb);
5218 if (j == 0)
5219 new_phis.quick_push (phi);
5220 else
5221 {
5222 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5223 new_phis.quick_push (phi);
5224 }
5225
5226 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5227 }
5228 }
5229
5230 exit_gsi = gsi_after_labels (exit_bb);
5231
5232 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5233 (i.e. when reduc_fn is not available) and in the final adjustment
5234 code (if needed). Also get the original scalar reduction variable as
5235 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5236 represents a reduction pattern), the tree-code and scalar-def are
5237 taken from the original stmt that the pattern-stmt (STMT) replaces.
5238 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5239 are taken from STMT. */
5240
5241 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5242 if (orig_stmt_info != stmt_info)
5243 {
5244 /* Reduction pattern */
5245 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5246 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5247 }
5248
5249 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5250 scalar_type = TREE_TYPE (scalar_dest);
5251 scalar_results.create (group_size);
5252 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5253 bitsize = TYPE_SIZE (scalar_type);
5254
5255 /* SLP reduction without reduction chain, e.g.,
5256 # a1 = phi <a2, a0>
5257 # b1 = phi <b2, b0>
5258 a2 = operation (a1)
5259 b2 = operation (b1) */
5260 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5261
5262 /* True if we should implement SLP_REDUC using native reduction operations
5263 instead of scalar operations. */
5264 direct_slp_reduc = (reduc_fn != IFN_LAST
5265 && slp_reduc
5266 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5267
5268 /* In case of reduction chain, e.g.,
5269 # a1 = phi <a3, a0>
5270 a2 = operation (a1)
5271 a3 = operation (a2),
5272
5273 we may end up with more than one vector result. Here we reduce them to
5274 one vector. */
5275 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5276 {
5277 gimple_seq stmts = NULL;
5278 tree first_vect = PHI_RESULT (new_phis[0]);
5279 first_vect = gimple_convert (&stmts, vectype, first_vect);
5280 for (k = 1; k < new_phis.length (); k++)
5281 {
5282 gimple *next_phi = new_phis[k];
5283 tree second_vect = PHI_RESULT (next_phi);
5284 second_vect = gimple_convert (&stmts, vectype, second_vect);
5285 first_vect = gimple_build (&stmts, code, vectype,
5286 first_vect, second_vect);
5287 }
5288 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5289
5290 new_phi_result = first_vect;
5291 new_phis.truncate (0);
5292 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5293 }
5294 /* Likewise if we couldn't use a single defuse cycle. */
5295 else if (ncopies > 1)
5296 {
5297 gimple_seq stmts = NULL;
5298 tree first_vect = PHI_RESULT (new_phis[0]);
5299 first_vect = gimple_convert (&stmts, vectype, first_vect);
5300 for (int k = 1; k < ncopies; ++k)
5301 {
5302 tree second_vect = PHI_RESULT (new_phis[k]);
5303 second_vect = gimple_convert (&stmts, vectype, second_vect);
5304 first_vect = gimple_build (&stmts, code, vectype,
5305 first_vect, second_vect);
5306 }
5307 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5308 new_phi_result = first_vect;
5309 new_phis.truncate (0);
5310 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5311 }
5312 else
5313 new_phi_result = PHI_RESULT (new_phis[0]);
5314
5315 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5316 && reduc_fn != IFN_LAST)
5317 {
5318 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5319 various data values where the condition matched and another vector
5320 (INDUCTION_INDEX) containing all the indexes of those matches. We
5321 need to extract the last matching index (which will be the index with
5322 highest value) and use this to index into the data vector.
5323 For the case where there were no matches, the data vector will contain
5324 all default values and the index vector will be all zeros. */
5325
5326 /* Get various versions of the type of the vector of indexes. */
5327 tree index_vec_type = TREE_TYPE (induction_index);
5328 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5329 tree index_scalar_type = TREE_TYPE (index_vec_type);
5330 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5331
5332 /* Get an unsigned integer version of the type of the data vector. */
5333 int scalar_precision
5334 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5335 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5336 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5337 vectype);
5338
5339 /* First we need to create a vector (ZERO_VEC) of zeros and another
5340 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5341 can create using a MAX reduction and then expanding.
5342 In the case where the loop never made any matches, the max index will
5343 be zero. */
5344
5345 /* Vector of {0, 0, 0,...}. */
5346 tree zero_vec = build_zero_cst (vectype);
5347
5348 gimple_seq stmts = NULL;
5349 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5350 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5351
5352 /* Find maximum value from the vector of found indexes. */
5353 tree max_index = make_ssa_name (index_scalar_type);
5354 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5355 1, induction_index);
5356 gimple_call_set_lhs (max_index_stmt, max_index);
5357 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5358
5359 /* Vector of {max_index, max_index, max_index,...}. */
5360 tree max_index_vec = make_ssa_name (index_vec_type);
5361 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5362 max_index);
5363 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5364 max_index_vec_rhs);
5365 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5366
5367 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5368 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5369 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5370 otherwise. Only one value should match, resulting in a vector
5371 (VEC_COND) with one data value and the rest zeros.
5372 In the case where the loop never made any matches, every index will
5373 match, resulting in a vector with all data values (which will all be
5374 the default value). */
5375
5376 /* Compare the max index vector to the vector of found indexes to find
5377 the position of the max value. */
5378 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5379 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5380 induction_index,
5381 max_index_vec);
5382 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5383
5384 /* Use the compare to choose either values from the data vector or
5385 zero. */
5386 tree vec_cond = make_ssa_name (vectype);
5387 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5388 vec_compare, new_phi_result,
5389 zero_vec);
5390 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5391
5392 /* Finally we need to extract the data value from the vector (VEC_COND)
5393 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5394 reduction, but because this doesn't exist, we can use a MAX reduction
5395 instead. The data value might be signed or a float so we need to cast
5396 it first.
5397 In the case where the loop never made any matches, the data values are
5398 all identical, and so will reduce down correctly. */
5399
5400 /* Make the matched data values unsigned. */
5401 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5402 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5403 vec_cond);
5404 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5405 VIEW_CONVERT_EXPR,
5406 vec_cond_cast_rhs);
5407 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5408
5409 /* Reduce down to a scalar value. */
5410 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5411 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5412 1, vec_cond_cast);
5413 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5414 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5415
5416 /* Convert the reduced value back to the result type and set as the
5417 result. */
5418 stmts = NULL;
5419 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5420 data_reduc);
5421 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5422 scalar_results.safe_push (new_temp);
5423 }
5424 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5425 && reduc_fn == IFN_LAST)
5426 {
5427 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5428 idx = 0;
5429 idx_val = induction_index[0];
5430 val = data_reduc[0];
5431 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5432 if (induction_index[i] > idx_val)
5433 val = data_reduc[i], idx_val = induction_index[i];
5434 return val; */
5435
5436 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5437 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5438 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5439 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5440 /* Enforced by vectorizable_reduction, which ensures we have target
5441 support before allowing a conditional reduction on variable-length
5442 vectors. */
5443 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5444 tree idx_val = NULL_TREE, val = NULL_TREE;
5445 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5446 {
5447 tree old_idx_val = idx_val;
5448 tree old_val = val;
5449 idx_val = make_ssa_name (idx_eltype);
5450 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5451 build3 (BIT_FIELD_REF, idx_eltype,
5452 induction_index,
5453 bitsize_int (el_size),
5454 bitsize_int (off)));
5455 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5456 val = make_ssa_name (data_eltype);
5457 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5458 build3 (BIT_FIELD_REF,
5459 data_eltype,
5460 new_phi_result,
5461 bitsize_int (el_size),
5462 bitsize_int (off)));
5463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464 if (off != 0)
5465 {
5466 tree new_idx_val = idx_val;
5467 if (off != v_size - el_size)
5468 {
5469 new_idx_val = make_ssa_name (idx_eltype);
5470 epilog_stmt = gimple_build_assign (new_idx_val,
5471 MAX_EXPR, idx_val,
5472 old_idx_val);
5473 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5474 }
5475 tree new_val = make_ssa_name (data_eltype);
5476 epilog_stmt = gimple_build_assign (new_val,
5477 COND_EXPR,
5478 build2 (GT_EXPR,
5479 boolean_type_node,
5480 idx_val,
5481 old_idx_val),
5482 val, old_val);
5483 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5484 idx_val = new_idx_val;
5485 val = new_val;
5486 }
5487 }
5488 /* Convert the reduced value back to the result type and set as the
5489 result. */
5490 gimple_seq stmts = NULL;
5491 val = gimple_convert (&stmts, scalar_type, val);
5492 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5493 scalar_results.safe_push (val);
5494 }
5495
5496 /* 2.3 Create the reduction code, using one of the three schemes described
5497 above. In SLP we simply need to extract all the elements from the
5498 vector (without reducing them), so we use scalar shifts. */
5499 else if (reduc_fn != IFN_LAST && !slp_reduc)
5500 {
5501 tree tmp;
5502 tree vec_elem_type;
5503
5504 /* Case 1: Create:
5505 v_out2 = reduc_expr <v_out1> */
5506
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_NOTE, vect_location,
5509 "Reduce using direct vector reduction.\n");
5510
5511 gimple_seq stmts = NULL;
5512 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5513 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5514 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5515 vec_elem_type, new_phi_result);
5516 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5517 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5518
5519 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5520 && induc_val)
5521 {
5522 /* Earlier we set the initial value to be a vector if induc_val
5523 values. Check the result and if it is induc_val then replace
5524 with the original initial value, unless induc_val is
5525 the same as initial_def already. */
5526 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527 induc_val);
5528
5529 tmp = make_ssa_name (new_scalar_dest);
5530 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531 initial_def, new_temp);
5532 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533 new_temp = tmp;
5534 }
5535
5536 scalar_results.safe_push (new_temp);
5537 }
5538 else if (direct_slp_reduc)
5539 {
5540 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5541 with the elements for other SLP statements replaced with the
5542 neutral value. We can then do a normal reduction on each vector. */
5543
5544 /* Enforced by vectorizable_reduction. */
5545 gcc_assert (new_phis.length () == 1);
5546 gcc_assert (pow2p_hwi (group_size));
5547
5548 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5549 vec<stmt_vec_info> orig_phis
5550 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5551 gimple_seq seq = NULL;
5552
5553 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5554 and the same element size as VECTYPE. */
5555 tree index = build_index_vector (vectype, 0, 1);
5556 tree index_type = TREE_TYPE (index);
5557 tree index_elt_type = TREE_TYPE (index_type);
5558 tree mask_type = truth_type_for (index_type);
5559
5560 /* Create a vector that, for each element, identifies which of
5561 the REDUC_GROUP_SIZE results should use it. */
5562 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5563 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5564 build_vector_from_val (index_type, index_mask));
5565
5566 /* Get a neutral vector value. This is simply a splat of the neutral
5567 scalar value if we have one, otherwise the initial scalar value
5568 is itself a neutral value. */
5569 tree vector_identity = NULL_TREE;
5570 tree neutral_op = NULL_TREE;
5571 if (slp_node)
5572 {
5573 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5574 neutral_op
5575 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5576 vectype, code, first != NULL);
5577 }
5578 if (neutral_op)
5579 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5580 neutral_op);
5581 for (unsigned int i = 0; i < group_size; ++i)
5582 {
5583 /* If there's no univeral neutral value, we can use the
5584 initial scalar value from the original PHI. This is used
5585 for MIN and MAX reduction, for example. */
5586 if (!neutral_op)
5587 {
5588 tree scalar_value
5589 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5590 loop_preheader_edge (loop));
5591 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5592 scalar_value);
5593 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5594 scalar_value);
5595 }
5596
5597 /* Calculate the equivalent of:
5598
5599 sel[j] = (index[j] == i);
5600
5601 which selects the elements of NEW_PHI_RESULT that should
5602 be included in the result. */
5603 tree compare_val = build_int_cst (index_elt_type, i);
5604 compare_val = build_vector_from_val (index_type, compare_val);
5605 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5606 index, compare_val);
5607
5608 /* Calculate the equivalent of:
5609
5610 vec = seq ? new_phi_result : vector_identity;
5611
5612 VEC is now suitable for a full vector reduction. */
5613 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5614 sel, new_phi_result, vector_identity);
5615
5616 /* Do the reduction and convert it to the appropriate type. */
5617 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5618 TREE_TYPE (vectype), vec);
5619 scalar = gimple_convert (&seq, scalar_type, scalar);
5620 scalar_results.safe_push (scalar);
5621 }
5622 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5623 }
5624 else
5625 {
5626 bool reduce_with_shift;
5627 tree vec_temp;
5628
5629 gcc_assert (slp_reduc || new_phis.length () == 1);
5630
5631 /* See if the target wants to do the final (shift) reduction
5632 in a vector mode of smaller size and first reduce upper/lower
5633 halves against each other. */
5634 enum machine_mode mode1 = mode;
5635 tree stype = TREE_TYPE (vectype);
5636 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5637 unsigned nunits1 = nunits;
5638 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5639 && new_phis.length () == 1)
5640 {
5641 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5642 /* For SLP reductions we have to make sure lanes match up, but
5643 since we're doing individual element final reduction reducing
5644 vector width here is even more important.
5645 ??? We can also separate lanes with permutes, for the common
5646 case of power-of-two group-size odd/even extracts would work. */
5647 if (slp_reduc && nunits != nunits1)
5648 {
5649 nunits1 = least_common_multiple (nunits1, group_size);
5650 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5651 }
5652 }
5653 if (!slp_reduc
5654 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5655 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5656
5657 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5658 stype, nunits1);
5659 reduce_with_shift = have_whole_vector_shift (mode1);
5660 if (!VECTOR_MODE_P (mode1))
5661 reduce_with_shift = false;
5662 else
5663 {
5664 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5665 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5666 reduce_with_shift = false;
5667 }
5668
5669 /* First reduce the vector to the desired vector size we should
5670 do shift reduction on by combining upper and lower halves. */
5671 new_temp = new_phi_result;
5672 while (nunits > nunits1)
5673 {
5674 nunits /= 2;
5675 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5676 stype, nunits);
5677 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5678
5679 /* The target has to make sure we support lowpart/highpart
5680 extraction, either via direct vector extract or through
5681 an integer mode punning. */
5682 tree dst1, dst2;
5683 if (convert_optab_handler (vec_extract_optab,
5684 TYPE_MODE (TREE_TYPE (new_temp)),
5685 TYPE_MODE (vectype1))
5686 != CODE_FOR_nothing)
5687 {
5688 /* Extract sub-vectors directly once vec_extract becomes
5689 a conversion optab. */
5690 dst1 = make_ssa_name (vectype1);
5691 epilog_stmt
5692 = gimple_build_assign (dst1, BIT_FIELD_REF,
5693 build3 (BIT_FIELD_REF, vectype1,
5694 new_temp, TYPE_SIZE (vectype1),
5695 bitsize_int (0)));
5696 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5697 dst2 = make_ssa_name (vectype1);
5698 epilog_stmt
5699 = gimple_build_assign (dst2, BIT_FIELD_REF,
5700 build3 (BIT_FIELD_REF, vectype1,
5701 new_temp, TYPE_SIZE (vectype1),
5702 bitsize_int (bitsize)));
5703 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5704 }
5705 else
5706 {
5707 /* Extract via punning to appropriately sized integer mode
5708 vector. */
5709 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5710 tree etype = build_vector_type (eltype, 2);
5711 gcc_assert (convert_optab_handler (vec_extract_optab,
5712 TYPE_MODE (etype),
5713 TYPE_MODE (eltype))
5714 != CODE_FOR_nothing);
5715 tree tem = make_ssa_name (etype);
5716 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5717 build1 (VIEW_CONVERT_EXPR,
5718 etype, new_temp));
5719 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5720 new_temp = tem;
5721 tem = make_ssa_name (eltype);
5722 epilog_stmt
5723 = gimple_build_assign (tem, BIT_FIELD_REF,
5724 build3 (BIT_FIELD_REF, eltype,
5725 new_temp, TYPE_SIZE (eltype),
5726 bitsize_int (0)));
5727 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5728 dst1 = make_ssa_name (vectype1);
5729 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5730 build1 (VIEW_CONVERT_EXPR,
5731 vectype1, tem));
5732 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5733 tem = make_ssa_name (eltype);
5734 epilog_stmt
5735 = gimple_build_assign (tem, BIT_FIELD_REF,
5736 build3 (BIT_FIELD_REF, eltype,
5737 new_temp, TYPE_SIZE (eltype),
5738 bitsize_int (bitsize)));
5739 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5740 dst2 = make_ssa_name (vectype1);
5741 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5742 build1 (VIEW_CONVERT_EXPR,
5743 vectype1, tem));
5744 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5745 }
5746
5747 new_temp = make_ssa_name (vectype1);
5748 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5749 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 new_phis[0] = epilog_stmt;
5751 }
5752
5753 if (reduce_with_shift && !slp_reduc)
5754 {
5755 int element_bitsize = tree_to_uhwi (bitsize);
5756 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5757 for variable-length vectors and also requires direct target support
5758 for loop reductions. */
5759 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760 int nelements = vec_size_in_bits / element_bitsize;
5761 vec_perm_builder sel;
5762 vec_perm_indices indices;
5763
5764 int elt_offset;
5765
5766 tree zero_vec = build_zero_cst (vectype1);
5767 /* Case 2: Create:
5768 for (offset = nelements/2; offset >= 1; offset/=2)
5769 {
5770 Create: va' = vec_shift <va, offset>
5771 Create: va = vop <va, va'>
5772 } */
5773
5774 tree rhs;
5775
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_NOTE, vect_location,
5778 "Reduce using vector shifts\n");
5779
5780 gimple_seq stmts = NULL;
5781 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5782 for (elt_offset = nelements / 2;
5783 elt_offset >= 1;
5784 elt_offset /= 2)
5785 {
5786 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5787 indices.new_vector (sel, 2, nelements);
5788 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5789 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5790 new_temp, zero_vec, mask);
5791 new_temp = gimple_build (&stmts, code,
5792 vectype1, new_name, new_temp);
5793 }
5794 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5795
5796 /* 2.4 Extract the final scalar result. Create:
5797 s_out3 = extract_field <v_out2, bitpos> */
5798
5799 if (dump_enabled_p ())
5800 dump_printf_loc (MSG_NOTE, vect_location,
5801 "extract scalar result\n");
5802
5803 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5804 bitsize, bitsize_zero_node);
5805 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5806 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5807 gimple_assign_set_lhs (epilog_stmt, new_temp);
5808 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5809 scalar_results.safe_push (new_temp);
5810 }
5811 else
5812 {
5813 /* Case 3: Create:
5814 s = extract_field <v_out2, 0>
5815 for (offset = element_size;
5816 offset < vector_size;
5817 offset += element_size;)
5818 {
5819 Create: s' = extract_field <v_out2, offset>
5820 Create: s = op <s, s'> // For non SLP cases
5821 } */
5822
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_NOTE, vect_location,
5825 "Reduce using scalar code.\n");
5826
5827 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5828 int element_bitsize = tree_to_uhwi (bitsize);
5829 tree compute_type = TREE_TYPE (vectype);
5830 gimple_seq stmts = NULL;
5831 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5832 {
5833 int bit_offset;
5834 if (gimple_code (new_phi) == GIMPLE_PHI)
5835 vec_temp = PHI_RESULT (new_phi);
5836 else
5837 vec_temp = gimple_assign_lhs (new_phi);
5838 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5839 vec_temp, bitsize, bitsize_zero_node);
5840
5841 /* In SLP we don't need to apply reduction operation, so we just
5842 collect s' values in SCALAR_RESULTS. */
5843 if (slp_reduc)
5844 scalar_results.safe_push (new_temp);
5845
5846 for (bit_offset = element_bitsize;
5847 bit_offset < vec_size_in_bits;
5848 bit_offset += element_bitsize)
5849 {
5850 tree bitpos = bitsize_int (bit_offset);
5851 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5852 compute_type, vec_temp,
5853 bitsize, bitpos);
5854 if (slp_reduc)
5855 {
5856 /* In SLP we don't need to apply reduction operation, so
5857 we just collect s' values in SCALAR_RESULTS. */
5858 new_temp = new_name;
5859 scalar_results.safe_push (new_name);
5860 }
5861 else
5862 new_temp = gimple_build (&stmts, code, compute_type,
5863 new_name, new_temp);
5864 }
5865 }
5866
5867 /* The only case where we need to reduce scalar results in SLP, is
5868 unrolling. If the size of SCALAR_RESULTS is greater than
5869 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5870 REDUC_GROUP_SIZE. */
5871 if (slp_reduc)
5872 {
5873 tree res, first_res, new_res;
5874
5875 /* Reduce multiple scalar results in case of SLP unrolling. */
5876 for (j = group_size; scalar_results.iterate (j, &res);
5877 j++)
5878 {
5879 first_res = scalar_results[j % group_size];
5880 new_res = gimple_build (&stmts, code, compute_type,
5881 first_res, res);
5882 scalar_results[j % group_size] = new_res;
5883 }
5884 for (k = 0; k < group_size; k++)
5885 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5886 scalar_results[k]);
5887 }
5888 else
5889 {
5890 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5891 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5892 scalar_results.safe_push (new_temp);
5893 }
5894
5895 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5896 }
5897
5898 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5899 && induc_val)
5900 {
5901 /* Earlier we set the initial value to be a vector if induc_val
5902 values. Check the result and if it is induc_val then replace
5903 with the original initial value, unless induc_val is
5904 the same as initial_def already. */
5905 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5906 induc_val);
5907
5908 tree tmp = make_ssa_name (new_scalar_dest);
5909 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5910 initial_def, new_temp);
5911 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5912 scalar_results[0] = tmp;
5913 }
5914 }
5915
5916 /* 2.5 Adjust the final result by the initial value of the reduction
5917 variable. (When such adjustment is not needed, then
5918 'adjustment_def' is zero). For example, if code is PLUS we create:
5919 new_temp = loop_exit_def + adjustment_def */
5920
5921 if (adjustment_def)
5922 {
5923 gcc_assert (!slp_reduc);
5924 gimple_seq stmts = NULL;
5925 if (nested_in_vect_loop)
5926 {
5927 new_phi = new_phis[0];
5928 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5929 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5930 new_temp = gimple_build (&stmts, code, vectype,
5931 PHI_RESULT (new_phi), adjustment_def);
5932 }
5933 else
5934 {
5935 new_temp = scalar_results[0];
5936 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5937 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5938 new_temp = gimple_build (&stmts, code, scalar_type,
5939 new_temp, adjustment_def);
5940 }
5941
5942 epilog_stmt = gimple_seq_last_stmt (stmts);
5943 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5944 if (nested_in_vect_loop)
5945 {
5946 if (!double_reduc)
5947 scalar_results.quick_push (new_temp);
5948 else
5949 scalar_results[0] = new_temp;
5950 }
5951 else
5952 scalar_results[0] = new_temp;
5953
5954 new_phis[0] = epilog_stmt;
5955 }
5956
5957 if (double_reduc)
5958 loop = loop->inner;
5959
5960 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5961 phis with new adjusted scalar results, i.e., replace use <s_out0>
5962 with use <s_out4>.
5963
5964 Transform:
5965 loop_exit:
5966 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5967 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5968 v_out2 = reduce <v_out1>
5969 s_out3 = extract_field <v_out2, 0>
5970 s_out4 = adjust_result <s_out3>
5971 use <s_out0>
5972 use <s_out0>
5973
5974 into:
5975
5976 loop_exit:
5977 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5978 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5979 v_out2 = reduce <v_out1>
5980 s_out3 = extract_field <v_out2, 0>
5981 s_out4 = adjust_result <s_out3>
5982 use <s_out4>
5983 use <s_out4> */
5984
5985
5986 /* In SLP reduction chain we reduce vector results into one vector if
5987 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5988 LHS of the last stmt in the reduction chain, since we are looking for
5989 the loop exit phi node. */
5990 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5991 {
5992 stmt_vec_info dest_stmt_info
5993 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5994 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5995 group_size = 1;
5996 }
5997
5998 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5999 case that REDUC_GROUP_SIZE is greater than vectorization factor).
6000 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
6001 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
6002 correspond to the first vector stmt, etc.
6003 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
6004 if (group_size > new_phis.length ())
6005 gcc_assert (!(group_size % new_phis.length ()));
6006
6007 for (k = 0; k < group_size; k++)
6008 {
6009 if (slp_reduc)
6010 {
6011 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6012
6013 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6014 /* SLP statements can't participate in patterns. */
6015 gcc_assert (!orig_stmt_info);
6016 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6017 }
6018
6019 if (nested_in_vect_loop)
6020 {
6021 if (double_reduc)
6022 loop = outer_loop;
6023 else
6024 gcc_unreachable ();
6025 }
6026
6027 phis.create (3);
6028 /* Find the loop-closed-use at the loop exit of the original scalar
6029 result. (The reduction result is expected to have two immediate uses,
6030 one at the latch block, and one at the loop exit). For double
6031 reductions we are looking for exit phis of the outer loop. */
6032 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6033 {
6034 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6035 {
6036 if (!is_gimple_debug (USE_STMT (use_p)))
6037 phis.safe_push (USE_STMT (use_p));
6038 }
6039 else
6040 {
6041 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6042 {
6043 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6044
6045 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6046 {
6047 if (!flow_bb_inside_loop_p (loop,
6048 gimple_bb (USE_STMT (phi_use_p)))
6049 && !is_gimple_debug (USE_STMT (phi_use_p)))
6050 phis.safe_push (USE_STMT (phi_use_p));
6051 }
6052 }
6053 }
6054 }
6055
6056 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6057 {
6058 /* Replace the uses: */
6059 orig_name = PHI_RESULT (exit_phi);
6060 scalar_result = scalar_results[k];
6061 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6062 {
6063 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064 SET_USE (use_p, scalar_result);
6065 update_stmt (use_stmt);
6066 }
6067 }
6068
6069 phis.release ();
6070 }
6071 }
6072
6073 /* Return a vector of type VECTYPE that is equal to the vector select
6074 operation "MASK ? VEC : IDENTITY". Insert the select statements
6075 before GSI. */
6076
6077 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6078 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6079 tree vec, tree identity)
6080 {
6081 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6082 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6083 mask, vec, identity);
6084 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6085 return cond;
6086 }
6087
6088 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6089 order, starting with LHS. Insert the extraction statements before GSI and
6090 associate the new scalar SSA names with variable SCALAR_DEST.
6091 Return the SSA name for the result. */
6092
6093 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6094 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6095 tree_code code, tree lhs, tree vector_rhs)
6096 {
6097 tree vectype = TREE_TYPE (vector_rhs);
6098 tree scalar_type = TREE_TYPE (vectype);
6099 tree bitsize = TYPE_SIZE (scalar_type);
6100 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6101 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6102
6103 for (unsigned HOST_WIDE_INT bit_offset = 0;
6104 bit_offset < vec_size_in_bits;
6105 bit_offset += element_bitsize)
6106 {
6107 tree bitpos = bitsize_int (bit_offset);
6108 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6109 bitsize, bitpos);
6110
6111 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6112 rhs = make_ssa_name (scalar_dest, stmt);
6113 gimple_assign_set_lhs (stmt, rhs);
6114 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6115
6116 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6117 tree new_name = make_ssa_name (scalar_dest, stmt);
6118 gimple_assign_set_lhs (stmt, new_name);
6119 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6120 lhs = new_name;
6121 }
6122 return lhs;
6123 }
6124
6125 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6126 type of the vector input. */
6127
6128 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)6129 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6130 {
6131 internal_fn mask_reduc_fn;
6132
6133 switch (reduc_fn)
6134 {
6135 case IFN_FOLD_LEFT_PLUS:
6136 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6137 break;
6138
6139 default:
6140 return IFN_LAST;
6141 }
6142
6143 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6144 OPTIMIZE_FOR_SPEED))
6145 return mask_reduc_fn;
6146 return IFN_LAST;
6147 }
6148
6149 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6150 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6151 statement. CODE is the operation performed by STMT_INFO and OPS are
6152 its scalar operands. REDUC_INDEX is the index of the operand in
6153 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6154 implements in-order reduction, or IFN_LAST if we should open-code it.
6155 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6156 that should be used to control the operation in a fully-masked loop. */
6157
6158 static bool
vectorize_fold_left_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6159 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6160 stmt_vec_info stmt_info,
6161 gimple_stmt_iterator *gsi,
6162 gimple **vec_stmt, slp_tree slp_node,
6163 gimple *reduc_def_stmt,
6164 tree_code code, internal_fn reduc_fn,
6165 tree ops[3], tree vectype_in,
6166 int reduc_index, vec_loop_masks *masks)
6167 {
6168 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6169 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6170 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6171
6172 int ncopies;
6173 if (slp_node)
6174 ncopies = 1;
6175 else
6176 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6177
6178 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6179 gcc_assert (ncopies == 1);
6180 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6181
6182 if (slp_node)
6183 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6184 TYPE_VECTOR_SUBPARTS (vectype_in)));
6185
6186 tree op0 = ops[1 - reduc_index];
6187
6188 int group_size = 1;
6189 stmt_vec_info scalar_dest_def_info;
6190 auto_vec<tree> vec_oprnds0;
6191 if (slp_node)
6192 {
6193 auto_vec<vec<tree> > vec_defs (2);
6194 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6195 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6196 vec_defs[0].release ();
6197 vec_defs[1].release ();
6198 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6199 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6200 }
6201 else
6202 {
6203 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6204 op0, &vec_oprnds0);
6205 scalar_dest_def_info = stmt_info;
6206 }
6207
6208 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6209 tree scalar_type = TREE_TYPE (scalar_dest);
6210 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6211
6212 int vec_num = vec_oprnds0.length ();
6213 gcc_assert (vec_num == 1 || slp_node);
6214 tree vec_elem_type = TREE_TYPE (vectype_out);
6215 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6216
6217 tree vector_identity = NULL_TREE;
6218 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6219 vector_identity = build_zero_cst (vectype_out);
6220
6221 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6222 int i;
6223 tree def0;
6224 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6225 {
6226 gimple *new_stmt;
6227 tree mask = NULL_TREE;
6228 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6229 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6230
6231 /* Handle MINUS by adding the negative. */
6232 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6233 {
6234 tree negated = make_ssa_name (vectype_out);
6235 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6236 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6237 def0 = negated;
6238 }
6239
6240 if (mask && mask_reduc_fn == IFN_LAST)
6241 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6242 vector_identity);
6243
6244 /* On the first iteration the input is simply the scalar phi
6245 result, and for subsequent iterations it is the output of
6246 the preceding operation. */
6247 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6248 {
6249 if (mask && mask_reduc_fn != IFN_LAST)
6250 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6251 def0, mask);
6252 else
6253 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6254 def0);
6255 /* For chained SLP reductions the output of the previous reduction
6256 operation serves as the input of the next. For the final statement
6257 the output cannot be a temporary - we reuse the original
6258 scalar destination of the last statement. */
6259 if (i != vec_num - 1)
6260 {
6261 gimple_set_lhs (new_stmt, scalar_dest_var);
6262 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6263 gimple_set_lhs (new_stmt, reduc_var);
6264 }
6265 }
6266 else
6267 {
6268 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6269 reduc_var, def0);
6270 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6271 /* Remove the statement, so that we can use the same code paths
6272 as for statements that we've just created. */
6273 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6274 gsi_remove (&tmp_gsi, true);
6275 }
6276
6277 if (i == vec_num - 1)
6278 {
6279 gimple_set_lhs (new_stmt, scalar_dest);
6280 vect_finish_replace_stmt (loop_vinfo,
6281 scalar_dest_def_info,
6282 new_stmt);
6283 }
6284 else
6285 vect_finish_stmt_generation (loop_vinfo,
6286 scalar_dest_def_info,
6287 new_stmt, gsi);
6288
6289 if (slp_node)
6290 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6291 else
6292 {
6293 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6294 *vec_stmt = new_stmt;
6295 }
6296 }
6297
6298 return true;
6299 }
6300
6301 /* Function is_nonwrapping_integer_induction.
6302
6303 Check if STMT_VINO (which is part of loop LOOP) both increments and
6304 does not cause overflow. */
6305
6306 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)6307 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6308 {
6309 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6310 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6311 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6312 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6313 widest_int ni, max_loop_value, lhs_max;
6314 wi::overflow_type overflow = wi::OVF_NONE;
6315
6316 /* Make sure the loop is integer based. */
6317 if (TREE_CODE (base) != INTEGER_CST
6318 || TREE_CODE (step) != INTEGER_CST)
6319 return false;
6320
6321 /* Check that the max size of the loop will not wrap. */
6322
6323 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6324 return true;
6325
6326 if (! max_stmt_executions (loop, &ni))
6327 return false;
6328
6329 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6330 &overflow);
6331 if (overflow)
6332 return false;
6333
6334 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6335 TYPE_SIGN (lhs_type), &overflow);
6336 if (overflow)
6337 return false;
6338
6339 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6340 <= TYPE_PRECISION (lhs_type));
6341 }
6342
6343 /* Check if masking can be supported by inserting a conditional expression.
6344 CODE is the code for the operation. COND_FN is the conditional internal
6345 function, if it exists. VECTYPE_IN is the type of the vector input. */
6346 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)6347 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6348 tree vectype_in)
6349 {
6350 if (cond_fn != IFN_LAST
6351 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6352 OPTIMIZE_FOR_SPEED))
6353 return false;
6354
6355 switch (code)
6356 {
6357 case DOT_PROD_EXPR:
6358 case SAD_EXPR:
6359 return true;
6360
6361 default:
6362 return false;
6363 }
6364 }
6365
6366 /* Insert a conditional expression to enable masked vectorization. CODE is the
6367 code for the operation. VOP is the array of operands. MASK is the loop
6368 mask. GSI is a statement iterator used to place the new conditional
6369 expression. */
6370 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)6371 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6372 gimple_stmt_iterator *gsi)
6373 {
6374 switch (code)
6375 {
6376 case DOT_PROD_EXPR:
6377 {
6378 tree vectype = TREE_TYPE (vop[1]);
6379 tree zero = build_zero_cst (vectype);
6380 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6381 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6382 mask, vop[1], zero);
6383 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6384 vop[1] = masked_op1;
6385 break;
6386 }
6387
6388 case SAD_EXPR:
6389 {
6390 tree vectype = TREE_TYPE (vop[1]);
6391 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6392 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6393 mask, vop[1], vop[0]);
6394 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6395 vop[1] = masked_op1;
6396 break;
6397 }
6398
6399 default:
6400 gcc_unreachable ();
6401 }
6402 }
6403
6404 /* Function vectorizable_reduction.
6405
6406 Check if STMT_INFO performs a reduction operation that can be vectorized.
6407 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6408 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6409 Return true if STMT_INFO is vectorizable in this way.
6410
6411 This function also handles reduction idioms (patterns) that have been
6412 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6413 may be of this form:
6414 X = pattern_expr (arg0, arg1, ..., X)
6415 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6416 sequence that had been detected and replaced by the pattern-stmt
6417 (STMT_INFO).
6418
6419 This function also handles reduction of condition expressions, for example:
6420 for (int i = 0; i < N; i++)
6421 if (a[i] < value)
6422 last = a[i];
6423 This is handled by vectorising the loop and creating an additional vector
6424 containing the loop indexes for which "a[i] < value" was true. In the
6425 function epilogue this is reduced to a single max value and then used to
6426 index into the vector of results.
6427
6428 In some cases of reduction patterns, the type of the reduction variable X is
6429 different than the type of the other arguments of STMT_INFO.
6430 In such cases, the vectype that is used when transforming STMT_INFO into
6431 a vector stmt is different than the vectype that is used to determine the
6432 vectorization factor, because it consists of a different number of elements
6433 than the actual number of elements that are being operated upon in parallel.
6434
6435 For example, consider an accumulation of shorts into an int accumulator.
6436 On some targets it's possible to vectorize this pattern operating on 8
6437 shorts at a time (hence, the vectype for purposes of determining the
6438 vectorization factor should be V8HI); on the other hand, the vectype that
6439 is used to create the vector form is actually V4SI (the type of the result).
6440
6441 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6442 indicates what is the actual level of parallelism (V8HI in the example), so
6443 that the right vectorization factor would be derived. This vectype
6444 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6445 be used to create the vectorized stmt. The right vectype for the vectorized
6446 stmt is obtained from the type of the result X:
6447 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448
6449 This means that, contrary to "regular" reductions (or "regular" stmts in
6450 general), the following equation:
6451 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6452 does *NOT* necessarily hold for reduction patterns. */
6453
6454 bool
vectorizable_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6455 vectorizable_reduction (loop_vec_info loop_vinfo,
6456 stmt_vec_info stmt_info, slp_tree slp_node,
6457 slp_instance slp_node_instance,
6458 stmt_vector_for_cost *cost_vec)
6459 {
6460 tree scalar_dest;
6461 tree vectype_in = NULL_TREE;
6462 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6463 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6464 stmt_vec_info cond_stmt_vinfo = NULL;
6465 tree scalar_type;
6466 int i;
6467 int ncopies;
6468 bool single_defuse_cycle = false;
6469 bool nested_cycle = false;
6470 bool double_reduc = false;
6471 int vec_num;
6472 tree tem;
6473 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6474 tree cond_reduc_val = NULL_TREE;
6475
6476 /* Make sure it was already recognized as a reduction computation. */
6477 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6478 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6479 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6480 return false;
6481
6482 /* The stmt we store reduction analysis meta on. */
6483 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6484 reduc_info->is_reduc_info = true;
6485
6486 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6487 {
6488 if (is_a <gphi *> (stmt_info->stmt))
6489 {
6490 if (slp_node)
6491 {
6492 /* We eventually need to set a vector type on invariant
6493 arguments. */
6494 unsigned j;
6495 slp_tree child;
6496 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6497 if (!vect_maybe_update_slp_op_vectype
6498 (child, SLP_TREE_VECTYPE (slp_node)))
6499 {
6500 if (dump_enabled_p ())
6501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502 "incompatible vector types for "
6503 "invariants\n");
6504 return false;
6505 }
6506 }
6507 /* Analysis for double-reduction is done on the outer
6508 loop PHI, nested cycles have no further restrictions. */
6509 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6510 }
6511 else
6512 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6513 return true;
6514 }
6515
6516 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6517 stmt_vec_info phi_info = stmt_info;
6518 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6519 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6520 {
6521 if (!is_a <gphi *> (stmt_info->stmt))
6522 {
6523 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6524 return true;
6525 }
6526 if (slp_node)
6527 {
6528 slp_node_instance->reduc_phis = slp_node;
6529 /* ??? We're leaving slp_node to point to the PHIs, we only
6530 need it to get at the number of vector stmts which wasn't
6531 yet initialized for the instance root. */
6532 }
6533 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6534 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6535 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6536 {
6537 use_operand_p use_p;
6538 gimple *use_stmt;
6539 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6540 &use_p, &use_stmt);
6541 gcc_assert (res);
6542 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6543 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6544 }
6545 }
6546
6547 /* PHIs should not participate in patterns. */
6548 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6549 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6550
6551 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6552 and compute the reduction chain length. Discover the real
6553 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6554 tree reduc_def
6555 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6556 loop_latch_edge
6557 (gimple_bb (reduc_def_phi)->loop_father));
6558 unsigned reduc_chain_length = 0;
6559 bool only_slp_reduc_chain = true;
6560 stmt_info = NULL;
6561 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6562 while (reduc_def != PHI_RESULT (reduc_def_phi))
6563 {
6564 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6565 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6566 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6567 {
6568 if (dump_enabled_p ())
6569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6570 "reduction chain broken by patterns.\n");
6571 return false;
6572 }
6573 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6574 only_slp_reduc_chain = false;
6575 /* ??? For epilogue generation live members of the chain need
6576 to point back to the PHI via their original stmt for
6577 info_for_reduction to work. */
6578 if (STMT_VINFO_LIVE_P (vdef))
6579 STMT_VINFO_REDUC_DEF (def) = phi_info;
6580 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6581 if (!assign)
6582 {
6583 if (dump_enabled_p ())
6584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585 "reduction chain includes calls.\n");
6586 return false;
6587 }
6588 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6589 {
6590 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6591 TREE_TYPE (gimple_assign_rhs1 (assign))))
6592 {
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595 "conversion in the reduction chain.\n");
6596 return false;
6597 }
6598 }
6599 else if (!stmt_info)
6600 /* First non-conversion stmt. */
6601 stmt_info = vdef;
6602 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6603 reduc_chain_length++;
6604 if (!stmt_info && slp_node)
6605 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6606 }
6607 /* PHIs should not participate in patterns. */
6608 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6609
6610 if (nested_in_vect_loop_p (loop, stmt_info))
6611 {
6612 loop = loop->inner;
6613 nested_cycle = true;
6614 }
6615
6616 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6617 element. */
6618 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6619 {
6620 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6621 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6622 }
6623 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6624 gcc_assert (slp_node
6625 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6626
6627 /* 1. Is vectorizable reduction? */
6628 /* Not supportable if the reduction variable is used in the loop, unless
6629 it's a reduction chain. */
6630 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6631 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6632 return false;
6633
6634 /* Reductions that are not used even in an enclosing outer-loop,
6635 are expected to be "live" (used out of the loop). */
6636 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6637 && !STMT_VINFO_LIVE_P (stmt_info))
6638 return false;
6639
6640 /* 2. Has this been recognized as a reduction pattern?
6641
6642 Check if STMT represents a pattern that has been recognized
6643 in earlier analysis stages. For stmts that represent a pattern,
6644 the STMT_VINFO_RELATED_STMT field records the last stmt in
6645 the original sequence that constitutes the pattern. */
6646
6647 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6648 if (orig_stmt_info)
6649 {
6650 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6651 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6652 }
6653
6654 /* 3. Check the operands of the operation. The first operands are defined
6655 inside the loop body. The last operand is the reduction variable,
6656 which is defined by the loop-header-phi. */
6657
6658 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6659 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6660 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6661 enum tree_code code = gimple_assign_rhs_code (stmt);
6662 bool lane_reduc_code_p
6663 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6664 int op_type = TREE_CODE_LENGTH (code);
6665
6666 scalar_dest = gimple_assign_lhs (stmt);
6667 scalar_type = TREE_TYPE (scalar_dest);
6668 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6669 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6670 return false;
6671
6672 /* Do not try to vectorize bit-precision reductions. */
6673 if (!type_has_mode_precision_p (scalar_type))
6674 return false;
6675
6676 /* For lane-reducing ops we're reducing the number of reduction PHIs
6677 which means the only use of that may be in the lane-reducing operation. */
6678 if (lane_reduc_code_p
6679 && reduc_chain_length != 1
6680 && !only_slp_reduc_chain)
6681 {
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 "lane-reducing reduction with extra stmts.\n");
6685 return false;
6686 }
6687
6688 /* All uses but the last are expected to be defined in the loop.
6689 The last use is the reduction variable. In case of nested cycle this
6690 assumption is not true: we use reduc_index to record the index of the
6691 reduction variable. */
6692 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6693 /* We need to skip an extra operand for COND_EXPRs with embedded
6694 comparison. */
6695 unsigned opno_adjust = 0;
6696 if (code == COND_EXPR
6697 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6698 opno_adjust = 1;
6699 for (i = 0; i < op_type; i++)
6700 {
6701 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6702 if (i == 0 && code == COND_EXPR)
6703 continue;
6704
6705 stmt_vec_info def_stmt_info;
6706 enum vect_def_type dt;
6707 tree op;
6708 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6709 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6710 &def_stmt_info))
6711 {
6712 if (dump_enabled_p ())
6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 "use not simple.\n");
6715 return false;
6716 }
6717 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6718 continue;
6719
6720 /* There should be only one cycle def in the stmt, the one
6721 leading to reduc_def. */
6722 if (VECTORIZABLE_CYCLE_DEF (dt))
6723 return false;
6724
6725 /* To properly compute ncopies we are interested in the widest
6726 non-reduction input type in case we're looking at a widening
6727 accumulation that we later handle in vect_transform_reduction. */
6728 if (lane_reduc_code_p
6729 && tem
6730 && (!vectype_in
6731 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6732 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6733 vectype_in = tem;
6734
6735 if (code == COND_EXPR)
6736 {
6737 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6738 if (dt == vect_constant_def)
6739 {
6740 cond_reduc_dt = dt;
6741 cond_reduc_val = op;
6742 }
6743 if (dt == vect_induction_def
6744 && def_stmt_info
6745 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6746 {
6747 cond_reduc_dt = dt;
6748 cond_stmt_vinfo = def_stmt_info;
6749 }
6750 }
6751 }
6752 if (!vectype_in)
6753 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6754 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6755
6756 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6757 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6758 /* If we have a condition reduction, see if we can simplify it further. */
6759 if (v_reduc_type == COND_REDUCTION)
6760 {
6761 if (slp_node)
6762 return false;
6763
6764 /* When the condition uses the reduction value in the condition, fail. */
6765 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6766 {
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "condition depends on previous iteration\n");
6770 return false;
6771 }
6772
6773 if (reduc_chain_length == 1
6774 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6775 vectype_in, OPTIMIZE_FOR_SPEED))
6776 {
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 "optimizing condition reduction with"
6780 " FOLD_EXTRACT_LAST.\n");
6781 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6782 }
6783 else if (cond_reduc_dt == vect_induction_def)
6784 {
6785 tree base
6786 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6787 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6788
6789 gcc_assert (TREE_CODE (base) == INTEGER_CST
6790 && TREE_CODE (step) == INTEGER_CST);
6791 cond_reduc_val = NULL_TREE;
6792 enum tree_code cond_reduc_op_code = ERROR_MARK;
6793 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6794 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6795 ;
6796 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6797 above base; punt if base is the minimum value of the type for
6798 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6799 else if (tree_int_cst_sgn (step) == -1)
6800 {
6801 cond_reduc_op_code = MIN_EXPR;
6802 if (tree_int_cst_sgn (base) == -1)
6803 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6804 else if (tree_int_cst_lt (base,
6805 TYPE_MAX_VALUE (TREE_TYPE (base))))
6806 cond_reduc_val
6807 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6808 }
6809 else
6810 {
6811 cond_reduc_op_code = MAX_EXPR;
6812 if (tree_int_cst_sgn (base) == 1)
6813 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6814 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6815 base))
6816 cond_reduc_val
6817 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6818 }
6819 if (cond_reduc_val)
6820 {
6821 if (dump_enabled_p ())
6822 dump_printf_loc (MSG_NOTE, vect_location,
6823 "condition expression based on "
6824 "integer induction.\n");
6825 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6826 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6827 = cond_reduc_val;
6828 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6829 }
6830 }
6831 else if (cond_reduc_dt == vect_constant_def)
6832 {
6833 enum vect_def_type cond_initial_dt;
6834 tree cond_initial_val
6835 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6836
6837 gcc_assert (cond_reduc_val != NULL_TREE);
6838 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6839 if (cond_initial_dt == vect_constant_def
6840 && types_compatible_p (TREE_TYPE (cond_initial_val),
6841 TREE_TYPE (cond_reduc_val)))
6842 {
6843 tree e = fold_binary (LE_EXPR, boolean_type_node,
6844 cond_initial_val, cond_reduc_val);
6845 if (e && (integer_onep (e) || integer_zerop (e)))
6846 {
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_NOTE, vect_location,
6849 "condition expression based on "
6850 "compile time constant.\n");
6851 /* Record reduction code at analysis stage. */
6852 STMT_VINFO_REDUC_CODE (reduc_info)
6853 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6854 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6855 }
6856 }
6857 }
6858 }
6859
6860 if (STMT_VINFO_LIVE_P (phi_info))
6861 return false;
6862
6863 if (slp_node)
6864 ncopies = 1;
6865 else
6866 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867
6868 gcc_assert (ncopies >= 1);
6869
6870 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6871
6872 if (nested_cycle)
6873 {
6874 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6875 == vect_double_reduction_def);
6876 double_reduc = true;
6877 }
6878
6879 /* 4.2. Check support for the epilog operation.
6880
6881 If STMT represents a reduction pattern, then the type of the
6882 reduction variable may be different than the type of the rest
6883 of the arguments. For example, consider the case of accumulation
6884 of shorts into an int accumulator; The original code:
6885 S1: int_a = (int) short_a;
6886 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6887
6888 was replaced with:
6889 STMT: int_acc = widen_sum <short_a, int_acc>
6890
6891 This means that:
6892 1. The tree-code that is used to create the vector operation in the
6893 epilog code (that reduces the partial results) is not the
6894 tree-code of STMT, but is rather the tree-code of the original
6895 stmt from the pattern that STMT is replacing. I.e, in the example
6896 above we want to use 'widen_sum' in the loop, but 'plus' in the
6897 epilog.
6898 2. The type (mode) we use to check available target support
6899 for the vector operation to be created in the *epilog*, is
6900 determined by the type of the reduction variable (in the example
6901 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6902 However the type (mode) we use to check available target support
6903 for the vector operation to be created *inside the loop*, is
6904 determined by the type of the other arguments to STMT (in the
6905 example we'd check this: optab_handler (widen_sum_optab,
6906 vect_short_mode)).
6907
6908 This is contrary to "regular" reductions, in which the types of all
6909 the arguments are the same as the type of the reduction variable.
6910 For "regular" reductions we can therefore use the same vector type
6911 (and also the same tree-code) when generating the epilog code and
6912 when generating the code inside the loop. */
6913
6914 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6915 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6916
6917 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6918 if (reduction_type == TREE_CODE_REDUCTION)
6919 {
6920 /* Check whether it's ok to change the order of the computation.
6921 Generally, when vectorizing a reduction we change the order of the
6922 computation. This may change the behavior of the program in some
6923 cases, so we need to check that this is ok. One exception is when
6924 vectorizing an outer-loop: the inner-loop is executed sequentially,
6925 and therefore vectorizing reductions in the inner-loop during
6926 outer-loop vectorization is safe. Likewise when we are vectorizing
6927 a series of reductions using SLP and the VF is one the reductions
6928 are performed in scalar order. */
6929 if (slp_node
6930 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6931 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6932 ;
6933 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6934 {
6935 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6936 is not directy used in stmt. */
6937 if (!only_slp_reduc_chain
6938 && reduc_chain_length != 1)
6939 {
6940 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942 "in-order reduction chain without SLP.\n");
6943 return false;
6944 }
6945 STMT_VINFO_REDUC_TYPE (reduc_info)
6946 = reduction_type = FOLD_LEFT_REDUCTION;
6947 }
6948 else if (!commutative_tree_code (orig_code)
6949 || !associative_tree_code (orig_code))
6950 {
6951 if (dump_enabled_p ())
6952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953 "reduction: not commutative/associative");
6954 return false;
6955 }
6956 }
6957
6958 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6959 && ncopies > 1)
6960 {
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963 "multiple types in double reduction or condition "
6964 "reduction or fold-left reduction.\n");
6965 return false;
6966 }
6967
6968 internal_fn reduc_fn = IFN_LAST;
6969 if (reduction_type == TREE_CODE_REDUCTION
6970 || reduction_type == FOLD_LEFT_REDUCTION
6971 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6972 || reduction_type == CONST_COND_REDUCTION)
6973 {
6974 if (reduction_type == FOLD_LEFT_REDUCTION
6975 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6976 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6977 {
6978 if (reduc_fn != IFN_LAST
6979 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6980 OPTIMIZE_FOR_SPEED))
6981 {
6982 if (dump_enabled_p ())
6983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6984 "reduc op not supported by target.\n");
6985
6986 reduc_fn = IFN_LAST;
6987 }
6988 }
6989 else
6990 {
6991 if (!nested_cycle || double_reduc)
6992 {
6993 if (dump_enabled_p ())
6994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995 "no reduc code for scalar code.\n");
6996
6997 return false;
6998 }
6999 }
7000 }
7001 else if (reduction_type == COND_REDUCTION)
7002 {
7003 int scalar_precision
7004 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7005 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7006 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7007 vectype_out);
7008
7009 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7010 OPTIMIZE_FOR_SPEED))
7011 reduc_fn = IFN_REDUC_MAX;
7012 }
7013 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7014
7015 if (reduction_type != EXTRACT_LAST_REDUCTION
7016 && (!nested_cycle || double_reduc)
7017 && reduc_fn == IFN_LAST
7018 && !nunits_out.is_constant ())
7019 {
7020 if (dump_enabled_p ())
7021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7022 "missing target support for reduction on"
7023 " variable-length vectors.\n");
7024 return false;
7025 }
7026
7027 /* For SLP reductions, see if there is a neutral value we can use. */
7028 tree neutral_op = NULL_TREE;
7029 if (slp_node)
7030 neutral_op = neutral_op_for_slp_reduction
7031 (slp_node_instance->reduc_phis, vectype_out, orig_code,
7032 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7033
7034 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7035 {
7036 /* We can't support in-order reductions of code such as this:
7037
7038 for (int i = 0; i < n1; ++i)
7039 for (int j = 0; j < n2; ++j)
7040 l += a[j];
7041
7042 since GCC effectively transforms the loop when vectorizing:
7043
7044 for (int i = 0; i < n1 / VF; ++i)
7045 for (int j = 0; j < n2; ++j)
7046 for (int k = 0; k < VF; ++k)
7047 l += a[j];
7048
7049 which is a reassociation of the original operation. */
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "in-order double reduction not supported.\n");
7053
7054 return false;
7055 }
7056
7057 if (reduction_type == FOLD_LEFT_REDUCTION
7058 && slp_node
7059 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7060 {
7061 /* We cannot use in-order reductions in this case because there is
7062 an implicit reassociation of the operations involved. */
7063 if (dump_enabled_p ())
7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 "in-order unchained SLP reductions not supported.\n");
7066 return false;
7067 }
7068
7069 /* For double reductions, and for SLP reductions with a neutral value,
7070 we construct a variable-length initial vector by loading a vector
7071 full of the neutral value and then shift-and-inserting the start
7072 values into the low-numbered elements. */
7073 if ((double_reduc || neutral_op)
7074 && !nunits_out.is_constant ()
7075 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7076 vectype_out, OPTIMIZE_FOR_SPEED))
7077 {
7078 if (dump_enabled_p ())
7079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080 "reduction on variable-length vectors requires"
7081 " target support for a vector-shift-and-insert"
7082 " operation.\n");
7083 return false;
7084 }
7085
7086 /* Check extra constraints for variable-length unchained SLP reductions. */
7087 if (STMT_SLP_TYPE (stmt_info)
7088 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7089 && !nunits_out.is_constant ())
7090 {
7091 /* We checked above that we could build the initial vector when
7092 there's a neutral element value. Check here for the case in
7093 which each SLP statement has its own initial value and in which
7094 that value needs to be repeated for every instance of the
7095 statement within the initial vector. */
7096 unsigned int group_size = SLP_TREE_LANES (slp_node);
7097 if (!neutral_op
7098 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7099 TREE_TYPE (vectype_out)))
7100 {
7101 if (dump_enabled_p ())
7102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 "unsupported form of SLP reduction for"
7104 " variable-length vectors: cannot build"
7105 " initial vector.\n");
7106 return false;
7107 }
7108 /* The epilogue code relies on the number of elements being a multiple
7109 of the group size. The duplicate-and-interleave approach to setting
7110 up the initial vector does too. */
7111 if (!multiple_p (nunits_out, group_size))
7112 {
7113 if (dump_enabled_p ())
7114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115 "unsupported form of SLP reduction for"
7116 " variable-length vectors: the vector size"
7117 " is not a multiple of the number of results.\n");
7118 return false;
7119 }
7120 }
7121
7122 if (reduction_type == COND_REDUCTION)
7123 {
7124 widest_int ni;
7125
7126 if (! max_loop_iterations (loop, &ni))
7127 {
7128 if (dump_enabled_p ())
7129 dump_printf_loc (MSG_NOTE, vect_location,
7130 "loop count not known, cannot create cond "
7131 "reduction.\n");
7132 return false;
7133 }
7134 /* Convert backedges to iterations. */
7135 ni += 1;
7136
7137 /* The additional index will be the same type as the condition. Check
7138 that the loop can fit into this less one (because we'll use up the
7139 zero slot for when there are no matches). */
7140 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7141 if (wi::geu_p (ni, wi::to_widest (max_index)))
7142 {
7143 if (dump_enabled_p ())
7144 dump_printf_loc (MSG_NOTE, vect_location,
7145 "loop size is greater than data size.\n");
7146 return false;
7147 }
7148 }
7149
7150 /* In case the vectorization factor (VF) is bigger than the number
7151 of elements that we can fit in a vectype (nunits), we have to generate
7152 more than one vector stmt - i.e - we need to "unroll" the
7153 vector stmt by a factor VF/nunits. For more details see documentation
7154 in vectorizable_operation. */
7155
7156 /* If the reduction is used in an outer loop we need to generate
7157 VF intermediate results, like so (e.g. for ncopies=2):
7158 r0 = phi (init, r0)
7159 r1 = phi (init, r1)
7160 r0 = x0 + r0;
7161 r1 = x1 + r1;
7162 (i.e. we generate VF results in 2 registers).
7163 In this case we have a separate def-use cycle for each copy, and therefore
7164 for each copy we get the vector def for the reduction variable from the
7165 respective phi node created for this copy.
7166
7167 Otherwise (the reduction is unused in the loop nest), we can combine
7168 together intermediate results, like so (e.g. for ncopies=2):
7169 r = phi (init, r)
7170 r = x0 + r;
7171 r = x1 + r;
7172 (i.e. we generate VF/2 results in a single register).
7173 In this case for each copy we get the vector def for the reduction variable
7174 from the vectorized reduction operation generated in the previous iteration.
7175
7176 This only works when we see both the reduction PHI and its only consumer
7177 in vectorizable_reduction and there are no intermediate stmts
7178 participating. */
7179 if (ncopies > 1
7180 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7181 && reduc_chain_length == 1)
7182 single_defuse_cycle = true;
7183
7184 if (single_defuse_cycle || lane_reduc_code_p)
7185 {
7186 gcc_assert (code != COND_EXPR);
7187
7188 /* 4. Supportable by target? */
7189 bool ok = true;
7190
7191 /* 4.1. check support for the operation in the loop */
7192 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7193 if (!optab)
7194 {
7195 if (dump_enabled_p ())
7196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197 "no optab.\n");
7198 ok = false;
7199 }
7200
7201 machine_mode vec_mode = TYPE_MODE (vectype_in);
7202 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7203 {
7204 if (dump_enabled_p ())
7205 dump_printf (MSG_NOTE, "op not supported by target.\n");
7206 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7207 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7208 ok = false;
7209 else
7210 if (dump_enabled_p ())
7211 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7212 }
7213
7214 /* Worthwhile without SIMD support? */
7215 if (ok
7216 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7217 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7218 {
7219 if (dump_enabled_p ())
7220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7221 "not worthwhile without SIMD support.\n");
7222 ok = false;
7223 }
7224
7225 /* lane-reducing operations have to go through vect_transform_reduction.
7226 For the other cases try without the single cycle optimization. */
7227 if (!ok)
7228 {
7229 if (lane_reduc_code_p)
7230 return false;
7231 else
7232 single_defuse_cycle = false;
7233 }
7234 }
7235 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7236
7237 /* If the reduction stmt is one of the patterns that have lane
7238 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7239 if ((ncopies > 1 && ! single_defuse_cycle)
7240 && lane_reduc_code_p)
7241 {
7242 if (dump_enabled_p ())
7243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244 "multi def-use cycle not possible for lane-reducing "
7245 "reduction operation\n");
7246 return false;
7247 }
7248
7249 if (slp_node
7250 && !(!single_defuse_cycle
7251 && code != DOT_PROD_EXPR
7252 && code != WIDEN_SUM_EXPR
7253 && code != SAD_EXPR
7254 && reduction_type != FOLD_LEFT_REDUCTION))
7255 for (i = 0; i < op_type; i++)
7256 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7257 {
7258 if (dump_enabled_p ())
7259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7260 "incompatible vector types for invariants\n");
7261 return false;
7262 }
7263
7264 if (slp_node)
7265 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7266 else
7267 vec_num = 1;
7268
7269 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7270 reduction_type, ncopies, cost_vec);
7271 /* Cost the reduction op inside the loop if transformed via
7272 vect_transform_reduction. Otherwise this is costed by the
7273 separate vectorizable_* routines. */
7274 if (single_defuse_cycle
7275 || code == DOT_PROD_EXPR
7276 || code == WIDEN_SUM_EXPR
7277 || code == SAD_EXPR)
7278 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7279
7280 if (dump_enabled_p ()
7281 && reduction_type == FOLD_LEFT_REDUCTION)
7282 dump_printf_loc (MSG_NOTE, vect_location,
7283 "using an in-order (fold-left) reduction.\n");
7284 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7285 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7286 reductions go through their own vectorizable_* routines. */
7287 if (!single_defuse_cycle
7288 && code != DOT_PROD_EXPR
7289 && code != WIDEN_SUM_EXPR
7290 && code != SAD_EXPR
7291 && reduction_type != FOLD_LEFT_REDUCTION)
7292 {
7293 stmt_vec_info tem
7294 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7295 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7296 {
7297 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7298 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7299 }
7300 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7301 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7302 }
7303 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7304 {
7305 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7306 internal_fn cond_fn = get_conditional_internal_fn (code);
7307
7308 if (reduction_type != FOLD_LEFT_REDUCTION
7309 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7310 && (cond_fn == IFN_LAST
7311 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7312 OPTIMIZE_FOR_SPEED)))
7313 {
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 "can't operate on partial vectors because"
7317 " no conditional operation is available.\n");
7318 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7319 }
7320 else if (reduction_type == FOLD_LEFT_REDUCTION
7321 && reduc_fn == IFN_LAST
7322 && !expand_vec_cond_expr_p (vectype_in,
7323 truth_type_for (vectype_in),
7324 SSA_NAME))
7325 {
7326 if (dump_enabled_p ())
7327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328 "can't operate on partial vectors because"
7329 " no conditional operation is available.\n");
7330 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7331 }
7332 else
7333 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7334 vectype_in, NULL);
7335 }
7336 return true;
7337 }
7338
7339 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7340 value. */
7341
7342 bool
vect_transform_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)7343 vect_transform_reduction (loop_vec_info loop_vinfo,
7344 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7345 gimple **vec_stmt, slp_tree slp_node)
7346 {
7347 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7348 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7349 int i;
7350 int ncopies;
7351 int vec_num;
7352
7353 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7354 gcc_assert (reduc_info->is_reduc_info);
7355
7356 if (nested_in_vect_loop_p (loop, stmt_info))
7357 {
7358 loop = loop->inner;
7359 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7360 }
7361
7362 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7363 enum tree_code code = gimple_assign_rhs_code (stmt);
7364 int op_type = TREE_CODE_LENGTH (code);
7365
7366 /* Flatten RHS. */
7367 tree ops[3];
7368 switch (get_gimple_rhs_class (code))
7369 {
7370 case GIMPLE_TERNARY_RHS:
7371 ops[2] = gimple_assign_rhs3 (stmt);
7372 /* Fall thru. */
7373 case GIMPLE_BINARY_RHS:
7374 ops[0] = gimple_assign_rhs1 (stmt);
7375 ops[1] = gimple_assign_rhs2 (stmt);
7376 break;
7377 default:
7378 gcc_unreachable ();
7379 }
7380
7381 /* All uses but the last are expected to be defined in the loop.
7382 The last use is the reduction variable. In case of nested cycle this
7383 assumption is not true: we use reduc_index to record the index of the
7384 reduction variable. */
7385 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7386 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7387 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7388 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7389
7390 if (slp_node)
7391 {
7392 ncopies = 1;
7393 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7394 }
7395 else
7396 {
7397 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7398 vec_num = 1;
7399 }
7400
7401 internal_fn cond_fn = get_conditional_internal_fn (code);
7402 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7403 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7404
7405 /* Transform. */
7406 tree new_temp = NULL_TREE;
7407 auto_vec<tree> vec_oprnds0;
7408 auto_vec<tree> vec_oprnds1;
7409 auto_vec<tree> vec_oprnds2;
7410 tree def0;
7411
7412 if (dump_enabled_p ())
7413 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7414
7415 /* FORNOW: Multiple types are not supported for condition. */
7416 if (code == COND_EXPR)
7417 gcc_assert (ncopies == 1);
7418
7419 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7420
7421 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7422 if (reduction_type == FOLD_LEFT_REDUCTION)
7423 {
7424 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7425 return vectorize_fold_left_reduction
7426 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7427 reduc_fn, ops, vectype_in, reduc_index, masks);
7428 }
7429
7430 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7431 gcc_assert (single_defuse_cycle
7432 || code == DOT_PROD_EXPR
7433 || code == WIDEN_SUM_EXPR
7434 || code == SAD_EXPR);
7435
7436 /* Create the destination vector */
7437 tree scalar_dest = gimple_assign_lhs (stmt);
7438 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7439
7440 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7441 single_defuse_cycle && reduc_index == 0
7442 ? NULL_TREE : ops[0], &vec_oprnds0,
7443 single_defuse_cycle && reduc_index == 1
7444 ? NULL_TREE : ops[1], &vec_oprnds1,
7445 op_type == ternary_op
7446 && !(single_defuse_cycle && reduc_index == 2)
7447 ? ops[2] : NULL_TREE, &vec_oprnds2);
7448 if (single_defuse_cycle)
7449 {
7450 gcc_assert (!slp_node);
7451 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7452 ops[reduc_index],
7453 reduc_index == 0 ? &vec_oprnds0
7454 : (reduc_index == 1 ? &vec_oprnds1
7455 : &vec_oprnds2));
7456 }
7457
7458 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7459 {
7460 gimple *new_stmt;
7461 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7462 if (masked_loop_p && !mask_by_cond_expr)
7463 {
7464 /* Make sure that the reduction accumulator is vop[0]. */
7465 if (reduc_index == 1)
7466 {
7467 gcc_assert (commutative_tree_code (code));
7468 std::swap (vop[0], vop[1]);
7469 }
7470 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7471 vectype_in, i);
7472 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7473 vop[0], vop[1], vop[0]);
7474 new_temp = make_ssa_name (vec_dest, call);
7475 gimple_call_set_lhs (call, new_temp);
7476 gimple_call_set_nothrow (call, true);
7477 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7478 new_stmt = call;
7479 }
7480 else
7481 {
7482 if (op_type == ternary_op)
7483 vop[2] = vec_oprnds2[i];
7484
7485 if (masked_loop_p && mask_by_cond_expr)
7486 {
7487 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7488 vectype_in, i);
7489 build_vect_cond_expr (code, vop, mask, gsi);
7490 }
7491
7492 new_stmt = gimple_build_assign (vec_dest, code,
7493 vop[0], vop[1], vop[2]);
7494 new_temp = make_ssa_name (vec_dest, new_stmt);
7495 gimple_assign_set_lhs (new_stmt, new_temp);
7496 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7497 }
7498
7499 if (slp_node)
7500 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7501 else if (single_defuse_cycle
7502 && i < ncopies - 1)
7503 {
7504 if (reduc_index == 0)
7505 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7506 else if (reduc_index == 1)
7507 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7508 else if (reduc_index == 2)
7509 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7510 }
7511 else
7512 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7513 }
7514
7515 if (!slp_node)
7516 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7517
7518 return true;
7519 }
7520
7521 /* Transform phase of a cycle PHI. */
7522
7523 bool
vect_transform_cycle_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7524 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7525 stmt_vec_info stmt_info, gimple **vec_stmt,
7526 slp_tree slp_node, slp_instance slp_node_instance)
7527 {
7528 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7529 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7530 int i;
7531 int ncopies;
7532 int j;
7533 bool nested_cycle = false;
7534 int vec_num;
7535
7536 if (nested_in_vect_loop_p (loop, stmt_info))
7537 {
7538 loop = loop->inner;
7539 nested_cycle = true;
7540 }
7541
7542 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7543 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7544 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7545 gcc_assert (reduc_info->is_reduc_info);
7546
7547 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7548 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7549 /* Leave the scalar phi in place. */
7550 return true;
7551
7552 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7553 /* For a nested cycle we do not fill the above. */
7554 if (!vectype_in)
7555 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7556 gcc_assert (vectype_in);
7557
7558 if (slp_node)
7559 {
7560 /* The size vect_schedule_slp_instance computes is off for us. */
7561 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7562 * SLP_TREE_LANES (slp_node), vectype_in);
7563 ncopies = 1;
7564 }
7565 else
7566 {
7567 vec_num = 1;
7568 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7569 }
7570
7571 /* Check whether we should use a single PHI node and accumulate
7572 vectors to one before the backedge. */
7573 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7574 ncopies = 1;
7575
7576 /* Create the destination vector */
7577 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7578 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7579 vectype_out);
7580
7581 /* Get the loop-entry arguments. */
7582 tree vec_initial_def;
7583 auto_vec<tree> vec_initial_defs;
7584 if (slp_node)
7585 {
7586 vec_initial_defs.reserve (vec_num);
7587 if (nested_cycle)
7588 {
7589 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7590 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7591 &vec_initial_defs);
7592 }
7593 else
7594 {
7595 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7596 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7597 tree neutral_op
7598 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7599 STMT_VINFO_REDUC_CODE (reduc_info),
7600 first != NULL);
7601 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7602 &vec_initial_defs, vec_num,
7603 first != NULL, neutral_op);
7604 }
7605 }
7606 else
7607 {
7608 /* Get at the scalar def before the loop, that defines the initial
7609 value of the reduction variable. */
7610 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7611 loop_preheader_edge (loop));
7612 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7613 and we can't use zero for induc_val, use initial_def. Similarly
7614 for REDUC_MIN and initial_def larger than the base. */
7615 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7616 {
7617 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7618 if (TREE_CODE (initial_def) == INTEGER_CST
7619 && !integer_zerop (induc_val)
7620 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7621 && tree_int_cst_lt (initial_def, induc_val))
7622 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7623 && tree_int_cst_lt (induc_val, initial_def))))
7624 {
7625 induc_val = initial_def;
7626 /* Communicate we used the initial_def to epilouge
7627 generation. */
7628 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7629 }
7630 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7631 vec_initial_defs.create (ncopies);
7632 for (i = 0; i < ncopies; ++i)
7633 vec_initial_defs.quick_push (vec_initial_def);
7634 }
7635 else if (nested_cycle)
7636 {
7637 /* Do not use an adjustment def as that case is not supported
7638 correctly if ncopies is not one. */
7639 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7640 ncopies, initial_def,
7641 &vec_initial_defs);
7642 }
7643 else
7644 {
7645 tree adjustment_def = NULL_TREE;
7646 tree *adjustment_defp = &adjustment_def;
7647 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7648 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7649 adjustment_defp = NULL;
7650 vec_initial_def
7651 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7652 initial_def, adjustment_defp);
7653 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7654 vec_initial_defs.create (ncopies);
7655 for (i = 0; i < ncopies; ++i)
7656 vec_initial_defs.quick_push (vec_initial_def);
7657 }
7658 }
7659
7660 /* Generate the reduction PHIs upfront. */
7661 for (i = 0; i < vec_num; i++)
7662 {
7663 tree vec_init_def = vec_initial_defs[i];
7664 for (j = 0; j < ncopies; j++)
7665 {
7666 /* Create the reduction-phi that defines the reduction
7667 operand. */
7668 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7669
7670 /* Set the loop-entry arg of the reduction-phi. */
7671 if (j != 0 && nested_cycle)
7672 vec_init_def = vec_initial_defs[j];
7673 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7674 UNKNOWN_LOCATION);
7675
7676 /* The loop-latch arg is set in epilogue processing. */
7677
7678 if (slp_node)
7679 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7680 else
7681 {
7682 if (j == 0)
7683 *vec_stmt = new_phi;
7684 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7685 }
7686 }
7687 }
7688
7689 return true;
7690 }
7691
7692 /* Vectorizes LC PHIs. */
7693
7694 bool
vectorizable_lc_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node)7695 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7696 stmt_vec_info stmt_info, gimple **vec_stmt,
7697 slp_tree slp_node)
7698 {
7699 if (!loop_vinfo
7700 || !is_a <gphi *> (stmt_info->stmt)
7701 || gimple_phi_num_args (stmt_info->stmt) != 1)
7702 return false;
7703
7704 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7705 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7706 return false;
7707
7708 if (!vec_stmt) /* transformation not required. */
7709 {
7710 /* Deal with copies from externs or constants that disguise as
7711 loop-closed PHI nodes (PR97886). */
7712 if (slp_node
7713 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7714 SLP_TREE_VECTYPE (slp_node)))
7715 {
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718 "incompatible vector types for invariants\n");
7719 return false;
7720 }
7721 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7722 return true;
7723 }
7724
7725 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7726 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7727 basic_block bb = gimple_bb (stmt_info->stmt);
7728 edge e = single_pred_edge (bb);
7729 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7730 auto_vec<tree> vec_oprnds;
7731 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7732 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7733 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7734 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7735 {
7736 /* Create the vectorized LC PHI node. */
7737 gphi *new_phi = create_phi_node (vec_dest, bb);
7738 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7739 if (slp_node)
7740 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7741 else
7742 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7743 }
7744 if (!slp_node)
7745 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7746
7747 return true;
7748 }
7749
7750 /* Vectorizes PHIs. */
7751
7752 bool
vectorizable_phi(vec_info *,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7753 vectorizable_phi (vec_info *,
7754 stmt_vec_info stmt_info, gimple **vec_stmt,
7755 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7756 {
7757 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7758 return false;
7759
7760 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7761 return false;
7762
7763 tree vectype = SLP_TREE_VECTYPE (slp_node);
7764
7765 if (!vec_stmt) /* transformation not required. */
7766 {
7767 slp_tree child;
7768 unsigned i;
7769 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7770 if (!child)
7771 {
7772 if (dump_enabled_p ())
7773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774 "PHI node with unvectorized backedge def\n");
7775 return false;
7776 }
7777 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7778 {
7779 if (dump_enabled_p ())
7780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7781 "incompatible vector types for invariants\n");
7782 return false;
7783 }
7784 /* For single-argument PHIs assume coalescing which means zero cost
7785 for the scalar and the vector PHIs. This avoids artificially
7786 favoring the vector path (but may pessimize it in some cases). */
7787 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7788 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7789 vector_stmt, stmt_info, vectype, 0, vect_body);
7790 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7791 return true;
7792 }
7793
7794 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7795 basic_block bb = gimple_bb (stmt_info->stmt);
7796 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7797 auto_vec<gphi *> new_phis;
7798 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7799 {
7800 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7801
7802 /* Skip not yet vectorized defs. */
7803 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7804 && SLP_TREE_VEC_STMTS (child).is_empty ())
7805 continue;
7806
7807 auto_vec<tree> vec_oprnds;
7808 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7809 if (!new_phis.exists ())
7810 {
7811 new_phis.create (vec_oprnds.length ());
7812 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7813 {
7814 /* Create the vectorized LC PHI node. */
7815 new_phis.quick_push (create_phi_node (vec_dest, bb));
7816 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7817 }
7818 }
7819 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7820 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7821 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7822 }
7823 /* We should have at least one already vectorized child. */
7824 gcc_assert (new_phis.exists ());
7825
7826 return true;
7827 }
7828
7829
7830 /* Function vect_min_worthwhile_factor.
7831
7832 For a loop where we could vectorize the operation indicated by CODE,
7833 return the minimum vectorization factor that makes it worthwhile
7834 to use generic vectors. */
7835 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7836 vect_min_worthwhile_factor (enum tree_code code)
7837 {
7838 switch (code)
7839 {
7840 case PLUS_EXPR:
7841 case MINUS_EXPR:
7842 case NEGATE_EXPR:
7843 return 4;
7844
7845 case BIT_AND_EXPR:
7846 case BIT_IOR_EXPR:
7847 case BIT_XOR_EXPR:
7848 case BIT_NOT_EXPR:
7849 return 2;
7850
7851 default:
7852 return INT_MAX;
7853 }
7854 }
7855
7856 /* Return true if VINFO indicates we are doing loop vectorization and if
7857 it is worth decomposing CODE operations into scalar operations for
7858 that loop's vectorization factor. */
7859
7860 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7861 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7862 {
7863 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7864 unsigned HOST_WIDE_INT value;
7865 return (loop_vinfo
7866 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7867 && value >= vect_min_worthwhile_factor (code));
7868 }
7869
7870 /* Function vectorizable_induction
7871
7872 Check if STMT_INFO performs an induction computation that can be vectorized.
7873 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7874 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7875 Return true if STMT_INFO is vectorizable in this way. */
7876
7877 bool
vectorizable_induction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7878 vectorizable_induction (loop_vec_info loop_vinfo,
7879 stmt_vec_info stmt_info,
7880 gimple **vec_stmt, slp_tree slp_node,
7881 stmt_vector_for_cost *cost_vec)
7882 {
7883 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7884 unsigned ncopies;
7885 bool nested_in_vect_loop = false;
7886 class loop *iv_loop;
7887 tree vec_def;
7888 edge pe = loop_preheader_edge (loop);
7889 basic_block new_bb;
7890 tree new_vec, vec_init, vec_step, t;
7891 tree new_name;
7892 gimple *new_stmt;
7893 gphi *induction_phi;
7894 tree induc_def, vec_dest;
7895 tree init_expr, step_expr;
7896 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7897 unsigned i;
7898 tree expr;
7899 gimple_stmt_iterator si;
7900
7901 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7902 if (!phi)
7903 return false;
7904
7905 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7906 return false;
7907
7908 /* Make sure it was recognized as induction computation. */
7909 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7910 return false;
7911
7912 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7913 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7914
7915 if (slp_node)
7916 ncopies = 1;
7917 else
7918 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7919 gcc_assert (ncopies >= 1);
7920
7921 /* FORNOW. These restrictions should be relaxed. */
7922 if (nested_in_vect_loop_p (loop, stmt_info))
7923 {
7924 imm_use_iterator imm_iter;
7925 use_operand_p use_p;
7926 gimple *exit_phi;
7927 edge latch_e;
7928 tree loop_arg;
7929
7930 if (ncopies > 1)
7931 {
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934 "multiple types in nested loop.\n");
7935 return false;
7936 }
7937
7938 exit_phi = NULL;
7939 latch_e = loop_latch_edge (loop->inner);
7940 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7941 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7942 {
7943 gimple *use_stmt = USE_STMT (use_p);
7944 if (is_gimple_debug (use_stmt))
7945 continue;
7946
7947 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7948 {
7949 exit_phi = use_stmt;
7950 break;
7951 }
7952 }
7953 if (exit_phi)
7954 {
7955 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7956 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7957 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7958 {
7959 if (dump_enabled_p ())
7960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961 "inner-loop induction only used outside "
7962 "of the outer vectorized loop.\n");
7963 return false;
7964 }
7965 }
7966
7967 nested_in_vect_loop = true;
7968 iv_loop = loop->inner;
7969 }
7970 else
7971 iv_loop = loop;
7972 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7973
7974 if (slp_node && !nunits.is_constant ())
7975 {
7976 /* The current SLP code creates the step value element-by-element. */
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 "SLP induction not supported for variable-length"
7980 " vectors.\n");
7981 return false;
7982 }
7983
7984 if (!vec_stmt) /* transformation not required. */
7985 {
7986 unsigned inside_cost = 0, prologue_cost = 0;
7987 if (slp_node)
7988 {
7989 /* We eventually need to set a vector type on invariant
7990 arguments. */
7991 unsigned j;
7992 slp_tree child;
7993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7994 if (!vect_maybe_update_slp_op_vectype
7995 (child, SLP_TREE_VECTYPE (slp_node)))
7996 {
7997 if (dump_enabled_p ())
7998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999 "incompatible vector types for "
8000 "invariants\n");
8001 return false;
8002 }
8003 /* loop cost for vec_loop. */
8004 inside_cost
8005 = record_stmt_cost (cost_vec,
8006 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8007 vector_stmt, stmt_info, 0, vect_body);
8008 /* prologue cost for vec_init (if not nested) and step. */
8009 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8010 scalar_to_vec,
8011 stmt_info, 0, vect_prologue);
8012 }
8013 else /* if (!slp_node) */
8014 {
8015 /* loop cost for vec_loop. */
8016 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8017 stmt_info, 0, vect_body);
8018 /* prologue cost for vec_init and vec_step. */
8019 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8020 stmt_info, 0, vect_prologue);
8021 }
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_NOTE, vect_location,
8024 "vect_model_induction_cost: inside_cost = %d, "
8025 "prologue_cost = %d .\n", inside_cost,
8026 prologue_cost);
8027
8028 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8029 DUMP_VECT_SCOPE ("vectorizable_induction");
8030 return true;
8031 }
8032
8033 /* Transform. */
8034
8035 /* Compute a vector variable, initialized with the first VF values of
8036 the induction variable. E.g., for an iv with IV_PHI='X' and
8037 evolution S, for a vector of 4 units, we want to compute:
8038 [X, X + S, X + 2*S, X + 3*S]. */
8039
8040 if (dump_enabled_p ())
8041 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8042
8043 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8044 gcc_assert (step_expr != NULL_TREE);
8045 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8046
8047 pe = loop_preheader_edge (iv_loop);
8048 /* Find the first insertion point in the BB. */
8049 basic_block bb = gimple_bb (phi);
8050 si = gsi_after_labels (bb);
8051
8052 /* For SLP induction we have to generate several IVs as for example
8053 with group size 3 we need
8054 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8055 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8056 if (slp_node)
8057 {
8058 /* Enforced above. */
8059 unsigned int const_nunits = nunits.to_constant ();
8060
8061 /* The initial values are vectorized, but any lanes > group_size
8062 need adjustment. */
8063 slp_tree init_node
8064 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8065
8066 /* Gather steps. Since we do not vectorize inductions as
8067 cycles we have to reconstruct the step from SCEV data. */
8068 unsigned group_size = SLP_TREE_LANES (slp_node);
8069 tree *steps = XALLOCAVEC (tree, group_size);
8070 tree *inits = XALLOCAVEC (tree, group_size);
8071 stmt_vec_info phi_info;
8072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8073 {
8074 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8075 if (!init_node)
8076 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8077 pe->dest_idx);
8078 }
8079
8080 /* Now generate the IVs. */
8081 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8082 gcc_assert ((const_nunits * nvects) % group_size == 0);
8083 unsigned nivs;
8084 if (nested_in_vect_loop)
8085 nivs = nvects;
8086 else
8087 {
8088 /* Compute the number of distinct IVs we need. First reduce
8089 group_size if it is a multiple of const_nunits so we get
8090 one IV for a group_size of 4 but const_nunits 2. */
8091 unsigned group_sizep = group_size;
8092 if (group_sizep % const_nunits == 0)
8093 group_sizep = group_sizep / const_nunits;
8094 nivs = least_common_multiple (group_sizep,
8095 const_nunits) / const_nunits;
8096 }
8097 tree stept = TREE_TYPE (step_vectype);
8098 tree lupdate_mul = NULL_TREE;
8099 if (!nested_in_vect_loop)
8100 {
8101 /* The number of iterations covered in one vector iteration. */
8102 unsigned lup_mul = (nvects * const_nunits) / group_size;
8103 lupdate_mul
8104 = build_vector_from_val (step_vectype,
8105 SCALAR_FLOAT_TYPE_P (stept)
8106 ? build_real_from_wide (stept, lup_mul,
8107 UNSIGNED)
8108 : build_int_cstu (stept, lup_mul));
8109 }
8110 tree peel_mul = NULL_TREE;
8111 gimple_seq init_stmts = NULL;
8112 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8113 {
8114 if (SCALAR_FLOAT_TYPE_P (stept))
8115 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8116 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8117 else
8118 peel_mul = gimple_convert (&init_stmts, stept,
8119 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8120 peel_mul = gimple_build_vector_from_val (&init_stmts,
8121 step_vectype, peel_mul);
8122 }
8123 unsigned ivn;
8124 auto_vec<tree> vec_steps;
8125 for (ivn = 0; ivn < nivs; ++ivn)
8126 {
8127 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8128 tree_vector_builder init_elts (vectype, const_nunits, 1);
8129 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8130 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8131 {
8132 /* The scalar steps of the IVs. */
8133 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8134 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8135 step_elts.quick_push (elt);
8136 if (!init_node)
8137 {
8138 /* The scalar inits of the IVs if not vectorized. */
8139 elt = inits[(ivn*const_nunits + eltn) % group_size];
8140 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8141 TREE_TYPE (elt)))
8142 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8143 TREE_TYPE (vectype), elt);
8144 init_elts.quick_push (elt);
8145 }
8146 /* The number of steps to add to the initial values. */
8147 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8148 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8149 ? build_real_from_wide (stept,
8150 mul_elt, UNSIGNED)
8151 : build_int_cstu (stept, mul_elt));
8152 }
8153 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8154 vec_steps.safe_push (vec_step);
8155 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8156 if (peel_mul)
8157 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8158 step_mul, peel_mul);
8159 if (!init_node)
8160 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8161
8162 /* Create the induction-phi that defines the induction-operand. */
8163 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8164 "vec_iv_");
8165 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8166 induc_def = PHI_RESULT (induction_phi);
8167
8168 /* Create the iv update inside the loop */
8169 tree up = vec_step;
8170 if (lupdate_mul)
8171 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8172 vec_step, lupdate_mul);
8173 gimple_seq stmts = NULL;
8174 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8175 vec_def = gimple_build (&stmts,
8176 PLUS_EXPR, step_vectype, vec_def, up);
8177 vec_def = gimple_convert (&stmts, vectype, vec_def);
8178 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8179 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8180 UNKNOWN_LOCATION);
8181
8182 if (init_node)
8183 vec_init = vect_get_slp_vect_def (init_node, ivn);
8184 if (!nested_in_vect_loop
8185 && !integer_zerop (step_mul))
8186 {
8187 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8188 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8189 vec_step, step_mul);
8190 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8191 vec_def, up);
8192 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8193 }
8194
8195 /* Set the arguments of the phi node: */
8196 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8197
8198 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8199 }
8200 if (!nested_in_vect_loop)
8201 {
8202 /* Fill up to the number of vectors we need for the whole group. */
8203 nivs = least_common_multiple (group_size,
8204 const_nunits) / const_nunits;
8205 vec_steps.reserve (nivs-ivn);
8206 for (; ivn < nivs; ++ivn)
8207 {
8208 SLP_TREE_VEC_STMTS (slp_node)
8209 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8210 vec_steps.quick_push (vec_steps[0]);
8211 }
8212 }
8213
8214 /* Re-use IVs when we can. We are generating further vector
8215 stmts by adding VF' * stride to the IVs generated above. */
8216 if (ivn < nvects)
8217 {
8218 unsigned vfp
8219 = least_common_multiple (group_size, const_nunits) / group_size;
8220 tree lupdate_mul
8221 = build_vector_from_val (step_vectype,
8222 SCALAR_FLOAT_TYPE_P (stept)
8223 ? build_real_from_wide (stept,
8224 vfp, UNSIGNED)
8225 : build_int_cstu (stept, vfp));
8226 for (; ivn < nvects; ++ivn)
8227 {
8228 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8229 tree def = gimple_get_lhs (iv);
8230 if (ivn < 2*nivs)
8231 vec_steps[ivn - nivs]
8232 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8233 vec_steps[ivn - nivs], lupdate_mul);
8234 gimple_seq stmts = NULL;
8235 def = gimple_convert (&stmts, step_vectype, def);
8236 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8237 def, vec_steps[ivn % nivs]);
8238 def = gimple_convert (&stmts, vectype, def);
8239 if (gimple_code (iv) == GIMPLE_PHI)
8240 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8241 else
8242 {
8243 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8244 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8245 }
8246 SLP_TREE_VEC_STMTS (slp_node)
8247 .quick_push (SSA_NAME_DEF_STMT (def));
8248 }
8249 }
8250
8251 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8252 gcc_assert (!new_bb);
8253
8254 return true;
8255 }
8256
8257 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8258 loop_preheader_edge (iv_loop));
8259
8260 gimple_seq stmts = NULL;
8261 if (!nested_in_vect_loop)
8262 {
8263 /* Convert the initial value to the IV update type. */
8264 tree new_type = TREE_TYPE (step_expr);
8265 init_expr = gimple_convert (&stmts, new_type, init_expr);
8266
8267 /* If we are using the loop mask to "peel" for alignment then we need
8268 to adjust the start value here. */
8269 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8270 if (skip_niters != NULL_TREE)
8271 {
8272 if (FLOAT_TYPE_P (vectype))
8273 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8274 skip_niters);
8275 else
8276 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8277 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8278 skip_niters, step_expr);
8279 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8280 init_expr, skip_step);
8281 }
8282 }
8283
8284 if (stmts)
8285 {
8286 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8287 gcc_assert (!new_bb);
8288 }
8289
8290 /* Create the vector that holds the initial_value of the induction. */
8291 if (nested_in_vect_loop)
8292 {
8293 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8294 been created during vectorization of previous stmts. We obtain it
8295 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8296 auto_vec<tree> vec_inits;
8297 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8298 init_expr, &vec_inits);
8299 vec_init = vec_inits[0];
8300 /* If the initial value is not of proper type, convert it. */
8301 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8302 {
8303 new_stmt
8304 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8305 vect_simple_var,
8306 "vec_iv_"),
8307 VIEW_CONVERT_EXPR,
8308 build1 (VIEW_CONVERT_EXPR, vectype,
8309 vec_init));
8310 vec_init = gimple_assign_lhs (new_stmt);
8311 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8312 new_stmt);
8313 gcc_assert (!new_bb);
8314 }
8315 }
8316 else
8317 {
8318 /* iv_loop is the loop to be vectorized. Create:
8319 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8320 stmts = NULL;
8321 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8322
8323 unsigned HOST_WIDE_INT const_nunits;
8324 if (nunits.is_constant (&const_nunits))
8325 {
8326 tree_vector_builder elts (step_vectype, const_nunits, 1);
8327 elts.quick_push (new_name);
8328 for (i = 1; i < const_nunits; i++)
8329 {
8330 /* Create: new_name_i = new_name + step_expr */
8331 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8332 new_name, step_expr);
8333 elts.quick_push (new_name);
8334 }
8335 /* Create a vector from [new_name_0, new_name_1, ...,
8336 new_name_nunits-1] */
8337 vec_init = gimple_build_vector (&stmts, &elts);
8338 }
8339 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8340 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8341 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8342 new_name, step_expr);
8343 else
8344 {
8345 /* Build:
8346 [base, base, base, ...]
8347 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8348 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8349 gcc_assert (flag_associative_math);
8350 tree index = build_index_vector (step_vectype, 0, 1);
8351 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8352 new_name);
8353 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8354 step_expr);
8355 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8356 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8357 vec_init, step_vec);
8358 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8359 vec_init, base_vec);
8360 }
8361 vec_init = gimple_convert (&stmts, vectype, vec_init);
8362
8363 if (stmts)
8364 {
8365 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8366 gcc_assert (!new_bb);
8367 }
8368 }
8369
8370
8371 /* Create the vector that holds the step of the induction. */
8372 if (nested_in_vect_loop)
8373 /* iv_loop is nested in the loop to be vectorized. Generate:
8374 vec_step = [S, S, S, S] */
8375 new_name = step_expr;
8376 else
8377 {
8378 /* iv_loop is the loop to be vectorized. Generate:
8379 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8380 gimple_seq seq = NULL;
8381 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8382 {
8383 expr = build_int_cst (integer_type_node, vf);
8384 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8385 }
8386 else
8387 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8388 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8389 expr, step_expr);
8390 if (seq)
8391 {
8392 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8393 gcc_assert (!new_bb);
8394 }
8395 }
8396
8397 t = unshare_expr (new_name);
8398 gcc_assert (CONSTANT_CLASS_P (new_name)
8399 || TREE_CODE (new_name) == SSA_NAME);
8400 new_vec = build_vector_from_val (step_vectype, t);
8401 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8402 new_vec, step_vectype, NULL);
8403
8404
8405 /* Create the following def-use cycle:
8406 loop prolog:
8407 vec_init = ...
8408 vec_step = ...
8409 loop:
8410 vec_iv = PHI <vec_init, vec_loop>
8411 ...
8412 STMT
8413 ...
8414 vec_loop = vec_iv + vec_step; */
8415
8416 /* Create the induction-phi that defines the induction-operand. */
8417 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8418 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8419 induc_def = PHI_RESULT (induction_phi);
8420
8421 /* Create the iv update inside the loop */
8422 stmts = NULL;
8423 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8424 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8425 vec_def = gimple_convert (&stmts, vectype, vec_def);
8426 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8427 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8428
8429 /* Set the arguments of the phi node: */
8430 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8431 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8432 UNKNOWN_LOCATION);
8433
8434 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8435 *vec_stmt = induction_phi;
8436
8437 /* In case that vectorization factor (VF) is bigger than the number
8438 of elements that we can fit in a vectype (nunits), we have to generate
8439 more than one vector stmt - i.e - we need to "unroll" the
8440 vector stmt by a factor VF/nunits. For more details see documentation
8441 in vectorizable_operation. */
8442
8443 if (ncopies > 1)
8444 {
8445 gimple_seq seq = NULL;
8446 /* FORNOW. This restriction should be relaxed. */
8447 gcc_assert (!nested_in_vect_loop);
8448
8449 /* Create the vector that holds the step of the induction. */
8450 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8451 {
8452 expr = build_int_cst (integer_type_node, nunits);
8453 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8454 }
8455 else
8456 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8457 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8458 expr, step_expr);
8459 if (seq)
8460 {
8461 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8462 gcc_assert (!new_bb);
8463 }
8464
8465 t = unshare_expr (new_name);
8466 gcc_assert (CONSTANT_CLASS_P (new_name)
8467 || TREE_CODE (new_name) == SSA_NAME);
8468 new_vec = build_vector_from_val (step_vectype, t);
8469 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8470 new_vec, step_vectype, NULL);
8471
8472 vec_def = induc_def;
8473 for (i = 1; i < ncopies; i++)
8474 {
8475 /* vec_i = vec_prev + vec_step */
8476 gimple_seq stmts = NULL;
8477 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8478 vec_def = gimple_build (&stmts,
8479 PLUS_EXPR, step_vectype, vec_def, vec_step);
8480 vec_def = gimple_convert (&stmts, vectype, vec_def);
8481
8482 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8483 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8484 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8485 }
8486 }
8487
8488 if (dump_enabled_p ())
8489 dump_printf_loc (MSG_NOTE, vect_location,
8490 "transform induction: created def-use cycle: %G%G",
8491 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8492
8493 return true;
8494 }
8495
8496 /* Function vectorizable_live_operation.
8497
8498 STMT_INFO computes a value that is used outside the loop. Check if
8499 it can be supported. */
8500
8501 bool
vectorizable_live_operation(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost * cost_vec)8502 vectorizable_live_operation (vec_info *vinfo,
8503 stmt_vec_info stmt_info,
8504 gimple_stmt_iterator *gsi,
8505 slp_tree slp_node, slp_instance slp_node_instance,
8506 int slp_index, bool vec_stmt_p,
8507 stmt_vector_for_cost *cost_vec)
8508 {
8509 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8510 imm_use_iterator imm_iter;
8511 tree lhs, lhs_type, bitsize;
8512 tree vectype = (slp_node
8513 ? SLP_TREE_VECTYPE (slp_node)
8514 : STMT_VINFO_VECTYPE (stmt_info));
8515 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8516 int ncopies;
8517 gimple *use_stmt;
8518 auto_vec<tree> vec_oprnds;
8519 int vec_entry = 0;
8520 poly_uint64 vec_index = 0;
8521
8522 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8523
8524 /* If a stmt of a reduction is live, vectorize it via
8525 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8526 validity so just trigger the transform here. */
8527 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8528 {
8529 if (!vec_stmt_p)
8530 return true;
8531 if (slp_node)
8532 {
8533 /* For reduction chains the meta-info is attached to
8534 the group leader. */
8535 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8536 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8537 /* For SLP reductions we vectorize the epilogue for
8538 all involved stmts together. */
8539 else if (slp_index != 0)
8540 return true;
8541 else
8542 /* For SLP reductions the meta-info is attached to
8543 the representative. */
8544 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8545 }
8546 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8547 gcc_assert (reduc_info->is_reduc_info);
8548 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8549 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8550 return true;
8551 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8552 slp_node_instance);
8553 return true;
8554 }
8555
8556 /* If STMT is not relevant and it is a simple assignment and its inputs are
8557 invariant then it can remain in place, unvectorized. The original last
8558 scalar value that it computes will be used. */
8559 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8560 {
8561 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8562 if (dump_enabled_p ())
8563 dump_printf_loc (MSG_NOTE, vect_location,
8564 "statement is simple and uses invariant. Leaving in "
8565 "place.\n");
8566 return true;
8567 }
8568
8569 if (slp_node)
8570 ncopies = 1;
8571 else
8572 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8573
8574 if (slp_node)
8575 {
8576 gcc_assert (slp_index >= 0);
8577
8578 /* Get the last occurrence of the scalar index from the concatenation of
8579 all the slp vectors. Calculate which slp vector it is and the index
8580 within. */
8581 int num_scalar = SLP_TREE_LANES (slp_node);
8582 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8583 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8584
8585 /* Calculate which vector contains the result, and which lane of
8586 that vector we need. */
8587 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8588 {
8589 if (dump_enabled_p ())
8590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8591 "Cannot determine which vector holds the"
8592 " final result.\n");
8593 return false;
8594 }
8595 }
8596
8597 if (!vec_stmt_p)
8598 {
8599 /* No transformation required. */
8600 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8601 {
8602 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8603 OPTIMIZE_FOR_SPEED))
8604 {
8605 if (dump_enabled_p ())
8606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8607 "can't operate on partial vectors "
8608 "because the target doesn't support extract "
8609 "last reduction.\n");
8610 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8611 }
8612 else if (slp_node)
8613 {
8614 if (dump_enabled_p ())
8615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8616 "can't operate on partial vectors "
8617 "because an SLP statement is live after "
8618 "the loop.\n");
8619 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8620 }
8621 else if (ncopies > 1)
8622 {
8623 if (dump_enabled_p ())
8624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8625 "can't operate on partial vectors "
8626 "because ncopies is greater than 1.\n");
8627 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8628 }
8629 else
8630 {
8631 gcc_assert (ncopies == 1 && !slp_node);
8632 vect_record_loop_mask (loop_vinfo,
8633 &LOOP_VINFO_MASKS (loop_vinfo),
8634 1, vectype, NULL);
8635 }
8636 }
8637 /* ??? Enable for loop costing as well. */
8638 if (!loop_vinfo)
8639 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8640 0, vect_epilogue);
8641 return true;
8642 }
8643
8644 /* Use the lhs of the original scalar statement. */
8645 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8646 if (dump_enabled_p ())
8647 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8648 "stmt %G", stmt);
8649
8650 lhs = gimple_get_lhs (stmt);
8651 lhs_type = TREE_TYPE (lhs);
8652
8653 bitsize = vector_element_bits_tree (vectype);
8654
8655 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8656 tree vec_lhs, bitstart;
8657 gimple *vec_stmt;
8658 if (slp_node)
8659 {
8660 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8661
8662 /* Get the correct slp vectorized stmt. */
8663 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8664 vec_lhs = gimple_get_lhs (vec_stmt);
8665
8666 /* Get entry to use. */
8667 bitstart = bitsize_int (vec_index);
8668 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8669 }
8670 else
8671 {
8672 /* For multiple copies, get the last copy. */
8673 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8674 vec_lhs = gimple_get_lhs (vec_stmt);
8675
8676 /* Get the last lane in the vector. */
8677 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8678 }
8679
8680 if (loop_vinfo)
8681 {
8682 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8683 requirement, insert one phi node for it. It looks like:
8684 loop;
8685 BB:
8686 # lhs' = PHI <lhs>
8687 ==>
8688 loop;
8689 BB:
8690 # vec_lhs' = PHI <vec_lhs>
8691 new_tree = lane_extract <vec_lhs', ...>;
8692 lhs' = new_tree; */
8693
8694 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8695 basic_block exit_bb = single_exit (loop)->dest;
8696 gcc_assert (single_pred_p (exit_bb));
8697
8698 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8699 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8700 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8701
8702 gimple_seq stmts = NULL;
8703 tree new_tree;
8704 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8705 {
8706 /* Emit:
8707
8708 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8709
8710 where VEC_LHS is the vectorized live-out result and MASK is
8711 the loop mask for the final iteration. */
8712 gcc_assert (ncopies == 1 && !slp_node);
8713 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8714 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8715 1, vectype, 0);
8716 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8717 mask, vec_lhs_phi);
8718
8719 /* Convert the extracted vector element to the scalar type. */
8720 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8721 }
8722 else
8723 {
8724 tree bftype = TREE_TYPE (vectype);
8725 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8726 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8727 new_tree = build3 (BIT_FIELD_REF, bftype,
8728 vec_lhs_phi, bitsize, bitstart);
8729 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8730 &stmts, true, NULL_TREE);
8731 }
8732
8733 if (stmts)
8734 {
8735 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8736 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8737
8738 /* Remove existing phi from lhs and create one copy from new_tree. */
8739 tree lhs_phi = NULL_TREE;
8740 gimple_stmt_iterator gsi;
8741 for (gsi = gsi_start_phis (exit_bb);
8742 !gsi_end_p (gsi); gsi_next (&gsi))
8743 {
8744 gimple *phi = gsi_stmt (gsi);
8745 if ((gimple_phi_arg_def (phi, 0) == lhs))
8746 {
8747 remove_phi_node (&gsi, false);
8748 lhs_phi = gimple_phi_result (phi);
8749 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8750 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8751 break;
8752 }
8753 }
8754 }
8755
8756 /* Replace use of lhs with newly computed result. If the use stmt is a
8757 single arg PHI, just replace all uses of PHI result. It's necessary
8758 because lcssa PHI defining lhs may be before newly inserted stmt. */
8759 use_operand_p use_p;
8760 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8761 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8762 && !is_gimple_debug (use_stmt))
8763 {
8764 if (gimple_code (use_stmt) == GIMPLE_PHI
8765 && gimple_phi_num_args (use_stmt) == 1)
8766 {
8767 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8768 }
8769 else
8770 {
8771 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8772 SET_USE (use_p, new_tree);
8773 }
8774 update_stmt (use_stmt);
8775 }
8776 }
8777 else
8778 {
8779 /* For basic-block vectorization simply insert the lane-extraction. */
8780 tree bftype = TREE_TYPE (vectype);
8781 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8782 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8783 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8784 vec_lhs, bitsize, bitstart);
8785 gimple_seq stmts = NULL;
8786 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8787 &stmts, true, NULL_TREE);
8788 if (TREE_CODE (new_tree) == SSA_NAME
8789 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8790 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8791 if (is_a <gphi *> (vec_stmt))
8792 {
8793 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8794 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8795 }
8796 else
8797 {
8798 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8799 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8800 }
8801
8802 /* Replace use of lhs with newly computed result. If the use stmt is a
8803 single arg PHI, just replace all uses of PHI result. It's necessary
8804 because lcssa PHI defining lhs may be before newly inserted stmt. */
8805 use_operand_p use_p;
8806 stmt_vec_info use_stmt_info;
8807 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8808 if (!is_gimple_debug (use_stmt)
8809 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8810 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8811 {
8812 /* ??? This can happen when the live lane ends up being
8813 used in a vector construction code-generated by an
8814 external SLP node (and code-generation for that already
8815 happened). See gcc.dg/vect/bb-slp-47.c.
8816 Doing this is what would happen if that vector CTOR
8817 were not code-generated yet so it is not too bad.
8818 ??? In fact we'd likely want to avoid this situation
8819 in the first place. */
8820 if (TREE_CODE (new_tree) == SSA_NAME
8821 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8822 && gimple_code (use_stmt) != GIMPLE_PHI
8823 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8824 use_stmt))
8825 {
8826 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8827 gcc_assert (code == CONSTRUCTOR
8828 || code == VIEW_CONVERT_EXPR
8829 || CONVERT_EXPR_CODE_P (code));
8830 if (dump_enabled_p ())
8831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8832 "Using original scalar computation for "
8833 "live lane because use preceeds vector "
8834 "def\n");
8835 continue;
8836 }
8837 /* ??? It can also happen that we end up pulling a def into
8838 a loop where replacing out-of-loop uses would require
8839 a new LC SSA PHI node. Retain the original scalar in
8840 those cases as well. PR98064. */
8841 if (TREE_CODE (new_tree) == SSA_NAME
8842 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8843 && (gimple_bb (use_stmt)->loop_father
8844 != gimple_bb (vec_stmt)->loop_father)
8845 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8846 gimple_bb (use_stmt)->loop_father))
8847 {
8848 if (dump_enabled_p ())
8849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8850 "Using original scalar computation for "
8851 "live lane because there is an out-of-loop "
8852 "definition for it\n");
8853 continue;
8854 }
8855 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8856 SET_USE (use_p, new_tree);
8857 update_stmt (use_stmt);
8858 }
8859 }
8860
8861 return true;
8862 }
8863
8864 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8865
8866 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8867 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8868 {
8869 ssa_op_iter op_iter;
8870 imm_use_iterator imm_iter;
8871 def_operand_p def_p;
8872 gimple *ustmt;
8873
8874 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8875 {
8876 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8877 {
8878 basic_block bb;
8879
8880 if (!is_gimple_debug (ustmt))
8881 continue;
8882
8883 bb = gimple_bb (ustmt);
8884
8885 if (!flow_bb_inside_loop_p (loop, bb))
8886 {
8887 if (gimple_debug_bind_p (ustmt))
8888 {
8889 if (dump_enabled_p ())
8890 dump_printf_loc (MSG_NOTE, vect_location,
8891 "killing debug use\n");
8892
8893 gimple_debug_bind_reset_value (ustmt);
8894 update_stmt (ustmt);
8895 }
8896 else
8897 gcc_unreachable ();
8898 }
8899 }
8900 }
8901 }
8902
8903 /* Given loop represented by LOOP_VINFO, return true if computation of
8904 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8905 otherwise. */
8906
8907 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8908 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8909 {
8910 /* Constant case. */
8911 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8912 {
8913 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8914 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8915
8916 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8917 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8918 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8919 return true;
8920 }
8921
8922 widest_int max;
8923 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8924 /* Check the upper bound of loop niters. */
8925 if (get_max_loop_iterations (loop, &max))
8926 {
8927 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8928 signop sgn = TYPE_SIGN (type);
8929 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8930 if (max < type_max)
8931 return true;
8932 }
8933 return false;
8934 }
8935
8936 /* Return a mask type with half the number of elements as OLD_TYPE,
8937 given that it should have mode NEW_MODE. */
8938
8939 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8940 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8941 {
8942 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8943 return build_truth_vector_type_for_mode (nunits, new_mode);
8944 }
8945
8946 /* Return a mask type with twice as many elements as OLD_TYPE,
8947 given that it should have mode NEW_MODE. */
8948
8949 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8950 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8951 {
8952 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8953 return build_truth_vector_type_for_mode (nunits, new_mode);
8954 }
8955
8956 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8957 contain a sequence of NVECTORS masks that each control a vector of type
8958 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8959 these vector masks with the vector version of SCALAR_MASK. */
8960
8961 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8962 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8963 unsigned int nvectors, tree vectype, tree scalar_mask)
8964 {
8965 gcc_assert (nvectors != 0);
8966 if (masks->length () < nvectors)
8967 masks->safe_grow_cleared (nvectors, true);
8968 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8969 /* The number of scalars per iteration and the number of vectors are
8970 both compile-time constants. */
8971 unsigned int nscalars_per_iter
8972 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8973 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8974
8975 if (scalar_mask)
8976 {
8977 scalar_cond_masked_key cond (scalar_mask, nvectors);
8978 loop_vinfo->scalar_cond_masked_set.add (cond);
8979 }
8980
8981 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8982 {
8983 rgm->max_nscalars_per_iter = nscalars_per_iter;
8984 rgm->type = truth_type_for (vectype);
8985 rgm->factor = 1;
8986 }
8987 }
8988
8989 /* Given a complete set of masks MASKS, extract mask number INDEX
8990 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8991 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8992
8993 See the comment above vec_loop_masks for more details about the mask
8994 arrangement. */
8995
8996 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8997 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8998 unsigned int nvectors, tree vectype, unsigned int index)
8999 {
9000 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9001 tree mask_type = rgm->type;
9002
9003 /* Populate the rgroup's mask array, if this is the first time we've
9004 used it. */
9005 if (rgm->controls.is_empty ())
9006 {
9007 rgm->controls.safe_grow_cleared (nvectors, true);
9008 for (unsigned int i = 0; i < nvectors; ++i)
9009 {
9010 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9011 /* Provide a dummy definition until the real one is available. */
9012 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9013 rgm->controls[i] = mask;
9014 }
9015 }
9016
9017 tree mask = rgm->controls[index];
9018 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9019 TYPE_VECTOR_SUBPARTS (vectype)))
9020 {
9021 /* A loop mask for data type X can be reused for data type Y
9022 if X has N times more elements than Y and if Y's elements
9023 are N times bigger than X's. In this case each sequence
9024 of N elements in the loop mask will be all-zero or all-one.
9025 We can then view-convert the mask so that each sequence of
9026 N elements is replaced by a single element. */
9027 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9028 TYPE_VECTOR_SUBPARTS (vectype)));
9029 gimple_seq seq = NULL;
9030 mask_type = truth_type_for (vectype);
9031 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9032 if (seq)
9033 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9034 }
9035 return mask;
9036 }
9037
9038 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9039 lengths for controlling an operation on VECTYPE. The operation splits
9040 each element of VECTYPE into FACTOR separate subelements, measuring the
9041 length as a number of these subelements. */
9042
9043 void
vect_record_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,tree vectype,unsigned int factor)9044 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9045 unsigned int nvectors, tree vectype, unsigned int factor)
9046 {
9047 gcc_assert (nvectors != 0);
9048 if (lens->length () < nvectors)
9049 lens->safe_grow_cleared (nvectors, true);
9050 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9051
9052 /* The number of scalars per iteration, scalar occupied bytes and
9053 the number of vectors are both compile-time constants. */
9054 unsigned int nscalars_per_iter
9055 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9056 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9057
9058 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9059 {
9060 /* For now, we only support cases in which all loads and stores fall back
9061 to VnQI or none do. */
9062 gcc_assert (!rgl->max_nscalars_per_iter
9063 || (rgl->factor == 1 && factor == 1)
9064 || (rgl->max_nscalars_per_iter * rgl->factor
9065 == nscalars_per_iter * factor));
9066 rgl->max_nscalars_per_iter = nscalars_per_iter;
9067 rgl->type = vectype;
9068 rgl->factor = factor;
9069 }
9070 }
9071
9072 /* Given a complete set of length LENS, extract length number INDEX for an
9073 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9074
9075 tree
vect_get_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,unsigned int index)9076 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9077 unsigned int nvectors, unsigned int index)
9078 {
9079 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9080
9081 /* Populate the rgroup's len array, if this is the first time we've
9082 used it. */
9083 if (rgl->controls.is_empty ())
9084 {
9085 rgl->controls.safe_grow_cleared (nvectors, true);
9086 for (unsigned int i = 0; i < nvectors; ++i)
9087 {
9088 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9089 gcc_assert (len_type != NULL_TREE);
9090 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9091
9092 /* Provide a dummy definition until the real one is available. */
9093 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9094 rgl->controls[i] = len;
9095 }
9096 }
9097
9098 return rgl->controls[index];
9099 }
9100
9101 /* Scale profiling counters by estimation for LOOP which is vectorized
9102 by factor VF. */
9103
9104 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)9105 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9106 {
9107 edge preheader = loop_preheader_edge (loop);
9108 /* Reduce loop iterations by the vectorization factor. */
9109 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9110 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9111
9112 if (freq_h.nonzero_p ())
9113 {
9114 profile_probability p;
9115
9116 /* Avoid dropping loop body profile counter to 0 because of zero count
9117 in loop's preheader. */
9118 if (!(freq_e == profile_count::zero ()))
9119 freq_e = freq_e.force_nonzero ();
9120 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9121 scale_loop_frequencies (loop, p);
9122 }
9123
9124 edge exit_e = single_exit (loop);
9125 exit_e->probability = profile_probability::always ()
9126 .apply_scale (1, new_est_niter + 1);
9127
9128 edge exit_l = single_pred_edge (loop->latch);
9129 profile_probability prob = exit_l->probability;
9130 exit_l->probability = exit_e->probability.invert ();
9131 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9132 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9133 }
9134
9135 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9136 latch edge values originally defined by it. */
9137
9138 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)9139 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9140 stmt_vec_info def_stmt_info)
9141 {
9142 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9143 if (!def || TREE_CODE (def) != SSA_NAME)
9144 return;
9145 stmt_vec_info phi_info;
9146 imm_use_iterator iter;
9147 use_operand_p use_p;
9148 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9149 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9150 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9151 && (phi_info = loop_vinfo->lookup_stmt (phi))
9152 && STMT_VINFO_RELEVANT_P (phi_info)
9153 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9154 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9155 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9156 {
9157 loop_p loop = gimple_bb (phi)->loop_father;
9158 edge e = loop_latch_edge (loop);
9159 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9160 {
9161 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9162 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9163 gcc_assert (phi_defs.length () == latch_defs.length ());
9164 for (unsigned i = 0; i < phi_defs.length (); ++i)
9165 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9166 gimple_get_lhs (latch_defs[i]), e,
9167 gimple_phi_arg_location (phi, e->dest_idx));
9168 }
9169 }
9170 }
9171
9172 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9173 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9174 stmt_vec_info. */
9175
9176 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)9177 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9178 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9179 {
9180 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9181 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9182
9183 if (dump_enabled_p ())
9184 dump_printf_loc (MSG_NOTE, vect_location,
9185 "------>vectorizing statement: %G", stmt_info->stmt);
9186
9187 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9188 vect_loop_kill_debug_uses (loop, stmt_info);
9189
9190 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9191 && !STMT_VINFO_LIVE_P (stmt_info))
9192 return false;
9193
9194 if (STMT_VINFO_VECTYPE (stmt_info))
9195 {
9196 poly_uint64 nunits
9197 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9198 if (!STMT_SLP_TYPE (stmt_info)
9199 && maybe_ne (nunits, vf)
9200 && dump_enabled_p ())
9201 /* For SLP VF is set according to unrolling factor, and not
9202 to vector size, hence for SLP this print is not valid. */
9203 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9204 }
9205
9206 /* Pure SLP statements have already been vectorized. We still need
9207 to apply loop vectorization to hybrid SLP statements. */
9208 if (PURE_SLP_STMT (stmt_info))
9209 return false;
9210
9211 if (dump_enabled_p ())
9212 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9213
9214 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9215 *seen_store = stmt_info;
9216
9217 return true;
9218 }
9219
9220 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9221 in the hash_map with its corresponding values. */
9222
9223 static tree
find_in_mapping(tree t,void * context)9224 find_in_mapping (tree t, void *context)
9225 {
9226 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9227
9228 tree *value = mapping->get (t);
9229 return value ? *value : t;
9230 }
9231
9232 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9233 original loop that has now been vectorized.
9234
9235 The inits of the data_references need to be advanced with the number of
9236 iterations of the main loop. This has been computed in vect_do_peeling and
9237 is stored in parameter ADVANCE. We first restore the data_references
9238 initial offset with the values recored in ORIG_DRS_INIT.
9239
9240 Since the loop_vec_info of this EPILOGUE was constructed for the original
9241 loop, its stmt_vec_infos all point to the original statements. These need
9242 to be updated to point to their corresponding copies as well as the SSA_NAMES
9243 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9244
9245 The data_reference's connections also need to be updated. Their
9246 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9247 stmt_vec_infos, their statements need to point to their corresponding copy,
9248 if they are gather loads or scatter stores then their reference needs to be
9249 updated to point to its corresponding copy and finally we set
9250 'base_misaligned' to false as we have already peeled for alignment in the
9251 prologue of the main loop. */
9252
9253 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)9254 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9255 {
9256 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9257 auto_vec<gimple *> stmt_worklist;
9258 hash_map<tree,tree> mapping;
9259 gimple *orig_stmt, *new_stmt;
9260 gimple_stmt_iterator epilogue_gsi;
9261 gphi_iterator epilogue_phi_gsi;
9262 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9263 basic_block *epilogue_bbs = get_loop_body (epilogue);
9264 unsigned i;
9265
9266 free (LOOP_VINFO_BBS (epilogue_vinfo));
9267 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9268
9269 /* Advance data_reference's with the number of iterations of the previous
9270 loop and its prologue. */
9271 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9272
9273
9274 /* The EPILOGUE loop is a copy of the original loop so they share the same
9275 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9276 point to the copied statements. We also create a mapping of all LHS' in
9277 the original loop and all the LHS' in the EPILOGUE and create worklists to
9278 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9279 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9280 {
9281 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9282 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9283 {
9284 new_stmt = epilogue_phi_gsi.phi ();
9285
9286 gcc_assert (gimple_uid (new_stmt) > 0);
9287 stmt_vinfo
9288 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9289
9290 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9291 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9292
9293 mapping.put (gimple_phi_result (orig_stmt),
9294 gimple_phi_result (new_stmt));
9295 /* PHI nodes can not have patterns or related statements. */
9296 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9297 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9298 }
9299
9300 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9301 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9302 {
9303 new_stmt = gsi_stmt (epilogue_gsi);
9304 if (is_gimple_debug (new_stmt))
9305 continue;
9306
9307 gcc_assert (gimple_uid (new_stmt) > 0);
9308 stmt_vinfo
9309 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9310
9311 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9312 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9313
9314 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9315 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9316
9317 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9318 {
9319 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9320 for (gimple_stmt_iterator gsi = gsi_start (seq);
9321 !gsi_end_p (gsi); gsi_next (&gsi))
9322 stmt_worklist.safe_push (gsi_stmt (gsi));
9323 }
9324
9325 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9326 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9327 {
9328 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9329 stmt_worklist.safe_push (stmt);
9330 /* Set BB such that the assert in
9331 'get_initial_def_for_reduction' is able to determine that
9332 the BB of the related stmt is inside this loop. */
9333 gimple_set_bb (stmt,
9334 gimple_bb (new_stmt));
9335 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9336 gcc_assert (related_vinfo == NULL
9337 || related_vinfo == stmt_vinfo);
9338 }
9339 }
9340 }
9341
9342 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9343 using the original main loop and thus need to be updated to refer to the
9344 cloned variables used in the epilogue. */
9345 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9346 {
9347 gimple *stmt = stmt_worklist[i];
9348 tree *new_op;
9349
9350 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9351 {
9352 tree op = gimple_op (stmt, j);
9353 if ((new_op = mapping.get(op)))
9354 gimple_set_op (stmt, j, *new_op);
9355 else
9356 {
9357 /* PR92429: The last argument of simplify_replace_tree disables
9358 folding when replacing arguments. This is required as
9359 otherwise you might end up with different statements than the
9360 ones analyzed in vect_loop_analyze, leading to different
9361 vectorization. */
9362 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9363 &find_in_mapping, &mapping, false);
9364 gimple_set_op (stmt, j, op);
9365 }
9366 }
9367 }
9368
9369 struct data_reference *dr;
9370 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9371 FOR_EACH_VEC_ELT (datarefs, i, dr)
9372 {
9373 orig_stmt = DR_STMT (dr);
9374 gcc_assert (gimple_uid (orig_stmt) > 0);
9375 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9376 /* Data references for gather loads and scatter stores do not use the
9377 updated offset we set using ADVANCE. Instead we have to make sure the
9378 reference in the data references point to the corresponding copy of
9379 the original in the epilogue. */
9380 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9381 == VMAT_GATHER_SCATTER)
9382 {
9383 DR_REF (dr)
9384 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9385 &find_in_mapping, &mapping);
9386 DR_BASE_ADDRESS (dr)
9387 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9388 &find_in_mapping, &mapping);
9389 }
9390 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9391 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9392 /* The vector size of the epilogue is smaller than that of the main loop
9393 so the alignment is either the same or lower. This means the dr will
9394 thus by definition be aligned. */
9395 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9396 }
9397
9398 epilogue_vinfo->shared->datarefs_copy.release ();
9399 epilogue_vinfo->shared->save_datarefs ();
9400 }
9401
9402 /* Function vect_transform_loop.
9403
9404 The analysis phase has determined that the loop is vectorizable.
9405 Vectorize the loop - created vectorized stmts to replace the scalar
9406 stmts in the loop, and update the loop exit condition.
9407 Returns scalar epilogue loop if any. */
9408
9409 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)9410 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9411 {
9412 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9413 class loop *epilogue = NULL;
9414 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9415 int nbbs = loop->num_nodes;
9416 int i;
9417 tree niters_vector = NULL_TREE;
9418 tree step_vector = NULL_TREE;
9419 tree niters_vector_mult_vf = NULL_TREE;
9420 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9421 unsigned int lowest_vf = constant_lower_bound (vf);
9422 gimple *stmt;
9423 bool check_profitability = false;
9424 unsigned int th;
9425
9426 DUMP_VECT_SCOPE ("vec_transform_loop");
9427
9428 loop_vinfo->shared->check_datarefs ();
9429
9430 /* Use the more conservative vectorization threshold. If the number
9431 of iterations is constant assume the cost check has been performed
9432 by our caller. If the threshold makes all loops profitable that
9433 run at least the (estimated) vectorization factor number of times
9434 checking is pointless, too. */
9435 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9436 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9437 {
9438 if (dump_enabled_p ())
9439 dump_printf_loc (MSG_NOTE, vect_location,
9440 "Profitability threshold is %d loop iterations.\n",
9441 th);
9442 check_profitability = true;
9443 }
9444
9445 /* Make sure there exists a single-predecessor exit bb. Do this before
9446 versioning. */
9447 edge e = single_exit (loop);
9448 if (! single_pred_p (e->dest))
9449 {
9450 split_loop_exit_edge (e, true);
9451 if (dump_enabled_p ())
9452 dump_printf (MSG_NOTE, "split exit edge\n");
9453 }
9454
9455 /* Version the loop first, if required, so the profitability check
9456 comes first. */
9457
9458 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9459 {
9460 class loop *sloop
9461 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9462 sloop->force_vectorize = false;
9463 check_profitability = false;
9464 }
9465
9466 /* Make sure there exists a single-predecessor exit bb also on the
9467 scalar loop copy. Do this after versioning but before peeling
9468 so CFG structure is fine for both scalar and if-converted loop
9469 to make slpeel_duplicate_current_defs_from_edges face matched
9470 loop closed PHI nodes on the exit. */
9471 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9472 {
9473 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9474 if (! single_pred_p (e->dest))
9475 {
9476 split_loop_exit_edge (e, true);
9477 if (dump_enabled_p ())
9478 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9479 }
9480 }
9481
9482 tree niters = vect_build_loop_niters (loop_vinfo);
9483 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9484 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9485 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9486 tree advance;
9487 drs_init_vec orig_drs_init;
9488
9489 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9490 &step_vector, &niters_vector_mult_vf, th,
9491 check_profitability, niters_no_overflow,
9492 &advance);
9493
9494 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9495 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9496 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9497 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9498
9499 if (niters_vector == NULL_TREE)
9500 {
9501 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9502 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9503 && known_eq (lowest_vf, vf))
9504 {
9505 niters_vector
9506 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9507 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9508 step_vector = build_one_cst (TREE_TYPE (niters));
9509 }
9510 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9511 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9512 &step_vector, niters_no_overflow);
9513 else
9514 /* vect_do_peeling subtracted the number of peeled prologue
9515 iterations from LOOP_VINFO_NITERS. */
9516 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9517 &niters_vector, &step_vector,
9518 niters_no_overflow);
9519 }
9520
9521 /* 1) Make sure the loop header has exactly two entries
9522 2) Make sure we have a preheader basic block. */
9523
9524 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9525
9526 split_edge (loop_preheader_edge (loop));
9527
9528 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9529 /* This will deal with any possible peeling. */
9530 vect_prepare_for_masked_peels (loop_vinfo);
9531
9532 /* Schedule the SLP instances first, then handle loop vectorization
9533 below. */
9534 if (!loop_vinfo->slp_instances.is_empty ())
9535 {
9536 DUMP_VECT_SCOPE ("scheduling SLP instances");
9537 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9538 }
9539
9540 /* FORNOW: the vectorizer supports only loops which body consist
9541 of one basic block (header + empty latch). When the vectorizer will
9542 support more involved loop forms, the order by which the BBs are
9543 traversed need to be reconsidered. */
9544
9545 for (i = 0; i < nbbs; i++)
9546 {
9547 basic_block bb = bbs[i];
9548 stmt_vec_info stmt_info;
9549
9550 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9551 gsi_next (&si))
9552 {
9553 gphi *phi = si.phi ();
9554 if (dump_enabled_p ())
9555 dump_printf_loc (MSG_NOTE, vect_location,
9556 "------>vectorizing phi: %G", phi);
9557 stmt_info = loop_vinfo->lookup_stmt (phi);
9558 if (!stmt_info)
9559 continue;
9560
9561 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9562 vect_loop_kill_debug_uses (loop, stmt_info);
9563
9564 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9565 && !STMT_VINFO_LIVE_P (stmt_info))
9566 continue;
9567
9568 if (STMT_VINFO_VECTYPE (stmt_info)
9569 && (maybe_ne
9570 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9571 && dump_enabled_p ())
9572 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9573
9574 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9575 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9576 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9577 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9578 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9579 && ! PURE_SLP_STMT (stmt_info))
9580 {
9581 if (dump_enabled_p ())
9582 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9583 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9584 }
9585 }
9586
9587 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9588 gsi_next (&si))
9589 {
9590 gphi *phi = si.phi ();
9591 stmt_info = loop_vinfo->lookup_stmt (phi);
9592 if (!stmt_info)
9593 continue;
9594
9595 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9596 && !STMT_VINFO_LIVE_P (stmt_info))
9597 continue;
9598
9599 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9600 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9602 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9603 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9604 && ! PURE_SLP_STMT (stmt_info))
9605 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9606 }
9607
9608 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9609 !gsi_end_p (si);)
9610 {
9611 stmt = gsi_stmt (si);
9612 /* During vectorization remove existing clobber stmts. */
9613 if (gimple_clobber_p (stmt))
9614 {
9615 unlink_stmt_vdef (stmt);
9616 gsi_remove (&si, true);
9617 release_defs (stmt);
9618 }
9619 else
9620 {
9621 /* Ignore vector stmts created in the outer loop. */
9622 stmt_info = loop_vinfo->lookup_stmt (stmt);
9623
9624 /* vector stmts created in the outer-loop during vectorization of
9625 stmts in an inner-loop may not have a stmt_info, and do not
9626 need to be vectorized. */
9627 stmt_vec_info seen_store = NULL;
9628 if (stmt_info)
9629 {
9630 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9631 {
9632 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9633 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9634 !gsi_end_p (subsi); gsi_next (&subsi))
9635 {
9636 stmt_vec_info pat_stmt_info
9637 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9638 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9639 &si, &seen_store);
9640 }
9641 stmt_vec_info pat_stmt_info
9642 = STMT_VINFO_RELATED_STMT (stmt_info);
9643 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9644 &si, &seen_store))
9645 maybe_set_vectorized_backedge_value (loop_vinfo,
9646 pat_stmt_info);
9647 }
9648 else
9649 {
9650 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9651 &seen_store))
9652 maybe_set_vectorized_backedge_value (loop_vinfo,
9653 stmt_info);
9654 }
9655 }
9656 gsi_next (&si);
9657 if (seen_store)
9658 {
9659 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9660 /* Interleaving. If IS_STORE is TRUE, the
9661 vectorization of the interleaving chain was
9662 completed - free all the stores in the chain. */
9663 vect_remove_stores (loop_vinfo,
9664 DR_GROUP_FIRST_ELEMENT (seen_store));
9665 else
9666 /* Free the attached stmt_vec_info and remove the stmt. */
9667 loop_vinfo->remove_stmt (stmt_info);
9668 }
9669 }
9670 }
9671
9672 /* Stub out scalar statements that must not survive vectorization.
9673 Doing this here helps with grouped statements, or statements that
9674 are involved in patterns. */
9675 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9676 !gsi_end_p (gsi); gsi_next (&gsi))
9677 {
9678 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9679 if (!call || !gimple_call_internal_p (call))
9680 continue;
9681 internal_fn ifn = gimple_call_internal_fn (call);
9682 if (ifn == IFN_MASK_LOAD)
9683 {
9684 tree lhs = gimple_get_lhs (call);
9685 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9686 {
9687 tree zero = build_zero_cst (TREE_TYPE (lhs));
9688 gimple *new_stmt = gimple_build_assign (lhs, zero);
9689 gsi_replace (&gsi, new_stmt, true);
9690 }
9691 }
9692 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9693 {
9694 tree lhs = gimple_get_lhs (call);
9695 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9696 {
9697 tree else_arg
9698 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9699 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9700 gsi_replace (&gsi, new_stmt, true);
9701 }
9702 }
9703 }
9704 } /* BBs in loop */
9705
9706 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9707 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9708 if (integer_onep (step_vector))
9709 niters_no_overflow = true;
9710 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9711 niters_vector_mult_vf, !niters_no_overflow);
9712
9713 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9714 scale_profile_for_vect_loop (loop, assumed_vf);
9715
9716 /* True if the final iteration might not handle a full vector's
9717 worth of scalar iterations. */
9718 bool final_iter_may_be_partial
9719 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9720 /* The minimum number of iterations performed by the epilogue. This
9721 is 1 when peeling for gaps because we always need a final scalar
9722 iteration. */
9723 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9724 /* +1 to convert latch counts to loop iteration counts,
9725 -min_epilogue_iters to remove iterations that cannot be performed
9726 by the vector code. */
9727 int bias_for_lowest = 1 - min_epilogue_iters;
9728 int bias_for_assumed = bias_for_lowest;
9729 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9730 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9731 {
9732 /* When the amount of peeling is known at compile time, the first
9733 iteration will have exactly alignment_npeels active elements.
9734 In the worst case it will have at least one. */
9735 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9736 bias_for_lowest += lowest_vf - min_first_active;
9737 bias_for_assumed += assumed_vf - min_first_active;
9738 }
9739 /* In these calculations the "- 1" converts loop iteration counts
9740 back to latch counts. */
9741 if (loop->any_upper_bound)
9742 loop->nb_iterations_upper_bound
9743 = (final_iter_may_be_partial
9744 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9745 lowest_vf) - 1
9746 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9747 lowest_vf) - 1);
9748 if (loop->any_likely_upper_bound)
9749 loop->nb_iterations_likely_upper_bound
9750 = (final_iter_may_be_partial
9751 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9752 + bias_for_lowest, lowest_vf) - 1
9753 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9754 + bias_for_lowest, lowest_vf) - 1);
9755 if (loop->any_estimate)
9756 loop->nb_iterations_estimate
9757 = (final_iter_may_be_partial
9758 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9759 assumed_vf) - 1
9760 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9761 assumed_vf) - 1);
9762
9763 if (dump_enabled_p ())
9764 {
9765 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9766 {
9767 dump_printf_loc (MSG_NOTE, vect_location,
9768 "LOOP VECTORIZED\n");
9769 if (loop->inner)
9770 dump_printf_loc (MSG_NOTE, vect_location,
9771 "OUTER LOOP VECTORIZED\n");
9772 dump_printf (MSG_NOTE, "\n");
9773 }
9774 else
9775 dump_printf_loc (MSG_NOTE, vect_location,
9776 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9777 GET_MODE_NAME (loop_vinfo->vector_mode));
9778 }
9779
9780 /* Loops vectorized with a variable factor won't benefit from
9781 unrolling/peeling. */
9782 if (!vf.is_constant ())
9783 {
9784 loop->unroll = 1;
9785 if (dump_enabled_p ())
9786 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9787 " variable-length vectorization factor\n");
9788 }
9789 /* Free SLP instances here because otherwise stmt reference counting
9790 won't work. */
9791 slp_instance instance;
9792 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9793 vect_free_slp_instance (instance);
9794 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9795 /* Clear-up safelen field since its value is invalid after vectorization
9796 since vectorized loop can have loop-carried dependencies. */
9797 loop->safelen = 0;
9798
9799 if (epilogue)
9800 {
9801 update_epilogue_loop_vinfo (epilogue, advance);
9802
9803 epilogue->simduid = loop->simduid;
9804 epilogue->force_vectorize = loop->force_vectorize;
9805 epilogue->dont_vectorize = false;
9806 }
9807
9808 return epilogue;
9809 }
9810
9811 /* The code below is trying to perform simple optimization - revert
9812 if-conversion for masked stores, i.e. if the mask of a store is zero
9813 do not perform it and all stored value producers also if possible.
9814 For example,
9815 for (i=0; i<n; i++)
9816 if (c[i])
9817 {
9818 p1[i] += 1;
9819 p2[i] = p3[i] +2;
9820 }
9821 this transformation will produce the following semi-hammock:
9822
9823 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9824 {
9825 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9826 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9827 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9828 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9829 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9830 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9831 }
9832 */
9833
9834 void
optimize_mask_stores(class loop * loop)9835 optimize_mask_stores (class loop *loop)
9836 {
9837 basic_block *bbs = get_loop_body (loop);
9838 unsigned nbbs = loop->num_nodes;
9839 unsigned i;
9840 basic_block bb;
9841 class loop *bb_loop;
9842 gimple_stmt_iterator gsi;
9843 gimple *stmt;
9844 auto_vec<gimple *> worklist;
9845 auto_purge_vect_location sentinel;
9846
9847 vect_location = find_loop_location (loop);
9848 /* Pick up all masked stores in loop if any. */
9849 for (i = 0; i < nbbs; i++)
9850 {
9851 bb = bbs[i];
9852 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9853 gsi_next (&gsi))
9854 {
9855 stmt = gsi_stmt (gsi);
9856 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9857 worklist.safe_push (stmt);
9858 }
9859 }
9860
9861 free (bbs);
9862 if (worklist.is_empty ())
9863 return;
9864
9865 /* Loop has masked stores. */
9866 while (!worklist.is_empty ())
9867 {
9868 gimple *last, *last_store;
9869 edge e, efalse;
9870 tree mask;
9871 basic_block store_bb, join_bb;
9872 gimple_stmt_iterator gsi_to;
9873 tree vdef, new_vdef;
9874 gphi *phi;
9875 tree vectype;
9876 tree zero;
9877
9878 last = worklist.pop ();
9879 mask = gimple_call_arg (last, 2);
9880 bb = gimple_bb (last);
9881 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9882 the same loop as if_bb. It could be different to LOOP when two
9883 level loop-nest is vectorized and mask_store belongs to the inner
9884 one. */
9885 e = split_block (bb, last);
9886 bb_loop = bb->loop_father;
9887 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9888 join_bb = e->dest;
9889 store_bb = create_empty_bb (bb);
9890 add_bb_to_loop (store_bb, bb_loop);
9891 e->flags = EDGE_TRUE_VALUE;
9892 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9893 /* Put STORE_BB to likely part. */
9894 efalse->probability = profile_probability::unlikely ();
9895 store_bb->count = efalse->count ();
9896 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9897 if (dom_info_available_p (CDI_DOMINATORS))
9898 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9899 if (dump_enabled_p ())
9900 dump_printf_loc (MSG_NOTE, vect_location,
9901 "Create new block %d to sink mask stores.",
9902 store_bb->index);
9903 /* Create vector comparison with boolean result. */
9904 vectype = TREE_TYPE (mask);
9905 zero = build_zero_cst (vectype);
9906 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9907 gsi = gsi_last_bb (bb);
9908 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9909 /* Create new PHI node for vdef of the last masked store:
9910 .MEM_2 = VDEF <.MEM_1>
9911 will be converted to
9912 .MEM.3 = VDEF <.MEM_1>
9913 and new PHI node will be created in join bb
9914 .MEM_2 = PHI <.MEM_1, .MEM_3>
9915 */
9916 vdef = gimple_vdef (last);
9917 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9918 gimple_set_vdef (last, new_vdef);
9919 phi = create_phi_node (vdef, join_bb);
9920 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9921
9922 /* Put all masked stores with the same mask to STORE_BB if possible. */
9923 while (true)
9924 {
9925 gimple_stmt_iterator gsi_from;
9926 gimple *stmt1 = NULL;
9927
9928 /* Move masked store to STORE_BB. */
9929 last_store = last;
9930 gsi = gsi_for_stmt (last);
9931 gsi_from = gsi;
9932 /* Shift GSI to the previous stmt for further traversal. */
9933 gsi_prev (&gsi);
9934 gsi_to = gsi_start_bb (store_bb);
9935 gsi_move_before (&gsi_from, &gsi_to);
9936 /* Setup GSI_TO to the non-empty block start. */
9937 gsi_to = gsi_start_bb (store_bb);
9938 if (dump_enabled_p ())
9939 dump_printf_loc (MSG_NOTE, vect_location,
9940 "Move stmt to created bb\n%G", last);
9941 /* Move all stored value producers if possible. */
9942 while (!gsi_end_p (gsi))
9943 {
9944 tree lhs;
9945 imm_use_iterator imm_iter;
9946 use_operand_p use_p;
9947 bool res;
9948
9949 /* Skip debug statements. */
9950 if (is_gimple_debug (gsi_stmt (gsi)))
9951 {
9952 gsi_prev (&gsi);
9953 continue;
9954 }
9955 stmt1 = gsi_stmt (gsi);
9956 /* Do not consider statements writing to memory or having
9957 volatile operand. */
9958 if (gimple_vdef (stmt1)
9959 || gimple_has_volatile_ops (stmt1))
9960 break;
9961 gsi_from = gsi;
9962 gsi_prev (&gsi);
9963 lhs = gimple_get_lhs (stmt1);
9964 if (!lhs)
9965 break;
9966
9967 /* LHS of vectorized stmt must be SSA_NAME. */
9968 if (TREE_CODE (lhs) != SSA_NAME)
9969 break;
9970
9971 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9972 {
9973 /* Remove dead scalar statement. */
9974 if (has_zero_uses (lhs))
9975 {
9976 gsi_remove (&gsi_from, true);
9977 continue;
9978 }
9979 }
9980
9981 /* Check that LHS does not have uses outside of STORE_BB. */
9982 res = true;
9983 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9984 {
9985 gimple *use_stmt;
9986 use_stmt = USE_STMT (use_p);
9987 if (is_gimple_debug (use_stmt))
9988 continue;
9989 if (gimple_bb (use_stmt) != store_bb)
9990 {
9991 res = false;
9992 break;
9993 }
9994 }
9995 if (!res)
9996 break;
9997
9998 if (gimple_vuse (stmt1)
9999 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10000 break;
10001
10002 /* Can move STMT1 to STORE_BB. */
10003 if (dump_enabled_p ())
10004 dump_printf_loc (MSG_NOTE, vect_location,
10005 "Move stmt to created bb\n%G", stmt1);
10006 gsi_move_before (&gsi_from, &gsi_to);
10007 /* Shift GSI_TO for further insertion. */
10008 gsi_prev (&gsi_to);
10009 }
10010 /* Put other masked stores with the same mask to STORE_BB. */
10011 if (worklist.is_empty ()
10012 || gimple_call_arg (worklist.last (), 2) != mask
10013 || worklist.last () != stmt1)
10014 break;
10015 last = worklist.pop ();
10016 }
10017 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10018 }
10019 }
10020
10021 /* Decide whether it is possible to use a zero-based induction variable
10022 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10023 the value that the induction variable must be able to hold in order
10024 to ensure that the rgroups eventually have no active vector elements.
10025 Return -1 otherwise. */
10026
10027 widest_int
vect_iv_limit_for_partial_vectors(loop_vec_info loop_vinfo)10028 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10029 {
10030 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10031 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10032 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10033
10034 /* Calculate the value that the induction variable must be able
10035 to hit in order to ensure that we end the loop with an all-false mask.
10036 This involves adding the maximum number of inactive trailing scalar
10037 iterations. */
10038 widest_int iv_limit = -1;
10039 if (max_loop_iterations (loop, &iv_limit))
10040 {
10041 if (niters_skip)
10042 {
10043 /* Add the maximum number of skipped iterations to the
10044 maximum iteration count. */
10045 if (TREE_CODE (niters_skip) == INTEGER_CST)
10046 iv_limit += wi::to_widest (niters_skip);
10047 else
10048 iv_limit += max_vf - 1;
10049 }
10050 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10051 /* Make a conservatively-correct assumption. */
10052 iv_limit += max_vf - 1;
10053
10054 /* IV_LIMIT is the maximum number of latch iterations, which is also
10055 the maximum in-range IV value. Round this value down to the previous
10056 vector alignment boundary and then add an extra full iteration. */
10057 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10058 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10059 }
10060 return iv_limit;
10061 }
10062
10063 /* For the given rgroup_controls RGC, check whether an induction variable
10064 would ever hit a value that produces a set of all-false masks or zero
10065 lengths before wrapping around. Return true if it's possible to wrap
10066 around before hitting the desirable value, otherwise return false. */
10067
10068 bool
vect_rgroup_iv_might_wrap_p(loop_vec_info loop_vinfo,rgroup_controls * rgc)10069 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10070 {
10071 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10072
10073 if (iv_limit == -1)
10074 return true;
10075
10076 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10077 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10078 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10079
10080 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10081 return true;
10082
10083 return false;
10084 }
10085