1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 #define vec_step vec_step_
58
59 /* Loop Vectorization Pass.
60
61 This pass tries to vectorize loops.
62
63 For example, the vectorizer transforms the following simple loop:
64
65 short a[N]; short b[N]; short c[N]; int i;
66
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
69 }
70
71 as if it was manually vectorized by rewriting the source code into:
72
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
77
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
83 }
84
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
96
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
102
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
107
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
118
119 For example, say stmt S1 was vectorized into stmt VS1:
120
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
124
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
129
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
137
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
145
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
152
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
159 bool *, bool *);
160
161 /* Subroutine of vect_determine_vf_for_stmt that handles only one
162 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
163 may already be set for general statements (not just data refs). */
164
165 static opt_result
vect_determine_vf_for_stmt_1(stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)166 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
167 bool vectype_maybe_set_p,
168 poly_uint64 *vf)
169 {
170 gimple *stmt = stmt_info->stmt;
171
172 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173 && !STMT_VINFO_LIVE_P (stmt_info))
174 || gimple_clobber_p (stmt))
175 {
176 if (dump_enabled_p ())
177 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178 return opt_result::success ();
179 }
180
181 tree stmt_vectype, nunits_vectype;
182 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
186
187 if (stmt_vectype)
188 {
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
210
211 static opt_result
vect_determine_vf_for_stmt(stmt_vec_info stmt_info,poly_uint64 * vf)212 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 vec_info *vinfo = stmt_info->vinfo;
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
219 if (!res)
220 return res;
221
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
224 {
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
231 {
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
238 if (!res)
239 return res;
240 }
241
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
247 if (!res)
248 return res;
249 }
250
251 return opt_result::success ();
252 }
253
254 /* Function vect_determine_vectorization_factor
255
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
261
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
266
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
271 }
272
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
276 }
277 */
278
279 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 {
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
291
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293
294 for (i = 0; i < nbbs; i++)
295 {
296 basic_block bb = bbs[i];
297
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
300 {
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
306
307 gcc_assert (stmt_info);
308
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
311 {
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
319
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
331
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
337 }
338
339 vect_update_max_nunits (&vectorization_factor, vectype);
340 }
341 }
342
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
345 {
346 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
347 opt_result res
348 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
349 if (!res)
350 return res;
351 }
352 }
353
354 /* TODO: Analyze cost. Decide if worth while to vectorize. */
355 if (dump_enabled_p ())
356 {
357 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
358 dump_dec (MSG_NOTE, vectorization_factor);
359 dump_printf (MSG_NOTE, "\n");
360 }
361
362 if (known_le (vectorization_factor, 1U))
363 return opt_result::failure_at (vect_location,
364 "not vectorized: unsupported data-type\n");
365 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
366 return opt_result::success ();
367 }
368
369
370 /* Function vect_is_simple_iv_evolution.
371
372 FORNOW: A simple evolution of an induction variables in the loop is
373 considered a polynomial evolution. */
374
375 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
377 tree * step)
378 {
379 tree init_expr;
380 tree step_expr;
381 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
382 basic_block bb;
383
384 /* When there is no evolution in this loop, the evolution function
385 is not "simple". */
386 if (evolution_part == NULL_TREE)
387 return false;
388
389 /* When the evolution is a polynomial of degree >= 2
390 the evolution function is not "simple". */
391 if (tree_is_chrec (evolution_part))
392 return false;
393
394 step_expr = evolution_part;
395 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
396
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
399 step_expr, init_expr);
400
401 *init = init_expr;
402 *step = step_expr;
403
404 if (TREE_CODE (step_expr) != INTEGER_CST
405 && (TREE_CODE (step_expr) != SSA_NAME
406 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
407 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
408 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
409 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
410 || !flag_associative_math)))
411 && (TREE_CODE (step_expr) != REAL_CST
412 || !flag_associative_math))
413 {
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "step unknown.\n");
417 return false;
418 }
419
420 return true;
421 }
422
423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
424 what we are assuming is a double reduction. For example, given
425 a structure like this:
426
427 outer1:
428 x_1 = PHI <x_4(outer2), ...>;
429 ...
430
431 inner:
432 x_2 = PHI <x_1(outer1), ...>;
433 ...
434 x_3 = ...;
435 ...
436
437 outer2:
438 x_4 = PHI <x_3(inner)>;
439 ...
440
441 outer loop analysis would treat x_1 as a double reduction phi and
442 this function would then return true for x_2. */
443
444 static bool
vect_inner_phi_in_double_reduction_p(stmt_vec_info stmt_info,gphi * phi)445 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
446 {
447 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
448 use_operand_p use_p;
449 ssa_op_iter op_iter;
450 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
451 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
452 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
453 return true;
454 return false;
455 }
456
457 /* Function vect_analyze_scalar_cycles_1.
458
459 Examine the cross iteration def-use cycles of scalar variables
460 in LOOP. LOOP_VINFO represents the loop that is now being
461 considered for vectorization (can be LOOP, or an outer-loop
462 enclosing LOOP). */
463
464 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)465 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
466 {
467 basic_block bb = loop->header;
468 tree init, step;
469 auto_vec<stmt_vec_info, 64> worklist;
470 gphi_iterator gsi;
471 bool double_reduc, reduc_chain;
472
473 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
474
475 /* First - identify all inductions. Reduction detection assumes that all the
476 inductions have been identified, therefore, this order must not be
477 changed. */
478 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
479 {
480 gphi *phi = gsi.phi ();
481 tree access_fn = NULL;
482 tree def = PHI_RESULT (phi);
483 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
484
485 if (dump_enabled_p ())
486 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
487
488 /* Skip virtual phi's. The data dependences that are associated with
489 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
490 if (virtual_operand_p (def))
491 continue;
492
493 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
494
495 /* Analyze the evolution function. */
496 access_fn = analyze_scalar_evolution (loop, def);
497 if (access_fn)
498 {
499 STRIP_NOPS (access_fn);
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_NOTE, vect_location,
502 "Access function of PHI: %T\n", access_fn);
503 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
504 = initial_condition_in_loop_num (access_fn, loop->num);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
506 = evolution_part_in_loop_num (access_fn, loop->num);
507 }
508
509 if (!access_fn
510 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
511 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
512 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
513 && TREE_CODE (step) != INTEGER_CST))
514 {
515 worklist.safe_push (stmt_vinfo);
516 continue;
517 }
518
519 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
520 != NULL_TREE);
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
522
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
525 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
526 }
527
528
529 /* Second - identify all reductions and nested cycles. */
530 while (worklist.length () > 0)
531 {
532 stmt_vec_info stmt_vinfo = worklist.pop ();
533 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
534 tree def = PHI_RESULT (phi);
535
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
538
539 gcc_assert (!virtual_operand_p (def)
540 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
541
542 stmt_vec_info reduc_stmt_info
543 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
544 &reduc_chain);
545 if (reduc_stmt_info)
546 {
547 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
548 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
549 if (double_reduc)
550 {
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location,
553 "Detected double reduction.\n");
554
555 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
556 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
557 }
558 else
559 {
560 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
561 {
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE, vect_location,
564 "Detected vectorizable nested cycle.\n");
565
566 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
567 }
568 else
569 {
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "Detected reduction.\n");
573
574 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
575 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
576 /* Store the reduction cycles for possible vectorization in
577 loop-aware SLP if it was not detected as reduction
578 chain. */
579 if (! reduc_chain)
580 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
581 (reduc_stmt_info);
582 }
583 }
584 }
585 else
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
588 "Unknown def-use cycle pattern.\n");
589 }
590 }
591
592
593 /* Function vect_analyze_scalar_cycles.
594
595 Examine the cross iteration def-use cycles of scalar variables, by
596 analyzing the loop-header PHIs of scalar variables. Classify each
597 cycle as one of the following: invariant, induction, reduction, unknown.
598 We do that for the loop represented by LOOP_VINFO, and also to its
599 inner-loop, if exists.
600 Examples for scalar cycles:
601
602 Example1: reduction:
603
604 loop1:
605 for (i=0; i<N; i++)
606 sum += a[i];
607
608 Example2: induction:
609
610 loop2:
611 for (i=0; i<N; i++)
612 a[i] = i; */
613
614 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)615 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
616 {
617 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
618
619 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
620
621 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
622 Reductions in such inner-loop therefore have different properties than
623 the reductions in the nest that gets vectorized:
624 1. When vectorized, they are executed in the same order as in the original
625 scalar loop, so we can't change the order of computation when
626 vectorizing them.
627 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
628 current checks are too strict. */
629
630 if (loop->inner)
631 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
632 }
633
634 /* Transfer group and reduction information from STMT_INFO to its
635 pattern stmt. */
636
637 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)638 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
639 {
640 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
641 stmt_vec_info stmtp;
642 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
643 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
644 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
645 do
646 {
647 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
648 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
649 == STMT_VINFO_DEF_TYPE (stmt_info));
650 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
651 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
652 if (stmt_info)
653 REDUC_GROUP_NEXT_ELEMENT (stmtp)
654 = STMT_VINFO_RELATED_STMT (stmt_info);
655 }
656 while (stmt_info);
657 }
658
659 /* Fixup scalar cycles that now have their stmts detected as patterns. */
660
661 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)662 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
663 {
664 stmt_vec_info first;
665 unsigned i;
666
667 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
668 if (STMT_VINFO_IN_PATTERN_P (first))
669 {
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
672 {
673 if (! STMT_VINFO_IN_PATTERN_P (next)
674 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
675 break;
676 next = REDUC_GROUP_NEXT_ELEMENT (next);
677 }
678 /* If not all stmt in the chain are patterns or if we failed
679 to update STMT_VINFO_REDUC_IDX try to handle the chain
680 without patterns. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
683 {
684 vect_fixup_reduc_chain (first);
685 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
686 = STMT_VINFO_RELATED_STMT (first);
687 }
688 }
689 }
690
691 /* Function vect_get_loop_niters.
692
693 Determine how many iterations the loop is executed and place it
694 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
695 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
696 niter information holds in ASSUMPTIONS.
697
698 Return the loop exit condition. */
699
700
701 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)702 vect_get_loop_niters (class loop *loop, tree *assumptions,
703 tree *number_of_iterations, tree *number_of_iterationsm1)
704 {
705 edge exit = single_exit (loop);
706 class tree_niter_desc niter_desc;
707 tree niter_assumptions, niter, may_be_zero;
708 gcond *cond = get_loop_exit_condition (loop);
709
710 *assumptions = boolean_true_node;
711 *number_of_iterationsm1 = chrec_dont_know;
712 *number_of_iterations = chrec_dont_know;
713 DUMP_VECT_SCOPE ("get_loop_niters");
714
715 if (!exit)
716 return cond;
717
718 may_be_zero = NULL_TREE;
719 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
720 || chrec_contains_undetermined (niter_desc.niter))
721 return cond;
722
723 niter_assumptions = niter_desc.assumptions;
724 may_be_zero = niter_desc.may_be_zero;
725 niter = niter_desc.niter;
726
727 if (may_be_zero && integer_zerop (may_be_zero))
728 may_be_zero = NULL_TREE;
729
730 if (may_be_zero)
731 {
732 if (COMPARISON_CLASS_P (may_be_zero))
733 {
734 /* Try to combine may_be_zero with assumptions, this can simplify
735 computation of niter expression. */
736 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
737 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
738 niter_assumptions,
739 fold_build1 (TRUTH_NOT_EXPR,
740 boolean_type_node,
741 may_be_zero));
742 else
743 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
744 build_int_cst (TREE_TYPE (niter), 0),
745 rewrite_to_non_trapping_overflow (niter));
746
747 may_be_zero = NULL_TREE;
748 }
749 else if (integer_nonzerop (may_be_zero))
750 {
751 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
752 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
753 return cond;
754 }
755 else
756 return cond;
757 }
758
759 *assumptions = niter_assumptions;
760 *number_of_iterationsm1 = niter;
761
762 /* We want the number of loop header executions which is the number
763 of latch executions plus one.
764 ??? For UINT_MAX latch executions this number overflows to zero
765 for loops like do { n++; } while (n != 0); */
766 if (niter && !chrec_contains_undetermined (niter))
767 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
768 build_int_cst (TREE_TYPE (niter), 1));
769 *number_of_iterations = niter;
770
771 return cond;
772 }
773
774 /* Function bb_in_loop_p
775
776 Used as predicate for dfs order traversal of the loop bbs. */
777
778 static bool
bb_in_loop_p(const_basic_block bb,const void * data)779 bb_in_loop_p (const_basic_block bb, const void *data)
780 {
781 const class loop *const loop = (const class loop *)data;
782 if (flow_bb_inside_loop_p (loop, bb))
783 return true;
784 return false;
785 }
786
787
788 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
789 stmt_vec_info structs for all the stmts in LOOP_IN. */
790
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)791 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
792 : vec_info (vec_info::loop, init_cost (loop_in), shared),
793 loop (loop_in),
794 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
795 num_itersm1 (NULL_TREE),
796 num_iters (NULL_TREE),
797 num_iters_unchanged (NULL_TREE),
798 num_iters_assumptions (NULL_TREE),
799 th (0),
800 versioning_threshold (0),
801 vectorization_factor (0),
802 max_vectorization_factor (0),
803 mask_skip_niters (NULL_TREE),
804 mask_compare_type (NULL_TREE),
805 simd_if_cond (NULL_TREE),
806 unaligned_dr (NULL),
807 peeling_for_alignment (0),
808 ptr_mask (0),
809 ivexpr_map (NULL),
810 scan_map (NULL),
811 slp_unrolling_factor (1),
812 single_scalar_iteration_cost (0),
813 vec_outside_cost (0),
814 vec_inside_cost (0),
815 vectorizable (false),
816 can_fully_mask_p (true),
817 fully_masked_p (false),
818 peeling_for_gaps (false),
819 peeling_for_niter (false),
820 no_data_dependencies (false),
821 has_mask_store (false),
822 scalar_loop_scaling (profile_probability::uninitialized ()),
823 scalar_loop (NULL),
824 orig_loop_info (NULL)
825 {
826 /* CHECKME: We want to visit all BBs before their successors (except for
827 latch blocks, for which this assertion wouldn't hold). In the simple
828 case of the loop forms we allow, a dfs order of the BBs would the same
829 as reversed postorder traversal, so we are safe. */
830
831 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
832 bbs, loop->num_nodes, loop);
833 gcc_assert (nbbs == loop->num_nodes);
834
835 for (unsigned int i = 0; i < nbbs; i++)
836 {
837 basic_block bb = bbs[i];
838 gimple_stmt_iterator si;
839
840 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
841 {
842 gimple *phi = gsi_stmt (si);
843 gimple_set_uid (phi, 0);
844 add_stmt (phi);
845 }
846
847 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
848 {
849 gimple *stmt = gsi_stmt (si);
850 gimple_set_uid (stmt, 0);
851 add_stmt (stmt);
852 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
853 third argument is the #pragma omp simd if (x) condition, when 0,
854 loop shouldn't be vectorized, when non-zero constant, it should
855 be vectorized normally, otherwise versioned with vectorized loop
856 done if the condition is non-zero at runtime. */
857 if (loop_in->simduid
858 && is_gimple_call (stmt)
859 && gimple_call_internal_p (stmt)
860 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
861 && gimple_call_num_args (stmt) >= 3
862 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
863 && (loop_in->simduid
864 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
865 {
866 tree arg = gimple_call_arg (stmt, 2);
867 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
868 simd_if_cond = arg;
869 else
870 gcc_assert (integer_nonzerop (arg));
871 }
872 }
873 }
874
875 epilogue_vinfos.create (6);
876 }
877
878 /* Free all levels of MASKS. */
879
880 void
release_vec_loop_masks(vec_loop_masks * masks)881 release_vec_loop_masks (vec_loop_masks *masks)
882 {
883 rgroup_masks *rgm;
884 unsigned int i;
885 FOR_EACH_VEC_ELT (*masks, i, rgm)
886 rgm->masks.release ();
887 masks->release ();
888 }
889
890 /* Free all memory used by the _loop_vec_info, as well as all the
891 stmt_vec_info structs of all the stmts in the loop. */
892
~_loop_vec_info()893 _loop_vec_info::~_loop_vec_info ()
894 {
895 free (bbs);
896
897 release_vec_loop_masks (&masks);
898 delete ivexpr_map;
899 delete scan_map;
900 epilogue_vinfos.release ();
901
902 loop->aux = NULL;
903 }
904
905 /* Return an invariant or register for EXPR and emit necessary
906 computations in the LOOP_VINFO loop preheader. */
907
908 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)909 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
910 {
911 if (is_gimple_reg (expr)
912 || is_gimple_min_invariant (expr))
913 return expr;
914
915 if (! loop_vinfo->ivexpr_map)
916 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
917 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
918 if (! cached)
919 {
920 gimple_seq stmts = NULL;
921 cached = force_gimple_operand (unshare_expr (expr),
922 &stmts, true, NULL_TREE);
923 if (stmts)
924 {
925 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
926 gsi_insert_seq_on_edge_immediate (e, stmts);
927 }
928 }
929 return cached;
930 }
931
932 /* Return true if we can use CMP_TYPE as the comparison type to produce
933 all masks required to mask LOOP_VINFO. */
934
935 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)936 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
937 {
938 rgroup_masks *rgm;
939 unsigned int i;
940 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
941 if (rgm->mask_type != NULL_TREE
942 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
943 cmp_type, rgm->mask_type,
944 OPTIMIZE_FOR_SPEED))
945 return false;
946 return true;
947 }
948
949 /* Calculate the maximum number of scalars per iteration for every
950 rgroup in LOOP_VINFO. */
951
952 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)953 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
954 {
955 unsigned int res = 1;
956 unsigned int i;
957 rgroup_masks *rgm;
958 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
959 res = MAX (res, rgm->max_nscalars_per_iter);
960 return res;
961 }
962
963 /* Each statement in LOOP_VINFO can be masked where necessary. Check
964 whether we can actually generate the masks required. Return true if so,
965 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
966
967 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)968 vect_verify_full_masking (loop_vec_info loop_vinfo)
969 {
970 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
971 unsigned int min_ni_width;
972 unsigned int max_nscalars_per_iter
973 = vect_get_max_nscalars_per_iter (loop_vinfo);
974
975 /* Use a normal loop if there are no statements that need masking.
976 This only happens in rare degenerate cases: it means that the loop
977 has no loads, no stores, and no live-out values. */
978 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
979 return false;
980
981 /* Get the maximum number of iterations that is representable
982 in the counter type. */
983 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
984 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985
986 /* Get a more refined estimate for the number of iterations. */
987 widest_int max_back_edges;
988 if (max_loop_iterations (loop, &max_back_edges))
989 max_ni = wi::smin (max_ni, max_back_edges + 1);
990
991 /* Account for rgroup masks, in which each bit is replicated N times. */
992 max_ni *= max_nscalars_per_iter;
993
994 /* Work out how many bits we need to represent the limit. */
995 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
996
997 /* Find a scalar mode for which WHILE_ULT is supported. */
998 opt_scalar_int_mode cmp_mode_iter;
999 tree cmp_type = NULL_TREE;
1000 tree iv_type = NULL_TREE;
1001 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1002 unsigned int iv_precision = UINT_MAX;
1003
1004 if (iv_limit != -1)
1005 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1006 UNSIGNED);
1007
1008 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1009 {
1010 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1011 if (cmp_bits >= min_ni_width
1012 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1013 {
1014 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1015 if (this_type
1016 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1017 {
1018 /* Although we could stop as soon as we find a valid mode,
1019 there are at least two reasons why that's not always the
1020 best choice:
1021
1022 - An IV that's Pmode or wider is more likely to be reusable
1023 in address calculations than an IV that's narrower than
1024 Pmode.
1025
1026 - Doing the comparison in IV_PRECISION or wider allows
1027 a natural 0-based IV, whereas using a narrower comparison
1028 type requires mitigations against wrap-around.
1029
1030 Conversely, if the IV limit is variable, doing the comparison
1031 in a wider type than the original type can introduce
1032 unnecessary extensions, so picking the widest valid mode
1033 is not always a good choice either.
1034
1035 Here we prefer the first IV type that's Pmode or wider,
1036 and the first comparison type that's IV_PRECISION or wider.
1037 (The comparison type must be no wider than the IV type,
1038 to avoid extensions in the vector loop.)
1039
1040 ??? We might want to try continuing beyond Pmode for ILP32
1041 targets if CMP_BITS < IV_PRECISION. */
1042 iv_type = this_type;
1043 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1044 cmp_type = this_type;
1045 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1046 break;
1047 }
1048 }
1049 }
1050
1051 if (!cmp_type)
1052 return false;
1053
1054 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1055 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1056 return true;
1057 }
1058
1059 /* Calculate the cost of one scalar iteration of the loop. */
1060 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1061 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1062 {
1063 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1064 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1065 int nbbs = loop->num_nodes, factor;
1066 int innerloop_iters, i;
1067
1068 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1069
1070 /* Gather costs for statements in the scalar loop. */
1071
1072 /* FORNOW. */
1073 innerloop_iters = 1;
1074 if (loop->inner)
1075 innerloop_iters = 50; /* FIXME */
1076
1077 for (i = 0; i < nbbs; i++)
1078 {
1079 gimple_stmt_iterator si;
1080 basic_block bb = bbs[i];
1081
1082 if (bb->loop_father == loop->inner)
1083 factor = innerloop_iters;
1084 else
1085 factor = 1;
1086
1087 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1088 {
1089 gimple *stmt = gsi_stmt (si);
1090 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1091
1092 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1093 continue;
1094
1095 /* Skip stmts that are not vectorized inside the loop. */
1096 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1097 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1098 && (!STMT_VINFO_LIVE_P (vstmt_info)
1099 || !VECTORIZABLE_CYCLE_DEF
1100 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1101 continue;
1102
1103 vect_cost_for_stmt kind;
1104 if (STMT_VINFO_DATA_REF (stmt_info))
1105 {
1106 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1107 kind = scalar_load;
1108 else
1109 kind = scalar_store;
1110 }
1111 else if (vect_nop_conversion_p (stmt_info))
1112 continue;
1113 else
1114 kind = scalar_stmt;
1115
1116 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1117 factor, kind, stmt_info, 0, vect_prologue);
1118 }
1119 }
1120
1121 /* Now accumulate cost. */
1122 void *target_cost_data = init_cost (loop);
1123 stmt_info_for_cost *si;
1124 int j;
1125 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126 j, si)
1127 (void) add_stmt_cost (target_cost_data, si->count,
1128 si->kind, si->stmt_info, si->misalign,
1129 vect_body);
1130 unsigned dummy, body_cost = 0;
1131 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1132 destroy_cost_data (target_cost_data);
1133 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1134 }
1135
1136
1137 /* Function vect_analyze_loop_form_1.
1138
1139 Verify that certain CFG restrictions hold, including:
1140 - the loop has a pre-header
1141 - the loop has a single entry and exit
1142 - the loop exit condition is simple enough
1143 - the number of iterations can be analyzed, i.e, a countable loop. The
1144 niter could be analyzed under some assumptions. */
1145
1146 opt_result
vect_analyze_loop_form_1(class loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1147 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1148 tree *assumptions, tree *number_of_iterationsm1,
1149 tree *number_of_iterations, gcond **inner_loop_cond)
1150 {
1151 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1152
1153 /* Different restrictions apply when we are considering an inner-most loop,
1154 vs. an outer (nested) loop.
1155 (FORNOW. May want to relax some of these restrictions in the future). */
1156
1157 if (!loop->inner)
1158 {
1159 /* Inner-most loop. We currently require that the number of BBs is
1160 exactly 2 (the header and latch). Vectorizable inner-most loops
1161 look like this:
1162
1163 (pre-header)
1164 |
1165 header <--------+
1166 | | |
1167 | +--> latch --+
1168 |
1169 (exit-bb) */
1170
1171 if (loop->num_nodes != 2)
1172 return opt_result::failure_at (vect_location,
1173 "not vectorized:"
1174 " control flow in loop.\n");
1175
1176 if (empty_block_p (loop->header))
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized: empty loop.\n");
1179 }
1180 else
1181 {
1182 class loop *innerloop = loop->inner;
1183 edge entryedge;
1184
1185 /* Nested loop. We currently require that the loop is doubly-nested,
1186 contains a single inner loop, and the number of BBs is exactly 5.
1187 Vectorizable outer-loops look like this:
1188
1189 (pre-header)
1190 |
1191 header <---+
1192 | |
1193 inner-loop |
1194 | |
1195 tail ------+
1196 |
1197 (exit-bb)
1198
1199 The inner-loop has the properties expected of inner-most loops
1200 as described above. */
1201
1202 if ((loop->inner)->inner || (loop->inner)->next)
1203 return opt_result::failure_at (vect_location,
1204 "not vectorized:"
1205 " multiple nested loops.\n");
1206
1207 if (loop->num_nodes != 5)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " control flow in loop.\n");
1211
1212 entryedge = loop_preheader_edge (innerloop);
1213 if (entryedge->src != loop->header
1214 || !single_exit (innerloop)
1215 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1216 return opt_result::failure_at (vect_location,
1217 "not vectorized:"
1218 " unsupported outerloop form.\n");
1219
1220 /* Analyze the inner-loop. */
1221 tree inner_niterm1, inner_niter, inner_assumptions;
1222 opt_result res
1223 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1224 &inner_assumptions, &inner_niterm1,
1225 &inner_niter, NULL);
1226 if (!res)
1227 {
1228 if (dump_enabled_p ())
1229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 "not vectorized: Bad inner loop.\n");
1231 return res;
1232 }
1233
1234 /* Don't support analyzing niter under assumptions for inner
1235 loop. */
1236 if (!integer_onep (inner_assumptions))
1237 return opt_result::failure_at (vect_location,
1238 "not vectorized: Bad inner loop.\n");
1239
1240 if (!expr_invariant_in_loop_p (loop, inner_niter))
1241 return opt_result::failure_at (vect_location,
1242 "not vectorized: inner-loop count not"
1243 " invariant.\n");
1244
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_NOTE, vect_location,
1247 "Considering outer-loop vectorization.\n");
1248 }
1249
1250 if (!single_exit (loop))
1251 return opt_result::failure_at (vect_location,
1252 "not vectorized: multiple exits.\n");
1253 if (EDGE_COUNT (loop->header->preds) != 2)
1254 return opt_result::failure_at (vect_location,
1255 "not vectorized:"
1256 " too many incoming edges.\n");
1257
1258 /* We assume that the loop exit condition is at the end of the loop. i.e,
1259 that the loop is represented as a do-while (with a proper if-guard
1260 before the loop if needed), where the loop header contains all the
1261 executable statements, and the latch is empty. */
1262 if (!empty_block_p (loop->latch)
1263 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1264 return opt_result::failure_at (vect_location,
1265 "not vectorized: latch block not empty.\n");
1266
1267 /* Make sure the exit is not abnormal. */
1268 edge e = single_exit (loop);
1269 if (e->flags & EDGE_ABNORMAL)
1270 return opt_result::failure_at (vect_location,
1271 "not vectorized:"
1272 " abnormal loop exit edge.\n");
1273
1274 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1275 number_of_iterationsm1);
1276 if (!*loop_cond)
1277 return opt_result::failure_at
1278 (vect_location,
1279 "not vectorized: complicated exit condition.\n");
1280
1281 if (integer_zerop (*assumptions)
1282 || !*number_of_iterations
1283 || chrec_contains_undetermined (*number_of_iterations))
1284 return opt_result::failure_at
1285 (*loop_cond,
1286 "not vectorized: number of iterations cannot be computed.\n");
1287
1288 if (integer_zerop (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations = 0.\n");
1292
1293 return opt_result::success ();
1294 }
1295
1296 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1297
1298 opt_loop_vec_info
vect_analyze_loop_form(class loop * loop,vec_info_shared * shared)1299 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1300 {
1301 tree assumptions, number_of_iterations, number_of_iterationsm1;
1302 gcond *loop_cond, *inner_loop_cond = NULL;
1303
1304 opt_result res
1305 = vect_analyze_loop_form_1 (loop, &loop_cond,
1306 &assumptions, &number_of_iterationsm1,
1307 &number_of_iterations, &inner_loop_cond);
1308 if (!res)
1309 return opt_loop_vec_info::propagate_failure (res);
1310
1311 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1312 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1313 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1314 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1315 if (!integer_onep (assumptions))
1316 {
1317 /* We consider to vectorize this loop by versioning it under
1318 some assumptions. In order to do this, we need to clear
1319 existing information computed by scev and niter analyzer. */
1320 scev_reset_htab ();
1321 free_numbers_of_iterations_estimates (loop);
1322 /* Also set flag for this loop so that following scev and niter
1323 analysis are done under the assumptions. */
1324 loop_constraint_set (loop, LOOP_C_FINITE);
1325 /* Also record the assumptions for versioning. */
1326 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1327 }
1328
1329 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1330 {
1331 if (dump_enabled_p ())
1332 {
1333 dump_printf_loc (MSG_NOTE, vect_location,
1334 "Symbolic number of iterations is ");
1335 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1336 dump_printf (MSG_NOTE, "\n");
1337 }
1338 }
1339
1340 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1341 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1342 if (inner_loop_cond)
1343 {
1344 stmt_vec_info inner_loop_cond_info
1345 = loop_vinfo->lookup_stmt (inner_loop_cond);
1346 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 }
1348
1349 gcc_assert (!loop->aux);
1350 loop->aux = loop_vinfo;
1351 return opt_loop_vec_info::success (loop_vinfo);
1352 }
1353
1354
1355
1356 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1357 statements update the vectorization factor. */
1358
1359 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1360 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1361 {
1362 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1363 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1364 int nbbs = loop->num_nodes;
1365 poly_uint64 vectorization_factor;
1366 int i;
1367
1368 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1369
1370 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1371 gcc_assert (known_ne (vectorization_factor, 0U));
1372
1373 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1374 vectorization factor of the loop is the unrolling factor required by
1375 the SLP instances. If that unrolling factor is 1, we say, that we
1376 perform pure SLP on loop - cross iteration parallelism is not
1377 exploited. */
1378 bool only_slp_in_loop = true;
1379 for (i = 0; i < nbbs; i++)
1380 {
1381 basic_block bb = bbs[i];
1382 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1383 gsi_next (&si))
1384 {
1385 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1386 if (!stmt_info)
1387 continue;
1388 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1389 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1390 && !PURE_SLP_STMT (stmt_info))
1391 /* STMT needs both SLP and loop-based vectorization. */
1392 only_slp_in_loop = false;
1393 }
1394 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1395 gsi_next (&si))
1396 {
1397 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1398 stmt_info = vect_stmt_to_vectorize (stmt_info);
1399 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1400 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1401 && !PURE_SLP_STMT (stmt_info))
1402 /* STMT needs both SLP and loop-based vectorization. */
1403 only_slp_in_loop = false;
1404 }
1405 }
1406
1407 if (only_slp_in_loop)
1408 {
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains only SLP stmts\n");
1412 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413 }
1414 else
1415 {
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "Loop contains SLP and non-SLP stmts\n");
1419 /* Both the vectorization factor and unroll factor have the form
1420 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1421 so they must have a common multiple. */
1422 vectorization_factor
1423 = force_common_multiple (vectorization_factor,
1424 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1425 }
1426
1427 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1428 if (dump_enabled_p ())
1429 {
1430 dump_printf_loc (MSG_NOTE, vect_location,
1431 "Updating vectorization factor to ");
1432 dump_dec (MSG_NOTE, vectorization_factor);
1433 dump_printf (MSG_NOTE, ".\n");
1434 }
1435 }
1436
1437 /* Return true if STMT_INFO describes a double reduction phi and if
1438 the other phi in the reduction is also relevant for vectorization.
1439 This rejects cases such as:
1440
1441 outer1:
1442 x_1 = PHI <x_3(outer2), ...>;
1443 ...
1444
1445 inner:
1446 x_2 = ...;
1447 ...
1448
1449 outer2:
1450 x_3 = PHI <x_2(inner)>;
1451
1452 if nothing in x_2 or elsewhere makes x_1 relevant. */
1453
1454 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1455 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1456 {
1457 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1458 return false;
1459
1460 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1461 }
1462
1463 /* Function vect_analyze_loop_operations.
1464
1465 Scan the loop stmts and make sure they are all vectorizable. */
1466
1467 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1468 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1469 {
1470 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1471 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1472 int nbbs = loop->num_nodes;
1473 int i;
1474 stmt_vec_info stmt_info;
1475 bool need_to_vectorize = false;
1476 bool ok;
1477
1478 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1479
1480 auto_vec<stmt_info_for_cost> cost_vec;
1481
1482 for (i = 0; i < nbbs; i++)
1483 {
1484 basic_block bb = bbs[i];
1485
1486 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1487 gsi_next (&si))
1488 {
1489 gphi *phi = si.phi ();
1490 ok = true;
1491
1492 stmt_info = loop_vinfo->lookup_stmt (phi);
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1495 if (virtual_operand_p (gimple_phi_result (phi)))
1496 continue;
1497
1498 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1499 (i.e., a phi in the tail of the outer-loop). */
1500 if (! is_loop_header_bb_p (bb))
1501 {
1502 /* FORNOW: we currently don't support the case that these phis
1503 are not used in the outerloop (unless it is double reduction,
1504 i.e., this phi is vect_reduction_def), cause this case
1505 requires to actually do something here. */
1506 if (STMT_VINFO_LIVE_P (stmt_info)
1507 && !vect_active_double_reduction_p (stmt_info))
1508 return opt_result::failure_at (phi,
1509 "Unsupported loop-closed phi"
1510 " in outer-loop.\n");
1511
1512 /* If PHI is used in the outer loop, we check that its operand
1513 is defined in the inner loop. */
1514 if (STMT_VINFO_RELEVANT_P (stmt_info))
1515 {
1516 tree phi_op;
1517
1518 if (gimple_phi_num_args (phi) != 1)
1519 return opt_result::failure_at (phi, "unsupported phi");
1520
1521 phi_op = PHI_ARG_DEF (phi, 0);
1522 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1523 if (!op_def_info)
1524 return opt_result::failure_at (phi, "unsupported phi\n");
1525
1526 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1527 && (STMT_VINFO_RELEVANT (op_def_info)
1528 != vect_used_in_outer_by_reduction))
1529 return opt_result::failure_at (phi, "unsupported phi\n");
1530
1531 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1532 || (STMT_VINFO_DEF_TYPE (stmt_info)
1533 == vect_double_reduction_def))
1534 && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1535 return opt_result::failure_at (phi, "unsupported phi\n");
1536 }
1537
1538 continue;
1539 }
1540
1541 gcc_assert (stmt_info);
1542
1543 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544 || STMT_VINFO_LIVE_P (stmt_info))
1545 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546 /* A scalar-dependence cycle that we don't support. */
1547 return opt_result::failure_at (phi,
1548 "not vectorized:"
1549 " scalar dependence cycle.\n");
1550
1551 if (STMT_VINFO_RELEVANT_P (stmt_info))
1552 {
1553 need_to_vectorize = true;
1554 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555 && ! PURE_SLP_STMT (stmt_info))
1556 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1557 &cost_vec);
1558 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1559 || (STMT_VINFO_DEF_TYPE (stmt_info)
1560 == vect_double_reduction_def)
1561 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1562 && ! PURE_SLP_STMT (stmt_info))
1563 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1564 }
1565
1566 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1567 if (ok
1568 && STMT_VINFO_LIVE_P (stmt_info)
1569 && !PURE_SLP_STMT (stmt_info))
1570 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1571 -1, false, &cost_vec);
1572
1573 if (!ok)
1574 return opt_result::failure_at (phi,
1575 "not vectorized: relevant phi not "
1576 "supported: %G",
1577 static_cast <gimple *> (phi));
1578 }
1579
1580 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1581 gsi_next (&si))
1582 {
1583 gimple *stmt = gsi_stmt (si);
1584 if (!gimple_clobber_p (stmt))
1585 {
1586 opt_result res
1587 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1588 &need_to_vectorize,
1589 NULL, NULL, &cost_vec);
1590 if (!res)
1591 return res;
1592 }
1593 }
1594 } /* bbs */
1595
1596 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1597
1598 /* All operations in the loop are either irrelevant (deal with loop
1599 control, or dead), or only used outside the loop and can be moved
1600 out of the loop (e.g. invariants, inductions). The loop can be
1601 optimized away by scalar optimizations. We're better off not
1602 touching this loop. */
1603 if (!need_to_vectorize)
1604 {
1605 if (dump_enabled_p ())
1606 dump_printf_loc (MSG_NOTE, vect_location,
1607 "All the computation can be taken out of the loop.\n");
1608 return opt_result::failure_at
1609 (vect_location,
1610 "not vectorized: redundant loop. no profit to vectorize.\n");
1611 }
1612
1613 return opt_result::success ();
1614 }
1615
1616 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1617 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1618 definitely no, or -1 if it's worth retrying. */
1619
1620 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1621 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1622 {
1623 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1624 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1625
1626 /* Only fully-masked loops can have iteration counts less than the
1627 vectorization factor. */
1628 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1629 {
1630 HOST_WIDE_INT max_niter;
1631
1632 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1633 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1634 else
1635 max_niter = max_stmt_executions_int (loop);
1636
1637 if (max_niter != -1
1638 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1639 {
1640 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642 "not vectorized: iteration count smaller than "
1643 "vectorization factor.\n");
1644 return 0;
1645 }
1646 }
1647
1648 int min_profitable_iters, min_profitable_estimate;
1649 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1650 &min_profitable_estimate);
1651
1652 if (min_profitable_iters < 0)
1653 {
1654 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656 "not vectorized: vectorization not profitable.\n");
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659 "not vectorized: vector version will never be "
1660 "profitable.\n");
1661 return -1;
1662 }
1663
1664 int min_scalar_loop_bound = (param_min_vect_loop_bound
1665 * assumed_vf);
1666
1667 /* Use the cost model only if it is more conservative than user specified
1668 threshold. */
1669 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1670 min_profitable_iters);
1671
1672 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1673
1674 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1675 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1676 {
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "not vectorized: vectorization not profitable.\n");
1680 if (dump_enabled_p ())
1681 dump_printf_loc (MSG_NOTE, vect_location,
1682 "not vectorized: iteration count smaller than user "
1683 "specified loop bound parameter or minimum profitable "
1684 "iterations (whichever is more conservative).\n");
1685 return 0;
1686 }
1687
1688 /* The static profitablity threshold min_profitable_estimate includes
1689 the cost of having to check at runtime whether the scalar loop
1690 should be used instead. If it turns out that we don't need or want
1691 such a check, the threshold we should use for the static estimate
1692 is simply the point at which the vector loop becomes more profitable
1693 than the scalar loop. */
1694 if (min_profitable_estimate > min_profitable_iters
1695 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1696 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1697 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1698 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1699 {
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1702 " choice between the scalar and vector loops\n");
1703 min_profitable_estimate = min_profitable_iters;
1704 }
1705
1706 HOST_WIDE_INT estimated_niter;
1707
1708 /* If we are vectorizing an epilogue then we know the maximum number of
1709 scalar iterations it will cover is at least one lower than the
1710 vectorization factor of the main loop. */
1711 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1712 estimated_niter
1713 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1714 else
1715 {
1716 estimated_niter = estimated_stmt_executions_int (loop);
1717 if (estimated_niter == -1)
1718 estimated_niter = likely_max_stmt_executions_int (loop);
1719 }
1720 if (estimated_niter != -1
1721 && ((unsigned HOST_WIDE_INT) estimated_niter
1722 < MAX (th, (unsigned) min_profitable_estimate)))
1723 {
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1726 "not vectorized: estimated iteration count too "
1727 "small.\n");
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_NOTE, vect_location,
1730 "not vectorized: estimated iteration count smaller "
1731 "than specified loop bound parameter or minimum "
1732 "profitable iterations (whichever is more "
1733 "conservative).\n");
1734 return -1;
1735 }
1736
1737 return 1;
1738 }
1739
1740 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1741 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1742 vec<data_reference_p> *datarefs,
1743 unsigned int *n_stmts)
1744 {
1745 *n_stmts = 0;
1746 for (unsigned i = 0; i < loop->num_nodes; i++)
1747 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1748 !gsi_end_p (gsi); gsi_next (&gsi))
1749 {
1750 gimple *stmt = gsi_stmt (gsi);
1751 if (is_gimple_debug (stmt))
1752 continue;
1753 ++(*n_stmts);
1754 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1755 if (!res)
1756 {
1757 if (is_gimple_call (stmt) && loop->safelen)
1758 {
1759 tree fndecl = gimple_call_fndecl (stmt), op;
1760 if (fndecl != NULL_TREE)
1761 {
1762 cgraph_node *node = cgraph_node::get (fndecl);
1763 if (node != NULL && node->simd_clones != NULL)
1764 {
1765 unsigned int j, n = gimple_call_num_args (stmt);
1766 for (j = 0; j < n; j++)
1767 {
1768 op = gimple_call_arg (stmt, j);
1769 if (DECL_P (op)
1770 || (REFERENCE_CLASS_P (op)
1771 && get_base_address (op)))
1772 break;
1773 }
1774 op = gimple_call_lhs (stmt);
1775 /* Ignore #pragma omp declare simd functions
1776 if they don't have data references in the
1777 call stmt itself. */
1778 if (j == n
1779 && !(op
1780 && (DECL_P (op)
1781 || (REFERENCE_CLASS_P (op)
1782 && get_base_address (op)))))
1783 continue;
1784 }
1785 }
1786 }
1787 return res;
1788 }
1789 /* If dependence analysis will give up due to the limit on the
1790 number of datarefs stop here and fail fatally. */
1791 if (datarefs->length ()
1792 > (unsigned)param_loop_max_datarefs_for_datadeps)
1793 return opt_result::failure_at (stmt, "exceeded param "
1794 "loop-max-datarefs-for-datadeps\n");
1795 }
1796 return opt_result::success ();
1797 }
1798
1799 /* Look for SLP-only access groups and turn each individual access into its own
1800 group. */
1801 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)1802 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1803 {
1804 unsigned int i;
1805 struct data_reference *dr;
1806
1807 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1808
1809 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1810 FOR_EACH_VEC_ELT (datarefs, i, dr)
1811 {
1812 gcc_assert (DR_REF (dr));
1813 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1814
1815 /* Check if the load is a part of an interleaving chain. */
1816 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1817 {
1818 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1819 unsigned int group_size = DR_GROUP_SIZE (first_element);
1820
1821 /* Check if SLP-only groups. */
1822 if (!STMT_SLP_TYPE (stmt_info)
1823 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1824 {
1825 /* Dissolve the group. */
1826 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1827
1828 stmt_vec_info vinfo = first_element;
1829 while (vinfo)
1830 {
1831 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1832 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1833 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1834 DR_GROUP_SIZE (vinfo) = 1;
1835 if (STMT_VINFO_STRIDED_P (first_element))
1836 DR_GROUP_GAP (vinfo) = 0;
1837 else
1838 DR_GROUP_GAP (vinfo) = group_size - 1;
1839 vinfo = next;
1840 }
1841 }
1842 }
1843 }
1844 }
1845
1846
1847 /* Decides whether we need to create an epilogue loop to handle
1848 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1849
1850 void
determine_peel_for_niter(loop_vec_info loop_vinfo)1851 determine_peel_for_niter (loop_vec_info loop_vinfo)
1852 {
1853 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1854
1855 unsigned HOST_WIDE_INT const_vf;
1856 HOST_WIDE_INT max_niter
1857 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1858
1859 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1860 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1861 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1862 (loop_vinfo));
1863
1864 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1865 /* The main loop handles all iterations. */
1866 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1867 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1868 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1869 {
1870 /* Work out the (constant) number of iterations that need to be
1871 peeled for reasons other than niters. */
1872 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1873 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1874 peel_niter += 1;
1875 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1876 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1877 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1878 }
1879 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880 /* ??? When peeling for gaps but not alignment, we could
1881 try to check whether the (variable) niters is known to be
1882 VF * N + 1. That's something of a niche case though. */
1883 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1884 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1885 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1886 < (unsigned) exact_log2 (const_vf))
1887 /* In case of versioning, check if the maximum number of
1888 iterations is greater than th. If they are identical,
1889 the epilogue is unnecessary. */
1890 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891 || ((unsigned HOST_WIDE_INT) max_niter
1892 > (th / const_vf) * const_vf))))
1893 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1894 }
1895
1896
1897 /* Function vect_analyze_loop_2.
1898
1899 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1900 for it. The different analyses will record information in the
1901 loop_vec_info struct. */
1902 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * n_stmts)1903 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1904 {
1905 opt_result ok = opt_result::success ();
1906 int res;
1907 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1908 poly_uint64 min_vf = 2;
1909 loop_vec_info orig_loop_vinfo = NULL;
1910
1911 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1912 loop_vec_info of the first vectorized loop. */
1913 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1914 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1915 else
1916 orig_loop_vinfo = loop_vinfo;
1917 gcc_assert (orig_loop_vinfo);
1918
1919 /* The first group of checks is independent of the vector size. */
1920 fatal = true;
1921
1922 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1923 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1924 return opt_result::failure_at (vect_location,
1925 "not vectorized: simd if(0)\n");
1926
1927 /* Find all data references in the loop (which correspond to vdefs/vuses)
1928 and analyze their evolution in the loop. */
1929
1930 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1931
1932 /* Gather the data references and count stmts in the loop. */
1933 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1934 {
1935 opt_result res
1936 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1937 &LOOP_VINFO_DATAREFS (loop_vinfo),
1938 n_stmts);
1939 if (!res)
1940 {
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943 "not vectorized: loop contains function "
1944 "calls or data references that cannot "
1945 "be analyzed\n");
1946 return res;
1947 }
1948 loop_vinfo->shared->save_datarefs ();
1949 }
1950 else
1951 loop_vinfo->shared->check_datarefs ();
1952
1953 /* Analyze the data references and also adjust the minimal
1954 vectorization factor according to the loads and stores. */
1955
1956 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1957 if (!ok)
1958 {
1959 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961 "bad data references.\n");
1962 return ok;
1963 }
1964
1965 /* Classify all cross-iteration scalar data-flow cycles.
1966 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1967 vect_analyze_scalar_cycles (loop_vinfo);
1968
1969 vect_pattern_recog (loop_vinfo);
1970
1971 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1972
1973 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1974 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1975
1976 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1977 if (!ok)
1978 {
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "bad data access.\n");
1982 return ok;
1983 }
1984
1985 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1986
1987 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1988 if (!ok)
1989 {
1990 if (dump_enabled_p ())
1991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992 "unexpected pattern.\n");
1993 return ok;
1994 }
1995
1996 /* While the rest of the analysis below depends on it in some way. */
1997 fatal = false;
1998
1999 /* Analyze data dependences between the data-refs in the loop
2000 and adjust the maximum vectorization factor according to
2001 the dependences.
2002 FORNOW: fail at the first data dependence that we encounter. */
2003
2004 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2005 if (!ok)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "bad data dependence.\n");
2010 return ok;
2011 }
2012 if (max_vf != MAX_VECTORIZATION_FACTOR
2013 && maybe_lt (max_vf, min_vf))
2014 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2015 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2016
2017 ok = vect_determine_vectorization_factor (loop_vinfo);
2018 if (!ok)
2019 {
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2022 "can't determine vectorization factor.\n");
2023 return ok;
2024 }
2025 if (max_vf != MAX_VECTORIZATION_FACTOR
2026 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2027 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2028
2029 /* Compute the scalar iteration cost. */
2030 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2031
2032 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2033
2034 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2035 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2036 if (!ok)
2037 return ok;
2038
2039 /* If there are any SLP instances mark them as pure_slp. */
2040 bool slp = vect_make_slp_decision (loop_vinfo);
2041 if (slp)
2042 {
2043 /* Find stmts that need to be both vectorized and SLPed. */
2044 vect_detect_hybrid_slp (loop_vinfo);
2045
2046 /* Update the vectorization factor based on the SLP decision. */
2047 vect_update_vf_for_slp (loop_vinfo);
2048 }
2049
2050 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2051
2052 /* We don't expect to have to roll back to anything other than an empty
2053 set of rgroups. */
2054 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2055
2056 /* This is the point where we can re-start analysis with SLP forced off. */
2057 start_over:
2058
2059 /* Now the vectorization factor is final. */
2060 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2061 gcc_assert (known_ne (vectorization_factor, 0U));
2062
2063 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2064 {
2065 dump_printf_loc (MSG_NOTE, vect_location,
2066 "vectorization_factor = ");
2067 dump_dec (MSG_NOTE, vectorization_factor);
2068 dump_printf (MSG_NOTE, ", niters = %wd\n",
2069 LOOP_VINFO_INT_NITERS (loop_vinfo));
2070 }
2071
2072 /* Analyze the alignment of the data-refs in the loop.
2073 Fail if a data reference is found that cannot be vectorized. */
2074
2075 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2076 if (!ok)
2077 {
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080 "bad data alignment.\n");
2081 return ok;
2082 }
2083
2084 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2085 It is important to call pruning after vect_analyze_data_ref_accesses,
2086 since we use grouping information gathered by interleaving analysis. */
2087 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2088 if (!ok)
2089 return ok;
2090
2091 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2092 vectorization, since we do not want to add extra peeling or
2093 add versioning for alignment. */
2094 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2095 /* This pass will decide on using loop versioning and/or loop peeling in
2096 order to enhance the alignment of data references in the loop. */
2097 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2098 else
2099 ok = vect_verify_datarefs_alignment (loop_vinfo);
2100 if (!ok)
2101 return ok;
2102
2103 if (slp)
2104 {
2105 /* Analyze operations in the SLP instances. Note this may
2106 remove unsupported SLP instances which makes the above
2107 SLP kind detection invalid. */
2108 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2109 vect_slp_analyze_operations (loop_vinfo);
2110 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2111 {
2112 ok = opt_result::failure_at (vect_location,
2113 "unsupported SLP instances\n");
2114 goto again;
2115 }
2116 }
2117
2118 /* Dissolve SLP-only groups. */
2119 vect_dissolve_slp_only_groups (loop_vinfo);
2120
2121 /* Scan all the remaining operations in the loop that are not subject
2122 to SLP and make sure they are vectorizable. */
2123 ok = vect_analyze_loop_operations (loop_vinfo);
2124 if (!ok)
2125 {
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "bad operation or unsupported loop bound.\n");
2129 return ok;
2130 }
2131
2132 /* Decide whether to use a fully-masked loop for this vectorization
2133 factor. */
2134 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2135 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2136 && vect_verify_full_masking (loop_vinfo));
2137 if (dump_enabled_p ())
2138 {
2139 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2140 dump_printf_loc (MSG_NOTE, vect_location,
2141 "using a fully-masked loop.\n");
2142 else
2143 dump_printf_loc (MSG_NOTE, vect_location,
2144 "not using a fully-masked loop.\n");
2145 }
2146
2147 /* If epilog loop is required because of data accesses with gaps,
2148 one additional iteration needs to be peeled. Check if there is
2149 enough iterations for vectorization. */
2150 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2151 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2152 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2153 {
2154 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2155 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2156
2157 if (known_lt (wi::to_widest (scalar_niters), vf))
2158 return opt_result::failure_at (vect_location,
2159 "loop has no enough iterations to"
2160 " support peeling for gaps.\n");
2161 }
2162
2163 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2164 loop or a loop that has a lower VF than the main loop. */
2165 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2166 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2167 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2169 return opt_result::failure_at (vect_location,
2170 "Vectorization factor too high for"
2171 " epilogue loop.\n");
2172
2173 /* Check the costings of the loop make vectorizing worthwhile. */
2174 res = vect_analyze_loop_costing (loop_vinfo);
2175 if (res < 0)
2176 {
2177 ok = opt_result::failure_at (vect_location,
2178 "Loop costings may not be worthwhile.\n");
2179 goto again;
2180 }
2181 if (!res)
2182 return opt_result::failure_at (vect_location,
2183 "Loop costings not worthwhile.\n");
2184
2185 determine_peel_for_niter (loop_vinfo);
2186 /* If an epilogue loop is required make sure we can create one. */
2187 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2188 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2189 {
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2192 if (!vect_can_advance_ivs_p (loop_vinfo)
2193 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2194 single_exit (LOOP_VINFO_LOOP
2195 (loop_vinfo))))
2196 {
2197 ok = opt_result::failure_at (vect_location,
2198 "not vectorized: can't create required "
2199 "epilog loop\n");
2200 goto again;
2201 }
2202 }
2203
2204 /* During peeling, we need to check if number of loop iterations is
2205 enough for both peeled prolog loop and vector loop. This check
2206 can be merged along with threshold check of loop versioning, so
2207 increase threshold for this case if necessary.
2208
2209 If we are analyzing an epilogue we still want to check what its
2210 versioning threshold would be. If we decide to vectorize the epilogues we
2211 will want to use the lowest versioning threshold of all epilogues and main
2212 loop. This will enable us to enter a vectorized epilogue even when
2213 versioning the loop. We can't simply check whether the epilogue requires
2214 versioning though since we may have skipped some versioning checks when
2215 analyzing the epilogue. For instance, checks for alias versioning will be
2216 skipped when dealing with epilogues as we assume we already checked them
2217 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2218 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2219 {
2220 poly_uint64 niters_th = 0;
2221 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2222
2223 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2224 {
2225 /* Niters for peeled prolog loop. */
2226 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2227 {
2228 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2229 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2230 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2231 }
2232 else
2233 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2234 }
2235
2236 /* Niters for at least one iteration of vectorized loop. */
2237 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2238 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2239 /* One additional iteration because of peeling for gap. */
2240 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2241 niters_th += 1;
2242
2243 /* Use the same condition as vect_transform_loop to decide when to use
2244 the cost to determine a versioning threshold. */
2245 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2246 && ordered_p (th, niters_th))
2247 niters_th = ordered_max (poly_uint64 (th), niters_th);
2248
2249 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2250 }
2251
2252 gcc_assert (known_eq (vectorization_factor,
2253 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2254
2255 /* Ok to vectorize! */
2256 return opt_result::success ();
2257
2258 again:
2259 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2260 gcc_assert (!ok);
2261
2262 /* Try again with SLP forced off but if we didn't do any SLP there is
2263 no point in re-trying. */
2264 if (!slp)
2265 return ok;
2266
2267 /* If there are reduction chains re-trying will fail anyway. */
2268 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2269 return ok;
2270
2271 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2272 via interleaving or lane instructions. */
2273 slp_instance instance;
2274 slp_tree node;
2275 unsigned i, j;
2276 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2277 {
2278 stmt_vec_info vinfo;
2279 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2280 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2281 continue;
2282 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2283 unsigned int size = DR_GROUP_SIZE (vinfo);
2284 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2285 if (! vect_store_lanes_supported (vectype, size, false)
2286 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2287 && ! vect_grouped_store_supported (vectype, size))
2288 return opt_result::failure_at (vinfo->stmt,
2289 "unsupported grouped store\n");
2290 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2291 {
2292 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2293 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2294 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2295 size = DR_GROUP_SIZE (vinfo);
2296 vectype = STMT_VINFO_VECTYPE (vinfo);
2297 if (! vect_load_lanes_supported (vectype, size, false)
2298 && ! vect_grouped_load_supported (vectype, single_element_p,
2299 size))
2300 return opt_result::failure_at (vinfo->stmt,
2301 "unsupported grouped load\n");
2302 }
2303 }
2304
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_NOTE, vect_location,
2307 "re-trying with SLP disabled\n");
2308
2309 /* Roll back state appropriately. No SLP this time. */
2310 slp = false;
2311 /* Restore vectorization factor as it were without SLP. */
2312 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2313 /* Free the SLP instances. */
2314 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2315 vect_free_slp_instance (instance, false);
2316 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2317 /* Reset SLP type to loop_vect on all stmts. */
2318 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2319 {
2320 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2321 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2322 !gsi_end_p (si); gsi_next (&si))
2323 {
2324 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2325 STMT_SLP_TYPE (stmt_info) = loop_vect;
2326 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2327 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2328 {
2329 /* vectorizable_reduction adjusts reduction stmt def-types,
2330 restore them to that of the PHI. */
2331 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2332 = STMT_VINFO_DEF_TYPE (stmt_info);
2333 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2334 (STMT_VINFO_REDUC_DEF (stmt_info)))
2335 = STMT_VINFO_DEF_TYPE (stmt_info);
2336 }
2337 }
2338 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2339 !gsi_end_p (si); gsi_next (&si))
2340 {
2341 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2342 STMT_SLP_TYPE (stmt_info) = loop_vect;
2343 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2344 {
2345 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2346 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2347 STMT_SLP_TYPE (stmt_info) = loop_vect;
2348 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2349 !gsi_end_p (pi); gsi_next (&pi))
2350 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2351 = loop_vect;
2352 }
2353 }
2354 }
2355 /* Free optimized alias test DDRS. */
2356 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2357 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2358 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2359 /* Reset target cost data. */
2360 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2361 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2362 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2363 /* Reset accumulated rgroup information. */
2364 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2365 /* Reset assorted flags. */
2366 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2367 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2368 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2369 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2370 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2371
2372 goto start_over;
2373 }
2374
2375 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2376 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2377 OLD_LOOP_VINFO is better unless something specifically indicates
2378 otherwise.
2379
2380 Note that this deliberately isn't a partial order. */
2381
2382 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2383 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2384 loop_vec_info old_loop_vinfo)
2385 {
2386 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2387 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2388
2389 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2390 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2391
2392 /* Always prefer a VF of loop->simdlen over any other VF. */
2393 if (loop->simdlen)
2394 {
2395 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2396 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2397 if (new_simdlen_p != old_simdlen_p)
2398 return new_simdlen_p;
2399 }
2400
2401 /* Limit the VFs to what is likely to be the maximum number of iterations,
2402 to handle cases in which at least one loop_vinfo is fully-masked. */
2403 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2404 if (estimated_max_niter != -1)
2405 {
2406 if (known_le (estimated_max_niter, new_vf))
2407 new_vf = estimated_max_niter;
2408 if (known_le (estimated_max_niter, old_vf))
2409 old_vf = estimated_max_niter;
2410 }
2411
2412 /* Check whether the (fractional) cost per scalar iteration is lower
2413 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2414 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2415 * poly_widest_int (old_vf));
2416 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2417 * poly_widest_int (new_vf));
2418 if (maybe_lt (rel_old, rel_new))
2419 {
2420 /* When old_loop_vinfo uses a variable vectorization factor,
2421 we know that it has a lower cost for at least one runtime VF.
2422 However, we don't know how likely that VF is.
2423
2424 One option would be to compare the costs for the estimated VFs.
2425 The problem is that that can put too much pressure on the cost
2426 model. E.g. if the estimated VF is also the lowest possible VF,
2427 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2428 for the estimated VF, we'd then choose new_loop_vinfo even
2429 though (a) new_loop_vinfo might not actually be better than
2430 old_loop_vinfo for that VF and (b) it would be significantly
2431 worse at larger VFs.
2432
2433 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2434 no more expensive than old_loop_vinfo even after doubling the
2435 estimated old_loop_vinfo VF. For all but trivial loops, this
2436 ensures that we only pick new_loop_vinfo if it is significantly
2437 better than old_loop_vinfo at the estimated VF. */
2438 if (rel_new.is_constant ())
2439 return false;
2440
2441 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2442 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2443 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2444 * widest_int (old_estimated_vf));
2445 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2446 * widest_int (new_estimated_vf));
2447 return estimated_rel_new * 2 <= estimated_rel_old;
2448 }
2449 if (known_lt (rel_new, rel_old))
2450 return true;
2451
2452 /* If there's nothing to choose between the loop bodies, see whether
2453 there's a difference in the prologue and epilogue costs. */
2454 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2455 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2456
2457 return false;
2458 }
2459
2460 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2461 true if we should. */
2462
2463 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2464 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2465 loop_vec_info old_loop_vinfo)
2466 {
2467 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2468 return false;
2469
2470 if (dump_enabled_p ())
2471 dump_printf_loc (MSG_NOTE, vect_location,
2472 "***** Preferring vector mode %s to vector mode %s\n",
2473 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2474 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2475 return true;
2476 }
2477
2478 /* Function vect_analyze_loop.
2479
2480 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2481 for it. The different analyses will record information in the
2482 loop_vec_info struct. */
2483 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2484 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2485 {
2486 auto_vector_modes vector_modes;
2487
2488 /* Autodetect first vector size we try. */
2489 unsigned int autovec_flags
2490 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2491 loop->simdlen != 0);
2492 unsigned int mode_i = 0;
2493
2494 DUMP_VECT_SCOPE ("analyze_loop_nest");
2495
2496 if (loop_outer (loop)
2497 && loop_vec_info_for_loop (loop_outer (loop))
2498 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2499 return opt_loop_vec_info::failure_at (vect_location,
2500 "outer-loop already vectorized.\n");
2501
2502 if (!find_loop_nest (loop, &shared->loop_nest))
2503 return opt_loop_vec_info::failure_at
2504 (vect_location,
2505 "not vectorized: loop nest containing two or more consecutive inner"
2506 " loops cannot be vectorized\n");
2507
2508 unsigned n_stmts = 0;
2509 machine_mode autodetected_vector_mode = VOIDmode;
2510 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2511 machine_mode next_vector_mode = VOIDmode;
2512 poly_uint64 lowest_th = 0;
2513 unsigned vectorized_loops = 0;
2514 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2515 && !unlimited_cost_model (loop));
2516
2517 bool vect_epilogues = false;
2518 opt_result res = opt_result::success ();
2519 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2520 while (1)
2521 {
2522 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2523 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2524 if (!loop_vinfo)
2525 {
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "bad loop form.\n");
2529 gcc_checking_assert (first_loop_vinfo == NULL);
2530 return loop_vinfo;
2531 }
2532 loop_vinfo->vector_mode = next_vector_mode;
2533
2534 bool fatal = false;
2535
2536 /* When pick_lowest_cost_p is true, we should in principle iterate
2537 over all the loop_vec_infos that LOOP_VINFO could replace and
2538 try to vectorize LOOP_VINFO under the same conditions.
2539 E.g. when trying to replace an epilogue loop, we should vectorize
2540 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2541 to replace the main loop, we should vectorize LOOP_VINFO as a main
2542 loop too.
2543
2544 However, autovectorize_vector_modes is usually sorted as follows:
2545
2546 - Modes that naturally produce lower VFs usually follow modes that
2547 naturally produce higher VFs.
2548
2549 - When modes naturally produce the same VF, maskable modes
2550 usually follow unmaskable ones, so that the maskable mode
2551 can be used to vectorize the epilogue of the unmaskable mode.
2552
2553 This order is preferred because it leads to the maximum
2554 epilogue vectorization opportunities. Targets should only use
2555 a different order if they want to make wide modes available while
2556 disparaging them relative to earlier, smaller modes. The assumption
2557 in that case is that the wider modes are more expensive in some
2558 way that isn't reflected directly in the costs.
2559
2560 There should therefore be few interesting cases in which
2561 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2562 treated as a standalone loop, and ends up being genuinely cheaper
2563 than FIRST_LOOP_VINFO. */
2564 if (vect_epilogues)
2565 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2566
2567 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2568 if (mode_i == 0)
2569 autodetected_vector_mode = loop_vinfo->vector_mode;
2570 if (dump_enabled_p ())
2571 {
2572 if (res)
2573 dump_printf_loc (MSG_NOTE, vect_location,
2574 "***** Analysis succeeded with vector mode %s\n",
2575 GET_MODE_NAME (loop_vinfo->vector_mode));
2576 else
2577 dump_printf_loc (MSG_NOTE, vect_location,
2578 "***** Analysis failed with vector mode %s\n",
2579 GET_MODE_NAME (loop_vinfo->vector_mode));
2580 }
2581
2582 loop->aux = NULL;
2583
2584 if (!fatal)
2585 while (mode_i < vector_modes.length ()
2586 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2587 {
2588 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_NOTE, vect_location,
2590 "***** The result for vector mode %s would"
2591 " be the same\n",
2592 GET_MODE_NAME (vector_modes[mode_i]));
2593 mode_i += 1;
2594 }
2595
2596 if (res)
2597 {
2598 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2599 vectorized_loops++;
2600
2601 /* Once we hit the desired simdlen for the first time,
2602 discard any previous attempts. */
2603 if (simdlen
2604 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2605 {
2606 delete first_loop_vinfo;
2607 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2608 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2609 simdlen = 0;
2610 }
2611 else if (pick_lowest_cost_p && first_loop_vinfo)
2612 {
2613 /* Keep trying to roll back vectorization attempts while the
2614 loop_vec_infos they produced were worse than this one. */
2615 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2616 while (!vinfos.is_empty ()
2617 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2618 {
2619 gcc_assert (vect_epilogues);
2620 delete vinfos.pop ();
2621 }
2622 if (vinfos.is_empty ()
2623 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2624 {
2625 delete first_loop_vinfo;
2626 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2627 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2628 }
2629 }
2630
2631 if (first_loop_vinfo == NULL)
2632 {
2633 first_loop_vinfo = loop_vinfo;
2634 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2635 }
2636 else if (vect_epilogues
2637 /* For now only allow one epilogue loop. */
2638 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2639 {
2640 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2641 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2642 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2643 || maybe_ne (lowest_th, 0U));
2644 /* Keep track of the known smallest versioning
2645 threshold. */
2646 if (ordered_p (lowest_th, th))
2647 lowest_th = ordered_min (lowest_th, th);
2648 }
2649 else
2650 delete loop_vinfo;
2651
2652 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2653 enabled, SIMDUID is not set, it is the innermost loop and we have
2654 either already found the loop's SIMDLEN or there was no SIMDLEN to
2655 begin with.
2656 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2657 vect_epilogues = (!simdlen
2658 && loop->inner == NULL
2659 && param_vect_epilogues_nomask
2660 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2661 && !loop->simduid
2662 /* For now only allow one epilogue loop, but allow
2663 pick_lowest_cost_p to replace it. */
2664 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2665 || pick_lowest_cost_p));
2666
2667 /* Commit to first_loop_vinfo if we have no reason to try
2668 alternatives. */
2669 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2670 break;
2671 }
2672 else
2673 {
2674 delete loop_vinfo;
2675 if (fatal)
2676 {
2677 gcc_checking_assert (first_loop_vinfo == NULL);
2678 break;
2679 }
2680 }
2681
2682 if (mode_i < vector_modes.length ()
2683 && VECTOR_MODE_P (autodetected_vector_mode)
2684 && (related_vector_mode (vector_modes[mode_i],
2685 GET_MODE_INNER (autodetected_vector_mode))
2686 == autodetected_vector_mode)
2687 && (related_vector_mode (autodetected_vector_mode,
2688 GET_MODE_INNER (vector_modes[mode_i]))
2689 == vector_modes[mode_i]))
2690 {
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_NOTE, vect_location,
2693 "***** Skipping vector mode %s, which would"
2694 " repeat the analysis for %s\n",
2695 GET_MODE_NAME (vector_modes[mode_i]),
2696 GET_MODE_NAME (autodetected_vector_mode));
2697 mode_i += 1;
2698 }
2699
2700 if (mode_i == vector_modes.length ()
2701 || autodetected_vector_mode == VOIDmode)
2702 break;
2703
2704 /* Try the next biggest vector size. */
2705 next_vector_mode = vector_modes[mode_i++];
2706 if (dump_enabled_p ())
2707 dump_printf_loc (MSG_NOTE, vect_location,
2708 "***** Re-trying analysis with vector mode %s\n",
2709 GET_MODE_NAME (next_vector_mode));
2710 }
2711
2712 if (first_loop_vinfo)
2713 {
2714 loop->aux = (loop_vec_info) first_loop_vinfo;
2715 if (dump_enabled_p ())
2716 dump_printf_loc (MSG_NOTE, vect_location,
2717 "***** Choosing vector mode %s\n",
2718 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2719 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2720 return first_loop_vinfo;
2721 }
2722
2723 return opt_loop_vec_info::propagate_failure (res);
2724 }
2725
2726 /* Return true if there is an in-order reduction function for CODE, storing
2727 it in *REDUC_FN if so. */
2728
2729 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2730 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2731 {
2732 switch (code)
2733 {
2734 case PLUS_EXPR:
2735 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2736 return true;
2737
2738 default:
2739 return false;
2740 }
2741 }
2742
2743 /* Function reduction_fn_for_scalar_code
2744
2745 Input:
2746 CODE - tree_code of a reduction operations.
2747
2748 Output:
2749 REDUC_FN - the corresponding internal function to be used to reduce the
2750 vector of partial results into a single scalar result, or IFN_LAST
2751 if the operation is a supported reduction operation, but does not have
2752 such an internal function.
2753
2754 Return FALSE if CODE currently cannot be vectorized as reduction. */
2755
2756 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2757 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2758 {
2759 switch (code)
2760 {
2761 case MAX_EXPR:
2762 *reduc_fn = IFN_REDUC_MAX;
2763 return true;
2764
2765 case MIN_EXPR:
2766 *reduc_fn = IFN_REDUC_MIN;
2767 return true;
2768
2769 case PLUS_EXPR:
2770 *reduc_fn = IFN_REDUC_PLUS;
2771 return true;
2772
2773 case BIT_AND_EXPR:
2774 *reduc_fn = IFN_REDUC_AND;
2775 return true;
2776
2777 case BIT_IOR_EXPR:
2778 *reduc_fn = IFN_REDUC_IOR;
2779 return true;
2780
2781 case BIT_XOR_EXPR:
2782 *reduc_fn = IFN_REDUC_XOR;
2783 return true;
2784
2785 case MULT_EXPR:
2786 case MINUS_EXPR:
2787 *reduc_fn = IFN_LAST;
2788 return true;
2789
2790 default:
2791 return false;
2792 }
2793 }
2794
2795 /* If there is a neutral value X such that SLP reduction NODE would not
2796 be affected by the introduction of additional X elements, return that X,
2797 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2798 is the vector type that would hold element X. REDUC_CHAIN is true if
2799 the SLP statements perform a single reduction, false if each statement
2800 performs an independent reduction. */
2801
2802 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree vector_type,tree_code code,bool reduc_chain)2803 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2804 tree_code code, bool reduc_chain)
2805 {
2806 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2807 stmt_vec_info stmt_vinfo = stmts[0];
2808 tree scalar_type = TREE_TYPE (vector_type);
2809 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2810 gcc_assert (loop);
2811
2812 switch (code)
2813 {
2814 case WIDEN_SUM_EXPR:
2815 case DOT_PROD_EXPR:
2816 case SAD_EXPR:
2817 case PLUS_EXPR:
2818 case MINUS_EXPR:
2819 case BIT_IOR_EXPR:
2820 case BIT_XOR_EXPR:
2821 return build_zero_cst (scalar_type);
2822
2823 case MULT_EXPR:
2824 return build_one_cst (scalar_type);
2825
2826 case BIT_AND_EXPR:
2827 return build_all_ones_cst (scalar_type);
2828
2829 case MAX_EXPR:
2830 case MIN_EXPR:
2831 /* For MIN/MAX the initial values are neutral. A reduction chain
2832 has only a single initial value, so that value is neutral for
2833 all statements. */
2834 if (reduc_chain)
2835 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2836 loop_preheader_edge (loop));
2837 return NULL_TREE;
2838
2839 default:
2840 return NULL_TREE;
2841 }
2842 }
2843
2844 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2845 STMT is printed with a message MSG. */
2846
2847 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2848 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2849 {
2850 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2851 }
2852
2853 /* Return true if we need an in-order reduction for operation CODE
2854 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2855 overflow must wrap. */
2856
2857 bool
needs_fold_left_reduction_p(tree type,tree_code code)2858 needs_fold_left_reduction_p (tree type, tree_code code)
2859 {
2860 /* CHECKME: check for !flag_finite_math_only too? */
2861 if (SCALAR_FLOAT_TYPE_P (type))
2862 switch (code)
2863 {
2864 case MIN_EXPR:
2865 case MAX_EXPR:
2866 return false;
2867
2868 default:
2869 return !flag_associative_math;
2870 }
2871
2872 if (INTEGRAL_TYPE_P (type))
2873 {
2874 if (!operation_no_trapping_overflow (type, code))
2875 return true;
2876 return false;
2877 }
2878
2879 if (SAT_FIXED_POINT_TYPE_P (type))
2880 return true;
2881
2882 return false;
2883 }
2884
2885 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2886 has a handled computation expression. Store the main reduction
2887 operation in *CODE. */
2888
2889 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)2890 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2891 tree loop_arg, enum tree_code *code,
2892 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2893 {
2894 auto_bitmap visited;
2895 tree lookfor = PHI_RESULT (phi);
2896 ssa_op_iter curri;
2897 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2898 while (USE_FROM_PTR (curr) != loop_arg)
2899 curr = op_iter_next_use (&curri);
2900 curri.i = curri.numops;
2901 do
2902 {
2903 path.safe_push (std::make_pair (curri, curr));
2904 tree use = USE_FROM_PTR (curr);
2905 if (use == lookfor)
2906 break;
2907 gimple *def = SSA_NAME_DEF_STMT (use);
2908 if (gimple_nop_p (def)
2909 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2910 {
2911 pop:
2912 do
2913 {
2914 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2915 curri = x.first;
2916 curr = x.second;
2917 do
2918 curr = op_iter_next_use (&curri);
2919 /* Skip already visited or non-SSA operands (from iterating
2920 over PHI args). */
2921 while (curr != NULL_USE_OPERAND_P
2922 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2923 || ! bitmap_set_bit (visited,
2924 SSA_NAME_VERSION
2925 (USE_FROM_PTR (curr)))));
2926 }
2927 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2928 if (curr == NULL_USE_OPERAND_P)
2929 break;
2930 }
2931 else
2932 {
2933 if (gimple_code (def) == GIMPLE_PHI)
2934 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2935 else
2936 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2937 while (curr != NULL_USE_OPERAND_P
2938 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2939 || ! bitmap_set_bit (visited,
2940 SSA_NAME_VERSION
2941 (USE_FROM_PTR (curr)))))
2942 curr = op_iter_next_use (&curri);
2943 if (curr == NULL_USE_OPERAND_P)
2944 goto pop;
2945 }
2946 }
2947 while (1);
2948 if (dump_file && (dump_flags & TDF_DETAILS))
2949 {
2950 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2951 unsigned i;
2952 std::pair<ssa_op_iter, use_operand_p> *x;
2953 FOR_EACH_VEC_ELT (path, i, x)
2954 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2955 dump_printf (MSG_NOTE, "\n");
2956 }
2957
2958 /* Check whether the reduction path detected is valid. */
2959 bool fail = path.length () == 0;
2960 bool neg = false;
2961 int sign = -1;
2962 *code = ERROR_MARK;
2963 for (unsigned i = 1; i < path.length (); ++i)
2964 {
2965 gimple *use_stmt = USE_STMT (path[i].second);
2966 tree op = USE_FROM_PTR (path[i].second);
2967 if (! is_gimple_assign (use_stmt)
2968 /* The following make sure we can compute the operand index
2969 easily plus it mostly disallows chaining via COND_EXPR condition
2970 operands. */
2971 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2972 && (gimple_num_ops (use_stmt) <= 2
2973 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2974 && (gimple_num_ops (use_stmt) <= 3
2975 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2976 {
2977 fail = true;
2978 break;
2979 }
2980 /* Check there's only a single stmt the op is used on inside
2981 of the loop. */
2982 imm_use_iterator imm_iter;
2983 gimple *op_use_stmt;
2984 unsigned cnt = 0;
2985 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2986 if (!is_gimple_debug (op_use_stmt)
2987 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2988 {
2989 /* We want to allow x + x but not x < 1 ? x : 2. */
2990 if (is_gimple_assign (op_use_stmt)
2991 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2992 {
2993 use_operand_p use_p;
2994 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2995 cnt++;
2996 }
2997 else
2998 cnt++;
2999 }
3000 if (cnt != 1)
3001 {
3002 fail = true;
3003 break;
3004 }
3005 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3006 if (use_code == MINUS_EXPR)
3007 {
3008 use_code = PLUS_EXPR;
3009 /* Track whether we negate the reduction value each iteration. */
3010 if (gimple_assign_rhs2 (use_stmt) == op)
3011 neg = ! neg;
3012 }
3013 if (CONVERT_EXPR_CODE_P (use_code)
3014 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3015 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3016 ;
3017 else if (*code == ERROR_MARK)
3018 {
3019 *code = use_code;
3020 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3021 }
3022 else if (use_code != *code)
3023 {
3024 fail = true;
3025 break;
3026 }
3027 else if ((use_code == MIN_EXPR
3028 || use_code == MAX_EXPR)
3029 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3030 {
3031 fail = true;
3032 break;
3033 }
3034 }
3035 return ! fail && ! neg && *code != ERROR_MARK;
3036 }
3037
3038 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3039 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3040 tree loop_arg, enum tree_code code)
3041 {
3042 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3043 enum tree_code code_;
3044 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3045 && code_ == code);
3046 }
3047
3048
3049
3050 /* Function vect_is_simple_reduction
3051
3052 (1) Detect a cross-iteration def-use cycle that represents a simple
3053 reduction computation. We look for the following pattern:
3054
3055 loop_header:
3056 a1 = phi < a0, a2 >
3057 a3 = ...
3058 a2 = operation (a3, a1)
3059
3060 or
3061
3062 a3 = ...
3063 loop_header:
3064 a1 = phi < a0, a2 >
3065 a2 = operation (a3, a1)
3066
3067 such that:
3068 1. operation is commutative and associative and it is safe to
3069 change the order of the computation
3070 2. no uses for a2 in the loop (a2 is used out of the loop)
3071 3. no uses of a1 in the loop besides the reduction operation
3072 4. no uses of a1 outside the loop.
3073
3074 Conditions 1,4 are tested here.
3075 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3076
3077 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3078 nested cycles.
3079
3080 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3081 reductions:
3082
3083 a1 = phi < a0, a2 >
3084 inner loop (def of a3)
3085 a2 = phi < a3 >
3086
3087 (4) Detect condition expressions, ie:
3088 for (int i = 0; i < N; i++)
3089 if (a[i] < val)
3090 ret_val = a[i];
3091
3092 */
3093
3094 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3095 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3096 bool *double_reduc, bool *reduc_chain_p)
3097 {
3098 gphi *phi = as_a <gphi *> (phi_info->stmt);
3099 gimple *phi_use_stmt = NULL;
3100 imm_use_iterator imm_iter;
3101 use_operand_p use_p;
3102
3103 *double_reduc = false;
3104 *reduc_chain_p = false;
3105 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3106
3107 tree phi_name = PHI_RESULT (phi);
3108 /* ??? If there are no uses of the PHI result the inner loop reduction
3109 won't be detected as possibly double-reduction by vectorizable_reduction
3110 because that tries to walk the PHI arg from the preheader edge which
3111 can be constant. See PR60382. */
3112 if (has_zero_uses (phi_name))
3113 return NULL;
3114 class loop *loop = (gimple_bb (phi))->loop_father;
3115 unsigned nphi_def_loop_uses = 0;
3116 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3117 {
3118 gimple *use_stmt = USE_STMT (use_p);
3119 if (is_gimple_debug (use_stmt))
3120 continue;
3121
3122 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3123 {
3124 if (dump_enabled_p ())
3125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3126 "intermediate value used outside loop.\n");
3127
3128 return NULL;
3129 }
3130
3131 nphi_def_loop_uses++;
3132 phi_use_stmt = use_stmt;
3133 }
3134
3135 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3136 if (TREE_CODE (latch_def) != SSA_NAME)
3137 {
3138 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 "reduction: not ssa_name: %T\n", latch_def);
3141 return NULL;
3142 }
3143
3144 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3145 if (!def_stmt_info
3146 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3147 return NULL;
3148
3149 bool nested_in_vect_loop
3150 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3151 unsigned nlatch_def_loop_uses = 0;
3152 auto_vec<gphi *, 3> lcphis;
3153 bool inner_loop_of_double_reduc = false;
3154 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3155 {
3156 gimple *use_stmt = USE_STMT (use_p);
3157 if (is_gimple_debug (use_stmt))
3158 continue;
3159 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3160 nlatch_def_loop_uses++;
3161 else
3162 {
3163 /* We can have more than one loop-closed PHI. */
3164 lcphis.safe_push (as_a <gphi *> (use_stmt));
3165 if (nested_in_vect_loop
3166 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3167 == vect_double_reduction_def))
3168 inner_loop_of_double_reduc = true;
3169 }
3170 }
3171
3172 /* If we are vectorizing an inner reduction we are executing that
3173 in the original order only in case we are not dealing with a
3174 double reduction. */
3175 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3176 {
3177 if (dump_enabled_p ())
3178 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3179 "detected nested cycle: ");
3180 return def_stmt_info;
3181 }
3182
3183 /* If this isn't a nested cycle or if the nested cycle reduction value
3184 is used ouside of the inner loop we cannot handle uses of the reduction
3185 value. */
3186 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3187 {
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190 "reduction used in loop.\n");
3191 return NULL;
3192 }
3193
3194 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3195 defined in the inner loop. */
3196 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3197 {
3198 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3199 if (gimple_phi_num_args (def_stmt) != 1
3200 || TREE_CODE (op1) != SSA_NAME)
3201 {
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3204 "unsupported phi node definition.\n");
3205
3206 return NULL;
3207 }
3208
3209 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3210 if (gimple_bb (def1)
3211 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3212 && loop->inner
3213 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3214 && is_gimple_assign (def1)
3215 && is_a <gphi *> (phi_use_stmt)
3216 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3217 {
3218 if (dump_enabled_p ())
3219 report_vect_op (MSG_NOTE, def_stmt,
3220 "detected double reduction: ");
3221
3222 *double_reduc = true;
3223 return def_stmt_info;
3224 }
3225
3226 return NULL;
3227 }
3228
3229 /* Look for the expression computing latch_def from then loop PHI result. */
3230 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3231 enum tree_code code;
3232 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3233 path))
3234 {
3235 STMT_VINFO_REDUC_CODE (phi_info) = code;
3236 if (code == COND_EXPR && !nested_in_vect_loop)
3237 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3238
3239 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3240 reduction chain for which the additional restriction is that
3241 all operations in the chain are the same. */
3242 auto_vec<stmt_vec_info, 8> reduc_chain;
3243 unsigned i;
3244 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3245 for (i = path.length () - 1; i >= 1; --i)
3246 {
3247 gimple *stmt = USE_STMT (path[i].second);
3248 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3249 STMT_VINFO_REDUC_IDX (stmt_info)
3250 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3251 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3252 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3253 && (i == 1 || i == path.length () - 1));
3254 if ((stmt_code != code && !leading_conversion)
3255 /* We can only handle the final value in epilogue
3256 generation for reduction chains. */
3257 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3258 is_slp_reduc = false;
3259 /* For reduction chains we support a trailing/leading
3260 conversions. We do not store those in the actual chain. */
3261 if (leading_conversion)
3262 continue;
3263 reduc_chain.safe_push (stmt_info);
3264 }
3265 if (is_slp_reduc && reduc_chain.length () > 1)
3266 {
3267 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3268 {
3269 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3270 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3271 }
3272 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3273 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3274
3275 /* Save the chain for further analysis in SLP detection. */
3276 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3277 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3278
3279 *reduc_chain_p = true;
3280 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "reduction: detected reduction chain\n");
3283 }
3284 else if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "reduction: detected reduction\n");
3287
3288 return def_stmt_info;
3289 }
3290
3291 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "reduction: unknown pattern\n");
3294
3295 return NULL;
3296 }
3297
3298 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3299 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3300 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3301 int *peel_iters_epilogue,
3302 stmt_vector_for_cost *scalar_cost_vec,
3303 stmt_vector_for_cost *prologue_cost_vec,
3304 stmt_vector_for_cost *epilogue_cost_vec)
3305 {
3306 int retval = 0;
3307 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3308
3309 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3310 {
3311 *peel_iters_epilogue = assumed_vf / 2;
3312 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE, vect_location,
3314 "cost model: epilogue peel iters set to vf/2 "
3315 "because loop iterations are unknown .\n");
3316
3317 /* If peeled iterations are known but number of scalar loop
3318 iterations are unknown, count a taken branch per peeled loop. */
3319 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3320 NULL, 0, vect_prologue);
3321 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3322 NULL, 0, vect_epilogue);
3323 }
3324 else
3325 {
3326 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3327 peel_iters_prologue = niters < peel_iters_prologue ?
3328 niters : peel_iters_prologue;
3329 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3330 /* If we need to peel for gaps, but no peeling is required, we have to
3331 peel VF iterations. */
3332 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3333 *peel_iters_epilogue = assumed_vf;
3334 }
3335
3336 stmt_info_for_cost *si;
3337 int j;
3338 if (peel_iters_prologue)
3339 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3340 retval += record_stmt_cost (prologue_cost_vec,
3341 si->count * peel_iters_prologue,
3342 si->kind, si->stmt_info, si->misalign,
3343 vect_prologue);
3344 if (*peel_iters_epilogue)
3345 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3346 retval += record_stmt_cost (epilogue_cost_vec,
3347 si->count * *peel_iters_epilogue,
3348 si->kind, si->stmt_info, si->misalign,
3349 vect_epilogue);
3350
3351 return retval;
3352 }
3353
3354 /* Function vect_estimate_min_profitable_iters
3355
3356 Return the number of iterations required for the vector version of the
3357 loop to be profitable relative to the cost of the scalar version of the
3358 loop.
3359
3360 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3361 of iterations for vectorization. -1 value means loop vectorization
3362 is not profitable. This returned value may be used for dynamic
3363 profitability check.
3364
3365 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3366 for static check against estimated number of iterations. */
3367
3368 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3369 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3370 int *ret_min_profitable_niters,
3371 int *ret_min_profitable_estimate)
3372 {
3373 int min_profitable_iters;
3374 int min_profitable_estimate;
3375 int peel_iters_prologue;
3376 int peel_iters_epilogue;
3377 unsigned vec_inside_cost = 0;
3378 int vec_outside_cost = 0;
3379 unsigned vec_prologue_cost = 0;
3380 unsigned vec_epilogue_cost = 0;
3381 int scalar_single_iter_cost = 0;
3382 int scalar_outside_cost = 0;
3383 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3384 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3385 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3386
3387 /* Cost model disabled. */
3388 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3389 {
3390 if (dump_enabled_p ())
3391 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3392 *ret_min_profitable_niters = 0;
3393 *ret_min_profitable_estimate = 0;
3394 return;
3395 }
3396
3397 /* Requires loop versioning tests to handle misalignment. */
3398 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3399 {
3400 /* FIXME: Make cost depend on complexity of individual check. */
3401 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3402 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3403 vect_prologue);
3404 if (dump_enabled_p ())
3405 dump_printf (MSG_NOTE,
3406 "cost model: Adding cost of checks for loop "
3407 "versioning to treat misalignment.\n");
3408 }
3409
3410 /* Requires loop versioning with alias checks. */
3411 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3412 {
3413 /* FIXME: Make cost depend on complexity of individual check. */
3414 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3415 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3416 vect_prologue);
3417 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3418 if (len)
3419 /* Count LEN - 1 ANDs and LEN comparisons. */
3420 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3421 NULL, 0, vect_prologue);
3422 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3423 if (len)
3424 {
3425 /* Count LEN - 1 ANDs and LEN comparisons. */
3426 unsigned int nstmts = len * 2 - 1;
3427 /* +1 for each bias that needs adding. */
3428 for (unsigned int i = 0; i < len; ++i)
3429 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3430 nstmts += 1;
3431 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3432 NULL, 0, vect_prologue);
3433 }
3434 if (dump_enabled_p ())
3435 dump_printf (MSG_NOTE,
3436 "cost model: Adding cost of checks for loop "
3437 "versioning aliasing.\n");
3438 }
3439
3440 /* Requires loop versioning with niter checks. */
3441 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3442 {
3443 /* FIXME: Make cost depend on complexity of individual check. */
3444 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3445 vect_prologue);
3446 if (dump_enabled_p ())
3447 dump_printf (MSG_NOTE,
3448 "cost model: Adding cost of checks for loop "
3449 "versioning niters.\n");
3450 }
3451
3452 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3453 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3454 vect_prologue);
3455
3456 /* Count statements in scalar loop. Using this as scalar cost for a single
3457 iteration for now.
3458
3459 TODO: Add outer loop support.
3460
3461 TODO: Consider assigning different costs to different scalar
3462 statements. */
3463
3464 scalar_single_iter_cost
3465 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3466
3467 /* Add additional cost for the peeled instructions in prologue and epilogue
3468 loop. (For fully-masked loops there will be no peeling.)
3469
3470 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3471 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3472
3473 TODO: Build an expression that represents peel_iters for prologue and
3474 epilogue to be used in a run-time test. */
3475
3476 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3477 {
3478 peel_iters_prologue = 0;
3479 peel_iters_epilogue = 0;
3480
3481 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3482 {
3483 /* We need to peel exactly one iteration. */
3484 peel_iters_epilogue += 1;
3485 stmt_info_for_cost *si;
3486 int j;
3487 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3488 j, si)
3489 (void) add_stmt_cost (target_cost_data, si->count,
3490 si->kind, si->stmt_info, si->misalign,
3491 vect_epilogue);
3492 }
3493
3494 /* Calculate how many masks we need to generate. */
3495 unsigned int num_masks = 0;
3496 rgroup_masks *rgm;
3497 unsigned int num_vectors_m1;
3498 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3499 if (rgm->mask_type)
3500 num_masks += num_vectors_m1 + 1;
3501 gcc_assert (num_masks > 0);
3502
3503 /* In the worst case, we need to generate each mask in the prologue
3504 and in the loop body. One of the loop body mask instructions
3505 replaces the comparison in the scalar loop, and since we don't
3506 count the scalar comparison against the scalar body, we shouldn't
3507 count that vector instruction against the vector body either.
3508
3509 Sometimes we can use unpacks instead of generating prologue
3510 masks and sometimes the prologue mask will fold to a constant,
3511 so the actual prologue cost might be smaller. However, it's
3512 simpler and safer to use the worst-case cost; if this ends up
3513 being the tie-breaker between vectorizing or not, then it's
3514 probably better not to vectorize. */
3515 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3516 NULL, 0, vect_prologue);
3517 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3518 NULL, 0, vect_body);
3519 }
3520 else if (npeel < 0)
3521 {
3522 peel_iters_prologue = assumed_vf / 2;
3523 if (dump_enabled_p ())
3524 dump_printf (MSG_NOTE, "cost model: "
3525 "prologue peel iters set to vf/2.\n");
3526
3527 /* If peeling for alignment is unknown, loop bound of main loop becomes
3528 unknown. */
3529 peel_iters_epilogue = assumed_vf / 2;
3530 if (dump_enabled_p ())
3531 dump_printf (MSG_NOTE, "cost model: "
3532 "epilogue peel iters set to vf/2 because "
3533 "peeling for alignment is unknown.\n");
3534
3535 /* If peeled iterations are unknown, count a taken branch and a not taken
3536 branch per peeled loop. Even if scalar loop iterations are known,
3537 vector iterations are not known since peeled prologue iterations are
3538 not known. Hence guards remain the same. */
3539 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3540 NULL, 0, vect_prologue);
3541 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3542 NULL, 0, vect_prologue);
3543 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3544 NULL, 0, vect_epilogue);
3545 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3546 NULL, 0, vect_epilogue);
3547 stmt_info_for_cost *si;
3548 int j;
3549 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3550 {
3551 (void) add_stmt_cost (target_cost_data,
3552 si->count * peel_iters_prologue,
3553 si->kind, si->stmt_info, si->misalign,
3554 vect_prologue);
3555 (void) add_stmt_cost (target_cost_data,
3556 si->count * peel_iters_epilogue,
3557 si->kind, si->stmt_info, si->misalign,
3558 vect_epilogue);
3559 }
3560 }
3561 else
3562 {
3563 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3564 stmt_info_for_cost *si;
3565 int j;
3566 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3567
3568 prologue_cost_vec.create (2);
3569 epilogue_cost_vec.create (2);
3570 peel_iters_prologue = npeel;
3571
3572 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3573 &peel_iters_epilogue,
3574 &LOOP_VINFO_SCALAR_ITERATION_COST
3575 (loop_vinfo),
3576 &prologue_cost_vec,
3577 &epilogue_cost_vec);
3578
3579 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3580 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3581 si->misalign, vect_prologue);
3582
3583 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3584 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3585 si->misalign, vect_epilogue);
3586
3587 prologue_cost_vec.release ();
3588 epilogue_cost_vec.release ();
3589 }
3590
3591 /* FORNOW: The scalar outside cost is incremented in one of the
3592 following ways:
3593
3594 1. The vectorizer checks for alignment and aliasing and generates
3595 a condition that allows dynamic vectorization. A cost model
3596 check is ANDED with the versioning condition. Hence scalar code
3597 path now has the added cost of the versioning check.
3598
3599 if (cost > th & versioning_check)
3600 jmp to vector code
3601
3602 Hence run-time scalar is incremented by not-taken branch cost.
3603
3604 2. The vectorizer then checks if a prologue is required. If the
3605 cost model check was not done before during versioning, it has to
3606 be done before the prologue check.
3607
3608 if (cost <= th)
3609 prologue = scalar_iters
3610 if (prologue == 0)
3611 jmp to vector code
3612 else
3613 execute prologue
3614 if (prologue == num_iters)
3615 go to exit
3616
3617 Hence the run-time scalar cost is incremented by a taken branch,
3618 plus a not-taken branch, plus a taken branch cost.
3619
3620 3. The vectorizer then checks if an epilogue is required. If the
3621 cost model check was not done before during prologue check, it
3622 has to be done with the epilogue check.
3623
3624 if (prologue == 0)
3625 jmp to vector code
3626 else
3627 execute prologue
3628 if (prologue == num_iters)
3629 go to exit
3630 vector code:
3631 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3632 jmp to epilogue
3633
3634 Hence the run-time scalar cost should be incremented by 2 taken
3635 branches.
3636
3637 TODO: The back end may reorder the BBS's differently and reverse
3638 conditions/branch directions. Change the estimates below to
3639 something more reasonable. */
3640
3641 /* If the number of iterations is known and we do not do versioning, we can
3642 decide whether to vectorize at compile time. Hence the scalar version
3643 do not carry cost model guard costs. */
3644 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3645 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3646 {
3647 /* Cost model check occurs at versioning. */
3648 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3649 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3650 else
3651 {
3652 /* Cost model check occurs at prologue generation. */
3653 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3654 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3655 + vect_get_stmt_cost (cond_branch_not_taken);
3656 /* Cost model check occurs at epilogue generation. */
3657 else
3658 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3659 }
3660 }
3661
3662 /* Complete the target-specific cost calculations. */
3663 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3664 &vec_inside_cost, &vec_epilogue_cost);
3665
3666 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3667
3668 /* Stash the costs so that we can compare two loop_vec_infos. */
3669 loop_vinfo->vec_inside_cost = vec_inside_cost;
3670 loop_vinfo->vec_outside_cost = vec_outside_cost;
3671
3672 if (dump_enabled_p ())
3673 {
3674 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3675 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3676 vec_inside_cost);
3677 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3678 vec_prologue_cost);
3679 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3680 vec_epilogue_cost);
3681 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3682 scalar_single_iter_cost);
3683 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3684 scalar_outside_cost);
3685 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3686 vec_outside_cost);
3687 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3688 peel_iters_prologue);
3689 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3690 peel_iters_epilogue);
3691 }
3692
3693 /* Calculate number of iterations required to make the vector version
3694 profitable, relative to the loop bodies only. The following condition
3695 must hold true:
3696 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3697 where
3698 SIC = scalar iteration cost, VIC = vector iteration cost,
3699 VOC = vector outside cost, VF = vectorization factor,
3700 NPEEL = prologue iterations + epilogue iterations,
3701 SOC = scalar outside cost for run time cost model check. */
3702
3703 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3704 - vec_inside_cost);
3705 if (saving_per_viter <= 0)
3706 {
3707 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3708 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3709 "vectorization did not happen for a simd loop");
3710
3711 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3713 "cost model: the vector iteration cost = %d "
3714 "divided by the scalar iteration cost = %d "
3715 "is greater or equal to the vectorization factor = %d"
3716 ".\n",
3717 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3718 *ret_min_profitable_niters = -1;
3719 *ret_min_profitable_estimate = -1;
3720 return;
3721 }
3722
3723 /* ??? The "if" arm is written to handle all cases; see below for what
3724 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3725 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3726 {
3727 /* Rewriting the condition above in terms of the number of
3728 vector iterations (vniters) rather than the number of
3729 scalar iterations (niters) gives:
3730
3731 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3732
3733 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3734
3735 For integer N, X and Y when X > 0:
3736
3737 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3738 int outside_overhead = (vec_outside_cost
3739 - scalar_single_iter_cost * peel_iters_prologue
3740 - scalar_single_iter_cost * peel_iters_epilogue
3741 - scalar_outside_cost);
3742 /* We're only interested in cases that require at least one
3743 vector iteration. */
3744 int min_vec_niters = 1;
3745 if (outside_overhead > 0)
3746 min_vec_niters = outside_overhead / saving_per_viter + 1;
3747
3748 if (dump_enabled_p ())
3749 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3750 min_vec_niters);
3751
3752 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3753 {
3754 /* Now that we know the minimum number of vector iterations,
3755 find the minimum niters for which the scalar cost is larger:
3756
3757 SIC * niters > VIC * vniters + VOC - SOC
3758
3759 We know that the minimum niters is no more than
3760 vniters * VF + NPEEL, but it might be (and often is) less
3761 than that if a partial vector iteration is cheaper than the
3762 equivalent scalar code. */
3763 int threshold = (vec_inside_cost * min_vec_niters
3764 + vec_outside_cost
3765 - scalar_outside_cost);
3766 if (threshold <= 0)
3767 min_profitable_iters = 1;
3768 else
3769 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3770 }
3771 else
3772 /* Convert the number of vector iterations into a number of
3773 scalar iterations. */
3774 min_profitable_iters = (min_vec_niters * assumed_vf
3775 + peel_iters_prologue
3776 + peel_iters_epilogue);
3777 }
3778 else
3779 {
3780 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3781 * assumed_vf
3782 - vec_inside_cost * peel_iters_prologue
3783 - vec_inside_cost * peel_iters_epilogue);
3784 if (min_profitable_iters <= 0)
3785 min_profitable_iters = 0;
3786 else
3787 {
3788 min_profitable_iters /= saving_per_viter;
3789
3790 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3791 <= (((int) vec_inside_cost * min_profitable_iters)
3792 + (((int) vec_outside_cost - scalar_outside_cost)
3793 * assumed_vf)))
3794 min_profitable_iters++;
3795 }
3796 }
3797
3798 if (dump_enabled_p ())
3799 dump_printf (MSG_NOTE,
3800 " Calculated minimum iters for profitability: %d\n",
3801 min_profitable_iters);
3802
3803 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3804 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3805 /* We want the vectorized loop to execute at least once. */
3806 min_profitable_iters = assumed_vf + peel_iters_prologue;
3807
3808 if (dump_enabled_p ())
3809 dump_printf_loc (MSG_NOTE, vect_location,
3810 " Runtime profitability threshold = %d\n",
3811 min_profitable_iters);
3812
3813 *ret_min_profitable_niters = min_profitable_iters;
3814
3815 /* Calculate number of iterations required to make the vector version
3816 profitable, relative to the loop bodies only.
3817
3818 Non-vectorized variant is SIC * niters and it must win over vector
3819 variant on the expected loop trip count. The following condition must hold true:
3820 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3821
3822 if (vec_outside_cost <= 0)
3823 min_profitable_estimate = 0;
3824 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3825 {
3826 /* This is a repeat of the code above, but with + SOC rather
3827 than - SOC. */
3828 int outside_overhead = (vec_outside_cost
3829 - scalar_single_iter_cost * peel_iters_prologue
3830 - scalar_single_iter_cost * peel_iters_epilogue
3831 + scalar_outside_cost);
3832 int min_vec_niters = 1;
3833 if (outside_overhead > 0)
3834 min_vec_niters = outside_overhead / saving_per_viter + 1;
3835
3836 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3837 {
3838 int threshold = (vec_inside_cost * min_vec_niters
3839 + vec_outside_cost
3840 + scalar_outside_cost);
3841 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3842 }
3843 else
3844 min_profitable_estimate = (min_vec_niters * assumed_vf
3845 + peel_iters_prologue
3846 + peel_iters_epilogue);
3847 }
3848 else
3849 {
3850 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3851 * assumed_vf
3852 - vec_inside_cost * peel_iters_prologue
3853 - vec_inside_cost * peel_iters_epilogue)
3854 / ((scalar_single_iter_cost * assumed_vf)
3855 - vec_inside_cost);
3856 }
3857 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3858 if (dump_enabled_p ())
3859 dump_printf_loc (MSG_NOTE, vect_location,
3860 " Static estimate profitability threshold = %d\n",
3861 min_profitable_estimate);
3862
3863 *ret_min_profitable_estimate = min_profitable_estimate;
3864 }
3865
3866 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3867 vector elements (not bits) for a vector with NELT elements. */
3868 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)3869 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3870 vec_perm_builder *sel)
3871 {
3872 /* The encoding is a single stepped pattern. Any wrap-around is handled
3873 by vec_perm_indices. */
3874 sel->new_vector (nelt, 1, 3);
3875 for (unsigned int i = 0; i < 3; i++)
3876 sel->quick_push (i + offset);
3877 }
3878
3879 /* Checks whether the target supports whole-vector shifts for vectors of mode
3880 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3881 it supports vec_perm_const with masks for all necessary shift amounts. */
3882 static bool
have_whole_vector_shift(machine_mode mode)3883 have_whole_vector_shift (machine_mode mode)
3884 {
3885 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3886 return true;
3887
3888 /* Variable-length vectors should be handled via the optab. */
3889 unsigned int nelt;
3890 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3891 return false;
3892
3893 vec_perm_builder sel;
3894 vec_perm_indices indices;
3895 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3896 {
3897 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3898 indices.new_vector (sel, 2, nelt);
3899 if (!can_vec_perm_const_p (mode, indices, false))
3900 return false;
3901 }
3902 return true;
3903 }
3904
3905 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3906 functions. Design better to avoid maintenance issues. */
3907
3908 /* Function vect_model_reduction_cost.
3909
3910 Models cost for a reduction operation, including the vector ops
3911 generated within the strip-mine loop, the initial definition before
3912 the loop, and the epilogue code that must be generated. */
3913
3914 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)3915 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3916 vect_reduction_type reduction_type,
3917 int ncopies, stmt_vector_for_cost *cost_vec)
3918 {
3919 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3920 enum tree_code code;
3921 optab optab;
3922 tree vectype;
3923 machine_mode mode;
3924 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3925 class loop *loop = NULL;
3926
3927 if (loop_vinfo)
3928 loop = LOOP_VINFO_LOOP (loop_vinfo);
3929
3930 /* Condition reductions generate two reductions in the loop. */
3931 if (reduction_type == COND_REDUCTION)
3932 ncopies *= 2;
3933
3934 vectype = STMT_VINFO_VECTYPE (stmt_info);
3935 mode = TYPE_MODE (vectype);
3936 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3937
3938 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3939
3940 if (reduction_type == EXTRACT_LAST_REDUCTION)
3941 /* No extra instructions are needed in the prologue. The loop body
3942 operations are costed in vectorizable_condition. */
3943 inside_cost = 0;
3944 else if (reduction_type == FOLD_LEFT_REDUCTION)
3945 {
3946 /* No extra instructions needed in the prologue. */
3947 prologue_cost = 0;
3948
3949 if (reduc_fn != IFN_LAST)
3950 /* Count one reduction-like operation per vector. */
3951 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3952 stmt_info, 0, vect_body);
3953 else
3954 {
3955 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3956 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3957 inside_cost = record_stmt_cost (cost_vec, nelements,
3958 vec_to_scalar, stmt_info, 0,
3959 vect_body);
3960 inside_cost += record_stmt_cost (cost_vec, nelements,
3961 scalar_stmt, stmt_info, 0,
3962 vect_body);
3963 }
3964 }
3965 else
3966 {
3967 /* Add in cost for initial definition.
3968 For cond reduction we have four vectors: initial index, step,
3969 initial result of the data reduction, initial value of the index
3970 reduction. */
3971 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3972 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3973 scalar_to_vec, stmt_info, 0,
3974 vect_prologue);
3975
3976 /* Cost of reduction op inside loop. */
3977 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3978 stmt_info, 0, vect_body);
3979 }
3980
3981 /* Determine cost of epilogue code.
3982
3983 We have a reduction operator that will reduce the vector in one statement.
3984 Also requires scalar extract. */
3985
3986 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3987 {
3988 if (reduc_fn != IFN_LAST)
3989 {
3990 if (reduction_type == COND_REDUCTION)
3991 {
3992 /* An EQ stmt and an COND_EXPR stmt. */
3993 epilogue_cost += record_stmt_cost (cost_vec, 2,
3994 vector_stmt, stmt_info, 0,
3995 vect_epilogue);
3996 /* Reduction of the max index and a reduction of the found
3997 values. */
3998 epilogue_cost += record_stmt_cost (cost_vec, 2,
3999 vec_to_scalar, stmt_info, 0,
4000 vect_epilogue);
4001 /* A broadcast of the max value. */
4002 epilogue_cost += record_stmt_cost (cost_vec, 1,
4003 scalar_to_vec, stmt_info, 0,
4004 vect_epilogue);
4005 }
4006 else
4007 {
4008 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4009 stmt_info, 0, vect_epilogue);
4010 epilogue_cost += record_stmt_cost (cost_vec, 1,
4011 vec_to_scalar, stmt_info, 0,
4012 vect_epilogue);
4013 }
4014 }
4015 else if (reduction_type == COND_REDUCTION)
4016 {
4017 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4018 /* Extraction of scalar elements. */
4019 epilogue_cost += record_stmt_cost (cost_vec,
4020 2 * estimated_nunits,
4021 vec_to_scalar, stmt_info, 0,
4022 vect_epilogue);
4023 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4024 epilogue_cost += record_stmt_cost (cost_vec,
4025 2 * estimated_nunits - 3,
4026 scalar_stmt, stmt_info, 0,
4027 vect_epilogue);
4028 }
4029 else if (reduction_type == EXTRACT_LAST_REDUCTION
4030 || reduction_type == FOLD_LEFT_REDUCTION)
4031 /* No extra instructions need in the epilogue. */
4032 ;
4033 else
4034 {
4035 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4036 tree bitsize =
4037 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4038 int element_bitsize = tree_to_uhwi (bitsize);
4039 int nelements = vec_size_in_bits / element_bitsize;
4040
4041 if (code == COND_EXPR)
4042 code = MAX_EXPR;
4043
4044 optab = optab_for_tree_code (code, vectype, optab_default);
4045
4046 /* We have a whole vector shift available. */
4047 if (optab != unknown_optab
4048 && VECTOR_MODE_P (mode)
4049 && optab_handler (optab, mode) != CODE_FOR_nothing
4050 && have_whole_vector_shift (mode))
4051 {
4052 /* Final reduction via vector shifts and the reduction operator.
4053 Also requires scalar extract. */
4054 epilogue_cost += record_stmt_cost (cost_vec,
4055 exact_log2 (nelements) * 2,
4056 vector_stmt, stmt_info, 0,
4057 vect_epilogue);
4058 epilogue_cost += record_stmt_cost (cost_vec, 1,
4059 vec_to_scalar, stmt_info, 0,
4060 vect_epilogue);
4061 }
4062 else
4063 /* Use extracts and reduction op for final reduction. For N
4064 elements, we have N extracts and N-1 reduction ops. */
4065 epilogue_cost += record_stmt_cost (cost_vec,
4066 nelements + nelements - 1,
4067 vector_stmt, stmt_info, 0,
4068 vect_epilogue);
4069 }
4070 }
4071
4072 if (dump_enabled_p ())
4073 dump_printf (MSG_NOTE,
4074 "vect_model_reduction_cost: inside_cost = %d, "
4075 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4076 prologue_cost, epilogue_cost);
4077 }
4078
4079
4080 /* Function vect_model_induction_cost.
4081
4082 Models cost for induction operations. */
4083
4084 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies,stmt_vector_for_cost * cost_vec)4085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4086 stmt_vector_for_cost *cost_vec)
4087 {
4088 unsigned inside_cost, prologue_cost;
4089
4090 if (PURE_SLP_STMT (stmt_info))
4091 return;
4092
4093 /* loop cost for vec_loop. */
4094 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4095 stmt_info, 0, vect_body);
4096
4097 /* prologue cost for vec_init and vec_step. */
4098 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4099 stmt_info, 0, vect_prologue);
4100
4101 if (dump_enabled_p ())
4102 dump_printf_loc (MSG_NOTE, vect_location,
4103 "vect_model_induction_cost: inside_cost = %d, "
4104 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4105 }
4106
4107
4108
4109 /* Function get_initial_def_for_reduction
4110
4111 Input:
4112 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4113 INIT_VAL - the initial value of the reduction variable
4114
4115 Output:
4116 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4117 of the reduction (used for adjusting the epilog - see below).
4118 Return a vector variable, initialized according to the operation that
4119 STMT_VINFO performs. This vector will be used as the initial value
4120 of the vector of partial results.
4121
4122 Option1 (adjust in epilog): Initialize the vector as follows:
4123 add/bit or/xor: [0,0,...,0,0]
4124 mult/bit and: [1,1,...,1,1]
4125 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4126 and when necessary (e.g. add/mult case) let the caller know
4127 that it needs to adjust the result by init_val.
4128
4129 Option2: Initialize the vector as follows:
4130 add/bit or/xor: [init_val,0,0,...,0]
4131 mult/bit and: [init_val,1,1,...,1]
4132 min/max/cond_expr: [init_val,init_val,...,init_val]
4133 and no adjustments are needed.
4134
4135 For example, for the following code:
4136
4137 s = init_val;
4138 for (i=0;i<n;i++)
4139 s = s + a[i];
4140
4141 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4142 For a vector of 4 units, we want to return either [0,0,0,init_val],
4143 or [0,0,0,0] and let the caller know that it needs to adjust
4144 the result at the end by 'init_val'.
4145
4146 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4147 initialization vector is simpler (same element in all entries), if
4148 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4149
4150 A cost model should help decide between these two schemes. */
4151
4152 static tree
get_initial_def_for_reduction(stmt_vec_info stmt_vinfo,enum tree_code code,tree init_val,tree * adjustment_def)4153 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4154 enum tree_code code, tree init_val,
4155 tree *adjustment_def)
4156 {
4157 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4158 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4159 tree scalar_type = TREE_TYPE (init_val);
4160 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4161 tree def_for_init;
4162 tree init_def;
4163 REAL_VALUE_TYPE real_init_val = dconst0;
4164 int int_init_val = 0;
4165 gimple_seq stmts = NULL;
4166
4167 gcc_assert (vectype);
4168
4169 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4170 || SCALAR_FLOAT_TYPE_P (scalar_type));
4171
4172 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4173 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4174
4175 /* ADJUSTMENT_DEF is NULL when called from
4176 vect_create_epilog_for_reduction to vectorize double reduction. */
4177 if (adjustment_def)
4178 *adjustment_def = NULL;
4179
4180 switch (code)
4181 {
4182 case WIDEN_SUM_EXPR:
4183 case DOT_PROD_EXPR:
4184 case SAD_EXPR:
4185 case PLUS_EXPR:
4186 case MINUS_EXPR:
4187 case BIT_IOR_EXPR:
4188 case BIT_XOR_EXPR:
4189 case MULT_EXPR:
4190 case BIT_AND_EXPR:
4191 {
4192 if (code == MULT_EXPR)
4193 {
4194 real_init_val = dconst1;
4195 int_init_val = 1;
4196 }
4197
4198 if (code == BIT_AND_EXPR)
4199 int_init_val = -1;
4200
4201 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4202 def_for_init = build_real (scalar_type, real_init_val);
4203 else
4204 def_for_init = build_int_cst (scalar_type, int_init_val);
4205
4206 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4207 {
4208 /* Option1: the first element is '0' or '1' as well. */
4209 if (!operand_equal_p (def_for_init, init_val, 0))
4210 *adjustment_def = init_val;
4211 init_def = gimple_build_vector_from_val (&stmts, vectype,
4212 def_for_init);
4213 }
4214 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4215 {
4216 /* Option2 (variable length): the first element is INIT_VAL. */
4217 init_def = gimple_build_vector_from_val (&stmts, vectype,
4218 def_for_init);
4219 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4220 vectype, init_def, init_val);
4221 }
4222 else
4223 {
4224 /* Option2: the first element is INIT_VAL. */
4225 tree_vector_builder elts (vectype, 1, 2);
4226 elts.quick_push (init_val);
4227 elts.quick_push (def_for_init);
4228 init_def = gimple_build_vector (&stmts, &elts);
4229 }
4230 }
4231 break;
4232
4233 case MIN_EXPR:
4234 case MAX_EXPR:
4235 case COND_EXPR:
4236 {
4237 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4238 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4239 }
4240 break;
4241
4242 default:
4243 gcc_unreachable ();
4244 }
4245
4246 if (stmts)
4247 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4248 return init_def;
4249 }
4250
4251 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4252 NUMBER_OF_VECTORS is the number of vector defs to create.
4253 If NEUTRAL_OP is nonnull, introducing extra elements of that
4254 value will not change the result. */
4255
4256 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4257 get_initial_defs_for_reduction (slp_tree slp_node,
4258 vec<tree> *vec_oprnds,
4259 unsigned int number_of_vectors,
4260 bool reduc_chain, tree neutral_op)
4261 {
4262 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4263 stmt_vec_info stmt_vinfo = stmts[0];
4264 vec_info *vinfo = stmt_vinfo->vinfo;
4265 unsigned HOST_WIDE_INT nunits;
4266 unsigned j, number_of_places_left_in_vector;
4267 tree vector_type;
4268 unsigned int group_size = stmts.length ();
4269 unsigned int i;
4270 class loop *loop;
4271
4272 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4273
4274 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4275
4276 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4277 gcc_assert (loop);
4278 edge pe = loop_preheader_edge (loop);
4279
4280 gcc_assert (!reduc_chain || neutral_op);
4281
4282 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4283 created vectors. It is greater than 1 if unrolling is performed.
4284
4285 For example, we have two scalar operands, s1 and s2 (e.g., group of
4286 strided accesses of size two), while NUNITS is four (i.e., four scalars
4287 of this type can be packed in a vector). The output vector will contain
4288 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4289 will be 2).
4290
4291 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4292 vectors containing the operands.
4293
4294 For example, NUNITS is four as before, and the group size is 8
4295 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4296 {s5, s6, s7, s8}. */
4297
4298 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4299 nunits = group_size;
4300
4301 number_of_places_left_in_vector = nunits;
4302 bool constant_p = true;
4303 tree_vector_builder elts (vector_type, nunits, 1);
4304 elts.quick_grow (nunits);
4305 gimple_seq ctor_seq = NULL;
4306 for (j = 0; j < nunits * number_of_vectors; ++j)
4307 {
4308 tree op;
4309 i = j % group_size;
4310 stmt_vinfo = stmts[i];
4311
4312 /* Get the def before the loop. In reduction chain we have only
4313 one initial value. Else we have as many as PHIs in the group. */
4314 if (reduc_chain)
4315 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4316 else if (((vec_oprnds->length () + 1) * nunits
4317 - number_of_places_left_in_vector >= group_size)
4318 && neutral_op)
4319 op = neutral_op;
4320 else
4321 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4322
4323 /* Create 'vect_ = {op0,op1,...,opn}'. */
4324 number_of_places_left_in_vector--;
4325 elts[nunits - number_of_places_left_in_vector - 1] = op;
4326 if (!CONSTANT_CLASS_P (op))
4327 constant_p = false;
4328
4329 if (number_of_places_left_in_vector == 0)
4330 {
4331 tree init;
4332 if (constant_p && !neutral_op
4333 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4334 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4335 /* Build the vector directly from ELTS. */
4336 init = gimple_build_vector (&ctor_seq, &elts);
4337 else if (neutral_op)
4338 {
4339 /* Build a vector of the neutral value and shift the
4340 other elements into place. */
4341 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4342 neutral_op);
4343 int k = nunits;
4344 while (k > 0 && elts[k - 1] == neutral_op)
4345 k -= 1;
4346 while (k > 0)
4347 {
4348 k -= 1;
4349 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4350 vector_type, init, elts[k]);
4351 }
4352 }
4353 else
4354 {
4355 /* First time round, duplicate ELTS to fill the
4356 required number of vectors. */
4357 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4358 number_of_vectors, *vec_oprnds);
4359 break;
4360 }
4361 vec_oprnds->quick_push (init);
4362
4363 number_of_places_left_in_vector = nunits;
4364 elts.new_vector (vector_type, nunits, 1);
4365 elts.quick_grow (nunits);
4366 constant_p = true;
4367 }
4368 }
4369 if (ctor_seq != NULL)
4370 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4371 }
4372
4373 /* For a statement STMT_INFO taking part in a reduction operation return
4374 the stmt_vec_info the meta information is stored on. */
4375
4376 stmt_vec_info
info_for_reduction(stmt_vec_info stmt_info)4377 info_for_reduction (stmt_vec_info stmt_info)
4378 {
4379 stmt_info = vect_orig_stmt (stmt_info);
4380 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4381 if (!is_a <gphi *> (stmt_info->stmt))
4382 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4383 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4384 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4385 {
4386 if (gimple_phi_num_args (phi) == 1)
4387 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4388 }
4389 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4390 {
4391 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4392 stmt_vec_info info
4393 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4394 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4395 stmt_info = info;
4396 }
4397 return stmt_info;
4398 }
4399
4400 /* Function vect_create_epilog_for_reduction
4401
4402 Create code at the loop-epilog to finalize the result of a reduction
4403 computation.
4404
4405 STMT_INFO is the scalar reduction stmt that is being vectorized.
4406 SLP_NODE is an SLP node containing a group of reduction statements. The
4407 first one in this group is STMT_INFO.
4408 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4409 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4410 (counting from 0)
4411
4412 This function:
4413 1. Completes the reduction def-use cycles.
4414 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4415 by calling the function specified by REDUC_FN if available, or by
4416 other means (whole-vector shifts or a scalar loop).
4417 The function also creates a new phi node at the loop exit to preserve
4418 loop-closed form, as illustrated below.
4419
4420 The flow at the entry to this function:
4421
4422 loop:
4423 vec_def = phi <vec_init, null> # REDUCTION_PHI
4424 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4425 s_loop = scalar_stmt # (scalar) STMT_INFO
4426 loop_exit:
4427 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4428 use <s_out0>
4429 use <s_out0>
4430
4431 The above is transformed by this function into:
4432
4433 loop:
4434 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4435 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4436 s_loop = scalar_stmt # (scalar) STMT_INFO
4437 loop_exit:
4438 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4439 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4440 v_out2 = reduce <v_out1>
4441 s_out3 = extract_field <v_out2, 0>
4442 s_out4 = adjust_result <s_out3>
4443 use <s_out4>
4444 use <s_out4>
4445 */
4446
4447 static void
vect_create_epilog_for_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)4448 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4449 slp_tree slp_node,
4450 slp_instance slp_node_instance)
4451 {
4452 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4453 gcc_assert (reduc_info->is_reduc_info);
4454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4455 /* For double reductions we need to get at the inner loop reduction
4456 stmt which has the meta info attached. Our stmt_info is that of the
4457 loop-closed PHI of the inner loop which we remember as
4458 def for the reduction PHI generation. */
4459 bool double_reduc = false;
4460 stmt_vec_info rdef_info = stmt_info;
4461 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4462 {
4463 gcc_assert (!slp_node);
4464 double_reduc = true;
4465 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4466 (stmt_info->stmt, 0));
4467 stmt_info = vect_stmt_to_vectorize (stmt_info);
4468 }
4469 gphi *reduc_def_stmt
4470 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4471 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4472 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4473 stmt_vec_info prev_phi_info;
4474 tree vectype;
4475 machine_mode mode;
4476 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4477 basic_block exit_bb;
4478 tree scalar_dest;
4479 tree scalar_type;
4480 gimple *new_phi = NULL, *phi;
4481 stmt_vec_info phi_info;
4482 gimple_stmt_iterator exit_gsi;
4483 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4484 gimple *epilog_stmt = NULL;
4485 gimple *exit_phi;
4486 tree bitsize;
4487 tree def;
4488 tree orig_name, scalar_result;
4489 imm_use_iterator imm_iter, phi_imm_iter;
4490 use_operand_p use_p, phi_use_p;
4491 gimple *use_stmt;
4492 bool nested_in_vect_loop = false;
4493 auto_vec<gimple *> new_phis;
4494 int j, i;
4495 auto_vec<tree> scalar_results;
4496 unsigned int group_size = 1, k;
4497 auto_vec<gimple *> phis;
4498 bool slp_reduc = false;
4499 bool direct_slp_reduc;
4500 tree new_phi_result;
4501 tree induction_index = NULL_TREE;
4502
4503 if (slp_node)
4504 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4505
4506 if (nested_in_vect_loop_p (loop, stmt_info))
4507 {
4508 outer_loop = loop;
4509 loop = loop->inner;
4510 nested_in_vect_loop = true;
4511 gcc_assert (!slp_node);
4512 }
4513 gcc_assert (!nested_in_vect_loop || double_reduc);
4514
4515 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4516 gcc_assert (vectype);
4517 mode = TYPE_MODE (vectype);
4518
4519 tree initial_def = NULL;
4520 tree induc_val = NULL_TREE;
4521 tree adjustment_def = NULL;
4522 if (slp_node)
4523 ;
4524 else
4525 {
4526 /* Get at the scalar def before the loop, that defines the initial value
4527 of the reduction variable. */
4528 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4529 loop_preheader_edge (loop));
4530 /* Optimize: for induction condition reduction, if we can't use zero
4531 for induc_val, use initial_def. */
4532 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4533 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4534 else if (double_reduc)
4535 ;
4536 else if (nested_in_vect_loop)
4537 ;
4538 else
4539 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4540 }
4541
4542 unsigned vec_num;
4543 int ncopies;
4544 if (slp_node)
4545 {
4546 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4547 ncopies = 1;
4548 }
4549 else
4550 {
4551 vec_num = 1;
4552 ncopies = 0;
4553 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4554 do
4555 {
4556 ncopies++;
4557 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4558 }
4559 while (phi_info);
4560 }
4561
4562 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4563 which is updated with the current index of the loop for every match of
4564 the original loop's cond_expr (VEC_STMT). This results in a vector
4565 containing the last time the condition passed for that vector lane.
4566 The first match will be a 1 to allow 0 to be used for non-matching
4567 indexes. If there are no matches at all then the vector will be all
4568 zeroes.
4569
4570 PR92772: This algorithm is broken for architectures that support
4571 masked vectors, but do not provide fold_extract_last. */
4572 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4573 {
4574 auto_vec<std::pair<tree, bool>, 2> ccompares;
4575 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4576 cond_info = vect_stmt_to_vectorize (cond_info);
4577 while (cond_info != reduc_info)
4578 {
4579 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4580 {
4581 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4582 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4583 ccompares.safe_push
4584 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4585 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4586 }
4587 cond_info
4588 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4589 1 + STMT_VINFO_REDUC_IDX
4590 (cond_info)));
4591 cond_info = vect_stmt_to_vectorize (cond_info);
4592 }
4593 gcc_assert (ccompares.length () != 0);
4594
4595 tree indx_before_incr, indx_after_incr;
4596 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4597 int scalar_precision
4598 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4599 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4600 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4601 (TYPE_MODE (vectype), cr_index_scalar_type,
4602 TYPE_VECTOR_SUBPARTS (vectype));
4603
4604 /* First we create a simple vector induction variable which starts
4605 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4606 vector size (STEP). */
4607
4608 /* Create a {1,2,3,...} vector. */
4609 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4610
4611 /* Create a vector of the step value. */
4612 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4613 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4614
4615 /* Create an induction variable. */
4616 gimple_stmt_iterator incr_gsi;
4617 bool insert_after;
4618 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4619 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4620 insert_after, &indx_before_incr, &indx_after_incr);
4621
4622 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4623 filled with zeros (VEC_ZERO). */
4624
4625 /* Create a vector of 0s. */
4626 tree zero = build_zero_cst (cr_index_scalar_type);
4627 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4628
4629 /* Create a vector phi node. */
4630 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4631 new_phi = create_phi_node (new_phi_tree, loop->header);
4632 loop_vinfo->add_stmt (new_phi);
4633 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4634 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4635
4636 /* Now take the condition from the loops original cond_exprs
4637 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4638 every match uses values from the induction variable
4639 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4640 (NEW_PHI_TREE).
4641 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4642 the new cond_expr (INDEX_COND_EXPR). */
4643 gimple_seq stmts = NULL;
4644 for (int i = ccompares.length () - 1; i != -1; --i)
4645 {
4646 tree ccompare = ccompares[i].first;
4647 if (ccompares[i].second)
4648 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4649 cr_index_vector_type,
4650 ccompare,
4651 indx_before_incr, new_phi_tree);
4652 else
4653 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4654 cr_index_vector_type,
4655 ccompare,
4656 new_phi_tree, indx_before_incr);
4657 }
4658 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4659 stmt_vec_info index_vec_info
4660 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4661 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4662
4663 /* Update the phi with the vec cond. */
4664 induction_index = new_phi_tree;
4665 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4666 loop_latch_edge (loop), UNKNOWN_LOCATION);
4667 }
4668
4669 /* 2. Create epilog code.
4670 The reduction epilog code operates across the elements of the vector
4671 of partial results computed by the vectorized loop.
4672 The reduction epilog code consists of:
4673
4674 step 1: compute the scalar result in a vector (v_out2)
4675 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4676 step 3: adjust the scalar result (s_out3) if needed.
4677
4678 Step 1 can be accomplished using one the following three schemes:
4679 (scheme 1) using reduc_fn, if available.
4680 (scheme 2) using whole-vector shifts, if available.
4681 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4682 combined.
4683
4684 The overall epilog code looks like this:
4685
4686 s_out0 = phi <s_loop> # original EXIT_PHI
4687 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4688 v_out2 = reduce <v_out1> # step 1
4689 s_out3 = extract_field <v_out2, 0> # step 2
4690 s_out4 = adjust_result <s_out3> # step 3
4691
4692 (step 3 is optional, and steps 1 and 2 may be combined).
4693 Lastly, the uses of s_out0 are replaced by s_out4. */
4694
4695
4696 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4697 v_out1 = phi <VECT_DEF>
4698 Store them in NEW_PHIS. */
4699 if (double_reduc)
4700 loop = outer_loop;
4701 exit_bb = single_exit (loop)->dest;
4702 prev_phi_info = NULL;
4703 new_phis.create (slp_node ? vec_num : ncopies);
4704 for (unsigned i = 0; i < vec_num; i++)
4705 {
4706 if (slp_node)
4707 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4708 else
4709 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4710 for (j = 0; j < ncopies; j++)
4711 {
4712 tree new_def = copy_ssa_name (def);
4713 phi = create_phi_node (new_def, exit_bb);
4714 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4715 if (j == 0)
4716 new_phis.quick_push (phi);
4717 else
4718 {
4719 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4720 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4721 }
4722
4723 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4724 prev_phi_info = phi_info;
4725 }
4726 }
4727
4728 exit_gsi = gsi_after_labels (exit_bb);
4729
4730 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4731 (i.e. when reduc_fn is not available) and in the final adjustment
4732 code (if needed). Also get the original scalar reduction variable as
4733 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4734 represents a reduction pattern), the tree-code and scalar-def are
4735 taken from the original stmt that the pattern-stmt (STMT) replaces.
4736 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4737 are taken from STMT. */
4738
4739 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4740 if (orig_stmt_info != stmt_info)
4741 {
4742 /* Reduction pattern */
4743 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4744 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4745 }
4746
4747 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4748 scalar_type = TREE_TYPE (scalar_dest);
4749 scalar_results.create (group_size);
4750 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4751 bitsize = TYPE_SIZE (scalar_type);
4752
4753 /* SLP reduction without reduction chain, e.g.,
4754 # a1 = phi <a2, a0>
4755 # b1 = phi <b2, b0>
4756 a2 = operation (a1)
4757 b2 = operation (b1) */
4758 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4759
4760 /* True if we should implement SLP_REDUC using native reduction operations
4761 instead of scalar operations. */
4762 direct_slp_reduc = (reduc_fn != IFN_LAST
4763 && slp_reduc
4764 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4765
4766 /* In case of reduction chain, e.g.,
4767 # a1 = phi <a3, a0>
4768 a2 = operation (a1)
4769 a3 = operation (a2),
4770
4771 we may end up with more than one vector result. Here we reduce them to
4772 one vector. */
4773 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4774 {
4775 gimple_seq stmts = NULL;
4776 tree first_vect = PHI_RESULT (new_phis[0]);
4777 first_vect = gimple_convert (&stmts, vectype, first_vect);
4778 for (k = 1; k < new_phis.length (); k++)
4779 {
4780 gimple *next_phi = new_phis[k];
4781 tree second_vect = PHI_RESULT (next_phi);
4782 second_vect = gimple_convert (&stmts, vectype, second_vect);
4783 first_vect = gimple_build (&stmts, code, vectype,
4784 first_vect, second_vect);
4785 }
4786 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4787
4788 new_phi_result = first_vect;
4789 new_phis.truncate (0);
4790 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4791 }
4792 /* Likewise if we couldn't use a single defuse cycle. */
4793 else if (ncopies > 1)
4794 {
4795 gcc_assert (new_phis.length () == 1);
4796 gimple_seq stmts = NULL;
4797 tree first_vect = PHI_RESULT (new_phis[0]);
4798 first_vect = gimple_convert (&stmts, vectype, first_vect);
4799 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4800 for (int k = 1; k < ncopies; ++k)
4801 {
4802 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4803 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4804 second_vect = gimple_convert (&stmts, vectype, second_vect);
4805 first_vect = gimple_build (&stmts, code, vectype,
4806 first_vect, second_vect);
4807 }
4808 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4809 new_phi_result = first_vect;
4810 new_phis.truncate (0);
4811 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4812 }
4813 else
4814 new_phi_result = PHI_RESULT (new_phis[0]);
4815
4816 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4817 && reduc_fn != IFN_LAST)
4818 {
4819 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4820 various data values where the condition matched and another vector
4821 (INDUCTION_INDEX) containing all the indexes of those matches. We
4822 need to extract the last matching index (which will be the index with
4823 highest value) and use this to index into the data vector.
4824 For the case where there were no matches, the data vector will contain
4825 all default values and the index vector will be all zeros. */
4826
4827 /* Get various versions of the type of the vector of indexes. */
4828 tree index_vec_type = TREE_TYPE (induction_index);
4829 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4830 tree index_scalar_type = TREE_TYPE (index_vec_type);
4831 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4832
4833 /* Get an unsigned integer version of the type of the data vector. */
4834 int scalar_precision
4835 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4836 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4837 tree vectype_unsigned = build_vector_type
4838 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4839
4840 /* First we need to create a vector (ZERO_VEC) of zeros and another
4841 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4842 can create using a MAX reduction and then expanding.
4843 In the case where the loop never made any matches, the max index will
4844 be zero. */
4845
4846 /* Vector of {0, 0, 0,...}. */
4847 tree zero_vec = build_zero_cst (vectype);
4848
4849 gimple_seq stmts = NULL;
4850 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4851 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4852
4853 /* Find maximum value from the vector of found indexes. */
4854 tree max_index = make_ssa_name (index_scalar_type);
4855 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4856 1, induction_index);
4857 gimple_call_set_lhs (max_index_stmt, max_index);
4858 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4859
4860 /* Vector of {max_index, max_index, max_index,...}. */
4861 tree max_index_vec = make_ssa_name (index_vec_type);
4862 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4863 max_index);
4864 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4865 max_index_vec_rhs);
4866 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4867
4868 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4869 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4870 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4871 otherwise. Only one value should match, resulting in a vector
4872 (VEC_COND) with one data value and the rest zeros.
4873 In the case where the loop never made any matches, every index will
4874 match, resulting in a vector with all data values (which will all be
4875 the default value). */
4876
4877 /* Compare the max index vector to the vector of found indexes to find
4878 the position of the max value. */
4879 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4880 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4881 induction_index,
4882 max_index_vec);
4883 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4884
4885 /* Use the compare to choose either values from the data vector or
4886 zero. */
4887 tree vec_cond = make_ssa_name (vectype);
4888 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4889 vec_compare, new_phi_result,
4890 zero_vec);
4891 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4892
4893 /* Finally we need to extract the data value from the vector (VEC_COND)
4894 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4895 reduction, but because this doesn't exist, we can use a MAX reduction
4896 instead. The data value might be signed or a float so we need to cast
4897 it first.
4898 In the case where the loop never made any matches, the data values are
4899 all identical, and so will reduce down correctly. */
4900
4901 /* Make the matched data values unsigned. */
4902 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4903 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4904 vec_cond);
4905 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4906 VIEW_CONVERT_EXPR,
4907 vec_cond_cast_rhs);
4908 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4909
4910 /* Reduce down to a scalar value. */
4911 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4912 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4913 1, vec_cond_cast);
4914 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4915 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4916
4917 /* Convert the reduced value back to the result type and set as the
4918 result. */
4919 stmts = NULL;
4920 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4921 data_reduc);
4922 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4923 scalar_results.safe_push (new_temp);
4924 }
4925 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4926 && reduc_fn == IFN_LAST)
4927 {
4928 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4929 idx = 0;
4930 idx_val = induction_index[0];
4931 val = data_reduc[0];
4932 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4933 if (induction_index[i] > idx_val)
4934 val = data_reduc[i], idx_val = induction_index[i];
4935 return val; */
4936
4937 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4938 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4939 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4940 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4941 /* Enforced by vectorizable_reduction, which ensures we have target
4942 support before allowing a conditional reduction on variable-length
4943 vectors. */
4944 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4945 tree idx_val = NULL_TREE, val = NULL_TREE;
4946 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4947 {
4948 tree old_idx_val = idx_val;
4949 tree old_val = val;
4950 idx_val = make_ssa_name (idx_eltype);
4951 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4952 build3 (BIT_FIELD_REF, idx_eltype,
4953 induction_index,
4954 bitsize_int (el_size),
4955 bitsize_int (off)));
4956 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4957 val = make_ssa_name (data_eltype);
4958 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4959 build3 (BIT_FIELD_REF,
4960 data_eltype,
4961 new_phi_result,
4962 bitsize_int (el_size),
4963 bitsize_int (off)));
4964 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965 if (off != 0)
4966 {
4967 tree new_idx_val = idx_val;
4968 if (off != v_size - el_size)
4969 {
4970 new_idx_val = make_ssa_name (idx_eltype);
4971 epilog_stmt = gimple_build_assign (new_idx_val,
4972 MAX_EXPR, idx_val,
4973 old_idx_val);
4974 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975 }
4976 tree new_val = make_ssa_name (data_eltype);
4977 epilog_stmt = gimple_build_assign (new_val,
4978 COND_EXPR,
4979 build2 (GT_EXPR,
4980 boolean_type_node,
4981 idx_val,
4982 old_idx_val),
4983 val, old_val);
4984 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4985 idx_val = new_idx_val;
4986 val = new_val;
4987 }
4988 }
4989 /* Convert the reduced value back to the result type and set as the
4990 result. */
4991 gimple_seq stmts = NULL;
4992 val = gimple_convert (&stmts, scalar_type, val);
4993 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4994 scalar_results.safe_push (val);
4995 }
4996
4997 /* 2.3 Create the reduction code, using one of the three schemes described
4998 above. In SLP we simply need to extract all the elements from the
4999 vector (without reducing them), so we use scalar shifts. */
5000 else if (reduc_fn != IFN_LAST && !slp_reduc)
5001 {
5002 tree tmp;
5003 tree vec_elem_type;
5004
5005 /* Case 1: Create:
5006 v_out2 = reduc_expr <v_out1> */
5007
5008 if (dump_enabled_p ())
5009 dump_printf_loc (MSG_NOTE, vect_location,
5010 "Reduce using direct vector reduction.\n");
5011
5012 gimple_seq stmts = NULL;
5013 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5014 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5015 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5016 vec_elem_type, new_phi_result);
5017 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5018 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5019
5020 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5021 && induc_val)
5022 {
5023 /* Earlier we set the initial value to be a vector if induc_val
5024 values. Check the result and if it is induc_val then replace
5025 with the original initial value, unless induc_val is
5026 the same as initial_def already. */
5027 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5028 induc_val);
5029
5030 tmp = make_ssa_name (new_scalar_dest);
5031 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5032 initial_def, new_temp);
5033 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5034 new_temp = tmp;
5035 }
5036
5037 scalar_results.safe_push (new_temp);
5038 }
5039 else if (direct_slp_reduc)
5040 {
5041 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5042 with the elements for other SLP statements replaced with the
5043 neutral value. We can then do a normal reduction on each vector. */
5044
5045 /* Enforced by vectorizable_reduction. */
5046 gcc_assert (new_phis.length () == 1);
5047 gcc_assert (pow2p_hwi (group_size));
5048
5049 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5050 vec<stmt_vec_info> orig_phis
5051 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5052 gimple_seq seq = NULL;
5053
5054 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5055 and the same element size as VECTYPE. */
5056 tree index = build_index_vector (vectype, 0, 1);
5057 tree index_type = TREE_TYPE (index);
5058 tree index_elt_type = TREE_TYPE (index_type);
5059 tree mask_type = truth_type_for (index_type);
5060
5061 /* Create a vector that, for each element, identifies which of
5062 the REDUC_GROUP_SIZE results should use it. */
5063 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5064 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5065 build_vector_from_val (index_type, index_mask));
5066
5067 /* Get a neutral vector value. This is simply a splat of the neutral
5068 scalar value if we have one, otherwise the initial scalar value
5069 is itself a neutral value. */
5070 tree vector_identity = NULL_TREE;
5071 tree neutral_op = NULL_TREE;
5072 if (slp_node)
5073 {
5074 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5075 neutral_op
5076 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5077 vectype, code, first != NULL);
5078 }
5079 if (neutral_op)
5080 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5081 neutral_op);
5082 for (unsigned int i = 0; i < group_size; ++i)
5083 {
5084 /* If there's no univeral neutral value, we can use the
5085 initial scalar value from the original PHI. This is used
5086 for MIN and MAX reduction, for example. */
5087 if (!neutral_op)
5088 {
5089 tree scalar_value
5090 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5091 loop_preheader_edge (loop));
5092 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5093 scalar_value);
5094 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5095 scalar_value);
5096 }
5097
5098 /* Calculate the equivalent of:
5099
5100 sel[j] = (index[j] == i);
5101
5102 which selects the elements of NEW_PHI_RESULT that should
5103 be included in the result. */
5104 tree compare_val = build_int_cst (index_elt_type, i);
5105 compare_val = build_vector_from_val (index_type, compare_val);
5106 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5107 index, compare_val);
5108
5109 /* Calculate the equivalent of:
5110
5111 vec = seq ? new_phi_result : vector_identity;
5112
5113 VEC is now suitable for a full vector reduction. */
5114 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5115 sel, new_phi_result, vector_identity);
5116
5117 /* Do the reduction and convert it to the appropriate type. */
5118 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5119 TREE_TYPE (vectype), vec);
5120 scalar = gimple_convert (&seq, scalar_type, scalar);
5121 scalar_results.safe_push (scalar);
5122 }
5123 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5124 }
5125 else
5126 {
5127 bool reduce_with_shift;
5128 tree vec_temp;
5129
5130 gcc_assert (slp_reduc || new_phis.length () == 1);
5131
5132 /* See if the target wants to do the final (shift) reduction
5133 in a vector mode of smaller size and first reduce upper/lower
5134 halves against each other. */
5135 enum machine_mode mode1 = mode;
5136 tree stype = TREE_TYPE (vectype);
5137 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5138 unsigned nunits1 = nunits;
5139 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5140 && new_phis.length () == 1)
5141 {
5142 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5143 /* For SLP reductions we have to make sure lanes match up, but
5144 since we're doing individual element final reduction reducing
5145 vector width here is even more important.
5146 ??? We can also separate lanes with permutes, for the common
5147 case of power-of-two group-size odd/even extracts would work. */
5148 if (slp_reduc && nunits != nunits1)
5149 {
5150 nunits1 = least_common_multiple (nunits1, group_size);
5151 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5152 }
5153 }
5154 if (!slp_reduc
5155 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5156 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5157
5158 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5159 stype, nunits1);
5160 reduce_with_shift = have_whole_vector_shift (mode1);
5161 if (!VECTOR_MODE_P (mode1))
5162 reduce_with_shift = false;
5163 else
5164 {
5165 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5166 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5167 reduce_with_shift = false;
5168 }
5169
5170 /* First reduce the vector to the desired vector size we should
5171 do shift reduction on by combining upper and lower halves. */
5172 new_temp = new_phi_result;
5173 while (nunits > nunits1)
5174 {
5175 nunits /= 2;
5176 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5177 stype, nunits);
5178 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5179
5180 /* The target has to make sure we support lowpart/highpart
5181 extraction, either via direct vector extract or through
5182 an integer mode punning. */
5183 tree dst1, dst2;
5184 if (convert_optab_handler (vec_extract_optab,
5185 TYPE_MODE (TREE_TYPE (new_temp)),
5186 TYPE_MODE (vectype1))
5187 != CODE_FOR_nothing)
5188 {
5189 /* Extract sub-vectors directly once vec_extract becomes
5190 a conversion optab. */
5191 dst1 = make_ssa_name (vectype1);
5192 epilog_stmt
5193 = gimple_build_assign (dst1, BIT_FIELD_REF,
5194 build3 (BIT_FIELD_REF, vectype1,
5195 new_temp, TYPE_SIZE (vectype1),
5196 bitsize_int (0)));
5197 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5198 dst2 = make_ssa_name (vectype1);
5199 epilog_stmt
5200 = gimple_build_assign (dst2, BIT_FIELD_REF,
5201 build3 (BIT_FIELD_REF, vectype1,
5202 new_temp, TYPE_SIZE (vectype1),
5203 bitsize_int (bitsize)));
5204 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5205 }
5206 else
5207 {
5208 /* Extract via punning to appropriately sized integer mode
5209 vector. */
5210 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5211 tree etype = build_vector_type (eltype, 2);
5212 gcc_assert (convert_optab_handler (vec_extract_optab,
5213 TYPE_MODE (etype),
5214 TYPE_MODE (eltype))
5215 != CODE_FOR_nothing);
5216 tree tem = make_ssa_name (etype);
5217 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5218 build1 (VIEW_CONVERT_EXPR,
5219 etype, new_temp));
5220 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5221 new_temp = tem;
5222 tem = make_ssa_name (eltype);
5223 epilog_stmt
5224 = gimple_build_assign (tem, BIT_FIELD_REF,
5225 build3 (BIT_FIELD_REF, eltype,
5226 new_temp, TYPE_SIZE (eltype),
5227 bitsize_int (0)));
5228 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 dst1 = make_ssa_name (vectype1);
5230 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5231 build1 (VIEW_CONVERT_EXPR,
5232 vectype1, tem));
5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 tem = make_ssa_name (eltype);
5235 epilog_stmt
5236 = gimple_build_assign (tem, BIT_FIELD_REF,
5237 build3 (BIT_FIELD_REF, eltype,
5238 new_temp, TYPE_SIZE (eltype),
5239 bitsize_int (bitsize)));
5240 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241 dst2 = make_ssa_name (vectype1);
5242 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5243 build1 (VIEW_CONVERT_EXPR,
5244 vectype1, tem));
5245 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5246 }
5247
5248 new_temp = make_ssa_name (vectype1);
5249 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5250 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 new_phis[0] = epilog_stmt;
5252 }
5253
5254 if (reduce_with_shift && !slp_reduc)
5255 {
5256 int element_bitsize = tree_to_uhwi (bitsize);
5257 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5258 for variable-length vectors and also requires direct target support
5259 for loop reductions. */
5260 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5261 int nelements = vec_size_in_bits / element_bitsize;
5262 vec_perm_builder sel;
5263 vec_perm_indices indices;
5264
5265 int elt_offset;
5266
5267 tree zero_vec = build_zero_cst (vectype1);
5268 /* Case 2: Create:
5269 for (offset = nelements/2; offset >= 1; offset/=2)
5270 {
5271 Create: va' = vec_shift <va, offset>
5272 Create: va = vop <va, va'>
5273 } */
5274
5275 tree rhs;
5276
5277 if (dump_enabled_p ())
5278 dump_printf_loc (MSG_NOTE, vect_location,
5279 "Reduce using vector shifts\n");
5280
5281 gimple_seq stmts = NULL;
5282 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5283 for (elt_offset = nelements / 2;
5284 elt_offset >= 1;
5285 elt_offset /= 2)
5286 {
5287 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5288 indices.new_vector (sel, 2, nelements);
5289 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5290 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5291 new_temp, zero_vec, mask);
5292 new_temp = gimple_build (&stmts, code,
5293 vectype1, new_name, new_temp);
5294 }
5295 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5296
5297 /* 2.4 Extract the final scalar result. Create:
5298 s_out3 = extract_field <v_out2, bitpos> */
5299
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_NOTE, vect_location,
5302 "extract scalar result\n");
5303
5304 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5305 bitsize, bitsize_zero_node);
5306 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5307 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5308 gimple_assign_set_lhs (epilog_stmt, new_temp);
5309 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310 scalar_results.safe_push (new_temp);
5311 }
5312 else
5313 {
5314 /* Case 3: Create:
5315 s = extract_field <v_out2, 0>
5316 for (offset = element_size;
5317 offset < vector_size;
5318 offset += element_size;)
5319 {
5320 Create: s' = extract_field <v_out2, offset>
5321 Create: s = op <s, s'> // For non SLP cases
5322 } */
5323
5324 if (dump_enabled_p ())
5325 dump_printf_loc (MSG_NOTE, vect_location,
5326 "Reduce using scalar code.\n");
5327
5328 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5329 int element_bitsize = tree_to_uhwi (bitsize);
5330 tree compute_type = TREE_TYPE (vectype);
5331 gimple_seq stmts = NULL;
5332 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5333 {
5334 int bit_offset;
5335 if (gimple_code (new_phi) == GIMPLE_PHI)
5336 vec_temp = PHI_RESULT (new_phi);
5337 else
5338 vec_temp = gimple_assign_lhs (new_phi);
5339 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5340 vec_temp, bitsize, bitsize_zero_node);
5341
5342 /* In SLP we don't need to apply reduction operation, so we just
5343 collect s' values in SCALAR_RESULTS. */
5344 if (slp_reduc)
5345 scalar_results.safe_push (new_temp);
5346
5347 for (bit_offset = element_bitsize;
5348 bit_offset < vec_size_in_bits;
5349 bit_offset += element_bitsize)
5350 {
5351 tree bitpos = bitsize_int (bit_offset);
5352 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5353 compute_type, vec_temp,
5354 bitsize, bitpos);
5355 if (slp_reduc)
5356 {
5357 /* In SLP we don't need to apply reduction operation, so
5358 we just collect s' values in SCALAR_RESULTS. */
5359 new_temp = new_name;
5360 scalar_results.safe_push (new_name);
5361 }
5362 else
5363 new_temp = gimple_build (&stmts, code, compute_type,
5364 new_name, new_temp);
5365 }
5366 }
5367
5368 /* The only case where we need to reduce scalar results in SLP, is
5369 unrolling. If the size of SCALAR_RESULTS is greater than
5370 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5371 REDUC_GROUP_SIZE. */
5372 if (slp_reduc)
5373 {
5374 tree res, first_res, new_res;
5375
5376 /* Reduce multiple scalar results in case of SLP unrolling. */
5377 for (j = group_size; scalar_results.iterate (j, &res);
5378 j++)
5379 {
5380 first_res = scalar_results[j % group_size];
5381 new_res = gimple_build (&stmts, code, compute_type,
5382 first_res, res);
5383 scalar_results[j % group_size] = new_res;
5384 }
5385 for (k = 0; k < group_size; k++)
5386 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5387 scalar_results[k]);
5388 }
5389 else
5390 {
5391 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5392 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5393 scalar_results.safe_push (new_temp);
5394 }
5395
5396 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5397 }
5398
5399 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5400 && induc_val)
5401 {
5402 /* Earlier we set the initial value to be a vector if induc_val
5403 values. Check the result and if it is induc_val then replace
5404 with the original initial value, unless induc_val is
5405 the same as initial_def already. */
5406 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5407 induc_val);
5408
5409 tree tmp = make_ssa_name (new_scalar_dest);
5410 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5411 initial_def, new_temp);
5412 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5413 scalar_results[0] = tmp;
5414 }
5415 }
5416
5417 /* 2.5 Adjust the final result by the initial value of the reduction
5418 variable. (When such adjustment is not needed, then
5419 'adjustment_def' is zero). For example, if code is PLUS we create:
5420 new_temp = loop_exit_def + adjustment_def */
5421
5422 if (adjustment_def)
5423 {
5424 gcc_assert (!slp_reduc);
5425 gimple_seq stmts = NULL;
5426 if (nested_in_vect_loop)
5427 {
5428 new_phi = new_phis[0];
5429 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5430 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5431 new_temp = gimple_build (&stmts, code, vectype,
5432 PHI_RESULT (new_phi), adjustment_def);
5433 }
5434 else
5435 {
5436 new_temp = scalar_results[0];
5437 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5438 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5439 new_temp = gimple_build (&stmts, code, scalar_type,
5440 new_temp, adjustment_def);
5441 }
5442
5443 epilog_stmt = gimple_seq_last_stmt (stmts);
5444 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5445 if (nested_in_vect_loop)
5446 {
5447 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5448 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5449 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5450
5451 if (!double_reduc)
5452 scalar_results.quick_push (new_temp);
5453 else
5454 scalar_results[0] = new_temp;
5455 }
5456 else
5457 scalar_results[0] = new_temp;
5458
5459 new_phis[0] = epilog_stmt;
5460 }
5461
5462 if (double_reduc)
5463 loop = loop->inner;
5464
5465 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5466 phis with new adjusted scalar results, i.e., replace use <s_out0>
5467 with use <s_out4>.
5468
5469 Transform:
5470 loop_exit:
5471 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5472 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5473 v_out2 = reduce <v_out1>
5474 s_out3 = extract_field <v_out2, 0>
5475 s_out4 = adjust_result <s_out3>
5476 use <s_out0>
5477 use <s_out0>
5478
5479 into:
5480
5481 loop_exit:
5482 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5483 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5484 v_out2 = reduce <v_out1>
5485 s_out3 = extract_field <v_out2, 0>
5486 s_out4 = adjust_result <s_out3>
5487 use <s_out4>
5488 use <s_out4> */
5489
5490
5491 /* In SLP reduction chain we reduce vector results into one vector if
5492 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5493 LHS of the last stmt in the reduction chain, since we are looking for
5494 the loop exit phi node. */
5495 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5496 {
5497 stmt_vec_info dest_stmt_info
5498 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5499 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5500 group_size = 1;
5501 }
5502
5503 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5504 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5505 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5506 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5507 correspond to the first vector stmt, etc.
5508 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5509 if (group_size > new_phis.length ())
5510 gcc_assert (!(group_size % new_phis.length ()));
5511
5512 for (k = 0; k < group_size; k++)
5513 {
5514 if (slp_reduc)
5515 {
5516 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5517
5518 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5519 /* SLP statements can't participate in patterns. */
5520 gcc_assert (!orig_stmt_info);
5521 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5522 }
5523
5524 if (nested_in_vect_loop)
5525 {
5526 if (double_reduc)
5527 loop = outer_loop;
5528 else
5529 gcc_unreachable ();
5530 }
5531
5532 phis.create (3);
5533 /* Find the loop-closed-use at the loop exit of the original scalar
5534 result. (The reduction result is expected to have two immediate uses,
5535 one at the latch block, and one at the loop exit). For double
5536 reductions we are looking for exit phis of the outer loop. */
5537 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5538 {
5539 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5540 {
5541 if (!is_gimple_debug (USE_STMT (use_p)))
5542 phis.safe_push (USE_STMT (use_p));
5543 }
5544 else
5545 {
5546 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5547 {
5548 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5549
5550 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5551 {
5552 if (!flow_bb_inside_loop_p (loop,
5553 gimple_bb (USE_STMT (phi_use_p)))
5554 && !is_gimple_debug (USE_STMT (phi_use_p)))
5555 phis.safe_push (USE_STMT (phi_use_p));
5556 }
5557 }
5558 }
5559 }
5560
5561 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5562 {
5563 /* Replace the uses: */
5564 orig_name = PHI_RESULT (exit_phi);
5565 scalar_result = scalar_results[k];
5566 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5567 {
5568 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5569 SET_USE (use_p, scalar_result);
5570 update_stmt (use_stmt);
5571 }
5572 }
5573
5574 phis.release ();
5575 }
5576 }
5577
5578 /* Return a vector of type VECTYPE that is equal to the vector select
5579 operation "MASK ? VEC : IDENTITY". Insert the select statements
5580 before GSI. */
5581
5582 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)5583 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5584 tree vec, tree identity)
5585 {
5586 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5587 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5588 mask, vec, identity);
5589 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5590 return cond;
5591 }
5592
5593 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5594 order, starting with LHS. Insert the extraction statements before GSI and
5595 associate the new scalar SSA names with variable SCALAR_DEST.
5596 Return the SSA name for the result. */
5597
5598 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)5599 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5600 tree_code code, tree lhs, tree vector_rhs)
5601 {
5602 tree vectype = TREE_TYPE (vector_rhs);
5603 tree scalar_type = TREE_TYPE (vectype);
5604 tree bitsize = TYPE_SIZE (scalar_type);
5605 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5606 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5607
5608 for (unsigned HOST_WIDE_INT bit_offset = 0;
5609 bit_offset < vec_size_in_bits;
5610 bit_offset += element_bitsize)
5611 {
5612 tree bitpos = bitsize_int (bit_offset);
5613 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5614 bitsize, bitpos);
5615
5616 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5617 rhs = make_ssa_name (scalar_dest, stmt);
5618 gimple_assign_set_lhs (stmt, rhs);
5619 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5620
5621 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5622 tree new_name = make_ssa_name (scalar_dest, stmt);
5623 gimple_assign_set_lhs (stmt, new_name);
5624 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5625 lhs = new_name;
5626 }
5627 return lhs;
5628 }
5629
5630 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5631 type of the vector input. */
5632
5633 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)5634 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5635 {
5636 internal_fn mask_reduc_fn;
5637
5638 switch (reduc_fn)
5639 {
5640 case IFN_FOLD_LEFT_PLUS:
5641 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5642 break;
5643
5644 default:
5645 return IFN_LAST;
5646 }
5647
5648 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5649 OPTIMIZE_FOR_SPEED))
5650 return mask_reduc_fn;
5651 return IFN_LAST;
5652 }
5653
5654 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5655 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5656 statement. CODE is the operation performed by STMT_INFO and OPS are
5657 its scalar operands. REDUC_INDEX is the index of the operand in
5658 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5659 implements in-order reduction, or IFN_LAST if we should open-code it.
5660 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5661 that should be used to control the operation in a fully-masked loop. */
5662
5663 static bool
vectorize_fold_left_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)5664 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5665 gimple_stmt_iterator *gsi,
5666 stmt_vec_info *vec_stmt, slp_tree slp_node,
5667 gimple *reduc_def_stmt,
5668 tree_code code, internal_fn reduc_fn,
5669 tree ops[3], tree vectype_in,
5670 int reduc_index, vec_loop_masks *masks)
5671 {
5672 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5673 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5674 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5675 stmt_vec_info new_stmt_info = NULL;
5676 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5677
5678 int ncopies;
5679 if (slp_node)
5680 ncopies = 1;
5681 else
5682 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5683
5684 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5685 gcc_assert (ncopies == 1);
5686 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5687
5688 if (slp_node)
5689 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5690 TYPE_VECTOR_SUBPARTS (vectype_in)));
5691
5692 tree op0 = ops[1 - reduc_index];
5693
5694 int group_size = 1;
5695 stmt_vec_info scalar_dest_def_info;
5696 auto_vec<tree> vec_oprnds0;
5697 if (slp_node)
5698 {
5699 auto_vec<vec<tree> > vec_defs (2);
5700 vect_get_slp_defs (slp_node, &vec_defs);
5701 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5702 vec_defs[0].release ();
5703 vec_defs[1].release ();
5704 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5705 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5706 }
5707 else
5708 {
5709 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5710 vec_oprnds0.create (1);
5711 vec_oprnds0.quick_push (loop_vec_def0);
5712 scalar_dest_def_info = stmt_info;
5713 }
5714
5715 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5716 tree scalar_type = TREE_TYPE (scalar_dest);
5717 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5718
5719 int vec_num = vec_oprnds0.length ();
5720 gcc_assert (vec_num == 1 || slp_node);
5721 tree vec_elem_type = TREE_TYPE (vectype_out);
5722 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5723
5724 tree vector_identity = NULL_TREE;
5725 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5726 vector_identity = build_zero_cst (vectype_out);
5727
5728 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5729 int i;
5730 tree def0;
5731 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5732 {
5733 gimple *new_stmt;
5734 tree mask = NULL_TREE;
5735 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5736 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5737
5738 /* Handle MINUS by adding the negative. */
5739 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5740 {
5741 tree negated = make_ssa_name (vectype_out);
5742 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5743 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5744 def0 = negated;
5745 }
5746
5747 if (mask && mask_reduc_fn == IFN_LAST)
5748 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5749 vector_identity);
5750
5751 /* On the first iteration the input is simply the scalar phi
5752 result, and for subsequent iterations it is the output of
5753 the preceding operation. */
5754 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5755 {
5756 if (mask && mask_reduc_fn != IFN_LAST)
5757 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5758 def0, mask);
5759 else
5760 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5761 def0);
5762 /* For chained SLP reductions the output of the previous reduction
5763 operation serves as the input of the next. For the final statement
5764 the output cannot be a temporary - we reuse the original
5765 scalar destination of the last statement. */
5766 if (i != vec_num - 1)
5767 {
5768 gimple_set_lhs (new_stmt, scalar_dest_var);
5769 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5770 gimple_set_lhs (new_stmt, reduc_var);
5771 }
5772 }
5773 else
5774 {
5775 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5776 reduc_var, def0);
5777 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5778 /* Remove the statement, so that we can use the same code paths
5779 as for statements that we've just created. */
5780 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5781 gsi_remove (&tmp_gsi, true);
5782 }
5783
5784 if (i == vec_num - 1)
5785 {
5786 gimple_set_lhs (new_stmt, scalar_dest);
5787 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5788 new_stmt);
5789 }
5790 else
5791 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5792 new_stmt, gsi);
5793
5794 if (slp_node)
5795 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5796 }
5797
5798 if (!slp_node)
5799 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5800
5801 return true;
5802 }
5803
5804 /* Function is_nonwrapping_integer_induction.
5805
5806 Check if STMT_VINO (which is part of loop LOOP) both increments and
5807 does not cause overflow. */
5808
5809 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)5810 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5811 {
5812 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5813 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5814 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5815 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5816 widest_int ni, max_loop_value, lhs_max;
5817 wi::overflow_type overflow = wi::OVF_NONE;
5818
5819 /* Make sure the loop is integer based. */
5820 if (TREE_CODE (base) != INTEGER_CST
5821 || TREE_CODE (step) != INTEGER_CST)
5822 return false;
5823
5824 /* Check that the max size of the loop will not wrap. */
5825
5826 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5827 return true;
5828
5829 if (! max_stmt_executions (loop, &ni))
5830 return false;
5831
5832 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5833 &overflow);
5834 if (overflow)
5835 return false;
5836
5837 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5838 TYPE_SIGN (lhs_type), &overflow);
5839 if (overflow)
5840 return false;
5841
5842 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5843 <= TYPE_PRECISION (lhs_type));
5844 }
5845
5846 /* Check if masking can be supported by inserting a conditional expression.
5847 CODE is the code for the operation. COND_FN is the conditional internal
5848 function, if it exists. VECTYPE_IN is the type of the vector input. */
5849 static bool
use_mask_by_cond_expr_p(enum tree_code code,internal_fn cond_fn,tree vectype_in)5850 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5851 tree vectype_in)
5852 {
5853 if (cond_fn != IFN_LAST
5854 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5855 OPTIMIZE_FOR_SPEED))
5856 return false;
5857
5858 switch (code)
5859 {
5860 case DOT_PROD_EXPR:
5861 case SAD_EXPR:
5862 return true;
5863
5864 default:
5865 return false;
5866 }
5867 }
5868
5869 /* Insert a conditional expression to enable masked vectorization. CODE is the
5870 code for the operation. VOP is the array of operands. MASK is the loop
5871 mask. GSI is a statement iterator used to place the new conditional
5872 expression. */
5873 static void
build_vect_cond_expr(enum tree_code code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)5874 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5875 gimple_stmt_iterator *gsi)
5876 {
5877 switch (code)
5878 {
5879 case DOT_PROD_EXPR:
5880 {
5881 tree vectype = TREE_TYPE (vop[1]);
5882 tree zero = build_zero_cst (vectype);
5883 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5884 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5885 mask, vop[1], zero);
5886 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5887 vop[1] = masked_op1;
5888 break;
5889 }
5890
5891 case SAD_EXPR:
5892 {
5893 tree vectype = TREE_TYPE (vop[1]);
5894 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5895 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5896 mask, vop[1], vop[0]);
5897 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5898 vop[1] = masked_op1;
5899 break;
5900 }
5901
5902 default:
5903 gcc_unreachable ();
5904 }
5905 }
5906
5907 /* Function vectorizable_reduction.
5908
5909 Check if STMT_INFO performs a reduction operation that can be vectorized.
5910 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5911 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5912 Return true if STMT_INFO is vectorizable in this way.
5913
5914 This function also handles reduction idioms (patterns) that have been
5915 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5916 may be of this form:
5917 X = pattern_expr (arg0, arg1, ..., X)
5918 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5919 sequence that had been detected and replaced by the pattern-stmt
5920 (STMT_INFO).
5921
5922 This function also handles reduction of condition expressions, for example:
5923 for (int i = 0; i < N; i++)
5924 if (a[i] < value)
5925 last = a[i];
5926 This is handled by vectorising the loop and creating an additional vector
5927 containing the loop indexes for which "a[i] < value" was true. In the
5928 function epilogue this is reduced to a single max value and then used to
5929 index into the vector of results.
5930
5931 In some cases of reduction patterns, the type of the reduction variable X is
5932 different than the type of the other arguments of STMT_INFO.
5933 In such cases, the vectype that is used when transforming STMT_INFO into
5934 a vector stmt is different than the vectype that is used to determine the
5935 vectorization factor, because it consists of a different number of elements
5936 than the actual number of elements that are being operated upon in parallel.
5937
5938 For example, consider an accumulation of shorts into an int accumulator.
5939 On some targets it's possible to vectorize this pattern operating on 8
5940 shorts at a time (hence, the vectype for purposes of determining the
5941 vectorization factor should be V8HI); on the other hand, the vectype that
5942 is used to create the vector form is actually V4SI (the type of the result).
5943
5944 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5945 indicates what is the actual level of parallelism (V8HI in the example), so
5946 that the right vectorization factor would be derived. This vectype
5947 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5948 be used to create the vectorized stmt. The right vectype for the vectorized
5949 stmt is obtained from the type of the result X:
5950 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5951
5952 This means that, contrary to "regular" reductions (or "regular" stmts in
5953 general), the following equation:
5954 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5955 does *NOT* necessarily hold for reduction patterns. */
5956
5957 bool
vectorizable_reduction(stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)5958 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5959 slp_instance slp_node_instance,
5960 stmt_vector_for_cost *cost_vec)
5961 {
5962 tree scalar_dest;
5963 tree vectype_in = NULL_TREE;
5964 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5965 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5966 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5967 stmt_vec_info cond_stmt_vinfo = NULL;
5968 tree scalar_type;
5969 int i;
5970 int ncopies;
5971 bool single_defuse_cycle = false;
5972 bool nested_cycle = false;
5973 bool double_reduc = false;
5974 int vec_num;
5975 tree tem;
5976 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5977 tree cond_reduc_val = NULL_TREE;
5978
5979 /* Make sure it was already recognized as a reduction computation. */
5980 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5981 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5982 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5983 return false;
5984
5985 /* The stmt we store reduction analysis meta on. */
5986 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5987 reduc_info->is_reduc_info = true;
5988
5989 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5990 {
5991 if (is_a <gphi *> (stmt_info->stmt))
5992 /* Analysis for double-reduction is done on the outer
5993 loop PHI, nested cycles have no further restrictions. */
5994 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5995 else
5996 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5997 return true;
5998 }
5999
6000 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6001 stmt_vec_info phi_info = stmt_info;
6002 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6003 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6004 {
6005 if (!is_a <gphi *> (stmt_info->stmt))
6006 {
6007 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6008 return true;
6009 }
6010 if (slp_node)
6011 {
6012 slp_node_instance->reduc_phis = slp_node;
6013 /* ??? We're leaving slp_node to point to the PHIs, we only
6014 need it to get at the number of vector stmts which wasn't
6015 yet initialized for the instance root. */
6016 }
6017 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6018 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6019 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6020 {
6021 use_operand_p use_p;
6022 gimple *use_stmt;
6023 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6024 &use_p, &use_stmt);
6025 gcc_assert (res);
6026 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6027 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6028 }
6029 }
6030
6031 /* PHIs should not participate in patterns. */
6032 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6033 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6034
6035 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6036 and compute the reduction chain length. */
6037 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6038 loop_latch_edge (loop));
6039 unsigned reduc_chain_length = 0;
6040 bool only_slp_reduc_chain = true;
6041 stmt_info = NULL;
6042 while (reduc_def != PHI_RESULT (reduc_def_phi))
6043 {
6044 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6045 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6046 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6047 {
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050 "reduction chain broken by patterns.\n");
6051 return false;
6052 }
6053 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6054 only_slp_reduc_chain = false;
6055 /* ??? For epilogue generation live members of the chain need
6056 to point back to the PHI via their original stmt for
6057 info_for_reduction to work. */
6058 if (STMT_VINFO_LIVE_P (vdef))
6059 STMT_VINFO_REDUC_DEF (def) = phi_info;
6060 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6061 if (!assign)
6062 {
6063 if (dump_enabled_p ())
6064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6065 "reduction chain includes calls.\n");
6066 return false;
6067 }
6068 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6069 {
6070 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6071 TREE_TYPE (gimple_assign_rhs1 (assign))))
6072 {
6073 if (dump_enabled_p ())
6074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6075 "conversion in the reduction chain.\n");
6076 return false;
6077 }
6078 }
6079 else if (!stmt_info)
6080 /* First non-conversion stmt. */
6081 stmt_info = vdef;
6082 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6083 reduc_chain_length++;
6084 }
6085 /* PHIs should not participate in patterns. */
6086 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6087
6088 if (nested_in_vect_loop_p (loop, stmt_info))
6089 {
6090 loop = loop->inner;
6091 nested_cycle = true;
6092 }
6093
6094 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6095 element. */
6096 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6097 {
6098 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6099 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6100 }
6101 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6102 gcc_assert (slp_node
6103 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6104
6105 /* 1. Is vectorizable reduction? */
6106 /* Not supportable if the reduction variable is used in the loop, unless
6107 it's a reduction chain. */
6108 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6109 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6110 return false;
6111
6112 /* Reductions that are not used even in an enclosing outer-loop,
6113 are expected to be "live" (used out of the loop). */
6114 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6115 && !STMT_VINFO_LIVE_P (stmt_info))
6116 return false;
6117
6118 /* 2. Has this been recognized as a reduction pattern?
6119
6120 Check if STMT represents a pattern that has been recognized
6121 in earlier analysis stages. For stmts that represent a pattern,
6122 the STMT_VINFO_RELATED_STMT field records the last stmt in
6123 the original sequence that constitutes the pattern. */
6124
6125 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6126 if (orig_stmt_info)
6127 {
6128 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6129 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6130 }
6131
6132 /* 3. Check the operands of the operation. The first operands are defined
6133 inside the loop body. The last operand is the reduction variable,
6134 which is defined by the loop-header-phi. */
6135
6136 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6137 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6138 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6139 enum tree_code code = gimple_assign_rhs_code (stmt);
6140 bool lane_reduc_code_p
6141 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6142 int op_type = TREE_CODE_LENGTH (code);
6143
6144 scalar_dest = gimple_assign_lhs (stmt);
6145 scalar_type = TREE_TYPE (scalar_dest);
6146 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6147 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6148 return false;
6149
6150 /* Do not try to vectorize bit-precision reductions. */
6151 if (!type_has_mode_precision_p (scalar_type))
6152 return false;
6153
6154 /* For lane-reducing ops we're reducing the number of reduction PHIs
6155 which means the only use of that may be in the lane-reducing operation. */
6156 if (lane_reduc_code_p
6157 && reduc_chain_length != 1
6158 && !only_slp_reduc_chain)
6159 {
6160 if (dump_enabled_p ())
6161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6162 "lane-reducing reduction with extra stmts.\n");
6163 return false;
6164 }
6165
6166 /* All uses but the last are expected to be defined in the loop.
6167 The last use is the reduction variable. In case of nested cycle this
6168 assumption is not true: we use reduc_index to record the index of the
6169 reduction variable. */
6170 reduc_def = PHI_RESULT (reduc_def_phi);
6171 for (i = 0; i < op_type; i++)
6172 {
6173 tree op = gimple_op (stmt, i + 1);
6174 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6175 if (i == 0 && code == COND_EXPR)
6176 continue;
6177
6178 stmt_vec_info def_stmt_info;
6179 enum vect_def_type dt;
6180 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6181 &def_stmt_info))
6182 {
6183 if (dump_enabled_p ())
6184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6185 "use not simple.\n");
6186 return false;
6187 }
6188 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6189 continue;
6190
6191 /* There should be only one cycle def in the stmt, the one
6192 leading to reduc_def. */
6193 if (VECTORIZABLE_CYCLE_DEF (dt))
6194 return false;
6195
6196 /* To properly compute ncopies we are interested in the widest
6197 non-reduction input type in case we're looking at a widening
6198 accumulation that we later handle in vect_transform_reduction. */
6199 if (lane_reduc_code_p
6200 && tem
6201 && (!vectype_in
6202 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6203 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6204 vectype_in = tem;
6205
6206 if (code == COND_EXPR)
6207 {
6208 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6209 if (dt == vect_constant_def)
6210 {
6211 cond_reduc_dt = dt;
6212 cond_reduc_val = op;
6213 }
6214 if (dt == vect_induction_def
6215 && def_stmt_info
6216 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6217 {
6218 cond_reduc_dt = dt;
6219 cond_stmt_vinfo = def_stmt_info;
6220 }
6221 }
6222 }
6223 if (!vectype_in)
6224 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6225 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6226
6227 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6228 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6229 /* If we have a condition reduction, see if we can simplify it further. */
6230 if (v_reduc_type == COND_REDUCTION)
6231 {
6232 if (slp_node)
6233 return false;
6234
6235 /* When the condition uses the reduction value in the condition, fail. */
6236 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6237 {
6238 if (dump_enabled_p ())
6239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240 "condition depends on previous iteration\n");
6241 return false;
6242 }
6243
6244 if (reduc_chain_length == 1
6245 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6246 vectype_in, OPTIMIZE_FOR_SPEED))
6247 {
6248 if (dump_enabled_p ())
6249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6250 "optimizing condition reduction with"
6251 " FOLD_EXTRACT_LAST.\n");
6252 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6253 }
6254 else if (cond_reduc_dt == vect_induction_def)
6255 {
6256 tree base
6257 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6258 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6259
6260 gcc_assert (TREE_CODE (base) == INTEGER_CST
6261 && TREE_CODE (step) == INTEGER_CST);
6262 cond_reduc_val = NULL_TREE;
6263 enum tree_code cond_reduc_op_code = ERROR_MARK;
6264 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6265 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6266 ;
6267 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6268 above base; punt if base is the minimum value of the type for
6269 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6270 else if (tree_int_cst_sgn (step) == -1)
6271 {
6272 cond_reduc_op_code = MIN_EXPR;
6273 if (tree_int_cst_sgn (base) == -1)
6274 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6275 else if (tree_int_cst_lt (base,
6276 TYPE_MAX_VALUE (TREE_TYPE (base))))
6277 cond_reduc_val
6278 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6279 }
6280 else
6281 {
6282 cond_reduc_op_code = MAX_EXPR;
6283 if (tree_int_cst_sgn (base) == 1)
6284 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6285 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6286 base))
6287 cond_reduc_val
6288 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6289 }
6290 if (cond_reduc_val)
6291 {
6292 if (dump_enabled_p ())
6293 dump_printf_loc (MSG_NOTE, vect_location,
6294 "condition expression based on "
6295 "integer induction.\n");
6296 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6297 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6298 = cond_reduc_val;
6299 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6300 }
6301 }
6302 else if (cond_reduc_dt == vect_constant_def)
6303 {
6304 enum vect_def_type cond_initial_dt;
6305 tree cond_initial_val
6306 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6307
6308 gcc_assert (cond_reduc_val != NULL_TREE);
6309 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6310 if (cond_initial_dt == vect_constant_def
6311 && types_compatible_p (TREE_TYPE (cond_initial_val),
6312 TREE_TYPE (cond_reduc_val)))
6313 {
6314 tree e = fold_binary (LE_EXPR, boolean_type_node,
6315 cond_initial_val, cond_reduc_val);
6316 if (e && (integer_onep (e) || integer_zerop (e)))
6317 {
6318 if (dump_enabled_p ())
6319 dump_printf_loc (MSG_NOTE, vect_location,
6320 "condition expression based on "
6321 "compile time constant.\n");
6322 /* Record reduction code at analysis stage. */
6323 STMT_VINFO_REDUC_CODE (reduc_info)
6324 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6325 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6326 }
6327 }
6328 }
6329 }
6330
6331 if (STMT_VINFO_LIVE_P (phi_info))
6332 return false;
6333
6334 if (slp_node)
6335 ncopies = 1;
6336 else
6337 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6338
6339 gcc_assert (ncopies >= 1);
6340
6341 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6342
6343 if (nested_cycle)
6344 {
6345 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6346 == vect_double_reduction_def);
6347 double_reduc = true;
6348 }
6349
6350 /* 4.2. Check support for the epilog operation.
6351
6352 If STMT represents a reduction pattern, then the type of the
6353 reduction variable may be different than the type of the rest
6354 of the arguments. For example, consider the case of accumulation
6355 of shorts into an int accumulator; The original code:
6356 S1: int_a = (int) short_a;
6357 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6358
6359 was replaced with:
6360 STMT: int_acc = widen_sum <short_a, int_acc>
6361
6362 This means that:
6363 1. The tree-code that is used to create the vector operation in the
6364 epilog code (that reduces the partial results) is not the
6365 tree-code of STMT, but is rather the tree-code of the original
6366 stmt from the pattern that STMT is replacing. I.e, in the example
6367 above we want to use 'widen_sum' in the loop, but 'plus' in the
6368 epilog.
6369 2. The type (mode) we use to check available target support
6370 for the vector operation to be created in the *epilog*, is
6371 determined by the type of the reduction variable (in the example
6372 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6373 However the type (mode) we use to check available target support
6374 for the vector operation to be created *inside the loop*, is
6375 determined by the type of the other arguments to STMT (in the
6376 example we'd check this: optab_handler (widen_sum_optab,
6377 vect_short_mode)).
6378
6379 This is contrary to "regular" reductions, in which the types of all
6380 the arguments are the same as the type of the reduction variable.
6381 For "regular" reductions we can therefore use the same vector type
6382 (and also the same tree-code) when generating the epilog code and
6383 when generating the code inside the loop. */
6384
6385 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6386 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6387
6388 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6389 if (reduction_type == TREE_CODE_REDUCTION)
6390 {
6391 /* Check whether it's ok to change the order of the computation.
6392 Generally, when vectorizing a reduction we change the order of the
6393 computation. This may change the behavior of the program in some
6394 cases, so we need to check that this is ok. One exception is when
6395 vectorizing an outer-loop: the inner-loop is executed sequentially,
6396 and therefore vectorizing reductions in the inner-loop during
6397 outer-loop vectorization is safe. */
6398 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6399 {
6400 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6401 is not directy used in stmt. */
6402 if (!only_slp_reduc_chain
6403 && reduc_chain_length != 1)
6404 {
6405 if (dump_enabled_p ())
6406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6407 "in-order reduction chain without SLP.\n");
6408 return false;
6409 }
6410 STMT_VINFO_REDUC_TYPE (reduc_info)
6411 = reduction_type = FOLD_LEFT_REDUCTION;
6412 }
6413 else if (!commutative_tree_code (orig_code)
6414 || !associative_tree_code (orig_code))
6415 {
6416 if (dump_enabled_p ())
6417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6418 "reduction: not commutative/associative");
6419 return false;
6420 }
6421 }
6422
6423 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6424 && ncopies > 1)
6425 {
6426 if (dump_enabled_p ())
6427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428 "multiple types in double reduction or condition "
6429 "reduction or fold-left reduction.\n");
6430 return false;
6431 }
6432
6433 internal_fn reduc_fn = IFN_LAST;
6434 if (reduction_type == TREE_CODE_REDUCTION
6435 || reduction_type == FOLD_LEFT_REDUCTION
6436 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6437 || reduction_type == CONST_COND_REDUCTION)
6438 {
6439 if (reduction_type == FOLD_LEFT_REDUCTION
6440 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6441 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6442 {
6443 if (reduc_fn != IFN_LAST
6444 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6445 OPTIMIZE_FOR_SPEED))
6446 {
6447 if (dump_enabled_p ())
6448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6449 "reduc op not supported by target.\n");
6450
6451 reduc_fn = IFN_LAST;
6452 }
6453 }
6454 else
6455 {
6456 if (!nested_cycle || double_reduc)
6457 {
6458 if (dump_enabled_p ())
6459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6460 "no reduc code for scalar code.\n");
6461
6462 return false;
6463 }
6464 }
6465 }
6466 else if (reduction_type == COND_REDUCTION)
6467 {
6468 int scalar_precision
6469 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6470 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6471 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6472 nunits_out);
6473
6474 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6475 OPTIMIZE_FOR_SPEED))
6476 reduc_fn = IFN_REDUC_MAX;
6477 }
6478 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6479
6480 if (reduction_type != EXTRACT_LAST_REDUCTION
6481 && (!nested_cycle || double_reduc)
6482 && reduc_fn == IFN_LAST
6483 && !nunits_out.is_constant ())
6484 {
6485 if (dump_enabled_p ())
6486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6487 "missing target support for reduction on"
6488 " variable-length vectors.\n");
6489 return false;
6490 }
6491
6492 /* For SLP reductions, see if there is a neutral value we can use. */
6493 tree neutral_op = NULL_TREE;
6494 if (slp_node)
6495 neutral_op = neutral_op_for_slp_reduction
6496 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6497 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6498
6499 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6500 {
6501 /* We can't support in-order reductions of code such as this:
6502
6503 for (int i = 0; i < n1; ++i)
6504 for (int j = 0; j < n2; ++j)
6505 l += a[j];
6506
6507 since GCC effectively transforms the loop when vectorizing:
6508
6509 for (int i = 0; i < n1 / VF; ++i)
6510 for (int j = 0; j < n2; ++j)
6511 for (int k = 0; k < VF; ++k)
6512 l += a[j];
6513
6514 which is a reassociation of the original operation. */
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517 "in-order double reduction not supported.\n");
6518
6519 return false;
6520 }
6521
6522 if (reduction_type == FOLD_LEFT_REDUCTION
6523 && slp_node
6524 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6525 {
6526 /* We cannot use in-order reductions in this case because there is
6527 an implicit reassociation of the operations involved. */
6528 if (dump_enabled_p ())
6529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6530 "in-order unchained SLP reductions not supported.\n");
6531 return false;
6532 }
6533
6534 /* For double reductions, and for SLP reductions with a neutral value,
6535 we construct a variable-length initial vector by loading a vector
6536 full of the neutral value and then shift-and-inserting the start
6537 values into the low-numbered elements. */
6538 if ((double_reduc || neutral_op)
6539 && !nunits_out.is_constant ()
6540 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6541 vectype_out, OPTIMIZE_FOR_SPEED))
6542 {
6543 if (dump_enabled_p ())
6544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6545 "reduction on variable-length vectors requires"
6546 " target support for a vector-shift-and-insert"
6547 " operation.\n");
6548 return false;
6549 }
6550
6551 /* Check extra constraints for variable-length unchained SLP reductions. */
6552 if (STMT_SLP_TYPE (stmt_info)
6553 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6554 && !nunits_out.is_constant ())
6555 {
6556 /* We checked above that we could build the initial vector when
6557 there's a neutral element value. Check here for the case in
6558 which each SLP statement has its own initial value and in which
6559 that value needs to be repeated for every instance of the
6560 statement within the initial vector. */
6561 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6562 if (!neutral_op
6563 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6564 TREE_TYPE (vectype_out)))
6565 {
6566 if (dump_enabled_p ())
6567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6568 "unsupported form of SLP reduction for"
6569 " variable-length vectors: cannot build"
6570 " initial vector.\n");
6571 return false;
6572 }
6573 /* The epilogue code relies on the number of elements being a multiple
6574 of the group size. The duplicate-and-interleave approach to setting
6575 up the initial vector does too. */
6576 if (!multiple_p (nunits_out, group_size))
6577 {
6578 if (dump_enabled_p ())
6579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580 "unsupported form of SLP reduction for"
6581 " variable-length vectors: the vector size"
6582 " is not a multiple of the number of results.\n");
6583 return false;
6584 }
6585 }
6586
6587 if (reduction_type == COND_REDUCTION)
6588 {
6589 widest_int ni;
6590
6591 if (! max_loop_iterations (loop, &ni))
6592 {
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_NOTE, vect_location,
6595 "loop count not known, cannot create cond "
6596 "reduction.\n");
6597 return false;
6598 }
6599 /* Convert backedges to iterations. */
6600 ni += 1;
6601
6602 /* The additional index will be the same type as the condition. Check
6603 that the loop can fit into this less one (because we'll use up the
6604 zero slot for when there are no matches). */
6605 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6606 if (wi::geu_p (ni, wi::to_widest (max_index)))
6607 {
6608 if (dump_enabled_p ())
6609 dump_printf_loc (MSG_NOTE, vect_location,
6610 "loop size is greater than data size.\n");
6611 return false;
6612 }
6613 }
6614
6615 /* In case the vectorization factor (VF) is bigger than the number
6616 of elements that we can fit in a vectype (nunits), we have to generate
6617 more than one vector stmt - i.e - we need to "unroll" the
6618 vector stmt by a factor VF/nunits. For more details see documentation
6619 in vectorizable_operation. */
6620
6621 /* If the reduction is used in an outer loop we need to generate
6622 VF intermediate results, like so (e.g. for ncopies=2):
6623 r0 = phi (init, r0)
6624 r1 = phi (init, r1)
6625 r0 = x0 + r0;
6626 r1 = x1 + r1;
6627 (i.e. we generate VF results in 2 registers).
6628 In this case we have a separate def-use cycle for each copy, and therefore
6629 for each copy we get the vector def for the reduction variable from the
6630 respective phi node created for this copy.
6631
6632 Otherwise (the reduction is unused in the loop nest), we can combine
6633 together intermediate results, like so (e.g. for ncopies=2):
6634 r = phi (init, r)
6635 r = x0 + r;
6636 r = x1 + r;
6637 (i.e. we generate VF/2 results in a single register).
6638 In this case for each copy we get the vector def for the reduction variable
6639 from the vectorized reduction operation generated in the previous iteration.
6640
6641 This only works when we see both the reduction PHI and its only consumer
6642 in vectorizable_reduction and there are no intermediate stmts
6643 participating. */
6644 if (ncopies > 1
6645 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6646 && reduc_chain_length == 1)
6647 single_defuse_cycle = true;
6648
6649 if (single_defuse_cycle || lane_reduc_code_p)
6650 {
6651 gcc_assert (code != COND_EXPR);
6652
6653 /* 4. Supportable by target? */
6654 bool ok = true;
6655
6656 /* 4.1. check support for the operation in the loop */
6657 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6658 if (!optab)
6659 {
6660 if (dump_enabled_p ())
6661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662 "no optab.\n");
6663 ok = false;
6664 }
6665
6666 machine_mode vec_mode = TYPE_MODE (vectype_in);
6667 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6668 {
6669 if (dump_enabled_p ())
6670 dump_printf (MSG_NOTE, "op not supported by target.\n");
6671 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6672 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6673 ok = false;
6674 else
6675 if (dump_enabled_p ())
6676 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6677 }
6678
6679 /* Worthwhile without SIMD support? */
6680 if (ok
6681 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6682 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6683 {
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6686 "not worthwhile without SIMD support.\n");
6687 ok = false;
6688 }
6689
6690 /* lane-reducing operations have to go through vect_transform_reduction.
6691 For the other cases try without the single cycle optimization. */
6692 if (!ok)
6693 {
6694 if (lane_reduc_code_p)
6695 return false;
6696 else
6697 single_defuse_cycle = false;
6698 }
6699 }
6700 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6701
6702 /* If the reduction stmt is one of the patterns that have lane
6703 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6704 if ((ncopies > 1 && ! single_defuse_cycle)
6705 && lane_reduc_code_p)
6706 {
6707 if (dump_enabled_p ())
6708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709 "multi def-use cycle not possible for lane-reducing "
6710 "reduction operation\n");
6711 return false;
6712 }
6713
6714 if (slp_node)
6715 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6716 else
6717 vec_num = 1;
6718
6719 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6720 cost_vec);
6721 if (dump_enabled_p ()
6722 && reduction_type == FOLD_LEFT_REDUCTION)
6723 dump_printf_loc (MSG_NOTE, vect_location,
6724 "using an in-order (fold-left) reduction.\n");
6725 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6726 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6727 reductions go through their own vectorizable_* routines. */
6728 if (!single_defuse_cycle
6729 && code != DOT_PROD_EXPR
6730 && code != WIDEN_SUM_EXPR
6731 && code != SAD_EXPR
6732 && reduction_type != FOLD_LEFT_REDUCTION)
6733 {
6734 stmt_vec_info tem
6735 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6736 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6737 {
6738 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6739 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6740 }
6741 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6742 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6743 }
6744 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6745 {
6746 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6747 internal_fn cond_fn = get_conditional_internal_fn (code);
6748
6749 if (reduction_type != FOLD_LEFT_REDUCTION
6750 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6751 && (cond_fn == IFN_LAST
6752 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6753 OPTIMIZE_FOR_SPEED)))
6754 {
6755 if (dump_enabled_p ())
6756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6757 "can't use a fully-masked loop because no"
6758 " conditional operation is available.\n");
6759 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6760 }
6761 else if (reduction_type == FOLD_LEFT_REDUCTION
6762 && reduc_fn == IFN_LAST
6763 && !expand_vec_cond_expr_p (vectype_in,
6764 truth_type_for (vectype_in),
6765 SSA_NAME))
6766 {
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "can't use a fully-masked loop because no"
6770 " conditional operation is available.\n");
6771 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6772 }
6773 else
6774 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6775 vectype_in, NULL);
6776 }
6777 return true;
6778 }
6779
6780 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6781 value. */
6782
6783 bool
vect_transform_reduction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * vec_stmt,slp_tree slp_node)6784 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6785 stmt_vec_info *vec_stmt, slp_tree slp_node)
6786 {
6787 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6788 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6789 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6790 int i;
6791 int ncopies;
6792 int j;
6793 int vec_num;
6794
6795 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6796 gcc_assert (reduc_info->is_reduc_info);
6797
6798 if (nested_in_vect_loop_p (loop, stmt_info))
6799 {
6800 loop = loop->inner;
6801 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6802 }
6803
6804 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6805 enum tree_code code = gimple_assign_rhs_code (stmt);
6806 int op_type = TREE_CODE_LENGTH (code);
6807
6808 /* Flatten RHS. */
6809 tree ops[3];
6810 switch (get_gimple_rhs_class (code))
6811 {
6812 case GIMPLE_TERNARY_RHS:
6813 ops[2] = gimple_assign_rhs3 (stmt);
6814 /* Fall thru. */
6815 case GIMPLE_BINARY_RHS:
6816 ops[0] = gimple_assign_rhs1 (stmt);
6817 ops[1] = gimple_assign_rhs2 (stmt);
6818 break;
6819 default:
6820 gcc_unreachable ();
6821 }
6822
6823 /* All uses but the last are expected to be defined in the loop.
6824 The last use is the reduction variable. In case of nested cycle this
6825 assumption is not true: we use reduc_index to record the index of the
6826 reduction variable. */
6827 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6828 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6829 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6830 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6831
6832 if (slp_node)
6833 {
6834 ncopies = 1;
6835 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6836 }
6837 else
6838 {
6839 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6840 vec_num = 1;
6841 }
6842
6843 internal_fn cond_fn = get_conditional_internal_fn (code);
6844 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6845 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6846
6847 /* Transform. */
6848 stmt_vec_info new_stmt_info = NULL;
6849 stmt_vec_info prev_stmt_info;
6850 tree new_temp = NULL_TREE;
6851 auto_vec<tree> vec_oprnds0;
6852 auto_vec<tree> vec_oprnds1;
6853 auto_vec<tree> vec_oprnds2;
6854 tree def0;
6855
6856 if (dump_enabled_p ())
6857 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6858
6859 /* FORNOW: Multiple types are not supported for condition. */
6860 if (code == COND_EXPR)
6861 gcc_assert (ncopies == 1);
6862
6863 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6864
6865 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6866 if (reduction_type == FOLD_LEFT_REDUCTION)
6867 {
6868 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6869 return vectorize_fold_left_reduction
6870 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6871 reduc_fn, ops, vectype_in, reduc_index, masks);
6872 }
6873
6874 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6875 gcc_assert (single_defuse_cycle
6876 || code == DOT_PROD_EXPR
6877 || code == WIDEN_SUM_EXPR
6878 || code == SAD_EXPR);
6879
6880 /* Create the destination vector */
6881 tree scalar_dest = gimple_assign_lhs (stmt);
6882 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6883
6884 prev_stmt_info = NULL;
6885 if (!slp_node)
6886 {
6887 vec_oprnds0.create (1);
6888 vec_oprnds1.create (1);
6889 if (op_type == ternary_op)
6890 vec_oprnds2.create (1);
6891 }
6892
6893 for (j = 0; j < ncopies; j++)
6894 {
6895 /* Handle uses. */
6896 if (j == 0)
6897 {
6898 if (slp_node)
6899 {
6900 /* Get vec defs for all the operands except the reduction index,
6901 ensuring the ordering of the ops in the vector is kept. */
6902 auto_vec<vec<tree>, 3> vec_defs;
6903 vect_get_slp_defs (slp_node, &vec_defs);
6904 vec_oprnds0.safe_splice (vec_defs[0]);
6905 vec_defs[0].release ();
6906 vec_oprnds1.safe_splice (vec_defs[1]);
6907 vec_defs[1].release ();
6908 if (op_type == ternary_op)
6909 {
6910 vec_oprnds2.safe_splice (vec_defs[2]);
6911 vec_defs[2].release ();
6912 }
6913 }
6914 else
6915 {
6916 vec_oprnds0.quick_push
6917 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6918 vec_oprnds1.quick_push
6919 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6920 if (op_type == ternary_op)
6921 vec_oprnds2.quick_push
6922 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6923 }
6924 }
6925 else
6926 {
6927 if (!slp_node)
6928 {
6929 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6930
6931 if (single_defuse_cycle && reduc_index == 0)
6932 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6933 else
6934 vec_oprnds0[0]
6935 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6936 vec_oprnds0[0]);
6937 if (single_defuse_cycle && reduc_index == 1)
6938 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6939 else
6940 vec_oprnds1[0]
6941 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6942 vec_oprnds1[0]);
6943 if (op_type == ternary_op)
6944 {
6945 if (single_defuse_cycle && reduc_index == 2)
6946 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6947 else
6948 vec_oprnds2[0]
6949 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6950 vec_oprnds2[0]);
6951 }
6952 }
6953 }
6954
6955 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6956 {
6957 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6958 if (masked_loop_p && !mask_by_cond_expr)
6959 {
6960 /* Make sure that the reduction accumulator is vop[0]. */
6961 if (reduc_index == 1)
6962 {
6963 gcc_assert (commutative_tree_code (code));
6964 std::swap (vop[0], vop[1]);
6965 }
6966 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6967 vectype_in, i * ncopies + j);
6968 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6969 vop[0], vop[1],
6970 vop[0]);
6971 new_temp = make_ssa_name (vec_dest, call);
6972 gimple_call_set_lhs (call, new_temp);
6973 gimple_call_set_nothrow (call, true);
6974 new_stmt_info
6975 = vect_finish_stmt_generation (stmt_info, call, gsi);
6976 }
6977 else
6978 {
6979 if (op_type == ternary_op)
6980 vop[2] = vec_oprnds2[i];
6981
6982 if (masked_loop_p && mask_by_cond_expr)
6983 {
6984 tree mask = vect_get_loop_mask (gsi, masks,
6985 vec_num * ncopies,
6986 vectype_in, i * ncopies + j);
6987 build_vect_cond_expr (code, vop, mask, gsi);
6988 }
6989
6990 gassign *new_stmt = gimple_build_assign (vec_dest, code,
6991 vop[0], vop[1], vop[2]);
6992 new_temp = make_ssa_name (vec_dest, new_stmt);
6993 gimple_assign_set_lhs (new_stmt, new_temp);
6994 new_stmt_info
6995 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6996 }
6997
6998 if (slp_node)
6999 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7000 }
7001
7002 if (slp_node || single_defuse_cycle)
7003 continue;
7004
7005 if (j == 0)
7006 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7007 else
7008 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7009
7010 prev_stmt_info = new_stmt_info;
7011 }
7012
7013 if (single_defuse_cycle && !slp_node)
7014 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7015
7016 return true;
7017 }
7018
7019 /* Transform phase of a cycle PHI. */
7020
7021 bool
vect_transform_cycle_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7022 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7023 slp_tree slp_node, slp_instance slp_node_instance)
7024 {
7025 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7026 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7027 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7028 int i;
7029 int ncopies;
7030 stmt_vec_info prev_phi_info;
7031 int j;
7032 bool nested_cycle = false;
7033 int vec_num;
7034
7035 if (nested_in_vect_loop_p (loop, stmt_info))
7036 {
7037 loop = loop->inner;
7038 nested_cycle = true;
7039 }
7040
7041 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7042 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7043 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7044 gcc_assert (reduc_info->is_reduc_info);
7045
7046 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7047 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7048 /* Leave the scalar phi in place. */
7049 return true;
7050
7051 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7052 /* For a nested cycle we do not fill the above. */
7053 if (!vectype_in)
7054 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7055 gcc_assert (vectype_in);
7056
7057 if (slp_node)
7058 {
7059 /* The size vect_schedule_slp_instance computes is off for us. */
7060 vec_num = vect_get_num_vectors
7061 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7062 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7063 ncopies = 1;
7064 }
7065 else
7066 {
7067 vec_num = 1;
7068 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7069 }
7070
7071 /* Check whether we should use a single PHI node and accumulate
7072 vectors to one before the backedge. */
7073 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7074 ncopies = 1;
7075
7076 /* Create the destination vector */
7077 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7078 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7079 vectype_out);
7080
7081 /* Get the loop-entry arguments. */
7082 tree vec_initial_def;
7083 auto_vec<tree> vec_initial_defs;
7084 if (slp_node)
7085 {
7086 vec_initial_defs.reserve (vec_num);
7087 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7088 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7089 tree neutral_op
7090 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7091 STMT_VINFO_REDUC_CODE (reduc_info),
7092 first != NULL);
7093 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7094 &vec_initial_defs, vec_num,
7095 first != NULL, neutral_op);
7096 }
7097 else
7098 {
7099 /* Get at the scalar def before the loop, that defines the initial
7100 value of the reduction variable. */
7101 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7102 loop_preheader_edge (loop));
7103 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7104 and we can't use zero for induc_val, use initial_def. Similarly
7105 for REDUC_MIN and initial_def larger than the base. */
7106 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7107 {
7108 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7109 if (TREE_CODE (initial_def) == INTEGER_CST
7110 && !integer_zerop (induc_val)
7111 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7112 && tree_int_cst_lt (initial_def, induc_val))
7113 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7114 && tree_int_cst_lt (induc_val, initial_def))))
7115 {
7116 induc_val = initial_def;
7117 /* Communicate we used the initial_def to epilouge
7118 generation. */
7119 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7120 }
7121 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7122 }
7123 else if (nested_cycle)
7124 {
7125 /* Do not use an adjustment def as that case is not supported
7126 correctly if ncopies is not one. */
7127 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7128 reduc_stmt_info);
7129 }
7130 else
7131 {
7132 tree adjustment_def = NULL_TREE;
7133 tree *adjustment_defp = &adjustment_def;
7134 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7135 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7136 adjustment_defp = NULL;
7137 vec_initial_def
7138 = get_initial_def_for_reduction (reduc_stmt_info, code,
7139 initial_def, adjustment_defp);
7140 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7141 }
7142 vec_initial_defs.create (1);
7143 vec_initial_defs.quick_push (vec_initial_def);
7144 }
7145
7146 /* Generate the reduction PHIs upfront. */
7147 prev_phi_info = NULL;
7148 for (i = 0; i < vec_num; i++)
7149 {
7150 tree vec_init_def = vec_initial_defs[i];
7151 for (j = 0; j < ncopies; j++)
7152 {
7153 /* Create the reduction-phi that defines the reduction
7154 operand. */
7155 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7156 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7157
7158 /* Set the loop-entry arg of the reduction-phi. */
7159 if (j != 0 && nested_cycle)
7160 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7161 vec_init_def);
7162 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7163 UNKNOWN_LOCATION);
7164
7165 /* The loop-latch arg is set in epilogue processing. */
7166
7167 if (slp_node)
7168 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7169 else
7170 {
7171 if (j == 0)
7172 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7173 else
7174 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7175 prev_phi_info = new_phi_info;
7176 }
7177 }
7178 }
7179
7180 return true;
7181 }
7182
7183 /* Vectorizes LC PHIs. */
7184
7185 bool
vectorizable_lc_phi(stmt_vec_info stmt_info,stmt_vec_info * vec_stmt,slp_tree slp_node)7186 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7187 slp_tree slp_node)
7188 {
7189 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7190 if (!loop_vinfo
7191 || !is_a <gphi *> (stmt_info->stmt)
7192 || gimple_phi_num_args (stmt_info->stmt) != 1)
7193 return false;
7194
7195 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7196 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7197 return false;
7198
7199 if (!vec_stmt) /* transformation not required. */
7200 {
7201 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7202 return true;
7203 }
7204
7205 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7206 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7207 basic_block bb = gimple_bb (stmt_info->stmt);
7208 edge e = single_pred_edge (bb);
7209 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7210 vec<tree> vec_oprnds = vNULL;
7211 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7212 stmt_info, &vec_oprnds, NULL, slp_node);
7213 if (slp_node)
7214 {
7215 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7216 gcc_assert (vec_oprnds.length () == vec_num);
7217 for (unsigned i = 0; i < vec_num; i++)
7218 {
7219 /* Create the vectorized LC PHI node. */
7220 gphi *new_phi = create_phi_node (vec_dest, bb);
7221 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7222 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7223 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7224 }
7225 }
7226 else
7227 {
7228 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7229 stmt_vec_info prev_phi_info = NULL;
7230 for (unsigned i = 0; i < ncopies; i++)
7231 {
7232 if (i != 0)
7233 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7234 /* Create the vectorized LC PHI node. */
7235 gphi *new_phi = create_phi_node (vec_dest, bb);
7236 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7237 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7238 if (i == 0)
7239 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7240 else
7241 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7242 prev_phi_info = new_phi_info;
7243 }
7244 }
7245 vec_oprnds.release ();
7246
7247 return true;
7248 }
7249
7250
7251 /* Function vect_min_worthwhile_factor.
7252
7253 For a loop where we could vectorize the operation indicated by CODE,
7254 return the minimum vectorization factor that makes it worthwhile
7255 to use generic vectors. */
7256 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7257 vect_min_worthwhile_factor (enum tree_code code)
7258 {
7259 switch (code)
7260 {
7261 case PLUS_EXPR:
7262 case MINUS_EXPR:
7263 case NEGATE_EXPR:
7264 return 4;
7265
7266 case BIT_AND_EXPR:
7267 case BIT_IOR_EXPR:
7268 case BIT_XOR_EXPR:
7269 case BIT_NOT_EXPR:
7270 return 2;
7271
7272 default:
7273 return INT_MAX;
7274 }
7275 }
7276
7277 /* Return true if VINFO indicates we are doing loop vectorization and if
7278 it is worth decomposing CODE operations into scalar operations for
7279 that loop's vectorization factor. */
7280
7281 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7282 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7283 {
7284 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7285 unsigned HOST_WIDE_INT value;
7286 return (loop_vinfo
7287 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7288 && value >= vect_min_worthwhile_factor (code));
7289 }
7290
7291 /* Function vectorizable_induction
7292
7293 Check if STMT_INFO performs an induction computation that can be vectorized.
7294 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7295 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7296 Return true if STMT_INFO is vectorizable in this way. */
7297
7298 bool
vectorizable_induction(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,stmt_vec_info * vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7299 vectorizable_induction (stmt_vec_info stmt_info,
7300 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7301 stmt_vec_info *vec_stmt, slp_tree slp_node,
7302 stmt_vector_for_cost *cost_vec)
7303 {
7304 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7305 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7306 unsigned ncopies;
7307 bool nested_in_vect_loop = false;
7308 class loop *iv_loop;
7309 tree vec_def;
7310 edge pe = loop_preheader_edge (loop);
7311 basic_block new_bb;
7312 tree new_vec, vec_init, vec_step, t;
7313 tree new_name;
7314 gimple *new_stmt;
7315 gphi *induction_phi;
7316 tree induc_def, vec_dest;
7317 tree init_expr, step_expr;
7318 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7319 unsigned i;
7320 tree expr;
7321 gimple_seq stmts;
7322 imm_use_iterator imm_iter;
7323 use_operand_p use_p;
7324 gimple *exit_phi;
7325 edge latch_e;
7326 tree loop_arg;
7327 gimple_stmt_iterator si;
7328
7329 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7330 if (!phi)
7331 return false;
7332
7333 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7334 return false;
7335
7336 /* Make sure it was recognized as induction computation. */
7337 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7338 return false;
7339
7340 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7341 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7342
7343 if (slp_node)
7344 ncopies = 1;
7345 else
7346 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7347 gcc_assert (ncopies >= 1);
7348
7349 /* FORNOW. These restrictions should be relaxed. */
7350 if (nested_in_vect_loop_p (loop, stmt_info))
7351 {
7352 imm_use_iterator imm_iter;
7353 use_operand_p use_p;
7354 gimple *exit_phi;
7355 edge latch_e;
7356 tree loop_arg;
7357
7358 if (ncopies > 1)
7359 {
7360 if (dump_enabled_p ())
7361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7362 "multiple types in nested loop.\n");
7363 return false;
7364 }
7365
7366 /* FORNOW: outer loop induction with SLP not supported. */
7367 if (STMT_SLP_TYPE (stmt_info))
7368 return false;
7369
7370 exit_phi = NULL;
7371 latch_e = loop_latch_edge (loop->inner);
7372 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7373 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7374 {
7375 gimple *use_stmt = USE_STMT (use_p);
7376 if (is_gimple_debug (use_stmt))
7377 continue;
7378
7379 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7380 {
7381 exit_phi = use_stmt;
7382 break;
7383 }
7384 }
7385 if (exit_phi)
7386 {
7387 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7388 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7389 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7390 {
7391 if (dump_enabled_p ())
7392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7393 "inner-loop induction only used outside "
7394 "of the outer vectorized loop.\n");
7395 return false;
7396 }
7397 }
7398
7399 nested_in_vect_loop = true;
7400 iv_loop = loop->inner;
7401 }
7402 else
7403 iv_loop = loop;
7404 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7405
7406 if (slp_node && !nunits.is_constant ())
7407 {
7408 /* The current SLP code creates the initial value element-by-element. */
7409 if (dump_enabled_p ())
7410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411 "SLP induction not supported for variable-length"
7412 " vectors.\n");
7413 return false;
7414 }
7415
7416 if (!vec_stmt) /* transformation not required. */
7417 {
7418 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7419 DUMP_VECT_SCOPE ("vectorizable_induction");
7420 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7421 return true;
7422 }
7423
7424 /* Transform. */
7425
7426 /* Compute a vector variable, initialized with the first VF values of
7427 the induction variable. E.g., for an iv with IV_PHI='X' and
7428 evolution S, for a vector of 4 units, we want to compute:
7429 [X, X + S, X + 2*S, X + 3*S]. */
7430
7431 if (dump_enabled_p ())
7432 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7433
7434 latch_e = loop_latch_edge (iv_loop);
7435 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7436
7437 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7438 gcc_assert (step_expr != NULL_TREE);
7439 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7440
7441 pe = loop_preheader_edge (iv_loop);
7442 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7443 loop_preheader_edge (iv_loop));
7444
7445 stmts = NULL;
7446 if (!nested_in_vect_loop)
7447 {
7448 /* Convert the initial value to the IV update type. */
7449 tree new_type = TREE_TYPE (step_expr);
7450 init_expr = gimple_convert (&stmts, new_type, init_expr);
7451
7452 /* If we are using the loop mask to "peel" for alignment then we need
7453 to adjust the start value here. */
7454 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7455 if (skip_niters != NULL_TREE)
7456 {
7457 if (FLOAT_TYPE_P (vectype))
7458 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7459 skip_niters);
7460 else
7461 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7462 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7463 skip_niters, step_expr);
7464 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7465 init_expr, skip_step);
7466 }
7467 }
7468
7469 if (stmts)
7470 {
7471 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7472 gcc_assert (!new_bb);
7473 }
7474
7475 /* Find the first insertion point in the BB. */
7476 basic_block bb = gimple_bb (phi);
7477 si = gsi_after_labels (bb);
7478
7479 /* For SLP induction we have to generate several IVs as for example
7480 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7481 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7482 [VF*S, VF*S, VF*S, VF*S] for all. */
7483 if (slp_node)
7484 {
7485 /* Enforced above. */
7486 unsigned int const_nunits = nunits.to_constant ();
7487
7488 /* Generate [VF*S, VF*S, ... ]. */
7489 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7490 {
7491 expr = build_int_cst (integer_type_node, vf);
7492 expr = fold_convert (TREE_TYPE (step_expr), expr);
7493 }
7494 else
7495 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7496 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7497 expr, step_expr);
7498 if (! CONSTANT_CLASS_P (new_name))
7499 new_name = vect_init_vector (stmt_info, new_name,
7500 TREE_TYPE (step_expr), NULL);
7501 new_vec = build_vector_from_val (step_vectype, new_name);
7502 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7503
7504 /* Now generate the IVs. */
7505 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7506 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7507 unsigned elts = const_nunits * nvects;
7508 unsigned nivs = least_common_multiple (group_size,
7509 const_nunits) / const_nunits;
7510 gcc_assert (elts % group_size == 0);
7511 tree elt = init_expr;
7512 unsigned ivn;
7513 for (ivn = 0; ivn < nivs; ++ivn)
7514 {
7515 tree_vector_builder elts (step_vectype, const_nunits, 1);
7516 stmts = NULL;
7517 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7518 {
7519 if (ivn*const_nunits + eltn >= group_size
7520 && (ivn * const_nunits + eltn) % group_size == 0)
7521 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7522 elt, step_expr);
7523 elts.quick_push (elt);
7524 }
7525 vec_init = gimple_build_vector (&stmts, &elts);
7526 vec_init = gimple_convert (&stmts, vectype, vec_init);
7527 if (stmts)
7528 {
7529 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7530 gcc_assert (!new_bb);
7531 }
7532
7533 /* Create the induction-phi that defines the induction-operand. */
7534 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7535 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7536 stmt_vec_info induction_phi_info
7537 = loop_vinfo->add_stmt (induction_phi);
7538 induc_def = PHI_RESULT (induction_phi);
7539
7540 /* Create the iv update inside the loop */
7541 gimple_seq stmts = NULL;
7542 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7543 vec_def = gimple_build (&stmts,
7544 PLUS_EXPR, step_vectype, vec_def, vec_step);
7545 vec_def = gimple_convert (&stmts, vectype, vec_def);
7546 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7547 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7548
7549 /* Set the arguments of the phi node: */
7550 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7551 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7552 UNKNOWN_LOCATION);
7553
7554 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7555 }
7556
7557 /* Re-use IVs when we can. */
7558 if (ivn < nvects)
7559 {
7560 unsigned vfp
7561 = least_common_multiple (group_size, const_nunits) / group_size;
7562 /* Generate [VF'*S, VF'*S, ... ]. */
7563 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7564 {
7565 expr = build_int_cst (integer_type_node, vfp);
7566 expr = fold_convert (TREE_TYPE (step_expr), expr);
7567 }
7568 else
7569 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7570 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7571 expr, step_expr);
7572 if (! CONSTANT_CLASS_P (new_name))
7573 new_name = vect_init_vector (stmt_info, new_name,
7574 TREE_TYPE (step_expr), NULL);
7575 new_vec = build_vector_from_val (step_vectype, new_name);
7576 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7577 for (; ivn < nvects; ++ivn)
7578 {
7579 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7580 tree def;
7581 if (gimple_code (iv) == GIMPLE_PHI)
7582 def = gimple_phi_result (iv);
7583 else
7584 def = gimple_assign_lhs (iv);
7585 gimple_seq stmts = NULL;
7586 def = gimple_convert (&stmts, step_vectype, def);
7587 def = gimple_build (&stmts,
7588 PLUS_EXPR, step_vectype, def, vec_step);
7589 def = gimple_convert (&stmts, vectype, def);
7590 if (gimple_code (iv) == GIMPLE_PHI)
7591 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7592 else
7593 {
7594 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7595 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7596 }
7597 SLP_TREE_VEC_STMTS (slp_node).quick_push
7598 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7599 }
7600 }
7601
7602 return true;
7603 }
7604
7605 /* Create the vector that holds the initial_value of the induction. */
7606 if (nested_in_vect_loop)
7607 {
7608 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7609 been created during vectorization of previous stmts. We obtain it
7610 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7611 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7612 /* If the initial value is not of proper type, convert it. */
7613 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7614 {
7615 new_stmt
7616 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7617 vect_simple_var,
7618 "vec_iv_"),
7619 VIEW_CONVERT_EXPR,
7620 build1 (VIEW_CONVERT_EXPR, vectype,
7621 vec_init));
7622 vec_init = gimple_assign_lhs (new_stmt);
7623 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7624 new_stmt);
7625 gcc_assert (!new_bb);
7626 loop_vinfo->add_stmt (new_stmt);
7627 }
7628 }
7629 else
7630 {
7631 /* iv_loop is the loop to be vectorized. Create:
7632 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7633 stmts = NULL;
7634 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7635
7636 unsigned HOST_WIDE_INT const_nunits;
7637 if (nunits.is_constant (&const_nunits))
7638 {
7639 tree_vector_builder elts (step_vectype, const_nunits, 1);
7640 elts.quick_push (new_name);
7641 for (i = 1; i < const_nunits; i++)
7642 {
7643 /* Create: new_name_i = new_name + step_expr */
7644 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7645 new_name, step_expr);
7646 elts.quick_push (new_name);
7647 }
7648 /* Create a vector from [new_name_0, new_name_1, ...,
7649 new_name_nunits-1] */
7650 vec_init = gimple_build_vector (&stmts, &elts);
7651 }
7652 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7653 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7654 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7655 new_name, step_expr);
7656 else
7657 {
7658 /* Build:
7659 [base, base, base, ...]
7660 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7661 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7662 gcc_assert (flag_associative_math);
7663 tree index = build_index_vector (step_vectype, 0, 1);
7664 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7665 new_name);
7666 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7667 step_expr);
7668 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7669 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7670 vec_init, step_vec);
7671 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7672 vec_init, base_vec);
7673 }
7674 vec_init = gimple_convert (&stmts, vectype, vec_init);
7675
7676 if (stmts)
7677 {
7678 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7679 gcc_assert (!new_bb);
7680 }
7681 }
7682
7683
7684 /* Create the vector that holds the step of the induction. */
7685 if (nested_in_vect_loop)
7686 /* iv_loop is nested in the loop to be vectorized. Generate:
7687 vec_step = [S, S, S, S] */
7688 new_name = step_expr;
7689 else
7690 {
7691 /* iv_loop is the loop to be vectorized. Generate:
7692 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7693 gimple_seq seq = NULL;
7694 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7695 {
7696 expr = build_int_cst (integer_type_node, vf);
7697 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7698 }
7699 else
7700 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7701 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7702 expr, step_expr);
7703 if (seq)
7704 {
7705 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7706 gcc_assert (!new_bb);
7707 }
7708 }
7709
7710 t = unshare_expr (new_name);
7711 gcc_assert (CONSTANT_CLASS_P (new_name)
7712 || TREE_CODE (new_name) == SSA_NAME);
7713 new_vec = build_vector_from_val (step_vectype, t);
7714 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7715
7716
7717 /* Create the following def-use cycle:
7718 loop prolog:
7719 vec_init = ...
7720 vec_step = ...
7721 loop:
7722 vec_iv = PHI <vec_init, vec_loop>
7723 ...
7724 STMT
7725 ...
7726 vec_loop = vec_iv + vec_step; */
7727
7728 /* Create the induction-phi that defines the induction-operand. */
7729 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7730 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7731 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7732 induc_def = PHI_RESULT (induction_phi);
7733
7734 /* Create the iv update inside the loop */
7735 stmts = NULL;
7736 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7737 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7738 vec_def = gimple_convert (&stmts, vectype, vec_def);
7739 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7740 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7741 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7742
7743 /* Set the arguments of the phi node: */
7744 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7745 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7746 UNKNOWN_LOCATION);
7747
7748 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7749
7750 /* In case that vectorization factor (VF) is bigger than the number
7751 of elements that we can fit in a vectype (nunits), we have to generate
7752 more than one vector stmt - i.e - we need to "unroll" the
7753 vector stmt by a factor VF/nunits. For more details see documentation
7754 in vectorizable_operation. */
7755
7756 if (ncopies > 1)
7757 {
7758 gimple_seq seq = NULL;
7759 stmt_vec_info prev_stmt_vinfo;
7760 /* FORNOW. This restriction should be relaxed. */
7761 gcc_assert (!nested_in_vect_loop);
7762
7763 /* Create the vector that holds the step of the induction. */
7764 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7765 {
7766 expr = build_int_cst (integer_type_node, nunits);
7767 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7768 }
7769 else
7770 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7771 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7772 expr, step_expr);
7773 if (seq)
7774 {
7775 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7776 gcc_assert (!new_bb);
7777 }
7778
7779 t = unshare_expr (new_name);
7780 gcc_assert (CONSTANT_CLASS_P (new_name)
7781 || TREE_CODE (new_name) == SSA_NAME);
7782 new_vec = build_vector_from_val (step_vectype, t);
7783 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7784
7785 vec_def = induc_def;
7786 prev_stmt_vinfo = induction_phi_info;
7787 for (i = 1; i < ncopies; i++)
7788 {
7789 /* vec_i = vec_prev + vec_step */
7790 gimple_seq stmts = NULL;
7791 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7792 vec_def = gimple_build (&stmts,
7793 PLUS_EXPR, step_vectype, vec_def, vec_step);
7794 vec_def = gimple_convert (&stmts, vectype, vec_def);
7795
7796 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7797 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7798 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7799 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7800 prev_stmt_vinfo = new_stmt_info;
7801 }
7802 }
7803
7804 if (nested_in_vect_loop)
7805 {
7806 /* Find the loop-closed exit-phi of the induction, and record
7807 the final vector of induction results: */
7808 exit_phi = NULL;
7809 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7810 {
7811 gimple *use_stmt = USE_STMT (use_p);
7812 if (is_gimple_debug (use_stmt))
7813 continue;
7814
7815 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7816 {
7817 exit_phi = use_stmt;
7818 break;
7819 }
7820 }
7821 if (exit_phi)
7822 {
7823 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7824 /* FORNOW. Currently not supporting the case that an inner-loop induction
7825 is not used in the outer-loop (i.e. only outside the outer-loop). */
7826 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7827 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7828
7829 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7830 if (dump_enabled_p ())
7831 dump_printf_loc (MSG_NOTE, vect_location,
7832 "vector of inductions after inner-loop:%G",
7833 new_stmt);
7834 }
7835 }
7836
7837
7838 if (dump_enabled_p ())
7839 dump_printf_loc (MSG_NOTE, vect_location,
7840 "transform induction: created def-use cycle: %G%G",
7841 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7842
7843 return true;
7844 }
7845
7846 /* Function vectorizable_live_operation.
7847
7848 STMT_INFO computes a value that is used outside the loop. Check if
7849 it can be supported. */
7850
7851 bool
vectorizable_live_operation(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost *)7852 vectorizable_live_operation (stmt_vec_info stmt_info,
7853 gimple_stmt_iterator *gsi,
7854 slp_tree slp_node, slp_instance slp_node_instance,
7855 int slp_index, bool vec_stmt_p,
7856 stmt_vector_for_cost *)
7857 {
7858 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7859 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7860 imm_use_iterator imm_iter;
7861 tree lhs, lhs_type, bitsize, vec_bitsize;
7862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7863 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7864 int ncopies;
7865 gimple *use_stmt;
7866 auto_vec<tree> vec_oprnds;
7867 int vec_entry = 0;
7868 poly_uint64 vec_index = 0;
7869
7870 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7871
7872 /* If a stmt of a reduction is live, vectorize it via
7873 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7874 validity so just trigger the transform here. */
7875 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7876 {
7877 if (!vec_stmt_p)
7878 return true;
7879 if (slp_node)
7880 {
7881 /* For reduction chains the meta-info is attached to
7882 the group leader. */
7883 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7884 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7885 /* For SLP reductions we vectorize the epilogue for
7886 all involved stmts together. */
7887 else if (slp_index != 0)
7888 return true;
7889 }
7890 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7891 gcc_assert (reduc_info->is_reduc_info);
7892 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7893 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7894 return true;
7895 vect_create_epilog_for_reduction (stmt_info, slp_node,
7896 slp_node_instance);
7897 return true;
7898 }
7899
7900 /* FORNOW. CHECKME. */
7901 if (nested_in_vect_loop_p (loop, stmt_info))
7902 return false;
7903
7904 /* If STMT is not relevant and it is a simple assignment and its inputs are
7905 invariant then it can remain in place, unvectorized. The original last
7906 scalar value that it computes will be used. */
7907 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7908 {
7909 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7910 if (dump_enabled_p ())
7911 dump_printf_loc (MSG_NOTE, vect_location,
7912 "statement is simple and uses invariant. Leaving in "
7913 "place.\n");
7914 return true;
7915 }
7916
7917 if (slp_node)
7918 ncopies = 1;
7919 else
7920 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7921
7922 if (slp_node)
7923 {
7924 gcc_assert (slp_index >= 0);
7925
7926 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7927 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7928
7929 /* Get the last occurrence of the scalar index from the concatenation of
7930 all the slp vectors. Calculate which slp vector it is and the index
7931 within. */
7932 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7933
7934 /* Calculate which vector contains the result, and which lane of
7935 that vector we need. */
7936 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7937 {
7938 if (dump_enabled_p ())
7939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7940 "Cannot determine which vector holds the"
7941 " final result.\n");
7942 return false;
7943 }
7944 }
7945
7946 if (!vec_stmt_p)
7947 {
7948 /* No transformation required. */
7949 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7950 {
7951 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7952 OPTIMIZE_FOR_SPEED))
7953 {
7954 if (dump_enabled_p ())
7955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7956 "can't use a fully-masked loop because "
7957 "the target doesn't support extract last "
7958 "reduction.\n");
7959 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7960 }
7961 else if (slp_node)
7962 {
7963 if (dump_enabled_p ())
7964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 "can't use a fully-masked loop because an "
7966 "SLP statement is live after the loop.\n");
7967 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7968 }
7969 else if (ncopies > 1)
7970 {
7971 if (dump_enabled_p ())
7972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7973 "can't use a fully-masked loop because"
7974 " ncopies is greater than 1.\n");
7975 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7976 }
7977 else
7978 {
7979 gcc_assert (ncopies == 1 && !slp_node);
7980 vect_record_loop_mask (loop_vinfo,
7981 &LOOP_VINFO_MASKS (loop_vinfo),
7982 1, vectype, NULL);
7983 }
7984 }
7985 return true;
7986 }
7987
7988 /* Use the lhs of the original scalar statement. */
7989 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7990
7991 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7992 : gimple_get_lhs (stmt);
7993 lhs_type = TREE_TYPE (lhs);
7994
7995 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7996 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7997 : TYPE_SIZE (TREE_TYPE (vectype)));
7998 vec_bitsize = TYPE_SIZE (vectype);
7999
8000 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8001 tree vec_lhs, bitstart;
8002 if (slp_node)
8003 {
8004 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8005
8006 /* Get the correct slp vectorized stmt. */
8007 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8008 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8009 vec_lhs = gimple_phi_result (phi);
8010 else
8011 vec_lhs = gimple_get_lhs (vec_stmt);
8012
8013 /* Get entry to use. */
8014 bitstart = bitsize_int (vec_index);
8015 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8016 }
8017 else
8018 {
8019 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8020 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8021 gcc_checking_assert (ncopies == 1
8022 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8023
8024 /* For multiple copies, get the last copy. */
8025 for (int i = 1; i < ncopies; ++i)
8026 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8027
8028 /* Get the last lane in the vector. */
8029 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8030 }
8031
8032 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8033 requirement, insert one phi node for it. It looks like:
8034 loop;
8035 BB:
8036 # lhs' = PHI <lhs>
8037 ==>
8038 loop;
8039 BB:
8040 # vec_lhs' = PHI <vec_lhs>
8041 new_tree = lane_extract <vec_lhs', ...>;
8042 lhs' = new_tree; */
8043
8044 basic_block exit_bb = single_exit (loop)->dest;
8045 gcc_assert (single_pred_p (exit_bb));
8046
8047 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8048 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8049 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8050
8051 gimple_seq stmts = NULL;
8052 tree new_tree;
8053 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8054 {
8055 /* Emit:
8056
8057 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8058
8059 where VEC_LHS is the vectorized live-out result and MASK is
8060 the loop mask for the final iteration. */
8061 gcc_assert (ncopies == 1 && !slp_node);
8062 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8063 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8064 vectype, 0);
8065 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8066 mask, vec_lhs_phi);
8067
8068 /* Convert the extracted vector element to the required scalar type. */
8069 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8070 }
8071 else
8072 {
8073 tree bftype = TREE_TYPE (vectype);
8074 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8075 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8076 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8077 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8078 &stmts, true, NULL_TREE);
8079 }
8080
8081 if (stmts)
8082 {
8083 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8084 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8085
8086 /* Remove existing phi from lhs and create one copy from new_tree. */
8087 tree lhs_phi = NULL_TREE;
8088 gimple_stmt_iterator gsi;
8089 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8090 {
8091 gimple *phi = gsi_stmt (gsi);
8092 if ((gimple_phi_arg_def (phi, 0) == lhs))
8093 {
8094 remove_phi_node (&gsi, false);
8095 lhs_phi = gimple_phi_result (phi);
8096 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8097 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8098 break;
8099 }
8100 }
8101 }
8102
8103 /* Replace use of lhs with newly computed result. If the use stmt is a
8104 single arg PHI, just replace all uses of PHI result. It's necessary
8105 because lcssa PHI defining lhs may be before newly inserted stmt. */
8106 use_operand_p use_p;
8107 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8108 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8109 && !is_gimple_debug (use_stmt))
8110 {
8111 if (gimple_code (use_stmt) == GIMPLE_PHI
8112 && gimple_phi_num_args (use_stmt) == 1)
8113 {
8114 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8115 }
8116 else
8117 {
8118 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8119 SET_USE (use_p, new_tree);
8120 }
8121 update_stmt (use_stmt);
8122 }
8123
8124 return true;
8125 }
8126
8127 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8128
8129 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)8130 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8131 {
8132 ssa_op_iter op_iter;
8133 imm_use_iterator imm_iter;
8134 def_operand_p def_p;
8135 gimple *ustmt;
8136
8137 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8138 {
8139 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8140 {
8141 basic_block bb;
8142
8143 if (!is_gimple_debug (ustmt))
8144 continue;
8145
8146 bb = gimple_bb (ustmt);
8147
8148 if (!flow_bb_inside_loop_p (loop, bb))
8149 {
8150 if (gimple_debug_bind_p (ustmt))
8151 {
8152 if (dump_enabled_p ())
8153 dump_printf_loc (MSG_NOTE, vect_location,
8154 "killing debug use\n");
8155
8156 gimple_debug_bind_reset_value (ustmt);
8157 update_stmt (ustmt);
8158 }
8159 else
8160 gcc_unreachable ();
8161 }
8162 }
8163 }
8164 }
8165
8166 /* Given loop represented by LOOP_VINFO, return true if computation of
8167 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8168 otherwise. */
8169
8170 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8171 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8172 {
8173 /* Constant case. */
8174 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8175 {
8176 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8177 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8178
8179 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8180 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8181 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8182 return true;
8183 }
8184
8185 widest_int max;
8186 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8187 /* Check the upper bound of loop niters. */
8188 if (get_max_loop_iterations (loop, &max))
8189 {
8190 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8191 signop sgn = TYPE_SIGN (type);
8192 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8193 if (max < type_max)
8194 return true;
8195 }
8196 return false;
8197 }
8198
8199 /* Return a mask type with half the number of elements as OLD_TYPE,
8200 given that it should have mode NEW_MODE. */
8201
8202 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)8203 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8204 {
8205 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8206 return build_truth_vector_type_for_mode (nunits, new_mode);
8207 }
8208
8209 /* Return a mask type with twice as many elements as OLD_TYPE,
8210 given that it should have mode NEW_MODE. */
8211
8212 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)8213 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8214 {
8215 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8216 return build_truth_vector_type_for_mode (nunits, new_mode);
8217 }
8218
8219 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8220 contain a sequence of NVECTORS masks that each control a vector of type
8221 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8222 these vector masks with the vector version of SCALAR_MASK. */
8223
8224 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)8225 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8226 unsigned int nvectors, tree vectype, tree scalar_mask)
8227 {
8228 gcc_assert (nvectors != 0);
8229 if (masks->length () < nvectors)
8230 masks->safe_grow_cleared (nvectors);
8231 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8232 /* The number of scalars per iteration and the number of vectors are
8233 both compile-time constants. */
8234 unsigned int nscalars_per_iter
8235 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8236 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8237
8238 if (scalar_mask)
8239 {
8240 scalar_cond_masked_key cond (scalar_mask, nvectors);
8241 loop_vinfo->scalar_cond_masked_set.add (cond);
8242 }
8243
8244 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8245 {
8246 rgm->max_nscalars_per_iter = nscalars_per_iter;
8247 rgm->mask_type = truth_type_for (vectype);
8248 }
8249 }
8250
8251 /* Given a complete set of masks MASKS, extract mask number INDEX
8252 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8253 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8254
8255 See the comment above vec_loop_masks for more details about the mask
8256 arrangement. */
8257
8258 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8259 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8260 unsigned int nvectors, tree vectype, unsigned int index)
8261 {
8262 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8263 tree mask_type = rgm->mask_type;
8264
8265 /* Populate the rgroup's mask array, if this is the first time we've
8266 used it. */
8267 if (rgm->masks.is_empty ())
8268 {
8269 rgm->masks.safe_grow_cleared (nvectors);
8270 for (unsigned int i = 0; i < nvectors; ++i)
8271 {
8272 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8273 /* Provide a dummy definition until the real one is available. */
8274 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8275 rgm->masks[i] = mask;
8276 }
8277 }
8278
8279 tree mask = rgm->masks[index];
8280 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8281 TYPE_VECTOR_SUBPARTS (vectype)))
8282 {
8283 /* A loop mask for data type X can be reused for data type Y
8284 if X has N times more elements than Y and if Y's elements
8285 are N times bigger than X's. In this case each sequence
8286 of N elements in the loop mask will be all-zero or all-one.
8287 We can then view-convert the mask so that each sequence of
8288 N elements is replaced by a single element. */
8289 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8290 TYPE_VECTOR_SUBPARTS (vectype)));
8291 gimple_seq seq = NULL;
8292 mask_type = truth_type_for (vectype);
8293 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8294 if (seq)
8295 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8296 }
8297 return mask;
8298 }
8299
8300 /* Scale profiling counters by estimation for LOOP which is vectorized
8301 by factor VF. */
8302
8303 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)8304 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8305 {
8306 edge preheader = loop_preheader_edge (loop);
8307 /* Reduce loop iterations by the vectorization factor. */
8308 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8309 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8310
8311 if (freq_h.nonzero_p ())
8312 {
8313 profile_probability p;
8314
8315 /* Avoid dropping loop body profile counter to 0 because of zero count
8316 in loop's preheader. */
8317 if (!(freq_e == profile_count::zero ()))
8318 freq_e = freq_e.force_nonzero ();
8319 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8320 scale_loop_frequencies (loop, p);
8321 }
8322
8323 edge exit_e = single_exit (loop);
8324 exit_e->probability = profile_probability::always ()
8325 .apply_scale (1, new_est_niter + 1);
8326
8327 edge exit_l = single_pred_edge (loop->latch);
8328 profile_probability prob = exit_l->probability;
8329 exit_l->probability = exit_e->probability.invert ();
8330 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8331 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8332 }
8333
8334 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8335 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8336 stmt_vec_info. */
8337
8338 static void
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)8339 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8340 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8341 {
8342 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8343 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8344
8345 if (dump_enabled_p ())
8346 dump_printf_loc (MSG_NOTE, vect_location,
8347 "------>vectorizing statement: %G", stmt_info->stmt);
8348
8349 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8350 vect_loop_kill_debug_uses (loop, stmt_info);
8351
8352 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8353 && !STMT_VINFO_LIVE_P (stmt_info))
8354 return;
8355
8356 if (STMT_VINFO_VECTYPE (stmt_info))
8357 {
8358 poly_uint64 nunits
8359 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8360 if (!STMT_SLP_TYPE (stmt_info)
8361 && maybe_ne (nunits, vf)
8362 && dump_enabled_p ())
8363 /* For SLP VF is set according to unrolling factor, and not
8364 to vector size, hence for SLP this print is not valid. */
8365 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8366 }
8367
8368 /* Pure SLP statements have already been vectorized. We still need
8369 to apply loop vectorization to hybrid SLP statements. */
8370 if (PURE_SLP_STMT (stmt_info))
8371 return;
8372
8373 if (dump_enabled_p ())
8374 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8375
8376 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8377 *seen_store = stmt_info;
8378 }
8379
8380 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8381 in the hash_map with its corresponding values. */
8382
8383 static tree
find_in_mapping(tree t,void * context)8384 find_in_mapping (tree t, void *context)
8385 {
8386 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8387
8388 tree *value = mapping->get (t);
8389 return value ? *value : t;
8390 }
8391
8392 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8393 original loop that has now been vectorized.
8394
8395 The inits of the data_references need to be advanced with the number of
8396 iterations of the main loop. This has been computed in vect_do_peeling and
8397 is stored in parameter ADVANCE. We first restore the data_references
8398 initial offset with the values recored in ORIG_DRS_INIT.
8399
8400 Since the loop_vec_info of this EPILOGUE was constructed for the original
8401 loop, its stmt_vec_infos all point to the original statements. These need
8402 to be updated to point to their corresponding copies as well as the SSA_NAMES
8403 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8404
8405 The data_reference's connections also need to be updated. Their
8406 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8407 stmt_vec_infos, their statements need to point to their corresponding copy,
8408 if they are gather loads or scatter stores then their reference needs to be
8409 updated to point to its corresponding copy and finally we set
8410 'base_misaligned' to false as we have already peeled for alignment in the
8411 prologue of the main loop. */
8412
8413 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)8414 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8415 {
8416 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8417 auto_vec<gimple *> stmt_worklist;
8418 hash_map<tree,tree> mapping;
8419 gimple *orig_stmt, *new_stmt;
8420 gimple_stmt_iterator epilogue_gsi;
8421 gphi_iterator epilogue_phi_gsi;
8422 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8423 basic_block *epilogue_bbs = get_loop_body (epilogue);
8424 unsigned i;
8425
8426 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8427
8428 /* Advance data_reference's with the number of iterations of the previous
8429 loop and its prologue. */
8430 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8431
8432
8433 /* The EPILOGUE loop is a copy of the original loop so they share the same
8434 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8435 point to the copied statements. We also create a mapping of all LHS' in
8436 the original loop and all the LHS' in the EPILOGUE and create worklists to
8437 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8438 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8439 {
8440 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8441 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8442 {
8443 new_stmt = epilogue_phi_gsi.phi ();
8444
8445 gcc_assert (gimple_uid (new_stmt) > 0);
8446 stmt_vinfo
8447 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8448
8449 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8450 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8451
8452 mapping.put (gimple_phi_result (orig_stmt),
8453 gimple_phi_result (new_stmt));
8454 /* PHI nodes can not have patterns or related statements. */
8455 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8456 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8457 }
8458
8459 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8460 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8461 {
8462 new_stmt = gsi_stmt (epilogue_gsi);
8463
8464 gcc_assert (gimple_uid (new_stmt) > 0);
8465 stmt_vinfo
8466 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8467
8468 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8469 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8470
8471 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8472 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8473
8474 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8475 {
8476 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8477 for (gimple_stmt_iterator gsi = gsi_start (seq);
8478 !gsi_end_p (gsi); gsi_next (&gsi))
8479 stmt_worklist.safe_push (gsi_stmt (gsi));
8480 }
8481
8482 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8483 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8484 {
8485 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8486 stmt_worklist.safe_push (stmt);
8487 /* Set BB such that the assert in
8488 'get_initial_def_for_reduction' is able to determine that
8489 the BB of the related stmt is inside this loop. */
8490 gimple_set_bb (stmt,
8491 gimple_bb (new_stmt));
8492 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8493 gcc_assert (related_vinfo == NULL
8494 || related_vinfo == stmt_vinfo);
8495 }
8496 }
8497 }
8498
8499 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8500 using the original main loop and thus need to be updated to refer to the
8501 cloned variables used in the epilogue. */
8502 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8503 {
8504 gimple *stmt = stmt_worklist[i];
8505 tree *new_op;
8506
8507 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8508 {
8509 tree op = gimple_op (stmt, j);
8510 if ((new_op = mapping.get(op)))
8511 gimple_set_op (stmt, j, *new_op);
8512 else
8513 {
8514 /* PR92429: The last argument of simplify_replace_tree disables
8515 folding when replacing arguments. This is required as
8516 otherwise you might end up with different statements than the
8517 ones analyzed in vect_loop_analyze, leading to different
8518 vectorization. */
8519 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8520 &find_in_mapping, &mapping, false);
8521 gimple_set_op (stmt, j, op);
8522 }
8523 }
8524 }
8525
8526 struct data_reference *dr;
8527 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8528 FOR_EACH_VEC_ELT (datarefs, i, dr)
8529 {
8530 orig_stmt = DR_STMT (dr);
8531 gcc_assert (gimple_uid (orig_stmt) > 0);
8532 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8533 /* Data references for gather loads and scatter stores do not use the
8534 updated offset we set using ADVANCE. Instead we have to make sure the
8535 reference in the data references point to the corresponding copy of
8536 the original in the epilogue. */
8537 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8538 == VMAT_GATHER_SCATTER)
8539 {
8540 DR_REF (dr)
8541 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8542 &find_in_mapping, &mapping);
8543 DR_BASE_ADDRESS (dr)
8544 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8545 &find_in_mapping, &mapping);
8546 }
8547 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8548 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8549 /* The vector size of the epilogue is smaller than that of the main loop
8550 so the alignment is either the same or lower. This means the dr will
8551 thus by definition be aligned. */
8552 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8553 }
8554
8555 epilogue_vinfo->shared->datarefs_copy.release ();
8556 epilogue_vinfo->shared->save_datarefs ();
8557 }
8558
8559 /* Function vect_transform_loop.
8560
8561 The analysis phase has determined that the loop is vectorizable.
8562 Vectorize the loop - created vectorized stmts to replace the scalar
8563 stmts in the loop, and update the loop exit condition.
8564 Returns scalar epilogue loop if any. */
8565
8566 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)8567 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8568 {
8569 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8570 class loop *epilogue = NULL;
8571 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8572 int nbbs = loop->num_nodes;
8573 int i;
8574 tree niters_vector = NULL_TREE;
8575 tree step_vector = NULL_TREE;
8576 tree niters_vector_mult_vf = NULL_TREE;
8577 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8578 unsigned int lowest_vf = constant_lower_bound (vf);
8579 gimple *stmt;
8580 bool check_profitability = false;
8581 unsigned int th;
8582
8583 DUMP_VECT_SCOPE ("vec_transform_loop");
8584
8585 loop_vinfo->shared->check_datarefs ();
8586
8587 /* Use the more conservative vectorization threshold. If the number
8588 of iterations is constant assume the cost check has been performed
8589 by our caller. If the threshold makes all loops profitable that
8590 run at least the (estimated) vectorization factor number of times
8591 checking is pointless, too. */
8592 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8593 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8594 {
8595 if (dump_enabled_p ())
8596 dump_printf_loc (MSG_NOTE, vect_location,
8597 "Profitability threshold is %d loop iterations.\n",
8598 th);
8599 check_profitability = true;
8600 }
8601
8602 /* Make sure there exists a single-predecessor exit bb. Do this before
8603 versioning. */
8604 edge e = single_exit (loop);
8605 if (! single_pred_p (e->dest))
8606 {
8607 split_loop_exit_edge (e, true);
8608 if (dump_enabled_p ())
8609 dump_printf (MSG_NOTE, "split exit edge\n");
8610 }
8611
8612 /* Version the loop first, if required, so the profitability check
8613 comes first. */
8614
8615 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8616 {
8617 class loop *sloop
8618 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8619 sloop->force_vectorize = false;
8620 check_profitability = false;
8621 }
8622
8623 /* Make sure there exists a single-predecessor exit bb also on the
8624 scalar loop copy. Do this after versioning but before peeling
8625 so CFG structure is fine for both scalar and if-converted loop
8626 to make slpeel_duplicate_current_defs_from_edges face matched
8627 loop closed PHI nodes on the exit. */
8628 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8629 {
8630 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8631 if (! single_pred_p (e->dest))
8632 {
8633 split_loop_exit_edge (e, true);
8634 if (dump_enabled_p ())
8635 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8636 }
8637 }
8638
8639 tree niters = vect_build_loop_niters (loop_vinfo);
8640 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8641 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8642 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8643 tree advance;
8644 drs_init_vec orig_drs_init;
8645
8646 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8647 &step_vector, &niters_vector_mult_vf, th,
8648 check_profitability, niters_no_overflow,
8649 &advance);
8650
8651 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8652 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8653 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8654 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8655
8656 if (niters_vector == NULL_TREE)
8657 {
8658 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8659 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8660 && known_eq (lowest_vf, vf))
8661 {
8662 niters_vector
8663 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8664 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8665 step_vector = build_one_cst (TREE_TYPE (niters));
8666 }
8667 else
8668 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8669 &step_vector, niters_no_overflow);
8670 }
8671
8672 /* 1) Make sure the loop header has exactly two entries
8673 2) Make sure we have a preheader basic block. */
8674
8675 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8676
8677 split_edge (loop_preheader_edge (loop));
8678
8679 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8680 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8681 /* This will deal with any possible peeling. */
8682 vect_prepare_for_masked_peels (loop_vinfo);
8683
8684 /* Schedule the SLP instances first, then handle loop vectorization
8685 below. */
8686 if (!loop_vinfo->slp_instances.is_empty ())
8687 {
8688 DUMP_VECT_SCOPE ("scheduling SLP instances");
8689 vect_schedule_slp (loop_vinfo);
8690 }
8691
8692 /* FORNOW: the vectorizer supports only loops which body consist
8693 of one basic block (header + empty latch). When the vectorizer will
8694 support more involved loop forms, the order by which the BBs are
8695 traversed need to be reconsidered. */
8696
8697 for (i = 0; i < nbbs; i++)
8698 {
8699 basic_block bb = bbs[i];
8700 stmt_vec_info stmt_info;
8701
8702 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8703 gsi_next (&si))
8704 {
8705 gphi *phi = si.phi ();
8706 if (dump_enabled_p ())
8707 dump_printf_loc (MSG_NOTE, vect_location,
8708 "------>vectorizing phi: %G", phi);
8709 stmt_info = loop_vinfo->lookup_stmt (phi);
8710 if (!stmt_info)
8711 continue;
8712
8713 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8714 vect_loop_kill_debug_uses (loop, stmt_info);
8715
8716 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8717 && !STMT_VINFO_LIVE_P (stmt_info))
8718 continue;
8719
8720 if (STMT_VINFO_VECTYPE (stmt_info)
8721 && (maybe_ne
8722 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8723 && dump_enabled_p ())
8724 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8725
8726 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8727 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8728 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8729 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8731 && ! PURE_SLP_STMT (stmt_info))
8732 {
8733 if (dump_enabled_p ())
8734 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8735 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8736 }
8737 }
8738
8739 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8740 !gsi_end_p (si);)
8741 {
8742 stmt = gsi_stmt (si);
8743 /* During vectorization remove existing clobber stmts. */
8744 if (gimple_clobber_p (stmt))
8745 {
8746 unlink_stmt_vdef (stmt);
8747 gsi_remove (&si, true);
8748 release_defs (stmt);
8749 }
8750 else
8751 {
8752 stmt_info = loop_vinfo->lookup_stmt (stmt);
8753
8754 /* vector stmts created in the outer-loop during vectorization of
8755 stmts in an inner-loop may not have a stmt_info, and do not
8756 need to be vectorized. */
8757 stmt_vec_info seen_store = NULL;
8758 if (stmt_info)
8759 {
8760 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8761 {
8762 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8763 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8764 !gsi_end_p (subsi); gsi_next (&subsi))
8765 {
8766 stmt_vec_info pat_stmt_info
8767 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8768 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8769 &si, &seen_store);
8770 }
8771 stmt_vec_info pat_stmt_info
8772 = STMT_VINFO_RELATED_STMT (stmt_info);
8773 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8774 &seen_store);
8775 }
8776 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8777 &seen_store);
8778 }
8779 gsi_next (&si);
8780 if (seen_store)
8781 {
8782 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8783 /* Interleaving. If IS_STORE is TRUE, the
8784 vectorization of the interleaving chain was
8785 completed - free all the stores in the chain. */
8786 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8787 else
8788 /* Free the attached stmt_vec_info and remove the stmt. */
8789 loop_vinfo->remove_stmt (stmt_info);
8790 }
8791 }
8792 }
8793
8794 /* Stub out scalar statements that must not survive vectorization.
8795 Doing this here helps with grouped statements, or statements that
8796 are involved in patterns. */
8797 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8798 !gsi_end_p (gsi); gsi_next (&gsi))
8799 {
8800 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8801 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8802 {
8803 tree lhs = gimple_get_lhs (call);
8804 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8805 {
8806 tree zero = build_zero_cst (TREE_TYPE (lhs));
8807 gimple *new_stmt = gimple_build_assign (lhs, zero);
8808 gsi_replace (&gsi, new_stmt, true);
8809 }
8810 }
8811 }
8812 } /* BBs in loop */
8813
8814 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8815 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8816 if (integer_onep (step_vector))
8817 niters_no_overflow = true;
8818 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8819 niters_vector_mult_vf, !niters_no_overflow);
8820
8821 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8822 scale_profile_for_vect_loop (loop, assumed_vf);
8823
8824 /* True if the final iteration might not handle a full vector's
8825 worth of scalar iterations. */
8826 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8827 /* The minimum number of iterations performed by the epilogue. This
8828 is 1 when peeling for gaps because we always need a final scalar
8829 iteration. */
8830 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8831 /* +1 to convert latch counts to loop iteration counts,
8832 -min_epilogue_iters to remove iterations that cannot be performed
8833 by the vector code. */
8834 int bias_for_lowest = 1 - min_epilogue_iters;
8835 int bias_for_assumed = bias_for_lowest;
8836 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8837 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8838 {
8839 /* When the amount of peeling is known at compile time, the first
8840 iteration will have exactly alignment_npeels active elements.
8841 In the worst case it will have at least one. */
8842 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8843 bias_for_lowest += lowest_vf - min_first_active;
8844 bias_for_assumed += assumed_vf - min_first_active;
8845 }
8846 /* In these calculations the "- 1" converts loop iteration counts
8847 back to latch counts. */
8848 if (loop->any_upper_bound)
8849 loop->nb_iterations_upper_bound
8850 = (final_iter_may_be_partial
8851 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8852 lowest_vf) - 1
8853 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8854 lowest_vf) - 1);
8855 if (loop->any_likely_upper_bound)
8856 loop->nb_iterations_likely_upper_bound
8857 = (final_iter_may_be_partial
8858 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8859 + bias_for_lowest, lowest_vf) - 1
8860 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8861 + bias_for_lowest, lowest_vf) - 1);
8862 if (loop->any_estimate)
8863 loop->nb_iterations_estimate
8864 = (final_iter_may_be_partial
8865 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8866 assumed_vf) - 1
8867 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8868 assumed_vf) - 1);
8869
8870 if (dump_enabled_p ())
8871 {
8872 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8873 {
8874 dump_printf_loc (MSG_NOTE, vect_location,
8875 "LOOP VECTORIZED\n");
8876 if (loop->inner)
8877 dump_printf_loc (MSG_NOTE, vect_location,
8878 "OUTER LOOP VECTORIZED\n");
8879 dump_printf (MSG_NOTE, "\n");
8880 }
8881 else
8882 dump_printf_loc (MSG_NOTE, vect_location,
8883 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8884 GET_MODE_NAME (loop_vinfo->vector_mode));
8885 }
8886
8887 /* Loops vectorized with a variable factor won't benefit from
8888 unrolling/peeling. */
8889 if (!vf.is_constant ())
8890 {
8891 loop->unroll = 1;
8892 if (dump_enabled_p ())
8893 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8894 " variable-length vectorization factor\n");
8895 }
8896 /* Free SLP instances here because otherwise stmt reference counting
8897 won't work. */
8898 slp_instance instance;
8899 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8900 vect_free_slp_instance (instance, true);
8901 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8902 /* Clear-up safelen field since its value is invalid after vectorization
8903 since vectorized loop can have loop-carried dependencies. */
8904 loop->safelen = 0;
8905
8906 if (epilogue)
8907 {
8908 update_epilogue_loop_vinfo (epilogue, advance);
8909
8910 epilogue->simduid = loop->simduid;
8911 epilogue->force_vectorize = loop->force_vectorize;
8912 epilogue->dont_vectorize = false;
8913 }
8914
8915 return epilogue;
8916 }
8917
8918 /* The code below is trying to perform simple optimization - revert
8919 if-conversion for masked stores, i.e. if the mask of a store is zero
8920 do not perform it and all stored value producers also if possible.
8921 For example,
8922 for (i=0; i<n; i++)
8923 if (c[i])
8924 {
8925 p1[i] += 1;
8926 p2[i] = p3[i] +2;
8927 }
8928 this transformation will produce the following semi-hammock:
8929
8930 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8931 {
8932 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8933 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8934 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8935 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8936 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8937 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8938 }
8939 */
8940
8941 void
optimize_mask_stores(class loop * loop)8942 optimize_mask_stores (class loop *loop)
8943 {
8944 basic_block *bbs = get_loop_body (loop);
8945 unsigned nbbs = loop->num_nodes;
8946 unsigned i;
8947 basic_block bb;
8948 class loop *bb_loop;
8949 gimple_stmt_iterator gsi;
8950 gimple *stmt;
8951 auto_vec<gimple *> worklist;
8952 auto_purge_vect_location sentinel;
8953
8954 vect_location = find_loop_location (loop);
8955 /* Pick up all masked stores in loop if any. */
8956 for (i = 0; i < nbbs; i++)
8957 {
8958 bb = bbs[i];
8959 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8960 gsi_next (&gsi))
8961 {
8962 stmt = gsi_stmt (gsi);
8963 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8964 worklist.safe_push (stmt);
8965 }
8966 }
8967
8968 free (bbs);
8969 if (worklist.is_empty ())
8970 return;
8971
8972 /* Loop has masked stores. */
8973 while (!worklist.is_empty ())
8974 {
8975 gimple *last, *last_store;
8976 edge e, efalse;
8977 tree mask;
8978 basic_block store_bb, join_bb;
8979 gimple_stmt_iterator gsi_to;
8980 tree vdef, new_vdef;
8981 gphi *phi;
8982 tree vectype;
8983 tree zero;
8984
8985 last = worklist.pop ();
8986 mask = gimple_call_arg (last, 2);
8987 bb = gimple_bb (last);
8988 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8989 the same loop as if_bb. It could be different to LOOP when two
8990 level loop-nest is vectorized and mask_store belongs to the inner
8991 one. */
8992 e = split_block (bb, last);
8993 bb_loop = bb->loop_father;
8994 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8995 join_bb = e->dest;
8996 store_bb = create_empty_bb (bb);
8997 add_bb_to_loop (store_bb, bb_loop);
8998 e->flags = EDGE_TRUE_VALUE;
8999 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9000 /* Put STORE_BB to likely part. */
9001 efalse->probability = profile_probability::unlikely ();
9002 store_bb->count = efalse->count ();
9003 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9004 if (dom_info_available_p (CDI_DOMINATORS))
9005 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9006 if (dump_enabled_p ())
9007 dump_printf_loc (MSG_NOTE, vect_location,
9008 "Create new block %d to sink mask stores.",
9009 store_bb->index);
9010 /* Create vector comparison with boolean result. */
9011 vectype = TREE_TYPE (mask);
9012 zero = build_zero_cst (vectype);
9013 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9014 gsi = gsi_last_bb (bb);
9015 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9016 /* Create new PHI node for vdef of the last masked store:
9017 .MEM_2 = VDEF <.MEM_1>
9018 will be converted to
9019 .MEM.3 = VDEF <.MEM_1>
9020 and new PHI node will be created in join bb
9021 .MEM_2 = PHI <.MEM_1, .MEM_3>
9022 */
9023 vdef = gimple_vdef (last);
9024 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9025 gimple_set_vdef (last, new_vdef);
9026 phi = create_phi_node (vdef, join_bb);
9027 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9028
9029 /* Put all masked stores with the same mask to STORE_BB if possible. */
9030 while (true)
9031 {
9032 gimple_stmt_iterator gsi_from;
9033 gimple *stmt1 = NULL;
9034
9035 /* Move masked store to STORE_BB. */
9036 last_store = last;
9037 gsi = gsi_for_stmt (last);
9038 gsi_from = gsi;
9039 /* Shift GSI to the previous stmt for further traversal. */
9040 gsi_prev (&gsi);
9041 gsi_to = gsi_start_bb (store_bb);
9042 gsi_move_before (&gsi_from, &gsi_to);
9043 /* Setup GSI_TO to the non-empty block start. */
9044 gsi_to = gsi_start_bb (store_bb);
9045 if (dump_enabled_p ())
9046 dump_printf_loc (MSG_NOTE, vect_location,
9047 "Move stmt to created bb\n%G", last);
9048 /* Move all stored value producers if possible. */
9049 while (!gsi_end_p (gsi))
9050 {
9051 tree lhs;
9052 imm_use_iterator imm_iter;
9053 use_operand_p use_p;
9054 bool res;
9055
9056 /* Skip debug statements. */
9057 if (is_gimple_debug (gsi_stmt (gsi)))
9058 {
9059 gsi_prev (&gsi);
9060 continue;
9061 }
9062 stmt1 = gsi_stmt (gsi);
9063 /* Do not consider statements writing to memory or having
9064 volatile operand. */
9065 if (gimple_vdef (stmt1)
9066 || gimple_has_volatile_ops (stmt1))
9067 break;
9068 gsi_from = gsi;
9069 gsi_prev (&gsi);
9070 lhs = gimple_get_lhs (stmt1);
9071 if (!lhs)
9072 break;
9073
9074 /* LHS of vectorized stmt must be SSA_NAME. */
9075 if (TREE_CODE (lhs) != SSA_NAME)
9076 break;
9077
9078 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9079 {
9080 /* Remove dead scalar statement. */
9081 if (has_zero_uses (lhs))
9082 {
9083 gsi_remove (&gsi_from, true);
9084 continue;
9085 }
9086 }
9087
9088 /* Check that LHS does not have uses outside of STORE_BB. */
9089 res = true;
9090 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9091 {
9092 gimple *use_stmt;
9093 use_stmt = USE_STMT (use_p);
9094 if (is_gimple_debug (use_stmt))
9095 continue;
9096 if (gimple_bb (use_stmt) != store_bb)
9097 {
9098 res = false;
9099 break;
9100 }
9101 }
9102 if (!res)
9103 break;
9104
9105 if (gimple_vuse (stmt1)
9106 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9107 break;
9108
9109 /* Can move STMT1 to STORE_BB. */
9110 if (dump_enabled_p ())
9111 dump_printf_loc (MSG_NOTE, vect_location,
9112 "Move stmt to created bb\n%G", stmt1);
9113 gsi_move_before (&gsi_from, &gsi_to);
9114 /* Shift GSI_TO for further insertion. */
9115 gsi_prev (&gsi_to);
9116 }
9117 /* Put other masked stores with the same mask to STORE_BB. */
9118 if (worklist.is_empty ()
9119 || gimple_call_arg (worklist.last (), 2) != mask
9120 || worklist.last () != stmt1)
9121 break;
9122 last = worklist.pop ();
9123 }
9124 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9125 }
9126 }
9127
9128 /* Decide whether it is possible to use a zero-based induction variable
9129 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9130 return the value that the induction variable must be able to hold
9131 in order to ensure that the loop ends with an all-false mask.
9132 Return -1 otherwise. */
9133 widest_int
vect_iv_limit_for_full_masking(loop_vec_info loop_vinfo)9134 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9135 {
9136 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9137 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9138 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9139
9140 /* Calculate the value that the induction variable must be able
9141 to hit in order to ensure that we end the loop with an all-false mask.
9142 This involves adding the maximum number of inactive trailing scalar
9143 iterations. */
9144 widest_int iv_limit = -1;
9145 if (max_loop_iterations (loop, &iv_limit))
9146 {
9147 if (niters_skip)
9148 {
9149 /* Add the maximum number of skipped iterations to the
9150 maximum iteration count. */
9151 if (TREE_CODE (niters_skip) == INTEGER_CST)
9152 iv_limit += wi::to_widest (niters_skip);
9153 else
9154 iv_limit += max_vf - 1;
9155 }
9156 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9157 /* Make a conservatively-correct assumption. */
9158 iv_limit += max_vf - 1;
9159
9160 /* IV_LIMIT is the maximum number of latch iterations, which is also
9161 the maximum in-range IV value. Round this value down to the previous
9162 vector alignment boundary and then add an extra full iteration. */
9163 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9164 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9165 }
9166 return iv_limit;
9167 }
9168
9169