1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 #define vec_step vec_step_
59
60 /* Loop Vectorization Pass.
61
62 This pass tries to vectorize loops.
63
64 For example, the vectorizer transforms the following simple loop:
65
66 short a[N]; short b[N]; short c[N]; int i;
67
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
70 }
71
72 as if it was manually vectorized by rewriting the source code into:
73
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
78
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
84 }
85
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
97
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
103
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
108
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
119
120 For example, say stmt S1 was vectorized into stmt VS1:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
125
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
130
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
138
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
146
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
153
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159
160 /* Function vect_determine_vectorization_factor
161
162 Determine the vectorization factor (VF). VF is the number of data elements
163 that are operated upon in parallel in a single iteration of the vectorized
164 loop. For example, when vectorizing a loop that operates on 4byte elements,
165 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
166 elements can fit in a single vector register.
167
168 We currently support vectorization of loops in which all types operated upon
169 are of the same size. Therefore this function currently sets VF according to
170 the size of the types operated upon, and fails if there are multiple sizes
171 in the loop.
172
173 VF is also the factor by which the loop iterations are strip-mined, e.g.:
174 original loop:
175 for (i=0; i<N; i++){
176 a[i] = b[i] + c[i];
177 }
178
179 vectorized loop:
180 for (i=0; i<N; i+=VF){
181 a[i:VF] = b[i:VF] + c[i:VF];
182 }
183 */
184
185 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)186 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
187 {
188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
190 unsigned nbbs = loop->num_nodes;
191 poly_uint64 vectorization_factor = 1;
192 tree scalar_type = NULL_TREE;
193 gphi *phi;
194 tree vectype;
195 stmt_vec_info stmt_info;
196 unsigned i;
197 HOST_WIDE_INT dummy;
198 gimple *stmt, *pattern_stmt = NULL;
199 gimple_seq pattern_def_seq = NULL;
200 gimple_stmt_iterator pattern_def_si = gsi_none ();
201 bool analyze_pattern_stmt = false;
202 bool bool_result;
203 auto_vec<stmt_vec_info> mask_producers;
204
205 if (dump_enabled_p ())
206 dump_printf_loc (MSG_NOTE, vect_location,
207 "=== vect_determine_vectorization_factor ===\n");
208
209 for (i = 0; i < nbbs; i++)
210 {
211 basic_block bb = bbs[i];
212
213 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
214 gsi_next (&si))
215 {
216 phi = si.phi ();
217 stmt_info = vinfo_for_stmt (phi);
218 if (dump_enabled_p ())
219 {
220 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
221 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
222 }
223
224 gcc_assert (stmt_info);
225
226 if (STMT_VINFO_RELEVANT_P (stmt_info)
227 || STMT_VINFO_LIVE_P (stmt_info))
228 {
229 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
230 scalar_type = TREE_TYPE (PHI_RESULT (phi));
231
232 if (dump_enabled_p ())
233 {
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "get vectype for scalar type: ");
236 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
237 dump_printf (MSG_NOTE, "\n");
238 }
239
240 vectype = get_vectype_for_scalar_type (scalar_type);
241 if (!vectype)
242 {
243 if (dump_enabled_p ())
244 {
245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
246 "not vectorized: unsupported "
247 "data-type ");
248 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
249 scalar_type);
250 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
251 }
252 return false;
253 }
254 STMT_VINFO_VECTYPE (stmt_info) = vectype;
255
256 if (dump_enabled_p ())
257 {
258 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
259 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
260 dump_printf (MSG_NOTE, "\n");
261 }
262
263 if (dump_enabled_p ())
264 {
265 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
266 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
267 dump_printf (MSG_NOTE, "\n");
268 }
269
270 vect_update_max_nunits (&vectorization_factor, vectype);
271 }
272 }
273
274 for (gimple_stmt_iterator si = gsi_start_bb (bb);
275 !gsi_end_p (si) || analyze_pattern_stmt;)
276 {
277 tree vf_vectype;
278
279 if (analyze_pattern_stmt)
280 stmt = pattern_stmt;
281 else
282 stmt = gsi_stmt (si);
283
284 stmt_info = vinfo_for_stmt (stmt);
285
286 if (dump_enabled_p ())
287 {
288 dump_printf_loc (MSG_NOTE, vect_location,
289 "==> examining statement: ");
290 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 }
292
293 gcc_assert (stmt_info);
294
295 /* Skip stmts which do not need to be vectorized. */
296 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
297 && !STMT_VINFO_LIVE_P (stmt_info))
298 || gimple_clobber_p (stmt))
299 {
300 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
301 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
302 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
303 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
304 {
305 stmt = pattern_stmt;
306 stmt_info = vinfo_for_stmt (pattern_stmt);
307 if (dump_enabled_p ())
308 {
309 dump_printf_loc (MSG_NOTE, vect_location,
310 "==> examining pattern statement: ");
311 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
312 }
313 }
314 else
315 {
316 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
318 gsi_next (&si);
319 continue;
320 }
321 }
322 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
323 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
324 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
325 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
326 analyze_pattern_stmt = true;
327
328 /* If a pattern statement has def stmts, analyze them too. */
329 if (is_pattern_stmt_p (stmt_info))
330 {
331 if (pattern_def_seq == NULL)
332 {
333 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
334 pattern_def_si = gsi_start (pattern_def_seq);
335 }
336 else if (!gsi_end_p (pattern_def_si))
337 gsi_next (&pattern_def_si);
338 if (pattern_def_seq != NULL)
339 {
340 gimple *pattern_def_stmt = NULL;
341 stmt_vec_info pattern_def_stmt_info = NULL;
342
343 while (!gsi_end_p (pattern_def_si))
344 {
345 pattern_def_stmt = gsi_stmt (pattern_def_si);
346 pattern_def_stmt_info
347 = vinfo_for_stmt (pattern_def_stmt);
348 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
349 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
350 break;
351 gsi_next (&pattern_def_si);
352 }
353
354 if (!gsi_end_p (pattern_def_si))
355 {
356 if (dump_enabled_p ())
357 {
358 dump_printf_loc (MSG_NOTE, vect_location,
359 "==> examining pattern def stmt: ");
360 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
361 pattern_def_stmt, 0);
362 }
363
364 stmt = pattern_def_stmt;
365 stmt_info = pattern_def_stmt_info;
366 }
367 else
368 {
369 pattern_def_si = gsi_none ();
370 analyze_pattern_stmt = false;
371 }
372 }
373 else
374 analyze_pattern_stmt = false;
375 }
376
377 if (gimple_get_lhs (stmt) == NULL_TREE
378 /* MASK_STORE has no lhs, but is ok. */
379 && (!is_gimple_call (stmt)
380 || !gimple_call_internal_p (stmt)
381 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
382 {
383 if (is_gimple_call (stmt))
384 {
385 /* Ignore calls with no lhs. These must be calls to
386 #pragma omp simd functions, and what vectorization factor
387 it really needs can't be determined until
388 vectorizable_simd_clone_call. */
389 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
390 {
391 pattern_def_seq = NULL;
392 gsi_next (&si);
393 }
394 continue;
395 }
396 if (dump_enabled_p ())
397 {
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "not vectorized: irregular stmt.");
400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 0);
402 }
403 return false;
404 }
405
406 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
407 {
408 if (dump_enabled_p ())
409 {
410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411 "not vectorized: vector stmt in loop:");
412 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
413 }
414 return false;
415 }
416
417 bool_result = false;
418
419 if (STMT_VINFO_VECTYPE (stmt_info))
420 {
421 /* The only case when a vectype had been already set is for stmts
422 that contain a dataref, or for "pattern-stmts" (stmts
423 generated by the vectorizer to represent/replace a certain
424 idiom). */
425 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
426 || is_pattern_stmt_p (stmt_info)
427 || !gsi_end_p (pattern_def_si));
428 vectype = STMT_VINFO_VECTYPE (stmt_info);
429 }
430 else
431 {
432 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
433 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
434 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 else
436 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437
438 /* Bool ops don't participate in vectorization factor
439 computation. For comparison use compared types to
440 compute a factor. */
441 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
442 && is_gimple_assign (stmt)
443 && gimple_assign_rhs_code (stmt) != COND_EXPR)
444 {
445 if (STMT_VINFO_RELEVANT_P (stmt_info)
446 || STMT_VINFO_LIVE_P (stmt_info))
447 mask_producers.safe_push (stmt_info);
448 bool_result = true;
449
450 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
451 == tcc_comparison
452 && !VECT_SCALAR_BOOLEAN_TYPE_P
453 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
454 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
455 else
456 {
457 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
458 {
459 pattern_def_seq = NULL;
460 gsi_next (&si);
461 }
462 continue;
463 }
464 }
465
466 if (dump_enabled_p ())
467 {
468 dump_printf_loc (MSG_NOTE, vect_location,
469 "get vectype for scalar type: ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
471 dump_printf (MSG_NOTE, "\n");
472 }
473 vectype = get_vectype_for_scalar_type (scalar_type);
474 if (!vectype)
475 {
476 if (dump_enabled_p ())
477 {
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "not vectorized: unsupported "
480 "data-type ");
481 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
482 scalar_type);
483 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
484 }
485 return false;
486 }
487
488 if (!bool_result)
489 STMT_VINFO_VECTYPE (stmt_info) = vectype;
490
491 if (dump_enabled_p ())
492 {
493 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
494 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
495 dump_printf (MSG_NOTE, "\n");
496 }
497 }
498
499 /* Don't try to compute VF out scalar types if we stmt
500 produces boolean vector. Use result vectype instead. */
501 if (VECTOR_BOOLEAN_TYPE_P (vectype))
502 vf_vectype = vectype;
503 else
504 {
505 /* The vectorization factor is according to the smallest
506 scalar type (or the largest vector size, but we only
507 support one vector size per loop). */
508 if (!bool_result)
509 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
510 &dummy);
511 if (dump_enabled_p ())
512 {
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "get vectype for scalar type: ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
516 dump_printf (MSG_NOTE, "\n");
517 }
518 vf_vectype = get_vectype_for_scalar_type (scalar_type);
519 }
520 if (!vf_vectype)
521 {
522 if (dump_enabled_p ())
523 {
524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 "not vectorized: unsupported data-type ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
527 scalar_type);
528 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 }
530 return false;
531 }
532
533 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
534 GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
535 {
536 if (dump_enabled_p ())
537 {
538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
539 "not vectorized: different sized vector "
540 "types in statement, ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
544 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
545 vf_vectype);
546 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547 }
548 return false;
549 }
550
551 if (dump_enabled_p ())
552 {
553 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
554 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
555 dump_printf (MSG_NOTE, "\n");
556 }
557
558 if (dump_enabled_p ())
559 {
560 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
561 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
562 dump_printf (MSG_NOTE, "\n");
563 }
564
565 vect_update_max_nunits (&vectorization_factor, vf_vectype);
566
567 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
568 {
569 pattern_def_seq = NULL;
570 gsi_next (&si);
571 }
572 }
573 }
574
575 /* TODO: Analyze cost. Decide if worth while to vectorize. */
576 if (dump_enabled_p ())
577 {
578 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
579 dump_dec (MSG_NOTE, vectorization_factor);
580 dump_printf (MSG_NOTE, "\n");
581 }
582
583 if (known_le (vectorization_factor, 1U))
584 {
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "not vectorized: unsupported data-type\n");
588 return false;
589 }
590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
591
592 for (i = 0; i < mask_producers.length (); i++)
593 {
594 tree mask_type = NULL;
595
596 stmt = STMT_VINFO_STMT (mask_producers[i]);
597
598 if (is_gimple_assign (stmt)
599 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
600 && !VECT_SCALAR_BOOLEAN_TYPE_P
601 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
602 {
603 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
604 mask_type = get_mask_type_for_scalar_type (scalar_type);
605
606 if (!mask_type)
607 {
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610 "not vectorized: unsupported mask\n");
611 return false;
612 }
613 }
614 else
615 {
616 tree rhs;
617 ssa_op_iter iter;
618 gimple *def_stmt;
619 enum vect_def_type dt;
620
621 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
622 {
623 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
624 &def_stmt, &dt, &vectype))
625 {
626 if (dump_enabled_p ())
627 {
628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
629 "not vectorized: can't compute mask type "
630 "for statement, ");
631 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
632 0);
633 }
634 return false;
635 }
636
637 /* No vectype probably means external definition.
638 Allow it in case there is another operand which
639 allows to determine mask type. */
640 if (!vectype)
641 continue;
642
643 if (!mask_type)
644 mask_type = vectype;
645 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
646 TYPE_VECTOR_SUBPARTS (vectype)))
647 {
648 if (dump_enabled_p ())
649 {
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "not vectorized: different sized masks "
652 "types in statement, ");
653 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
654 mask_type);
655 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
656 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
657 vectype);
658 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
659 }
660 return false;
661 }
662 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
663 != VECTOR_BOOLEAN_TYPE_P (vectype))
664 {
665 if (dump_enabled_p ())
666 {
667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
668 "not vectorized: mixed mask and "
669 "nonmask vector types in statement, ");
670 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
671 mask_type);
672 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
673 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
674 vectype);
675 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
676 }
677 return false;
678 }
679 }
680
681 /* We may compare boolean value loaded as vector of integers.
682 Fix mask_type in such case. */
683 if (mask_type
684 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
685 && gimple_code (stmt) == GIMPLE_ASSIGN
686 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
687 mask_type = build_same_sized_truth_vector_type (mask_type);
688 }
689
690 /* No mask_type should mean loop invariant predicate.
691 This is probably a subject for optimization in
692 if-conversion. */
693 if (!mask_type)
694 {
695 if (dump_enabled_p ())
696 {
697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
698 "not vectorized: can't compute mask type "
699 "for statement, ");
700 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
701 0);
702 }
703 return false;
704 }
705
706 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
707 }
708
709 return true;
710 }
711
712
713 /* Function vect_is_simple_iv_evolution.
714
715 FORNOW: A simple evolution of an induction variables in the loop is
716 considered a polynomial evolution. */
717
718 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)719 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
720 tree * step)
721 {
722 tree init_expr;
723 tree step_expr;
724 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
725 basic_block bb;
726
727 /* When there is no evolution in this loop, the evolution function
728 is not "simple". */
729 if (evolution_part == NULL_TREE)
730 return false;
731
732 /* When the evolution is a polynomial of degree >= 2
733 the evolution function is not "simple". */
734 if (tree_is_chrec (evolution_part))
735 return false;
736
737 step_expr = evolution_part;
738 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
739
740 if (dump_enabled_p ())
741 {
742 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
744 dump_printf (MSG_NOTE, ", init: ");
745 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
746 dump_printf (MSG_NOTE, "\n");
747 }
748
749 *init = init_expr;
750 *step = step_expr;
751
752 if (TREE_CODE (step_expr) != INTEGER_CST
753 && (TREE_CODE (step_expr) != SSA_NAME
754 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
755 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
756 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
757 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
758 || !flag_associative_math)))
759 && (TREE_CODE (step_expr) != REAL_CST
760 || !flag_associative_math))
761 {
762 if (dump_enabled_p ())
763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764 "step unknown.\n");
765 return false;
766 }
767
768 return true;
769 }
770
771 /* Function vect_analyze_scalar_cycles_1.
772
773 Examine the cross iteration def-use cycles of scalar variables
774 in LOOP. LOOP_VINFO represents the loop that is now being
775 considered for vectorization (can be LOOP, or an outer-loop
776 enclosing LOOP). */
777
778 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)779 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
780 {
781 basic_block bb = loop->header;
782 tree init, step;
783 auto_vec<gimple *, 64> worklist;
784 gphi_iterator gsi;
785 bool double_reduc;
786
787 if (dump_enabled_p ())
788 dump_printf_loc (MSG_NOTE, vect_location,
789 "=== vect_analyze_scalar_cycles ===\n");
790
791 /* First - identify all inductions. Reduction detection assumes that all the
792 inductions have been identified, therefore, this order must not be
793 changed. */
794 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
795 {
796 gphi *phi = gsi.phi ();
797 tree access_fn = NULL;
798 tree def = PHI_RESULT (phi);
799 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
800
801 if (dump_enabled_p ())
802 {
803 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
804 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
805 }
806
807 /* Skip virtual phi's. The data dependences that are associated with
808 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
809 if (virtual_operand_p (def))
810 continue;
811
812 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
813
814 /* Analyze the evolution function. */
815 access_fn = analyze_scalar_evolution (loop, def);
816 if (access_fn)
817 {
818 STRIP_NOPS (access_fn);
819 if (dump_enabled_p ())
820 {
821 dump_printf_loc (MSG_NOTE, vect_location,
822 "Access function of PHI: ");
823 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
824 dump_printf (MSG_NOTE, "\n");
825 }
826 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
827 = initial_condition_in_loop_num (access_fn, loop->num);
828 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
829 = evolution_part_in_loop_num (access_fn, loop->num);
830 }
831
832 if (!access_fn
833 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
834 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
835 && TREE_CODE (step) != INTEGER_CST))
836 {
837 worklist.safe_push (phi);
838 continue;
839 }
840
841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
842 != NULL_TREE);
843 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
844
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
847 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
848 }
849
850
851 /* Second - identify all reductions and nested cycles. */
852 while (worklist.length () > 0)
853 {
854 gimple *phi = worklist.pop ();
855 tree def = PHI_RESULT (phi);
856 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
857 gimple *reduc_stmt;
858
859 if (dump_enabled_p ())
860 {
861 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
862 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
863 }
864
865 gcc_assert (!virtual_operand_p (def)
866 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
867
868 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
869 &double_reduc, false);
870 if (reduc_stmt)
871 {
872 if (double_reduc)
873 {
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE, vect_location,
876 "Detected double reduction.\n");
877
878 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
879 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
880 vect_double_reduction_def;
881 }
882 else
883 {
884 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
885 {
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "Detected vectorizable nested cycle.\n");
889
890 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892 vect_nested_cycle;
893 }
894 else
895 {
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "Detected reduction.\n");
899
900 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
901 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
902 vect_reduction_def;
903 /* Store the reduction cycles for possible vectorization in
904 loop-aware SLP if it was not detected as reduction
905 chain. */
906 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
907 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
908 }
909 }
910 }
911 else
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Unknown def-use cycle pattern.\n");
915 }
916 }
917
918
919 /* Function vect_analyze_scalar_cycles.
920
921 Examine the cross iteration def-use cycles of scalar variables, by
922 analyzing the loop-header PHIs of scalar variables. Classify each
923 cycle as one of the following: invariant, induction, reduction, unknown.
924 We do that for the loop represented by LOOP_VINFO, and also to its
925 inner-loop, if exists.
926 Examples for scalar cycles:
927
928 Example1: reduction:
929
930 loop1:
931 for (i=0; i<N; i++)
932 sum += a[i];
933
934 Example2: induction:
935
936 loop2:
937 for (i=0; i<N; i++)
938 a[i] = i; */
939
940 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)941 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
942 {
943 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
944
945 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
946
947 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
948 Reductions in such inner-loop therefore have different properties than
949 the reductions in the nest that gets vectorized:
950 1. When vectorized, they are executed in the same order as in the original
951 scalar loop, so we can't change the order of computation when
952 vectorizing them.
953 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
954 current checks are too strict. */
955
956 if (loop->inner)
957 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
958 }
959
960 /* Transfer group and reduction information from STMT to its pattern stmt. */
961
962 static void
vect_fixup_reduc_chain(gimple * stmt)963 vect_fixup_reduc_chain (gimple *stmt)
964 {
965 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
966 gimple *stmtp;
967 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
968 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
969 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
970 do
971 {
972 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
973 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
974 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
975 if (stmt)
976 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
977 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
978 }
979 while (stmt);
980 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
981 }
982
983 /* Fixup scalar cycles that now have their stmts detected as patterns. */
984
985 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)986 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
987 {
988 gimple *first;
989 unsigned i;
990
991 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
992 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
993 {
994 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
995 while (next)
996 {
997 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
998 break;
999 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1000 }
1001 /* If not all stmt in the chain are patterns try to handle
1002 the chain without patterns. */
1003 if (! next)
1004 {
1005 vect_fixup_reduc_chain (first);
1006 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1007 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1008 }
1009 }
1010 }
1011
1012 /* Function vect_get_loop_niters.
1013
1014 Determine how many iterations the loop is executed and place it
1015 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1016 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1017 niter information holds in ASSUMPTIONS.
1018
1019 Return the loop exit condition. */
1020
1021
1022 static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)1023 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1024 tree *number_of_iterations, tree *number_of_iterationsm1)
1025 {
1026 edge exit = single_exit (loop);
1027 struct tree_niter_desc niter_desc;
1028 tree niter_assumptions, niter, may_be_zero;
1029 gcond *cond = get_loop_exit_condition (loop);
1030
1031 *assumptions = boolean_true_node;
1032 *number_of_iterationsm1 = chrec_dont_know;
1033 *number_of_iterations = chrec_dont_know;
1034 if (dump_enabled_p ())
1035 dump_printf_loc (MSG_NOTE, vect_location,
1036 "=== get_loop_niters ===\n");
1037
1038 if (!exit)
1039 return cond;
1040
1041 niter = chrec_dont_know;
1042 may_be_zero = NULL_TREE;
1043 niter_assumptions = boolean_true_node;
1044 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1045 || chrec_contains_undetermined (niter_desc.niter))
1046 return cond;
1047
1048 niter_assumptions = niter_desc.assumptions;
1049 may_be_zero = niter_desc.may_be_zero;
1050 niter = niter_desc.niter;
1051
1052 if (may_be_zero && integer_zerop (may_be_zero))
1053 may_be_zero = NULL_TREE;
1054
1055 if (may_be_zero)
1056 {
1057 if (COMPARISON_CLASS_P (may_be_zero))
1058 {
1059 /* Try to combine may_be_zero with assumptions, this can simplify
1060 computation of niter expression. */
1061 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1062 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1063 niter_assumptions,
1064 fold_build1 (TRUTH_NOT_EXPR,
1065 boolean_type_node,
1066 may_be_zero));
1067 else
1068 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1069 build_int_cst (TREE_TYPE (niter), 0),
1070 rewrite_to_non_trapping_overflow (niter));
1071
1072 may_be_zero = NULL_TREE;
1073 }
1074 else if (integer_nonzerop (may_be_zero))
1075 {
1076 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1077 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1078 return cond;
1079 }
1080 else
1081 return cond;
1082 }
1083
1084 *assumptions = niter_assumptions;
1085 *number_of_iterationsm1 = niter;
1086
1087 /* We want the number of loop header executions which is the number
1088 of latch executions plus one.
1089 ??? For UINT_MAX latch executions this number overflows to zero
1090 for loops like do { n++; } while (n != 0); */
1091 if (niter && !chrec_contains_undetermined (niter))
1092 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1093 build_int_cst (TREE_TYPE (niter), 1));
1094 *number_of_iterations = niter;
1095
1096 return cond;
1097 }
1098
1099 /* Function bb_in_loop_p
1100
1101 Used as predicate for dfs order traversal of the loop bbs. */
1102
1103 static bool
bb_in_loop_p(const_basic_block bb,const void * data)1104 bb_in_loop_p (const_basic_block bb, const void *data)
1105 {
1106 const struct loop *const loop = (const struct loop *)data;
1107 if (flow_bb_inside_loop_p (loop, bb))
1108 return true;
1109 return false;
1110 }
1111
1112
1113 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1114 stmt_vec_info structs for all the stmts in LOOP_IN. */
1115
_loop_vec_info(struct loop * loop_in)1116 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1117 : vec_info (vec_info::loop, init_cost (loop_in)),
1118 loop (loop_in),
1119 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1120 num_itersm1 (NULL_TREE),
1121 num_iters (NULL_TREE),
1122 num_iters_unchanged (NULL_TREE),
1123 num_iters_assumptions (NULL_TREE),
1124 th (0),
1125 versioning_threshold (0),
1126 vectorization_factor (0),
1127 max_vectorization_factor (0),
1128 mask_skip_niters (NULL_TREE),
1129 mask_compare_type (NULL_TREE),
1130 unaligned_dr (NULL),
1131 peeling_for_alignment (0),
1132 ptr_mask (0),
1133 ivexpr_map (NULL),
1134 slp_unrolling_factor (1),
1135 single_scalar_iteration_cost (0),
1136 vectorizable (false),
1137 can_fully_mask_p (true),
1138 fully_masked_p (false),
1139 peeling_for_gaps (false),
1140 peeling_for_niter (false),
1141 operands_swapped (false),
1142 no_data_dependencies (false),
1143 has_mask_store (false),
1144 scalar_loop (NULL),
1145 orig_loop_info (NULL)
1146 {
1147 /* Create/Update stmt_info for all stmts in the loop. */
1148 basic_block *body = get_loop_body (loop);
1149 for (unsigned int i = 0; i < loop->num_nodes; i++)
1150 {
1151 basic_block bb = body[i];
1152 gimple_stmt_iterator si;
1153
1154 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1155 {
1156 gimple *phi = gsi_stmt (si);
1157 gimple_set_uid (phi, 0);
1158 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1159 }
1160
1161 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1162 {
1163 gimple *stmt = gsi_stmt (si);
1164 gimple_set_uid (stmt, 0);
1165 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1166 }
1167 }
1168 free (body);
1169
1170 /* CHECKME: We want to visit all BBs before their successors (except for
1171 latch blocks, for which this assertion wouldn't hold). In the simple
1172 case of the loop forms we allow, a dfs order of the BBs would the same
1173 as reversed postorder traversal, so we are safe. */
1174
1175 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1176 bbs, loop->num_nodes, loop);
1177 gcc_assert (nbbs == loop->num_nodes);
1178 }
1179
1180 /* Free all levels of MASKS. */
1181
1182 void
release_vec_loop_masks(vec_loop_masks * masks)1183 release_vec_loop_masks (vec_loop_masks *masks)
1184 {
1185 rgroup_masks *rgm;
1186 unsigned int i;
1187 FOR_EACH_VEC_ELT (*masks, i, rgm)
1188 rgm->masks.release ();
1189 masks->release ();
1190 }
1191
1192 /* Free all memory used by the _loop_vec_info, as well as all the
1193 stmt_vec_info structs of all the stmts in the loop. */
1194
~_loop_vec_info()1195 _loop_vec_info::~_loop_vec_info ()
1196 {
1197 int nbbs;
1198 gimple_stmt_iterator si;
1199 int j;
1200
1201 nbbs = loop->num_nodes;
1202 for (j = 0; j < nbbs; j++)
1203 {
1204 basic_block bb = bbs[j];
1205 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1206 free_stmt_vec_info (gsi_stmt (si));
1207
1208 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1209 {
1210 gimple *stmt = gsi_stmt (si);
1211
1212 /* We may have broken canonical form by moving a constant
1213 into RHS1 of a commutative op. Fix such occurrences. */
1214 if (operands_swapped && is_gimple_assign (stmt))
1215 {
1216 enum tree_code code = gimple_assign_rhs_code (stmt);
1217
1218 if ((code == PLUS_EXPR
1219 || code == POINTER_PLUS_EXPR
1220 || code == MULT_EXPR)
1221 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1222 swap_ssa_operands (stmt,
1223 gimple_assign_rhs1_ptr (stmt),
1224 gimple_assign_rhs2_ptr (stmt));
1225 else if (code == COND_EXPR
1226 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1227 {
1228 tree cond_expr = gimple_assign_rhs1 (stmt);
1229 enum tree_code cond_code = TREE_CODE (cond_expr);
1230
1231 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1232 {
1233 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1234 0));
1235 cond_code = invert_tree_comparison (cond_code,
1236 honor_nans);
1237 if (cond_code != ERROR_MARK)
1238 {
1239 TREE_SET_CODE (cond_expr, cond_code);
1240 swap_ssa_operands (stmt,
1241 gimple_assign_rhs2_ptr (stmt),
1242 gimple_assign_rhs3_ptr (stmt));
1243 }
1244 }
1245 }
1246 }
1247
1248 /* Free stmt_vec_info. */
1249 free_stmt_vec_info (stmt);
1250 gsi_next (&si);
1251 }
1252 }
1253
1254 free (bbs);
1255
1256 release_vec_loop_masks (&masks);
1257 delete ivexpr_map;
1258
1259 loop->aux = NULL;
1260 }
1261
1262 /* Return an invariant or register for EXPR and emit necessary
1263 computations in the LOOP_VINFO loop preheader. */
1264
1265 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)1266 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1267 {
1268 if (is_gimple_reg (expr)
1269 || is_gimple_min_invariant (expr))
1270 return expr;
1271
1272 if (! loop_vinfo->ivexpr_map)
1273 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1274 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1275 if (! cached)
1276 {
1277 gimple_seq stmts = NULL;
1278 cached = force_gimple_operand (unshare_expr (expr),
1279 &stmts, true, NULL_TREE);
1280 if (stmts)
1281 {
1282 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1283 gsi_insert_seq_on_edge_immediate (e, stmts);
1284 }
1285 }
1286 return cached;
1287 }
1288
1289 /* Return true if we can use CMP_TYPE as the comparison type to produce
1290 all masks required to mask LOOP_VINFO. */
1291
1292 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)1293 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1294 {
1295 rgroup_masks *rgm;
1296 unsigned int i;
1297 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1298 if (rgm->mask_type != NULL_TREE
1299 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1300 cmp_type, rgm->mask_type,
1301 OPTIMIZE_FOR_SPEED))
1302 return false;
1303 return true;
1304 }
1305
1306 /* Calculate the maximum number of scalars per iteration for every
1307 rgroup in LOOP_VINFO. */
1308
1309 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)1310 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1311 {
1312 unsigned int res = 1;
1313 unsigned int i;
1314 rgroup_masks *rgm;
1315 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1316 res = MAX (res, rgm->max_nscalars_per_iter);
1317 return res;
1318 }
1319
1320 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1321 whether we can actually generate the masks required. Return true if so,
1322 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1323
1324 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1325 vect_verify_full_masking (loop_vec_info loop_vinfo)
1326 {
1327 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1328 unsigned int min_ni_width;
1329
1330 /* Use a normal loop if there are no statements that need masking.
1331 This only happens in rare degenerate cases: it means that the loop
1332 has no loads, no stores, and no live-out values. */
1333 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1334 return false;
1335
1336 /* Get the maximum number of iterations that is representable
1337 in the counter type. */
1338 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1339 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1340
1341 /* Get a more refined estimate for the number of iterations. */
1342 widest_int max_back_edges;
1343 if (max_loop_iterations (loop, &max_back_edges))
1344 max_ni = wi::smin (max_ni, max_back_edges + 1);
1345
1346 /* Account for rgroup masks, in which each bit is replicated N times. */
1347 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1348
1349 /* Work out how many bits we need to represent the limit. */
1350 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1351
1352 /* Find a scalar mode for which WHILE_ULT is supported. */
1353 opt_scalar_int_mode cmp_mode_iter;
1354 tree cmp_type = NULL_TREE;
1355 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356 {
1357 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358 if (cmp_bits >= min_ni_width
1359 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360 {
1361 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 if (this_type
1363 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364 {
1365 /* Although we could stop as soon as we find a valid mode,
1366 it's often better to continue until we hit Pmode, since the
1367 operands to the WHILE are more likely to be reusable in
1368 address calculations. */
1369 cmp_type = this_type;
1370 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1371 break;
1372 }
1373 }
1374 }
1375
1376 if (!cmp_type)
1377 return false;
1378
1379 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1380 return true;
1381 }
1382
1383 /* Calculate the cost of one scalar iteration of the loop. */
1384 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1385 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1386 {
1387 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1388 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1389 int nbbs = loop->num_nodes, factor;
1390 int innerloop_iters, i;
1391
1392 /* Gather costs for statements in the scalar loop. */
1393
1394 /* FORNOW. */
1395 innerloop_iters = 1;
1396 if (loop->inner)
1397 innerloop_iters = 50; /* FIXME */
1398
1399 for (i = 0; i < nbbs; i++)
1400 {
1401 gimple_stmt_iterator si;
1402 basic_block bb = bbs[i];
1403
1404 if (bb->loop_father == loop->inner)
1405 factor = innerloop_iters;
1406 else
1407 factor = 1;
1408
1409 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1410 {
1411 gimple *stmt = gsi_stmt (si);
1412 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1413
1414 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1415 continue;
1416
1417 /* Skip stmts that are not vectorized inside the loop. */
1418 if (stmt_info
1419 && !STMT_VINFO_RELEVANT_P (stmt_info)
1420 && (!STMT_VINFO_LIVE_P (stmt_info)
1421 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1422 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1423 continue;
1424
1425 vect_cost_for_stmt kind;
1426 if (STMT_VINFO_DATA_REF (stmt_info))
1427 {
1428 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1429 kind = scalar_load;
1430 else
1431 kind = scalar_store;
1432 }
1433 else
1434 kind = scalar_stmt;
1435
1436 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1437 factor, kind, stmt_info, 0, vect_prologue);
1438 }
1439 }
1440
1441 /* Now accumulate cost. */
1442 void *target_cost_data = init_cost (loop);
1443 stmt_info_for_cost *si;
1444 int j;
1445 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1446 j, si)
1447 {
1448 struct _stmt_vec_info *stmt_info
1449 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1450 (void) add_stmt_cost (target_cost_data, si->count,
1451 si->kind, stmt_info, si->misalign,
1452 vect_body);
1453 }
1454 unsigned dummy, body_cost = 0;
1455 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1456 destroy_cost_data (target_cost_data);
1457 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1458 }
1459
1460
1461 /* Function vect_analyze_loop_form_1.
1462
1463 Verify that certain CFG restrictions hold, including:
1464 - the loop has a pre-header
1465 - the loop has a single entry and exit
1466 - the loop exit condition is simple enough
1467 - the number of iterations can be analyzed, i.e, a countable loop. The
1468 niter could be analyzed under some assumptions. */
1469
1470 bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1471 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1472 tree *assumptions, tree *number_of_iterationsm1,
1473 tree *number_of_iterations, gcond **inner_loop_cond)
1474 {
1475 if (dump_enabled_p ())
1476 dump_printf_loc (MSG_NOTE, vect_location,
1477 "=== vect_analyze_loop_form ===\n");
1478
1479 /* Different restrictions apply when we are considering an inner-most loop,
1480 vs. an outer (nested) loop.
1481 (FORNOW. May want to relax some of these restrictions in the future). */
1482
1483 if (!loop->inner)
1484 {
1485 /* Inner-most loop. We currently require that the number of BBs is
1486 exactly 2 (the header and latch). Vectorizable inner-most loops
1487 look like this:
1488
1489 (pre-header)
1490 |
1491 header <--------+
1492 | | |
1493 | +--> latch --+
1494 |
1495 (exit-bb) */
1496
1497 if (loop->num_nodes != 2)
1498 {
1499 if (dump_enabled_p ())
1500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1501 "not vectorized: control flow in loop.\n");
1502 return false;
1503 }
1504
1505 if (empty_block_p (loop->header))
1506 {
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509 "not vectorized: empty loop.\n");
1510 return false;
1511 }
1512 }
1513 else
1514 {
1515 struct loop *innerloop = loop->inner;
1516 edge entryedge;
1517
1518 /* Nested loop. We currently require that the loop is doubly-nested,
1519 contains a single inner loop, and the number of BBs is exactly 5.
1520 Vectorizable outer-loops look like this:
1521
1522 (pre-header)
1523 |
1524 header <---+
1525 | |
1526 inner-loop |
1527 | |
1528 tail ------+
1529 |
1530 (exit-bb)
1531
1532 The inner-loop has the properties expected of inner-most loops
1533 as described above. */
1534
1535 if ((loop->inner)->inner || (loop->inner)->next)
1536 {
1537 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1539 "not vectorized: multiple nested loops.\n");
1540 return false;
1541 }
1542
1543 if (loop->num_nodes != 5)
1544 {
1545 if (dump_enabled_p ())
1546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1547 "not vectorized: control flow in loop.\n");
1548 return false;
1549 }
1550
1551 entryedge = loop_preheader_edge (innerloop);
1552 if (entryedge->src != loop->header
1553 || !single_exit (innerloop)
1554 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1555 {
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558 "not vectorized: unsupported outerloop form.\n");
1559 return false;
1560 }
1561
1562 /* Analyze the inner-loop. */
1563 tree inner_niterm1, inner_niter, inner_assumptions;
1564 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1565 &inner_assumptions, &inner_niterm1,
1566 &inner_niter, NULL)
1567 /* Don't support analyzing niter under assumptions for inner
1568 loop. */
1569 || !integer_onep (inner_assumptions))
1570 {
1571 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573 "not vectorized: Bad inner loop.\n");
1574 return false;
1575 }
1576
1577 if (!expr_invariant_in_loop_p (loop, inner_niter))
1578 {
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1581 "not vectorized: inner-loop count not"
1582 " invariant.\n");
1583 return false;
1584 }
1585
1586 if (dump_enabled_p ())
1587 dump_printf_loc (MSG_NOTE, vect_location,
1588 "Considering outer-loop vectorization.\n");
1589 }
1590
1591 if (!single_exit (loop)
1592 || EDGE_COUNT (loop->header->preds) != 2)
1593 {
1594 if (dump_enabled_p ())
1595 {
1596 if (!single_exit (loop))
1597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 "not vectorized: multiple exits.\n");
1599 else if (EDGE_COUNT (loop->header->preds) != 2)
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "not vectorized: too many incoming edges.\n");
1602 }
1603 return false;
1604 }
1605
1606 /* We assume that the loop exit condition is at the end of the loop. i.e,
1607 that the loop is represented as a do-while (with a proper if-guard
1608 before the loop if needed), where the loop header contains all the
1609 executable statements, and the latch is empty. */
1610 if (!empty_block_p (loop->latch)
1611 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1612 {
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 "not vectorized: latch block not empty.\n");
1616 return false;
1617 }
1618
1619 /* Make sure the exit is not abnormal. */
1620 edge e = single_exit (loop);
1621 if (e->flags & EDGE_ABNORMAL)
1622 {
1623 if (dump_enabled_p ())
1624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1625 "not vectorized: abnormal loop exit edge.\n");
1626 return false;
1627 }
1628
1629 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1630 number_of_iterationsm1);
1631 if (!*loop_cond)
1632 {
1633 if (dump_enabled_p ())
1634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635 "not vectorized: complicated exit condition.\n");
1636 return false;
1637 }
1638
1639 if (integer_zerop (*assumptions)
1640 || !*number_of_iterations
1641 || chrec_contains_undetermined (*number_of_iterations))
1642 {
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "not vectorized: number of iterations cannot be "
1646 "computed.\n");
1647 return false;
1648 }
1649
1650 if (integer_zerop (*number_of_iterations))
1651 {
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "not vectorized: number of iterations = 0.\n");
1655 return false;
1656 }
1657
1658 return true;
1659 }
1660
1661 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1662
1663 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1664 vect_analyze_loop_form (struct loop *loop)
1665 {
1666 tree assumptions, number_of_iterations, number_of_iterationsm1;
1667 gcond *loop_cond, *inner_loop_cond = NULL;
1668
1669 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1670 &assumptions, &number_of_iterationsm1,
1671 &number_of_iterations, &inner_loop_cond))
1672 return NULL;
1673
1674 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1675 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1676 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1677 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1678 if (!integer_onep (assumptions))
1679 {
1680 /* We consider to vectorize this loop by versioning it under
1681 some assumptions. In order to do this, we need to clear
1682 existing information computed by scev and niter analyzer. */
1683 scev_reset_htab ();
1684 free_numbers_of_iterations_estimates (loop);
1685 /* Also set flag for this loop so that following scev and niter
1686 analysis are done under the assumptions. */
1687 loop_constraint_set (loop, LOOP_C_FINITE);
1688 /* Also record the assumptions for versioning. */
1689 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1690 }
1691
1692 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1693 {
1694 if (dump_enabled_p ())
1695 {
1696 dump_printf_loc (MSG_NOTE, vect_location,
1697 "Symbolic number of iterations is ");
1698 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1699 dump_printf (MSG_NOTE, "\n");
1700 }
1701 }
1702
1703 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1704 if (inner_loop_cond)
1705 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1706 = loop_exit_ctrl_vec_info_type;
1707
1708 gcc_assert (!loop->aux);
1709 loop->aux = loop_vinfo;
1710 return loop_vinfo;
1711 }
1712
1713
1714
1715 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1716 statements update the vectorization factor. */
1717
1718 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1719 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1720 {
1721 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1722 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1723 int nbbs = loop->num_nodes;
1724 poly_uint64 vectorization_factor;
1725 int i;
1726
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "=== vect_update_vf_for_slp ===\n");
1730
1731 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1732 gcc_assert (known_ne (vectorization_factor, 0U));
1733
1734 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1735 vectorization factor of the loop is the unrolling factor required by
1736 the SLP instances. If that unrolling factor is 1, we say, that we
1737 perform pure SLP on loop - cross iteration parallelism is not
1738 exploited. */
1739 bool only_slp_in_loop = true;
1740 for (i = 0; i < nbbs; i++)
1741 {
1742 basic_block bb = bbs[i];
1743 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1744 gsi_next (&si))
1745 {
1746 gimple *stmt = gsi_stmt (si);
1747 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1748 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1749 && STMT_VINFO_RELATED_STMT (stmt_info))
1750 {
1751 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1752 stmt_info = vinfo_for_stmt (stmt);
1753 }
1754 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1755 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1756 && !PURE_SLP_STMT (stmt_info))
1757 /* STMT needs both SLP and loop-based vectorization. */
1758 only_slp_in_loop = false;
1759 }
1760 }
1761
1762 if (only_slp_in_loop)
1763 {
1764 dump_printf_loc (MSG_NOTE, vect_location,
1765 "Loop contains only SLP stmts\n");
1766 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1767 }
1768 else
1769 {
1770 dump_printf_loc (MSG_NOTE, vect_location,
1771 "Loop contains SLP and non-SLP stmts\n");
1772 /* Both the vectorization factor and unroll factor have the form
1773 current_vector_size * X for some rational X, so they must have
1774 a common multiple. */
1775 vectorization_factor
1776 = force_common_multiple (vectorization_factor,
1777 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1778 }
1779
1780 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1781 if (dump_enabled_p ())
1782 {
1783 dump_printf_loc (MSG_NOTE, vect_location,
1784 "Updating vectorization factor to ");
1785 dump_dec (MSG_NOTE, vectorization_factor);
1786 dump_printf (MSG_NOTE, ".\n");
1787 }
1788 }
1789
1790 /* Return true if STMT_INFO describes a double reduction phi and if
1791 the other phi in the reduction is also relevant for vectorization.
1792 This rejects cases such as:
1793
1794 outer1:
1795 x_1 = PHI <x_3(outer2), ...>;
1796 ...
1797
1798 inner:
1799 x_2 = ...;
1800 ...
1801
1802 outer2:
1803 x_3 = PHI <x_2(inner)>;
1804
1805 if nothing in x_2 or elsewhere makes x_1 relevant. */
1806
1807 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1808 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1809 {
1810 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1811 return false;
1812
1813 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1814 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1815 }
1816
1817 /* Function vect_analyze_loop_operations.
1818
1819 Scan the loop stmts and make sure they are all vectorizable. */
1820
1821 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1822 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1823 {
1824 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1825 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1826 int nbbs = loop->num_nodes;
1827 int i;
1828 stmt_vec_info stmt_info;
1829 bool need_to_vectorize = false;
1830 bool ok;
1831
1832 if (dump_enabled_p ())
1833 dump_printf_loc (MSG_NOTE, vect_location,
1834 "=== vect_analyze_loop_operations ===\n");
1835
1836 for (i = 0; i < nbbs; i++)
1837 {
1838 basic_block bb = bbs[i];
1839
1840 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1841 gsi_next (&si))
1842 {
1843 gphi *phi = si.phi ();
1844 ok = true;
1845
1846 stmt_info = vinfo_for_stmt (phi);
1847 if (dump_enabled_p ())
1848 {
1849 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1850 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1851 }
1852 if (virtual_operand_p (gimple_phi_result (phi)))
1853 continue;
1854
1855 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1856 (i.e., a phi in the tail of the outer-loop). */
1857 if (! is_loop_header_bb_p (bb))
1858 {
1859 /* FORNOW: we currently don't support the case that these phis
1860 are not used in the outerloop (unless it is double reduction,
1861 i.e., this phi is vect_reduction_def), cause this case
1862 requires to actually do something here. */
1863 if (STMT_VINFO_LIVE_P (stmt_info)
1864 && !vect_active_double_reduction_p (stmt_info))
1865 {
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 "Unsupported loop-closed phi in "
1869 "outer-loop.\n");
1870 return false;
1871 }
1872
1873 /* If PHI is used in the outer loop, we check that its operand
1874 is defined in the inner loop. */
1875 if (STMT_VINFO_RELEVANT_P (stmt_info))
1876 {
1877 tree phi_op;
1878 gimple *op_def_stmt;
1879
1880 if (gimple_phi_num_args (phi) != 1)
1881 return false;
1882
1883 phi_op = PHI_ARG_DEF (phi, 0);
1884 if (TREE_CODE (phi_op) != SSA_NAME)
1885 return false;
1886
1887 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1888 if (gimple_nop_p (op_def_stmt)
1889 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1890 || !vinfo_for_stmt (op_def_stmt))
1891 return false;
1892
1893 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894 != vect_used_in_outer
1895 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1896 != vect_used_in_outer_by_reduction)
1897 return false;
1898 }
1899
1900 continue;
1901 }
1902
1903 gcc_assert (stmt_info);
1904
1905 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1906 || STMT_VINFO_LIVE_P (stmt_info))
1907 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1908 {
1909 /* A scalar-dependence cycle that we don't support. */
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 "not vectorized: scalar dependence cycle.\n");
1913 return false;
1914 }
1915
1916 if (STMT_VINFO_RELEVANT_P (stmt_info))
1917 {
1918 need_to_vectorize = true;
1919 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1920 && ! PURE_SLP_STMT (stmt_info))
1921 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1922 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1923 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1924 && ! PURE_SLP_STMT (stmt_info))
1925 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1926 }
1927
1928 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1929 if (ok
1930 && STMT_VINFO_LIVE_P (stmt_info)
1931 && !PURE_SLP_STMT (stmt_info))
1932 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1933
1934 if (!ok)
1935 {
1936 if (dump_enabled_p ())
1937 {
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 "not vectorized: relevant phi not "
1940 "supported: ");
1941 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1942 }
1943 return false;
1944 }
1945 }
1946
1947 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1948 gsi_next (&si))
1949 {
1950 gimple *stmt = gsi_stmt (si);
1951 if (!gimple_clobber_p (stmt)
1952 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1953 return false;
1954 }
1955 } /* bbs */
1956
1957 /* All operations in the loop are either irrelevant (deal with loop
1958 control, or dead), or only used outside the loop and can be moved
1959 out of the loop (e.g. invariants, inductions). The loop can be
1960 optimized away by scalar optimizations. We're better off not
1961 touching this loop. */
1962 if (!need_to_vectorize)
1963 {
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_NOTE, vect_location,
1966 "All the computation can be taken out of the loop.\n");
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1969 "not vectorized: redundant loop. no profit to "
1970 "vectorize.\n");
1971 return false;
1972 }
1973
1974 return true;
1975 }
1976
1977 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1978 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1979 definitely no, or -1 if it's worth retrying. */
1980
1981 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1982 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1983 {
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986
1987 /* Only fully-masked loops can have iteration counts less than the
1988 vectorization factor. */
1989 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1990 {
1991 HOST_WIDE_INT max_niter;
1992
1993 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1994 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1995 else
1996 max_niter = max_stmt_executions_int (loop);
1997
1998 if (max_niter != -1
1999 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2000 {
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "not vectorized: iteration count smaller than "
2004 "vectorization factor.\n");
2005 return 0;
2006 }
2007 }
2008
2009 int min_profitable_iters, min_profitable_estimate;
2010 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2011 &min_profitable_estimate);
2012
2013 if (min_profitable_iters < 0)
2014 {
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017 "not vectorized: vectorization not profitable.\n");
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "not vectorized: vector version will never be "
2021 "profitable.\n");
2022 return -1;
2023 }
2024
2025 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2026 * assumed_vf);
2027
2028 /* Use the cost model only if it is more conservative than user specified
2029 threshold. */
2030 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2031 min_profitable_iters);
2032
2033 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2034
2035 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2036 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2037 {
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 "not vectorized: vectorization not profitable.\n");
2041 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "not vectorized: iteration count smaller than user "
2044 "specified loop bound parameter or minimum profitable "
2045 "iterations (whichever is more conservative).\n");
2046 return 0;
2047 }
2048
2049 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2050 if (estimated_niter == -1)
2051 estimated_niter = likely_max_stmt_executions_int (loop);
2052 if (estimated_niter != -1
2053 && ((unsigned HOST_WIDE_INT) estimated_niter
2054 < MAX (th, (unsigned) min_profitable_estimate)))
2055 {
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not vectorized: estimated iteration count too "
2059 "small.\n");
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "not vectorized: estimated iteration count smaller "
2063 "than specified loop bound parameter or minimum "
2064 "profitable iterations (whichever is more "
2065 "conservative).\n");
2066 return -1;
2067 }
2068
2069 return 1;
2070 }
2071
2072
2073 /* Function vect_analyze_loop_2.
2074
2075 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2076 for it. The different analyses will record information in the
2077 loop_vec_info struct. */
2078 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)2079 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2080 {
2081 bool ok;
2082 int res;
2083 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2084 poly_uint64 min_vf = 2;
2085 unsigned int n_stmts = 0;
2086
2087 /* The first group of checks is independent of the vector size. */
2088 fatal = true;
2089
2090 /* Find all data references in the loop (which correspond to vdefs/vuses)
2091 and analyze their evolution in the loop. */
2092
2093 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2094
2095 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2096 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2097 {
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: loop nest containing two "
2101 "or more consecutive inner loops cannot be "
2102 "vectorized\n");
2103 return false;
2104 }
2105
2106 for (unsigned i = 0; i < loop->num_nodes; i++)
2107 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2108 !gsi_end_p (gsi); gsi_next (&gsi))
2109 {
2110 gimple *stmt = gsi_stmt (gsi);
2111 if (is_gimple_debug (stmt))
2112 continue;
2113 ++n_stmts;
2114 if (!find_data_references_in_stmt (loop, stmt,
2115 &LOOP_VINFO_DATAREFS (loop_vinfo)))
2116 {
2117 if (is_gimple_call (stmt) && loop->safelen)
2118 {
2119 tree fndecl = gimple_call_fndecl (stmt), op;
2120 if (fndecl != NULL_TREE)
2121 {
2122 cgraph_node *node = cgraph_node::get (fndecl);
2123 if (node != NULL && node->simd_clones != NULL)
2124 {
2125 unsigned int j, n = gimple_call_num_args (stmt);
2126 for (j = 0; j < n; j++)
2127 {
2128 op = gimple_call_arg (stmt, j);
2129 if (DECL_P (op)
2130 || (REFERENCE_CLASS_P (op)
2131 && get_base_address (op)))
2132 break;
2133 }
2134 op = gimple_call_lhs (stmt);
2135 /* Ignore #pragma omp declare simd functions
2136 if they don't have data references in the
2137 call stmt itself. */
2138 if (j == n
2139 && !(op
2140 && (DECL_P (op)
2141 || (REFERENCE_CLASS_P (op)
2142 && get_base_address (op)))))
2143 continue;
2144 }
2145 }
2146 }
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "not vectorized: loop contains function "
2150 "calls or data references that cannot "
2151 "be analyzed\n");
2152 return false;
2153 }
2154 }
2155
2156 /* Analyze the data references and also adjust the minimal
2157 vectorization factor according to the loads and stores. */
2158
2159 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2160 if (!ok)
2161 {
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2164 "bad data references.\n");
2165 return false;
2166 }
2167
2168 /* Classify all cross-iteration scalar data-flow cycles.
2169 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2170 vect_analyze_scalar_cycles (loop_vinfo);
2171
2172 vect_pattern_recog (loop_vinfo);
2173
2174 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2175
2176 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2177 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2178
2179 ok = vect_analyze_data_ref_accesses (loop_vinfo);
2180 if (!ok)
2181 {
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "bad data access.\n");
2185 return false;
2186 }
2187
2188 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2189
2190 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2191 if (!ok)
2192 {
2193 if (dump_enabled_p ())
2194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195 "unexpected pattern.\n");
2196 return false;
2197 }
2198
2199 /* While the rest of the analysis below depends on it in some way. */
2200 fatal = false;
2201
2202 /* Analyze data dependences between the data-refs in the loop
2203 and adjust the maximum vectorization factor according to
2204 the dependences.
2205 FORNOW: fail at the first data dependence that we encounter. */
2206
2207 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2208 if (!ok
2209 || (max_vf != MAX_VECTORIZATION_FACTOR
2210 && maybe_lt (max_vf, min_vf)))
2211 {
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "bad data dependence.\n");
2215 return false;
2216 }
2217 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2218
2219 ok = vect_determine_vectorization_factor (loop_vinfo);
2220 if (!ok)
2221 {
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 "can't determine vectorization factor.\n");
2225 return false;
2226 }
2227 if (max_vf != MAX_VECTORIZATION_FACTOR
2228 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2229 {
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "bad data dependence.\n");
2233 return false;
2234 }
2235
2236 /* Compute the scalar iteration cost. */
2237 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2238
2239 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2240 unsigned th;
2241
2242 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2243 ok = vect_analyze_slp (loop_vinfo, n_stmts);
2244 if (!ok)
2245 return false;
2246
2247 /* If there are any SLP instances mark them as pure_slp. */
2248 bool slp = vect_make_slp_decision (loop_vinfo);
2249 if (slp)
2250 {
2251 /* Find stmts that need to be both vectorized and SLPed. */
2252 vect_detect_hybrid_slp (loop_vinfo);
2253
2254 /* Update the vectorization factor based on the SLP decision. */
2255 vect_update_vf_for_slp (loop_vinfo);
2256 }
2257
2258 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2259
2260 /* We don't expect to have to roll back to anything other than an empty
2261 set of rgroups. */
2262 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2263
2264 /* This is the point where we can re-start analysis with SLP forced off. */
2265 start_over:
2266
2267 /* Now the vectorization factor is final. */
2268 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2269 gcc_assert (known_ne (vectorization_factor, 0U));
2270
2271 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2272 {
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "vectorization_factor = ");
2275 dump_dec (MSG_NOTE, vectorization_factor);
2276 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2277 LOOP_VINFO_INT_NITERS (loop_vinfo));
2278 }
2279
2280 HOST_WIDE_INT max_niter
2281 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2282
2283 /* Analyze the alignment of the data-refs in the loop.
2284 Fail if a data reference is found that cannot be vectorized. */
2285
2286 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2287 if (!ok)
2288 {
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "bad data alignment.\n");
2292 return false;
2293 }
2294
2295 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2296 It is important to call pruning after vect_analyze_data_ref_accesses,
2297 since we use grouping information gathered by interleaving analysis. */
2298 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2299 if (!ok)
2300 return false;
2301
2302 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2303 vectorization. */
2304 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2305 {
2306 /* This pass will decide on using loop versioning and/or loop peeling in
2307 order to enhance the alignment of data references in the loop. */
2308 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2309 if (!ok)
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313 "bad data alignment.\n");
2314 return false;
2315 }
2316 }
2317
2318 if (slp)
2319 {
2320 /* Analyze operations in the SLP instances. Note this may
2321 remove unsupported SLP instances which makes the above
2322 SLP kind detection invalid. */
2323 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2324 vect_slp_analyze_operations (loop_vinfo);
2325 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2326 goto again;
2327 }
2328
2329 /* Scan all the remaining operations in the loop that are not subject
2330 to SLP and make sure they are vectorizable. */
2331 ok = vect_analyze_loop_operations (loop_vinfo);
2332 if (!ok)
2333 {
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 "bad operation or unsupported loop bound.\n");
2337 return false;
2338 }
2339
2340 /* Decide whether to use a fully-masked loop for this vectorization
2341 factor. */
2342 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2343 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2344 && vect_verify_full_masking (loop_vinfo));
2345 if (dump_enabled_p ())
2346 {
2347 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2348 dump_printf_loc (MSG_NOTE, vect_location,
2349 "using a fully-masked loop.\n");
2350 else
2351 dump_printf_loc (MSG_NOTE, vect_location,
2352 "not using a fully-masked loop.\n");
2353 }
2354
2355 /* If epilog loop is required because of data accesses with gaps,
2356 one additional iteration needs to be peeled. Check if there is
2357 enough iterations for vectorization. */
2358 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2359 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2360 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2361 {
2362 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2363 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2364
2365 if (known_lt (wi::to_widest (scalar_niters), vf))
2366 {
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "loop has no enough iterations to support"
2370 " peeling for gaps.\n");
2371 return false;
2372 }
2373 }
2374
2375 /* Check the costings of the loop make vectorizing worthwhile. */
2376 res = vect_analyze_loop_costing (loop_vinfo);
2377 if (res < 0)
2378 goto again;
2379 if (!res)
2380 {
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 "Loop costings not worthwhile.\n");
2384 return false;
2385 }
2386
2387 /* Decide whether we need to create an epilogue loop to handle
2388 remaining scalar iterations. */
2389 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2390
2391 unsigned HOST_WIDE_INT const_vf;
2392 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2393 /* The main loop handles all iterations. */
2394 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2395 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2396 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2397 {
2398 /* Work out the (constant) number of iterations that need to be
2399 peeled for reasons other than niters. */
2400 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2401 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2402 peel_niter += 1;
2403 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2404 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2405 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2406 }
2407 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2408 /* ??? When peeling for gaps but not alignment, we could
2409 try to check whether the (variable) niters is known to be
2410 VF * N + 1. That's something of a niche case though. */
2411 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2412 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2413 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2414 < (unsigned) exact_log2 (const_vf))
2415 /* In case of versioning, check if the maximum number of
2416 iterations is greater than th. If they are identical,
2417 the epilogue is unnecessary. */
2418 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2419 || ((unsigned HOST_WIDE_INT) max_niter
2420 > (th / const_vf) * const_vf))))
2421 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2422
2423 /* If an epilogue loop is required make sure we can create one. */
2424 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2425 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2426 {
2427 if (dump_enabled_p ())
2428 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2429 if (!vect_can_advance_ivs_p (loop_vinfo)
2430 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2431 single_exit (LOOP_VINFO_LOOP
2432 (loop_vinfo))))
2433 {
2434 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436 "not vectorized: can't create required "
2437 "epilog loop\n");
2438 goto again;
2439 }
2440 }
2441
2442 /* During peeling, we need to check if number of loop iterations is
2443 enough for both peeled prolog loop and vector loop. This check
2444 can be merged along with threshold check of loop versioning, so
2445 increase threshold for this case if necessary. */
2446 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2447 {
2448 poly_uint64 niters_th = 0;
2449
2450 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2451 {
2452 /* Niters for peeled prolog loop. */
2453 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2454 {
2455 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2456 tree vectype
2457 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2458 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2459 }
2460 else
2461 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2462 }
2463
2464 /* Niters for at least one iteration of vectorized loop. */
2465 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2466 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2467 /* One additional iteration because of peeling for gap. */
2468 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2469 niters_th += 1;
2470 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2471 }
2472
2473 gcc_assert (known_eq (vectorization_factor,
2474 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2475
2476 /* Ok to vectorize! */
2477 return true;
2478
2479 again:
2480 /* Try again with SLP forced off but if we didn't do any SLP there is
2481 no point in re-trying. */
2482 if (!slp)
2483 return false;
2484
2485 /* If there are reduction chains re-trying will fail anyway. */
2486 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2487 return false;
2488
2489 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2490 via interleaving or lane instructions. */
2491 slp_instance instance;
2492 slp_tree node;
2493 unsigned i, j;
2494 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2495 {
2496 stmt_vec_info vinfo;
2497 vinfo = vinfo_for_stmt
2498 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2499 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2500 continue;
2501 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2502 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2503 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2504 if (! vect_store_lanes_supported (vectype, size, false)
2505 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2506 && ! vect_grouped_store_supported (vectype, size))
2507 return false;
2508 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2509 {
2510 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2511 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2512 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2513 size = STMT_VINFO_GROUP_SIZE (vinfo);
2514 vectype = STMT_VINFO_VECTYPE (vinfo);
2515 if (! vect_load_lanes_supported (vectype, size, false)
2516 && ! vect_grouped_load_supported (vectype, single_element_p,
2517 size))
2518 return false;
2519 }
2520 }
2521
2522 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_NOTE, vect_location,
2524 "re-trying with SLP disabled\n");
2525
2526 /* Roll back state appropriately. No SLP this time. */
2527 slp = false;
2528 /* Restore vectorization factor as it were without SLP. */
2529 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2530 /* Free the SLP instances. */
2531 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2532 vect_free_slp_instance (instance);
2533 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2534 /* Reset SLP type to loop_vect on all stmts. */
2535 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2536 {
2537 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2538 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2539 !gsi_end_p (si); gsi_next (&si))
2540 {
2541 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2542 STMT_SLP_TYPE (stmt_info) = loop_vect;
2543 }
2544 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2545 !gsi_end_p (si); gsi_next (&si))
2546 {
2547 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2548 STMT_SLP_TYPE (stmt_info) = loop_vect;
2549 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2550 {
2551 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2552 STMT_SLP_TYPE (stmt_info) = loop_vect;
2553 for (gimple_stmt_iterator pi
2554 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2555 !gsi_end_p (pi); gsi_next (&pi))
2556 {
2557 gimple *pstmt = gsi_stmt (pi);
2558 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2559 }
2560 }
2561 }
2562 }
2563 /* Free optimized alias test DDRS. */
2564 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2565 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2566 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2567 /* Reset target cost data. */
2568 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2569 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2570 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2571 /* Reset accumulated rgroup information. */
2572 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2573 /* Reset assorted flags. */
2574 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2575 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2576 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2577 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2578 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2579
2580 goto start_over;
2581 }
2582
2583 /* Function vect_analyze_loop.
2584
2585 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2586 for it. The different analyses will record information in the
2587 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2588 be vectorized. */
2589 loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo)2590 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2591 {
2592 loop_vec_info loop_vinfo;
2593 auto_vector_sizes vector_sizes;
2594
2595 /* Autodetect first vector size we try. */
2596 current_vector_size = 0;
2597 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2598 unsigned int next_size = 0;
2599
2600 if (dump_enabled_p ())
2601 dump_printf_loc (MSG_NOTE, vect_location,
2602 "===== analyze_loop_nest =====\n");
2603
2604 if (loop_outer (loop)
2605 && loop_vec_info_for_loop (loop_outer (loop))
2606 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2607 {
2608 if (dump_enabled_p ())
2609 dump_printf_loc (MSG_NOTE, vect_location,
2610 "outer-loop already vectorized.\n");
2611 return NULL;
2612 }
2613
2614 poly_uint64 autodetected_vector_size = 0;
2615 while (1)
2616 {
2617 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2618 loop_vinfo = vect_analyze_loop_form (loop);
2619 if (!loop_vinfo)
2620 {
2621 if (dump_enabled_p ())
2622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 "bad loop form.\n");
2624 return NULL;
2625 }
2626
2627 bool fatal = false;
2628
2629 if (orig_loop_vinfo)
2630 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2631
2632 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2633 {
2634 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2635
2636 return loop_vinfo;
2637 }
2638
2639 delete loop_vinfo;
2640
2641 if (next_size == 0)
2642 autodetected_vector_size = current_vector_size;
2643
2644 if (next_size < vector_sizes.length ()
2645 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2646 next_size += 1;
2647
2648 if (fatal
2649 || next_size == vector_sizes.length ()
2650 || known_eq (current_vector_size, 0U))
2651 return NULL;
2652
2653 /* Try the next biggest vector size. */
2654 current_vector_size = vector_sizes[next_size++];
2655 if (dump_enabled_p ())
2656 {
2657 dump_printf_loc (MSG_NOTE, vect_location,
2658 "***** Re-trying analysis with "
2659 "vector size ");
2660 dump_dec (MSG_NOTE, current_vector_size);
2661 dump_printf (MSG_NOTE, "\n");
2662 }
2663 }
2664 }
2665
2666 /* Return true if there is an in-order reduction function for CODE, storing
2667 it in *REDUC_FN if so. */
2668
2669 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2670 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2671 {
2672 switch (code)
2673 {
2674 case PLUS_EXPR:
2675 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2676 return true;
2677
2678 default:
2679 return false;
2680 }
2681 }
2682
2683 /* Function reduction_fn_for_scalar_code
2684
2685 Input:
2686 CODE - tree_code of a reduction operations.
2687
2688 Output:
2689 REDUC_FN - the corresponding internal function to be used to reduce the
2690 vector of partial results into a single scalar result, or IFN_LAST
2691 if the operation is a supported reduction operation, but does not have
2692 such an internal function.
2693
2694 Return FALSE if CODE currently cannot be vectorized as reduction. */
2695
2696 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2697 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2698 {
2699 switch (code)
2700 {
2701 case MAX_EXPR:
2702 *reduc_fn = IFN_REDUC_MAX;
2703 return true;
2704
2705 case MIN_EXPR:
2706 *reduc_fn = IFN_REDUC_MIN;
2707 return true;
2708
2709 case PLUS_EXPR:
2710 *reduc_fn = IFN_REDUC_PLUS;
2711 return true;
2712
2713 case BIT_AND_EXPR:
2714 *reduc_fn = IFN_REDUC_AND;
2715 return true;
2716
2717 case BIT_IOR_EXPR:
2718 *reduc_fn = IFN_REDUC_IOR;
2719 return true;
2720
2721 case BIT_XOR_EXPR:
2722 *reduc_fn = IFN_REDUC_XOR;
2723 return true;
2724
2725 case MULT_EXPR:
2726 case MINUS_EXPR:
2727 *reduc_fn = IFN_LAST;
2728 return true;
2729
2730 default:
2731 return false;
2732 }
2733 }
2734
2735 /* If there is a neutral value X such that SLP reduction NODE would not
2736 be affected by the introduction of additional X elements, return that X,
2737 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2738 is true if the SLP statements perform a single reduction, false if each
2739 statement performs an independent reduction. */
2740
2741 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)2742 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2743 bool reduc_chain)
2744 {
2745 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2746 gimple *stmt = stmts[0];
2747 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2748 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2749 tree scalar_type = TREE_TYPE (vector_type);
2750 struct loop *loop = gimple_bb (stmt)->loop_father;
2751 gcc_assert (loop);
2752
2753 switch (code)
2754 {
2755 case WIDEN_SUM_EXPR:
2756 case DOT_PROD_EXPR:
2757 case SAD_EXPR:
2758 case PLUS_EXPR:
2759 case MINUS_EXPR:
2760 case BIT_IOR_EXPR:
2761 case BIT_XOR_EXPR:
2762 return build_zero_cst (scalar_type);
2763
2764 case MULT_EXPR:
2765 return build_one_cst (scalar_type);
2766
2767 case BIT_AND_EXPR:
2768 return build_all_ones_cst (scalar_type);
2769
2770 case MAX_EXPR:
2771 case MIN_EXPR:
2772 /* For MIN/MAX the initial values are neutral. A reduction chain
2773 has only a single initial value, so that value is neutral for
2774 all statements. */
2775 if (reduc_chain)
2776 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2777 return NULL_TREE;
2778
2779 default:
2780 return NULL_TREE;
2781 }
2782 }
2783
2784 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2785 STMT is printed with a message MSG. */
2786
2787 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2788 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2789 {
2790 dump_printf_loc (msg_type, vect_location, "%s", msg);
2791 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2792 }
2793
2794
2795 /* Detect SLP reduction of the form:
2796
2797 #a1 = phi <a5, a0>
2798 a2 = operation (a1)
2799 a3 = operation (a2)
2800 a4 = operation (a3)
2801 a5 = operation (a4)
2802
2803 #a = phi <a5>
2804
2805 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2806 FIRST_STMT is the first reduction stmt in the chain
2807 (a2 = operation (a1)).
2808
2809 Return TRUE if a reduction chain was detected. */
2810
2811 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2812 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2813 gimple *first_stmt)
2814 {
2815 struct loop *loop = (gimple_bb (phi))->loop_father;
2816 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2817 enum tree_code code;
2818 gimple *loop_use_stmt = NULL;
2819 stmt_vec_info use_stmt_info;
2820 tree lhs;
2821 imm_use_iterator imm_iter;
2822 use_operand_p use_p;
2823 int nloop_uses, size = 0, n_out_of_loop_uses;
2824 bool found = false;
2825
2826 if (loop != vect_loop)
2827 return false;
2828
2829 auto_vec<stmt_vec_info, 8> reduc_chain;
2830 lhs = PHI_RESULT (phi);
2831 code = gimple_assign_rhs_code (first_stmt);
2832 while (1)
2833 {
2834 nloop_uses = 0;
2835 n_out_of_loop_uses = 0;
2836 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2837 {
2838 gimple *use_stmt = USE_STMT (use_p);
2839 if (is_gimple_debug (use_stmt))
2840 continue;
2841
2842 /* Check if we got back to the reduction phi. */
2843 if (use_stmt == phi)
2844 {
2845 loop_use_stmt = use_stmt;
2846 found = true;
2847 break;
2848 }
2849
2850 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2851 {
2852 loop_use_stmt = use_stmt;
2853 nloop_uses++;
2854 }
2855 else
2856 n_out_of_loop_uses++;
2857
2858 /* There are can be either a single use in the loop or two uses in
2859 phi nodes. */
2860 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2861 return false;
2862 }
2863
2864 if (found)
2865 break;
2866
2867 /* We reached a statement with no loop uses. */
2868 if (nloop_uses == 0)
2869 return false;
2870
2871 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2872 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2873 return false;
2874
2875 if (!is_gimple_assign (loop_use_stmt)
2876 || code != gimple_assign_rhs_code (loop_use_stmt)
2877 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2878 return false;
2879
2880 /* Insert USE_STMT into reduction chain. */
2881 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2882 reduc_chain.safe_push (use_stmt_info);
2883
2884 lhs = gimple_assign_lhs (loop_use_stmt);
2885 size++;
2886 }
2887
2888 if (!found || loop_use_stmt != phi || size < 2)
2889 return false;
2890
2891 /* Swap the operands, if needed, to make the reduction operand be the second
2892 operand. */
2893 lhs = PHI_RESULT (phi);
2894 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2895 {
2896 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2897 if (gimple_assign_rhs2 (next_stmt) == lhs)
2898 {
2899 tree op = gimple_assign_rhs1 (next_stmt);
2900 gimple *def_stmt = NULL;
2901
2902 if (TREE_CODE (op) == SSA_NAME)
2903 def_stmt = SSA_NAME_DEF_STMT (op);
2904
2905 /* Check that the other def is either defined in the loop
2906 ("vect_internal_def"), or it's an induction (defined by a
2907 loop-header phi-node). */
2908 if (def_stmt
2909 && gimple_bb (def_stmt)
2910 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2911 && (is_gimple_assign (def_stmt)
2912 || is_gimple_call (def_stmt)
2913 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914 == vect_induction_def
2915 || (gimple_code (def_stmt) == GIMPLE_PHI
2916 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2917 == vect_internal_def
2918 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2919 {
2920 lhs = gimple_assign_lhs (next_stmt);
2921 continue;
2922 }
2923
2924 return false;
2925 }
2926 else
2927 {
2928 tree op = gimple_assign_rhs2 (next_stmt);
2929 gimple *def_stmt = NULL;
2930
2931 if (TREE_CODE (op) == SSA_NAME)
2932 def_stmt = SSA_NAME_DEF_STMT (op);
2933
2934 /* Check that the other def is either defined in the loop
2935 ("vect_internal_def"), or it's an induction (defined by a
2936 loop-header phi-node). */
2937 if (def_stmt
2938 && gimple_bb (def_stmt)
2939 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2940 && (is_gimple_assign (def_stmt)
2941 || is_gimple_call (def_stmt)
2942 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2943 == vect_induction_def
2944 || (gimple_code (def_stmt) == GIMPLE_PHI
2945 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2946 == vect_internal_def
2947 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2948 {
2949 if (dump_enabled_p ())
2950 {
2951 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2952 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2953 }
2954
2955 swap_ssa_operands (next_stmt,
2956 gimple_assign_rhs1_ptr (next_stmt),
2957 gimple_assign_rhs2_ptr (next_stmt));
2958 update_stmt (next_stmt);
2959
2960 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2961 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2962 }
2963 else
2964 return false;
2965 }
2966
2967 lhs = gimple_assign_lhs (next_stmt);
2968 }
2969
2970 /* Build up the actual chain. */
2971 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2972 {
2973 GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2974 GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2975 }
2976 GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2977 GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2978
2979 /* Save the chain for further analysis in SLP detection. */
2980 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2981 GROUP_SIZE (reduc_chain[0]) = size;
2982
2983 return true;
2984 }
2985
2986 /* Return true if we need an in-order reduction for operation CODE
2987 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2988 overflow must wrap. */
2989
2990 static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)2991 needs_fold_left_reduction_p (tree type, tree_code code,
2992 bool need_wrapping_integral_overflow)
2993 {
2994 /* CHECKME: check for !flag_finite_math_only too? */
2995 if (SCALAR_FLOAT_TYPE_P (type))
2996 switch (code)
2997 {
2998 case MIN_EXPR:
2999 case MAX_EXPR:
3000 return false;
3001
3002 default:
3003 return !flag_associative_math;
3004 }
3005
3006 if (INTEGRAL_TYPE_P (type))
3007 {
3008 if (!operation_no_trapping_overflow (type, code))
3009 return true;
3010 if (need_wrapping_integral_overflow
3011 && !TYPE_OVERFLOW_WRAPS (type)
3012 && operation_can_overflow (code))
3013 return true;
3014 return false;
3015 }
3016
3017 if (SAT_FIXED_POINT_TYPE_P (type))
3018 return true;
3019
3020 return false;
3021 }
3022
3023 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3024 reduction operation CODE has a handled computation expression. */
3025
3026 bool
check_reduction_path(location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3027 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3028 enum tree_code code)
3029 {
3030 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3031 auto_bitmap visited;
3032 tree lookfor = PHI_RESULT (phi);
3033 ssa_op_iter curri;
3034 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3035 while (USE_FROM_PTR (curr) != loop_arg)
3036 curr = op_iter_next_use (&curri);
3037 curri.i = curri.numops;
3038 do
3039 {
3040 path.safe_push (std::make_pair (curri, curr));
3041 tree use = USE_FROM_PTR (curr);
3042 if (use == lookfor)
3043 break;
3044 gimple *def = SSA_NAME_DEF_STMT (use);
3045 if (gimple_nop_p (def)
3046 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3047 {
3048 pop:
3049 do
3050 {
3051 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3052 curri = x.first;
3053 curr = x.second;
3054 do
3055 curr = op_iter_next_use (&curri);
3056 /* Skip already visited or non-SSA operands (from iterating
3057 over PHI args). */
3058 while (curr != NULL_USE_OPERAND_P
3059 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3060 || ! bitmap_set_bit (visited,
3061 SSA_NAME_VERSION
3062 (USE_FROM_PTR (curr)))));
3063 }
3064 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3065 if (curr == NULL_USE_OPERAND_P)
3066 break;
3067 }
3068 else
3069 {
3070 if (gimple_code (def) == GIMPLE_PHI)
3071 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3072 else
3073 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3074 while (curr != NULL_USE_OPERAND_P
3075 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3076 || ! bitmap_set_bit (visited,
3077 SSA_NAME_VERSION
3078 (USE_FROM_PTR (curr)))))
3079 curr = op_iter_next_use (&curri);
3080 if (curr == NULL_USE_OPERAND_P)
3081 goto pop;
3082 }
3083 }
3084 while (1);
3085 if (dump_file && (dump_flags & TDF_DETAILS))
3086 {
3087 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3088 unsigned i;
3089 std::pair<ssa_op_iter, use_operand_p> *x;
3090 FOR_EACH_VEC_ELT (path, i, x)
3091 {
3092 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3093 dump_printf (MSG_NOTE, " ");
3094 }
3095 dump_printf (MSG_NOTE, "\n");
3096 }
3097
3098 /* Check whether the reduction path detected is valid. */
3099 bool fail = path.length () == 0;
3100 bool neg = false;
3101 for (unsigned i = 1; i < path.length (); ++i)
3102 {
3103 gimple *use_stmt = USE_STMT (path[i].second);
3104 tree op = USE_FROM_PTR (path[i].second);
3105 if (! has_single_use (op)
3106 || ! is_gimple_assign (use_stmt))
3107 {
3108 fail = true;
3109 break;
3110 }
3111 if (gimple_assign_rhs_code (use_stmt) != code)
3112 {
3113 if (code == PLUS_EXPR
3114 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3115 {
3116 /* Track whether we negate the reduction value each iteration. */
3117 if (gimple_assign_rhs2 (use_stmt) == op)
3118 neg = ! neg;
3119 }
3120 else
3121 {
3122 fail = true;
3123 break;
3124 }
3125 }
3126 }
3127 return ! fail && ! neg;
3128 }
3129
3130
3131 /* Function vect_is_simple_reduction
3132
3133 (1) Detect a cross-iteration def-use cycle that represents a simple
3134 reduction computation. We look for the following pattern:
3135
3136 loop_header:
3137 a1 = phi < a0, a2 >
3138 a3 = ...
3139 a2 = operation (a3, a1)
3140
3141 or
3142
3143 a3 = ...
3144 loop_header:
3145 a1 = phi < a0, a2 >
3146 a2 = operation (a3, a1)
3147
3148 such that:
3149 1. operation is commutative and associative and it is safe to
3150 change the order of the computation
3151 2. no uses for a2 in the loop (a2 is used out of the loop)
3152 3. no uses of a1 in the loop besides the reduction operation
3153 4. no uses of a1 outside the loop.
3154
3155 Conditions 1,4 are tested here.
3156 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3157
3158 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3159 nested cycles.
3160
3161 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3162 reductions:
3163
3164 a1 = phi < a0, a2 >
3165 inner loop (def of a3)
3166 a2 = phi < a3 >
3167
3168 (4) Detect condition expressions, ie:
3169 for (int i = 0; i < N; i++)
3170 if (a[i] < val)
3171 ret_val = a[i];
3172
3173 */
3174
3175 static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)3176 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3177 bool *double_reduc,
3178 bool need_wrapping_integral_overflow,
3179 enum vect_reduction_type *v_reduc_type)
3180 {
3181 struct loop *loop = (gimple_bb (phi))->loop_father;
3182 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3183 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3184 enum tree_code orig_code, code;
3185 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3186 tree type;
3187 int nloop_uses;
3188 tree name;
3189 imm_use_iterator imm_iter;
3190 use_operand_p use_p;
3191 bool phi_def;
3192
3193 *double_reduc = false;
3194 *v_reduc_type = TREE_CODE_REDUCTION;
3195
3196 tree phi_name = PHI_RESULT (phi);
3197 /* ??? If there are no uses of the PHI result the inner loop reduction
3198 won't be detected as possibly double-reduction by vectorizable_reduction
3199 because that tries to walk the PHI arg from the preheader edge which
3200 can be constant. See PR60382. */
3201 if (has_zero_uses (phi_name))
3202 return NULL;
3203 nloop_uses = 0;
3204 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3205 {
3206 gimple *use_stmt = USE_STMT (use_p);
3207 if (is_gimple_debug (use_stmt))
3208 continue;
3209
3210 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3211 {
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214 "intermediate value used outside loop.\n");
3215
3216 return NULL;
3217 }
3218
3219 nloop_uses++;
3220 if (nloop_uses > 1)
3221 {
3222 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3224 "reduction value used in loop.\n");
3225 return NULL;
3226 }
3227
3228 phi_use_stmt = use_stmt;
3229 }
3230
3231 edge latch_e = loop_latch_edge (loop);
3232 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3233 if (TREE_CODE (loop_arg) != SSA_NAME)
3234 {
3235 if (dump_enabled_p ())
3236 {
3237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3238 "reduction: not ssa_name: ");
3239 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3240 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3241 }
3242 return NULL;
3243 }
3244
3245 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3246 if (is_gimple_assign (def_stmt))
3247 {
3248 name = gimple_assign_lhs (def_stmt);
3249 phi_def = false;
3250 }
3251 else if (gimple_code (def_stmt) == GIMPLE_PHI)
3252 {
3253 name = PHI_RESULT (def_stmt);
3254 phi_def = true;
3255 }
3256 else
3257 {
3258 if (dump_enabled_p ())
3259 {
3260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3261 "reduction: unhandled reduction operation: ");
3262 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3263 }
3264 return NULL;
3265 }
3266
3267 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3268 return NULL;
3269
3270 nloop_uses = 0;
3271 auto_vec<gphi *, 3> lcphis;
3272 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3273 {
3274 gimple *use_stmt = USE_STMT (use_p);
3275 if (is_gimple_debug (use_stmt))
3276 continue;
3277 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3278 nloop_uses++;
3279 else
3280 /* We can have more than one loop-closed PHI. */
3281 lcphis.safe_push (as_a <gphi *> (use_stmt));
3282 if (nloop_uses > 1)
3283 {
3284 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3286 "reduction used in loop.\n");
3287 return NULL;
3288 }
3289 }
3290
3291 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3292 defined in the inner loop. */
3293 if (phi_def)
3294 {
3295 op1 = PHI_ARG_DEF (def_stmt, 0);
3296
3297 if (gimple_phi_num_args (def_stmt) != 1
3298 || TREE_CODE (op1) != SSA_NAME)
3299 {
3300 if (dump_enabled_p ())
3301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302 "unsupported phi node definition.\n");
3303
3304 return NULL;
3305 }
3306
3307 def1 = SSA_NAME_DEF_STMT (op1);
3308 if (gimple_bb (def1)
3309 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3310 && loop->inner
3311 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3312 && is_gimple_assign (def1)
3313 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3314 {
3315 if (dump_enabled_p ())
3316 report_vect_op (MSG_NOTE, def_stmt,
3317 "detected double reduction: ");
3318
3319 *double_reduc = true;
3320 return def_stmt;
3321 }
3322
3323 return NULL;
3324 }
3325
3326 /* If we are vectorizing an inner reduction we are executing that
3327 in the original order only in case we are not dealing with a
3328 double reduction. */
3329 bool check_reduction = true;
3330 if (flow_loop_nested_p (vect_loop, loop))
3331 {
3332 gphi *lcphi;
3333 unsigned i;
3334 check_reduction = false;
3335 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3336 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3337 {
3338 gimple *use_stmt = USE_STMT (use_p);
3339 if (is_gimple_debug (use_stmt))
3340 continue;
3341 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3342 check_reduction = true;
3343 }
3344 }
3345
3346 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3347 code = orig_code = gimple_assign_rhs_code (def_stmt);
3348
3349 /* We can handle "res -= x[i]", which is non-associative by
3350 simply rewriting this into "res += -x[i]". Avoid changing
3351 gimple instruction for the first simple tests and only do this
3352 if we're allowed to change code at all. */
3353 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3354 code = PLUS_EXPR;
3355
3356 if (code == COND_EXPR)
3357 {
3358 if (! nested_in_vect_loop)
3359 *v_reduc_type = COND_REDUCTION;
3360
3361 op3 = gimple_assign_rhs1 (def_stmt);
3362 if (COMPARISON_CLASS_P (op3))
3363 {
3364 op4 = TREE_OPERAND (op3, 1);
3365 op3 = TREE_OPERAND (op3, 0);
3366 }
3367 if (op3 == phi_name || op4 == phi_name)
3368 {
3369 if (dump_enabled_p ())
3370 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3371 "reduction: condition depends on previous"
3372 " iteration: ");
3373 return NULL;
3374 }
3375
3376 op1 = gimple_assign_rhs2 (def_stmt);
3377 op2 = gimple_assign_rhs3 (def_stmt);
3378 }
3379 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3380 {
3381 if (dump_enabled_p ())
3382 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3383 "reduction: not commutative/associative: ");
3384 return NULL;
3385 }
3386 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3387 {
3388 op1 = gimple_assign_rhs1 (def_stmt);
3389 op2 = gimple_assign_rhs2 (def_stmt);
3390 }
3391 else
3392 {
3393 if (dump_enabled_p ())
3394 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3395 "reduction: not handled operation: ");
3396 return NULL;
3397 }
3398
3399 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3400 {
3401 if (dump_enabled_p ())
3402 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3403 "reduction: both uses not ssa_names: ");
3404
3405 return NULL;
3406 }
3407
3408 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3409 if ((TREE_CODE (op1) == SSA_NAME
3410 && !types_compatible_p (type,TREE_TYPE (op1)))
3411 || (TREE_CODE (op2) == SSA_NAME
3412 && !types_compatible_p (type, TREE_TYPE (op2)))
3413 || (op3 && TREE_CODE (op3) == SSA_NAME
3414 && !types_compatible_p (type, TREE_TYPE (op3)))
3415 || (op4 && TREE_CODE (op4) == SSA_NAME
3416 && !types_compatible_p (type, TREE_TYPE (op4))))
3417 {
3418 if (dump_enabled_p ())
3419 {
3420 dump_printf_loc (MSG_NOTE, vect_location,
3421 "reduction: multiple types: operation type: ");
3422 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3423 dump_printf (MSG_NOTE, ", operands types: ");
3424 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3425 TREE_TYPE (op1));
3426 dump_printf (MSG_NOTE, ",");
3427 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3428 TREE_TYPE (op2));
3429 if (op3)
3430 {
3431 dump_printf (MSG_NOTE, ",");
3432 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3433 TREE_TYPE (op3));
3434 }
3435
3436 if (op4)
3437 {
3438 dump_printf (MSG_NOTE, ",");
3439 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3440 TREE_TYPE (op4));
3441 }
3442 dump_printf (MSG_NOTE, "\n");
3443 }
3444
3445 return NULL;
3446 }
3447
3448 /* Check whether it's ok to change the order of the computation.
3449 Generally, when vectorizing a reduction we change the order of the
3450 computation. This may change the behavior of the program in some
3451 cases, so we need to check that this is ok. One exception is when
3452 vectorizing an outer-loop: the inner-loop is executed sequentially,
3453 and therefore vectorizing reductions in the inner-loop during
3454 outer-loop vectorization is safe. */
3455 if (check_reduction
3456 && *v_reduc_type == TREE_CODE_REDUCTION
3457 && needs_fold_left_reduction_p (type, code,
3458 need_wrapping_integral_overflow))
3459 *v_reduc_type = FOLD_LEFT_REDUCTION;
3460
3461 /* Reduction is safe. We're dealing with one of the following:
3462 1) integer arithmetic and no trapv
3463 2) floating point arithmetic, and special flags permit this optimization
3464 3) nested cycle (i.e., outer loop vectorization). */
3465 if (TREE_CODE (op1) == SSA_NAME)
3466 def1 = SSA_NAME_DEF_STMT (op1);
3467
3468 if (TREE_CODE (op2) == SSA_NAME)
3469 def2 = SSA_NAME_DEF_STMT (op2);
3470
3471 if (code != COND_EXPR
3472 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3473 {
3474 if (dump_enabled_p ())
3475 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3476 return NULL;
3477 }
3478
3479 /* Check that one def is the reduction def, defined by PHI,
3480 the other def is either defined in the loop ("vect_internal_def"),
3481 or it's an induction (defined by a loop-header phi-node). */
3482
3483 if (def2 && def2 == phi
3484 && (code == COND_EXPR
3485 || !def1 || gimple_nop_p (def1)
3486 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3487 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3488 && (is_gimple_assign (def1)
3489 || is_gimple_call (def1)
3490 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3491 == vect_induction_def
3492 || (gimple_code (def1) == GIMPLE_PHI
3493 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3494 == vect_internal_def
3495 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3496 {
3497 if (dump_enabled_p ())
3498 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3499 return def_stmt;
3500 }
3501
3502 if (def1 && def1 == phi
3503 && (code == COND_EXPR
3504 || !def2 || gimple_nop_p (def2)
3505 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3506 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3507 && (is_gimple_assign (def2)
3508 || is_gimple_call (def2)
3509 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3510 == vect_induction_def
3511 || (gimple_code (def2) == GIMPLE_PHI
3512 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3513 == vect_internal_def
3514 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3515 {
3516 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3517 {
3518 /* Check if we can swap operands (just for simplicity - so that
3519 the rest of the code can assume that the reduction variable
3520 is always the last (second) argument). */
3521 if (code == COND_EXPR)
3522 {
3523 /* Swap cond_expr by inverting the condition. */
3524 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3525 enum tree_code invert_code = ERROR_MARK;
3526 enum tree_code cond_code = TREE_CODE (cond_expr);
3527
3528 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3529 {
3530 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3531 invert_code = invert_tree_comparison (cond_code, honor_nans);
3532 }
3533 if (invert_code != ERROR_MARK)
3534 {
3535 TREE_SET_CODE (cond_expr, invert_code);
3536 swap_ssa_operands (def_stmt,
3537 gimple_assign_rhs2_ptr (def_stmt),
3538 gimple_assign_rhs3_ptr (def_stmt));
3539 }
3540 else
3541 {
3542 if (dump_enabled_p ())
3543 report_vect_op (MSG_NOTE, def_stmt,
3544 "detected reduction: cannot swap operands "
3545 "for cond_expr");
3546 return NULL;
3547 }
3548 }
3549 else
3550 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3551 gimple_assign_rhs2_ptr (def_stmt));
3552
3553 if (dump_enabled_p ())
3554 report_vect_op (MSG_NOTE, def_stmt,
3555 "detected reduction: need to swap operands: ");
3556
3557 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3558 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3559 }
3560 else
3561 {
3562 if (dump_enabled_p ())
3563 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3564 }
3565
3566 return def_stmt;
3567 }
3568
3569 /* Try to find SLP reduction chain. */
3570 if (! nested_in_vect_loop
3571 && code != COND_EXPR
3572 && orig_code != MINUS_EXPR
3573 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3574 {
3575 if (dump_enabled_p ())
3576 report_vect_op (MSG_NOTE, def_stmt,
3577 "reduction: detected reduction chain: ");
3578
3579 return def_stmt;
3580 }
3581
3582 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3583 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3584 while (first)
3585 {
3586 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3587 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3588 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3589 first = next;
3590 }
3591
3592 /* Look for the expression computing loop_arg from loop PHI result. */
3593 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3594 code))
3595 return def_stmt;
3596
3597 if (dump_enabled_p ())
3598 {
3599 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3600 "reduction: unknown pattern: ");
3601 }
3602
3603 return NULL;
3604 }
3605
3606 /* Wrapper around vect_is_simple_reduction, which will modify code
3607 in-place if it enables detection of more reductions. Arguments
3608 as there. */
3609
3610 gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow)3611 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3612 bool *double_reduc,
3613 bool need_wrapping_integral_overflow)
3614 {
3615 enum vect_reduction_type v_reduc_type;
3616 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3617 need_wrapping_integral_overflow,
3618 &v_reduc_type);
3619 if (def)
3620 {
3621 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3622 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3623 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3624 reduc_def_info = vinfo_for_stmt (def);
3625 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3626 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3627 }
3628 return def;
3629 }
3630
3631 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3632 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3633 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3634 int *peel_iters_epilogue,
3635 stmt_vector_for_cost *scalar_cost_vec,
3636 stmt_vector_for_cost *prologue_cost_vec,
3637 stmt_vector_for_cost *epilogue_cost_vec)
3638 {
3639 int retval = 0;
3640 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3641
3642 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3643 {
3644 *peel_iters_epilogue = assumed_vf / 2;
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE, vect_location,
3647 "cost model: epilogue peel iters set to vf/2 "
3648 "because loop iterations are unknown .\n");
3649
3650 /* If peeled iterations are known but number of scalar loop
3651 iterations are unknown, count a taken branch per peeled loop. */
3652 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3653 NULL, 0, vect_prologue);
3654 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3655 NULL, 0, vect_epilogue);
3656 }
3657 else
3658 {
3659 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3660 peel_iters_prologue = niters < peel_iters_prologue ?
3661 niters : peel_iters_prologue;
3662 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3663 /* If we need to peel for gaps, but no peeling is required, we have to
3664 peel VF iterations. */
3665 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3666 *peel_iters_epilogue = assumed_vf;
3667 }
3668
3669 stmt_info_for_cost *si;
3670 int j;
3671 if (peel_iters_prologue)
3672 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3673 {
3674 stmt_vec_info stmt_info
3675 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3676 retval += record_stmt_cost (prologue_cost_vec,
3677 si->count * peel_iters_prologue,
3678 si->kind, stmt_info, si->misalign,
3679 vect_prologue);
3680 }
3681 if (*peel_iters_epilogue)
3682 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3683 {
3684 stmt_vec_info stmt_info
3685 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3686 retval += record_stmt_cost (epilogue_cost_vec,
3687 si->count * *peel_iters_epilogue,
3688 si->kind, stmt_info, si->misalign,
3689 vect_epilogue);
3690 }
3691
3692 return retval;
3693 }
3694
3695 /* Function vect_estimate_min_profitable_iters
3696
3697 Return the number of iterations required for the vector version of the
3698 loop to be profitable relative to the cost of the scalar version of the
3699 loop.
3700
3701 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3702 of iterations for vectorization. -1 value means loop vectorization
3703 is not profitable. This returned value may be used for dynamic
3704 profitability check.
3705
3706 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3707 for static check against estimated number of iterations. */
3708
3709 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3710 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3711 int *ret_min_profitable_niters,
3712 int *ret_min_profitable_estimate)
3713 {
3714 int min_profitable_iters;
3715 int min_profitable_estimate;
3716 int peel_iters_prologue;
3717 int peel_iters_epilogue;
3718 unsigned vec_inside_cost = 0;
3719 int vec_outside_cost = 0;
3720 unsigned vec_prologue_cost = 0;
3721 unsigned vec_epilogue_cost = 0;
3722 int scalar_single_iter_cost = 0;
3723 int scalar_outside_cost = 0;
3724 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3725 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3726 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3727
3728 /* Cost model disabled. */
3729 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3730 {
3731 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3732 *ret_min_profitable_niters = 0;
3733 *ret_min_profitable_estimate = 0;
3734 return;
3735 }
3736
3737 /* Requires loop versioning tests to handle misalignment. */
3738 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3739 {
3740 /* FIXME: Make cost depend on complexity of individual check. */
3741 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3742 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3743 vect_prologue);
3744 dump_printf (MSG_NOTE,
3745 "cost model: Adding cost of checks for loop "
3746 "versioning to treat misalignment.\n");
3747 }
3748
3749 /* Requires loop versioning with alias checks. */
3750 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3751 {
3752 /* FIXME: Make cost depend on complexity of individual check. */
3753 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3754 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3755 vect_prologue);
3756 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3757 if (len)
3758 /* Count LEN - 1 ANDs and LEN comparisons. */
3759 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3760 NULL, 0, vect_prologue);
3761 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3762 if (len)
3763 {
3764 /* Count LEN - 1 ANDs and LEN comparisons. */
3765 unsigned int nstmts = len * 2 - 1;
3766 /* +1 for each bias that needs adding. */
3767 for (unsigned int i = 0; i < len; ++i)
3768 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3769 nstmts += 1;
3770 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3771 NULL, 0, vect_prologue);
3772 }
3773 dump_printf (MSG_NOTE,
3774 "cost model: Adding cost of checks for loop "
3775 "versioning aliasing.\n");
3776 }
3777
3778 /* Requires loop versioning with niter checks. */
3779 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3780 {
3781 /* FIXME: Make cost depend on complexity of individual check. */
3782 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3783 vect_prologue);
3784 dump_printf (MSG_NOTE,
3785 "cost model: Adding cost of checks for loop "
3786 "versioning niters.\n");
3787 }
3788
3789 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3790 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3791 vect_prologue);
3792
3793 /* Count statements in scalar loop. Using this as scalar cost for a single
3794 iteration for now.
3795
3796 TODO: Add outer loop support.
3797
3798 TODO: Consider assigning different costs to different scalar
3799 statements. */
3800
3801 scalar_single_iter_cost
3802 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3803
3804 /* Add additional cost for the peeled instructions in prologue and epilogue
3805 loop. (For fully-masked loops there will be no peeling.)
3806
3807 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3808 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3809
3810 TODO: Build an expression that represents peel_iters for prologue and
3811 epilogue to be used in a run-time test. */
3812
3813 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3814 {
3815 peel_iters_prologue = 0;
3816 peel_iters_epilogue = 0;
3817
3818 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3819 {
3820 /* We need to peel exactly one iteration. */
3821 peel_iters_epilogue += 1;
3822 stmt_info_for_cost *si;
3823 int j;
3824 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3825 j, si)
3826 {
3827 struct _stmt_vec_info *stmt_info
3828 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3829 (void) add_stmt_cost (target_cost_data, si->count,
3830 si->kind, stmt_info, si->misalign,
3831 vect_epilogue);
3832 }
3833 }
3834 }
3835 else if (npeel < 0)
3836 {
3837 peel_iters_prologue = assumed_vf / 2;
3838 dump_printf (MSG_NOTE, "cost model: "
3839 "prologue peel iters set to vf/2.\n");
3840
3841 /* If peeling for alignment is unknown, loop bound of main loop becomes
3842 unknown. */
3843 peel_iters_epilogue = assumed_vf / 2;
3844 dump_printf (MSG_NOTE, "cost model: "
3845 "epilogue peel iters set to vf/2 because "
3846 "peeling for alignment is unknown.\n");
3847
3848 /* If peeled iterations are unknown, count a taken branch and a not taken
3849 branch per peeled loop. Even if scalar loop iterations are known,
3850 vector iterations are not known since peeled prologue iterations are
3851 not known. Hence guards remain the same. */
3852 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3853 NULL, 0, vect_prologue);
3854 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3855 NULL, 0, vect_prologue);
3856 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3857 NULL, 0, vect_epilogue);
3858 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3859 NULL, 0, vect_epilogue);
3860 stmt_info_for_cost *si;
3861 int j;
3862 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3863 {
3864 struct _stmt_vec_info *stmt_info
3865 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3866 (void) add_stmt_cost (target_cost_data,
3867 si->count * peel_iters_prologue,
3868 si->kind, stmt_info, si->misalign,
3869 vect_prologue);
3870 (void) add_stmt_cost (target_cost_data,
3871 si->count * peel_iters_epilogue,
3872 si->kind, stmt_info, si->misalign,
3873 vect_epilogue);
3874 }
3875 }
3876 else
3877 {
3878 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3879 stmt_info_for_cost *si;
3880 int j;
3881 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3882
3883 prologue_cost_vec.create (2);
3884 epilogue_cost_vec.create (2);
3885 peel_iters_prologue = npeel;
3886
3887 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3888 &peel_iters_epilogue,
3889 &LOOP_VINFO_SCALAR_ITERATION_COST
3890 (loop_vinfo),
3891 &prologue_cost_vec,
3892 &epilogue_cost_vec);
3893
3894 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3895 {
3896 struct _stmt_vec_info *stmt_info
3897 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3898 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3899 si->misalign, vect_prologue);
3900 }
3901
3902 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3903 {
3904 struct _stmt_vec_info *stmt_info
3905 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3906 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3907 si->misalign, vect_epilogue);
3908 }
3909
3910 prologue_cost_vec.release ();
3911 epilogue_cost_vec.release ();
3912 }
3913
3914 /* FORNOW: The scalar outside cost is incremented in one of the
3915 following ways:
3916
3917 1. The vectorizer checks for alignment and aliasing and generates
3918 a condition that allows dynamic vectorization. A cost model
3919 check is ANDED with the versioning condition. Hence scalar code
3920 path now has the added cost of the versioning check.
3921
3922 if (cost > th & versioning_check)
3923 jmp to vector code
3924
3925 Hence run-time scalar is incremented by not-taken branch cost.
3926
3927 2. The vectorizer then checks if a prologue is required. If the
3928 cost model check was not done before during versioning, it has to
3929 be done before the prologue check.
3930
3931 if (cost <= th)
3932 prologue = scalar_iters
3933 if (prologue == 0)
3934 jmp to vector code
3935 else
3936 execute prologue
3937 if (prologue == num_iters)
3938 go to exit
3939
3940 Hence the run-time scalar cost is incremented by a taken branch,
3941 plus a not-taken branch, plus a taken branch cost.
3942
3943 3. The vectorizer then checks if an epilogue is required. If the
3944 cost model check was not done before during prologue check, it
3945 has to be done with the epilogue check.
3946
3947 if (prologue == 0)
3948 jmp to vector code
3949 else
3950 execute prologue
3951 if (prologue == num_iters)
3952 go to exit
3953 vector code:
3954 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3955 jmp to epilogue
3956
3957 Hence the run-time scalar cost should be incremented by 2 taken
3958 branches.
3959
3960 TODO: The back end may reorder the BBS's differently and reverse
3961 conditions/branch directions. Change the estimates below to
3962 something more reasonable. */
3963
3964 /* If the number of iterations is known and we do not do versioning, we can
3965 decide whether to vectorize at compile time. Hence the scalar version
3966 do not carry cost model guard costs. */
3967 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3968 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3969 {
3970 /* Cost model check occurs at versioning. */
3971 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3972 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3973 else
3974 {
3975 /* Cost model check occurs at prologue generation. */
3976 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3977 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3978 + vect_get_stmt_cost (cond_branch_not_taken);
3979 /* Cost model check occurs at epilogue generation. */
3980 else
3981 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3982 }
3983 }
3984
3985 /* Complete the target-specific cost calculations. */
3986 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3987 &vec_inside_cost, &vec_epilogue_cost);
3988
3989 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3990
3991 if (dump_enabled_p ())
3992 {
3993 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3994 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3995 vec_inside_cost);
3996 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3997 vec_prologue_cost);
3998 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3999 vec_epilogue_cost);
4000 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4001 scalar_single_iter_cost);
4002 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4003 scalar_outside_cost);
4004 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4005 vec_outside_cost);
4006 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4007 peel_iters_prologue);
4008 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4009 peel_iters_epilogue);
4010 }
4011
4012 /* Calculate number of iterations required to make the vector version
4013 profitable, relative to the loop bodies only. The following condition
4014 must hold true:
4015 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4016 where
4017 SIC = scalar iteration cost, VIC = vector iteration cost,
4018 VOC = vector outside cost, VF = vectorization factor,
4019 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4020 SOC = scalar outside cost for run time cost model check. */
4021
4022 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4023 {
4024 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4025 * assumed_vf
4026 - vec_inside_cost * peel_iters_prologue
4027 - vec_inside_cost * peel_iters_epilogue);
4028 if (min_profitable_iters <= 0)
4029 min_profitable_iters = 0;
4030 else
4031 {
4032 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4033 - vec_inside_cost);
4034
4035 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4036 <= (((int) vec_inside_cost * min_profitable_iters)
4037 + (((int) vec_outside_cost - scalar_outside_cost)
4038 * assumed_vf)))
4039 min_profitable_iters++;
4040 }
4041 }
4042 /* vector version will never be profitable. */
4043 else
4044 {
4045 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4046 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4047 "did not happen for a simd loop");
4048
4049 if (dump_enabled_p ())
4050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4051 "cost model: the vector iteration cost = %d "
4052 "divided by the scalar iteration cost = %d "
4053 "is greater or equal to the vectorization factor = %d"
4054 ".\n",
4055 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4056 *ret_min_profitable_niters = -1;
4057 *ret_min_profitable_estimate = -1;
4058 return;
4059 }
4060
4061 dump_printf (MSG_NOTE,
4062 " Calculated minimum iters for profitability: %d\n",
4063 min_profitable_iters);
4064
4065 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4066 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4067 /* We want the vectorized loop to execute at least once. */
4068 min_profitable_iters = assumed_vf + peel_iters_prologue;
4069
4070 if (dump_enabled_p ())
4071 dump_printf_loc (MSG_NOTE, vect_location,
4072 " Runtime profitability threshold = %d\n",
4073 min_profitable_iters);
4074
4075 *ret_min_profitable_niters = min_profitable_iters;
4076
4077 /* Calculate number of iterations required to make the vector version
4078 profitable, relative to the loop bodies only.
4079
4080 Non-vectorized variant is SIC * niters and it must win over vector
4081 variant on the expected loop trip count. The following condition must hold true:
4082 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
4083
4084 if (vec_outside_cost <= 0)
4085 min_profitable_estimate = 0;
4086 else
4087 {
4088 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4089 * assumed_vf
4090 - vec_inside_cost * peel_iters_prologue
4091 - vec_inside_cost * peel_iters_epilogue)
4092 / ((scalar_single_iter_cost * assumed_vf)
4093 - vec_inside_cost);
4094 }
4095 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4096 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_NOTE, vect_location,
4098 " Static estimate profitability threshold = %d\n",
4099 min_profitable_estimate);
4100
4101 *ret_min_profitable_estimate = min_profitable_estimate;
4102 }
4103
4104 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4105 vector elements (not bits) for a vector with NELT elements. */
4106 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4107 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4108 vec_perm_builder *sel)
4109 {
4110 /* The encoding is a single stepped pattern. Any wrap-around is handled
4111 by vec_perm_indices. */
4112 sel->new_vector (nelt, 1, 3);
4113 for (unsigned int i = 0; i < 3; i++)
4114 sel->quick_push (i + offset);
4115 }
4116
4117 /* Checks whether the target supports whole-vector shifts for vectors of mode
4118 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4119 it supports vec_perm_const with masks for all necessary shift amounts. */
4120 static bool
have_whole_vector_shift(machine_mode mode)4121 have_whole_vector_shift (machine_mode mode)
4122 {
4123 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4124 return true;
4125
4126 /* Variable-length vectors should be handled via the optab. */
4127 unsigned int nelt;
4128 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4129 return false;
4130
4131 vec_perm_builder sel;
4132 vec_perm_indices indices;
4133 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4134 {
4135 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4136 indices.new_vector (sel, 2, nelt);
4137 if (!can_vec_perm_const_p (mode, indices, false))
4138 return false;
4139 }
4140 return true;
4141 }
4142
4143 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4144 functions. Design better to avoid maintenance issues. */
4145
4146 /* Function vect_model_reduction_cost.
4147
4148 Models cost for a reduction operation, including the vector ops
4149 generated within the strip-mine loop, the initial definition before
4150 the loop, and the epilogue code that must be generated. */
4151
4152 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies)4153 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4154 int ncopies)
4155 {
4156 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4157 enum tree_code code;
4158 optab optab;
4159 tree vectype;
4160 gimple *orig_stmt;
4161 machine_mode mode;
4162 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4163 struct loop *loop = NULL;
4164 void *target_cost_data;
4165
4166 if (loop_vinfo)
4167 {
4168 loop = LOOP_VINFO_LOOP (loop_vinfo);
4169 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4170 }
4171 else
4172 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4173
4174 /* Condition reductions generate two reductions in the loop. */
4175 vect_reduction_type reduction_type
4176 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4177 if (reduction_type == COND_REDUCTION)
4178 ncopies *= 2;
4179
4180 vectype = STMT_VINFO_VECTYPE (stmt_info);
4181 mode = TYPE_MODE (vectype);
4182 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4183
4184 if (!orig_stmt)
4185 orig_stmt = STMT_VINFO_STMT (stmt_info);
4186
4187 code = gimple_assign_rhs_code (orig_stmt);
4188
4189 if (reduction_type == EXTRACT_LAST_REDUCTION
4190 || reduction_type == FOLD_LEFT_REDUCTION)
4191 {
4192 /* No extra instructions needed in the prologue. */
4193 prologue_cost = 0;
4194
4195 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4196 /* Count one reduction-like operation per vector. */
4197 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4198 stmt_info, 0, vect_body);
4199 else
4200 {
4201 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4202 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4203 inside_cost = add_stmt_cost (target_cost_data, nelements,
4204 vec_to_scalar, stmt_info, 0,
4205 vect_body);
4206 inside_cost += add_stmt_cost (target_cost_data, nelements,
4207 scalar_stmt, stmt_info, 0,
4208 vect_body);
4209 }
4210 }
4211 else
4212 {
4213 /* Add in cost for initial definition.
4214 For cond reduction we have four vectors: initial index, step,
4215 initial result of the data reduction, initial value of the index
4216 reduction. */
4217 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4218 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4219 scalar_to_vec, stmt_info, 0,
4220 vect_prologue);
4221
4222 /* Cost of reduction op inside loop. */
4223 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4224 stmt_info, 0, vect_body);
4225 }
4226
4227 /* Determine cost of epilogue code.
4228
4229 We have a reduction operator that will reduce the vector in one statement.
4230 Also requires scalar extract. */
4231
4232 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4233 {
4234 if (reduc_fn != IFN_LAST)
4235 {
4236 if (reduction_type == COND_REDUCTION)
4237 {
4238 /* An EQ stmt and an COND_EXPR stmt. */
4239 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4240 vector_stmt, stmt_info, 0,
4241 vect_epilogue);
4242 /* Reduction of the max index and a reduction of the found
4243 values. */
4244 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4245 vec_to_scalar, stmt_info, 0,
4246 vect_epilogue);
4247 /* A broadcast of the max value. */
4248 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4249 scalar_to_vec, stmt_info, 0,
4250 vect_epilogue);
4251 }
4252 else
4253 {
4254 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4255 stmt_info, 0, vect_epilogue);
4256 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4257 vec_to_scalar, stmt_info, 0,
4258 vect_epilogue);
4259 }
4260 }
4261 else if (reduction_type == COND_REDUCTION)
4262 {
4263 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4264 /* Extraction of scalar elements. */
4265 epilogue_cost += add_stmt_cost (target_cost_data,
4266 2 * estimated_nunits,
4267 vec_to_scalar, stmt_info, 0,
4268 vect_epilogue);
4269 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4270 epilogue_cost += add_stmt_cost (target_cost_data,
4271 2 * estimated_nunits - 3,
4272 scalar_stmt, stmt_info, 0,
4273 vect_epilogue);
4274 }
4275 else if (reduction_type == EXTRACT_LAST_REDUCTION
4276 || reduction_type == FOLD_LEFT_REDUCTION)
4277 /* No extra instructions need in the epilogue. */
4278 ;
4279 else
4280 {
4281 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4282 tree bitsize =
4283 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4284 int element_bitsize = tree_to_uhwi (bitsize);
4285 int nelements = vec_size_in_bits / element_bitsize;
4286
4287 if (code == COND_EXPR)
4288 code = MAX_EXPR;
4289
4290 optab = optab_for_tree_code (code, vectype, optab_default);
4291
4292 /* We have a whole vector shift available. */
4293 if (optab != unknown_optab
4294 && VECTOR_MODE_P (mode)
4295 && optab_handler (optab, mode) != CODE_FOR_nothing
4296 && have_whole_vector_shift (mode))
4297 {
4298 /* Final reduction via vector shifts and the reduction operator.
4299 Also requires scalar extract. */
4300 epilogue_cost += add_stmt_cost (target_cost_data,
4301 exact_log2 (nelements) * 2,
4302 vector_stmt, stmt_info, 0,
4303 vect_epilogue);
4304 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4305 vec_to_scalar, stmt_info, 0,
4306 vect_epilogue);
4307 }
4308 else
4309 /* Use extracts and reduction op for final reduction. For N
4310 elements, we have N extracts and N-1 reduction ops. */
4311 epilogue_cost += add_stmt_cost (target_cost_data,
4312 nelements + nelements - 1,
4313 vector_stmt, stmt_info, 0,
4314 vect_epilogue);
4315 }
4316 }
4317
4318 if (dump_enabled_p ())
4319 dump_printf (MSG_NOTE,
4320 "vect_model_reduction_cost: inside_cost = %d, "
4321 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4322 prologue_cost, epilogue_cost);
4323 }
4324
4325
4326 /* Function vect_model_induction_cost.
4327
4328 Models cost for induction operations. */
4329
4330 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)4331 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4332 {
4333 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4334 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4335 unsigned inside_cost, prologue_cost;
4336
4337 if (PURE_SLP_STMT (stmt_info))
4338 return;
4339
4340 /* loop cost for vec_loop. */
4341 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4342 stmt_info, 0, vect_body);
4343
4344 /* prologue cost for vec_init and vec_step. */
4345 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4346 stmt_info, 0, vect_prologue);
4347
4348 if (dump_enabled_p ())
4349 dump_printf_loc (MSG_NOTE, vect_location,
4350 "vect_model_induction_cost: inside_cost = %d, "
4351 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4352 }
4353
4354
4355
4356 /* Function get_initial_def_for_reduction
4357
4358 Input:
4359 STMT - a stmt that performs a reduction operation in the loop.
4360 INIT_VAL - the initial value of the reduction variable
4361
4362 Output:
4363 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4364 of the reduction (used for adjusting the epilog - see below).
4365 Return a vector variable, initialized according to the operation that STMT
4366 performs. This vector will be used as the initial value of the
4367 vector of partial results.
4368
4369 Option1 (adjust in epilog): Initialize the vector as follows:
4370 add/bit or/xor: [0,0,...,0,0]
4371 mult/bit and: [1,1,...,1,1]
4372 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4373 and when necessary (e.g. add/mult case) let the caller know
4374 that it needs to adjust the result by init_val.
4375
4376 Option2: Initialize the vector as follows:
4377 add/bit or/xor: [init_val,0,0,...,0]
4378 mult/bit and: [init_val,1,1,...,1]
4379 min/max/cond_expr: [init_val,init_val,...,init_val]
4380 and no adjustments are needed.
4381
4382 For example, for the following code:
4383
4384 s = init_val;
4385 for (i=0;i<n;i++)
4386 s = s + a[i];
4387
4388 STMT is 's = s + a[i]', and the reduction variable is 's'.
4389 For a vector of 4 units, we want to return either [0,0,0,init_val],
4390 or [0,0,0,0] and let the caller know that it needs to adjust
4391 the result at the end by 'init_val'.
4392
4393 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4394 initialization vector is simpler (same element in all entries), if
4395 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4396
4397 A cost model should help decide between these two schemes. */
4398
4399 tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)4400 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4401 tree *adjustment_def)
4402 {
4403 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4404 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4405 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4406 tree scalar_type = TREE_TYPE (init_val);
4407 tree vectype = get_vectype_for_scalar_type (scalar_type);
4408 enum tree_code code = gimple_assign_rhs_code (stmt);
4409 tree def_for_init;
4410 tree init_def;
4411 bool nested_in_vect_loop = false;
4412 REAL_VALUE_TYPE real_init_val = dconst0;
4413 int int_init_val = 0;
4414 gimple *def_stmt = NULL;
4415 gimple_seq stmts = NULL;
4416
4417 gcc_assert (vectype);
4418
4419 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4420 || SCALAR_FLOAT_TYPE_P (scalar_type));
4421
4422 if (nested_in_vect_loop_p (loop, stmt))
4423 nested_in_vect_loop = true;
4424 else
4425 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4426
4427 /* In case of double reduction we only create a vector variable to be put
4428 in the reduction phi node. The actual statement creation is done in
4429 vect_create_epilog_for_reduction. */
4430 if (adjustment_def && nested_in_vect_loop
4431 && TREE_CODE (init_val) == SSA_NAME
4432 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4433 && gimple_code (def_stmt) == GIMPLE_PHI
4434 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4435 && vinfo_for_stmt (def_stmt)
4436 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4437 == vect_double_reduction_def)
4438 {
4439 *adjustment_def = NULL;
4440 return vect_create_destination_var (init_val, vectype);
4441 }
4442
4443 vect_reduction_type reduction_type
4444 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4445
4446 /* In case of a nested reduction do not use an adjustment def as
4447 that case is not supported by the epilogue generation correctly
4448 if ncopies is not one. */
4449 if (adjustment_def && nested_in_vect_loop)
4450 {
4451 *adjustment_def = NULL;
4452 return vect_get_vec_def_for_operand (init_val, stmt);
4453 }
4454
4455 switch (code)
4456 {
4457 case WIDEN_SUM_EXPR:
4458 case DOT_PROD_EXPR:
4459 case SAD_EXPR:
4460 case PLUS_EXPR:
4461 case MINUS_EXPR:
4462 case BIT_IOR_EXPR:
4463 case BIT_XOR_EXPR:
4464 case MULT_EXPR:
4465 case BIT_AND_EXPR:
4466 {
4467 /* ADJUSTMENT_DEF is NULL when called from
4468 vect_create_epilog_for_reduction to vectorize double reduction. */
4469 if (adjustment_def)
4470 *adjustment_def = init_val;
4471
4472 if (code == MULT_EXPR)
4473 {
4474 real_init_val = dconst1;
4475 int_init_val = 1;
4476 }
4477
4478 if (code == BIT_AND_EXPR)
4479 int_init_val = -1;
4480
4481 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4482 def_for_init = build_real (scalar_type, real_init_val);
4483 else
4484 def_for_init = build_int_cst (scalar_type, int_init_val);
4485
4486 if (adjustment_def)
4487 /* Option1: the first element is '0' or '1' as well. */
4488 init_def = gimple_build_vector_from_val (&stmts, vectype,
4489 def_for_init);
4490 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4491 {
4492 /* Option2 (variable length): the first element is INIT_VAL. */
4493 init_def = build_vector_from_val (vectype, def_for_init);
4494 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4495 2, init_def, init_val);
4496 init_def = make_ssa_name (vectype);
4497 gimple_call_set_lhs (call, init_def);
4498 gimple_seq_add_stmt (&stmts, call);
4499 }
4500 else
4501 {
4502 /* Option2: the first element is INIT_VAL. */
4503 tree_vector_builder elts (vectype, 1, 2);
4504 elts.quick_push (init_val);
4505 elts.quick_push (def_for_init);
4506 init_def = gimple_build_vector (&stmts, &elts);
4507 }
4508 }
4509 break;
4510
4511 case MIN_EXPR:
4512 case MAX_EXPR:
4513 case COND_EXPR:
4514 {
4515 if (adjustment_def)
4516 {
4517 *adjustment_def = NULL_TREE;
4518 if (reduction_type != COND_REDUCTION
4519 && reduction_type != EXTRACT_LAST_REDUCTION)
4520 {
4521 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4522 break;
4523 }
4524 }
4525 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4526 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4527 }
4528 break;
4529
4530 default:
4531 gcc_unreachable ();
4532 }
4533
4534 if (stmts)
4535 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4536 return init_def;
4537 }
4538
4539 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4540 NUMBER_OF_VECTORS is the number of vector defs to create.
4541 If NEUTRAL_OP is nonnull, introducing extra elements of that
4542 value will not change the result. */
4543
4544 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4545 get_initial_defs_for_reduction (slp_tree slp_node,
4546 vec<tree> *vec_oprnds,
4547 unsigned int number_of_vectors,
4548 bool reduc_chain, tree neutral_op)
4549 {
4550 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4551 gimple *stmt = stmts[0];
4552 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4553 unsigned HOST_WIDE_INT nunits;
4554 unsigned j, number_of_places_left_in_vector;
4555 tree vector_type;
4556 unsigned int group_size = stmts.length ();
4557 unsigned int i;
4558 struct loop *loop;
4559
4560 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4561
4562 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4563
4564 loop = (gimple_bb (stmt))->loop_father;
4565 gcc_assert (loop);
4566 edge pe = loop_preheader_edge (loop);
4567
4568 gcc_assert (!reduc_chain || neutral_op);
4569
4570 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4571 created vectors. It is greater than 1 if unrolling is performed.
4572
4573 For example, we have two scalar operands, s1 and s2 (e.g., group of
4574 strided accesses of size two), while NUNITS is four (i.e., four scalars
4575 of this type can be packed in a vector). The output vector will contain
4576 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4577 will be 2).
4578
4579 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4580 containing the operands.
4581
4582 For example, NUNITS is four as before, and the group size is 8
4583 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4584 {s5, s6, s7, s8}. */
4585
4586 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4587 nunits = group_size;
4588
4589 number_of_places_left_in_vector = nunits;
4590 bool constant_p = true;
4591 tree_vector_builder elts (vector_type, nunits, 1);
4592 elts.quick_grow (nunits);
4593 gimple_seq ctor_seq = NULL;
4594 for (j = 0; j < nunits * number_of_vectors; ++j)
4595 {
4596 tree op;
4597 i = j % group_size;
4598 stmt_vinfo = vinfo_for_stmt (stmts[i]);
4599
4600 /* Get the def before the loop. In reduction chain we have only
4601 one initial value. Else we have as many as PHIs in the group. */
4602 if (reduc_chain)
4603 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4604 else if (((vec_oprnds->length () + 1) * nunits
4605 - number_of_places_left_in_vector >= group_size)
4606 && neutral_op)
4607 op = neutral_op;
4608 else
4609 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4610
4611 /* Create 'vect_ = {op0,op1,...,opn}'. */
4612 number_of_places_left_in_vector--;
4613 elts[nunits - number_of_places_left_in_vector - 1] = op;
4614 if (!CONSTANT_CLASS_P (op))
4615 constant_p = false;
4616
4617 if (number_of_places_left_in_vector == 0)
4618 {
4619 tree init;
4620 if (constant_p && !neutral_op
4621 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4622 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4623 /* Build the vector directly from ELTS. */
4624 init = gimple_build_vector (&ctor_seq, &elts);
4625 else if (neutral_op)
4626 {
4627 /* Build a vector of the neutral value and shift the
4628 other elements into place. */
4629 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4630 neutral_op);
4631 int k = nunits;
4632 while (k > 0 && elts[k - 1] == neutral_op)
4633 k -= 1;
4634 while (k > 0)
4635 {
4636 k -= 1;
4637 gcall *call = gimple_build_call_internal
4638 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4639 init = make_ssa_name (vector_type);
4640 gimple_call_set_lhs (call, init);
4641 gimple_seq_add_stmt (&ctor_seq, call);
4642 }
4643 }
4644 else
4645 {
4646 /* First time round, duplicate ELTS to fill the
4647 required number of vectors. */
4648 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4649 number_of_vectors, *vec_oprnds);
4650 break;
4651 }
4652 vec_oprnds->quick_push (init);
4653
4654 number_of_places_left_in_vector = nunits;
4655 elts.new_vector (vector_type, nunits, 1);
4656 elts.quick_grow (nunits);
4657 constant_p = true;
4658 }
4659 }
4660 if (ctor_seq != NULL)
4661 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4662 }
4663
4664
4665 /* Function vect_create_epilog_for_reduction
4666
4667 Create code at the loop-epilog to finalize the result of a reduction
4668 computation.
4669
4670 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4671 reduction statements.
4672 STMT is the scalar reduction stmt that is being vectorized.
4673 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4674 number of elements that we can fit in a vectype (nunits). In this case
4675 we have to generate more than one vector stmt - i.e - we need to "unroll"
4676 the vector stmt by a factor VF/nunits. For more details see documentation
4677 in vectorizable_operation.
4678 REDUC_FN is the internal function for the epilog reduction.
4679 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4680 computation.
4681 REDUC_INDEX is the index of the operand in the right hand side of the
4682 statement that is defined by REDUCTION_PHI.
4683 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4684 SLP_NODE is an SLP node containing a group of reduction statements. The
4685 first one in this group is STMT.
4686 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4687 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4688 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4689 any value of the IV in the loop.
4690 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4691 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4692 null if this is not an SLP reduction
4693
4694 This function:
4695 1. Creates the reduction def-use cycles: sets the arguments for
4696 REDUCTION_PHIS:
4697 The loop-entry argument is the vectorized initial-value of the reduction.
4698 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4699 sums.
4700 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4701 by calling the function specified by REDUC_FN if available, or by
4702 other means (whole-vector shifts or a scalar loop).
4703 The function also creates a new phi node at the loop exit to preserve
4704 loop-closed form, as illustrated below.
4705
4706 The flow at the entry to this function:
4707
4708 loop:
4709 vec_def = phi <null, null> # REDUCTION_PHI
4710 VECT_DEF = vector_stmt # vectorized form of STMT
4711 s_loop = scalar_stmt # (scalar) STMT
4712 loop_exit:
4713 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4714 use <s_out0>
4715 use <s_out0>
4716
4717 The above is transformed by this function into:
4718
4719 loop:
4720 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4721 VECT_DEF = vector_stmt # vectorized form of STMT
4722 s_loop = scalar_stmt # (scalar) STMT
4723 loop_exit:
4724 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4725 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4726 v_out2 = reduce <v_out1>
4727 s_out3 = extract_field <v_out2, 0>
4728 s_out4 = adjust_result <s_out3>
4729 use <s_out4>
4730 use <s_out4>
4731 */
4732
4733 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<gimple * > reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)4734 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4735 gimple *reduc_def_stmt,
4736 int ncopies, internal_fn reduc_fn,
4737 vec<gimple *> reduction_phis,
4738 bool double_reduc,
4739 slp_tree slp_node,
4740 slp_instance slp_node_instance,
4741 tree induc_val, enum tree_code induc_code,
4742 tree neutral_op)
4743 {
4744 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4745 stmt_vec_info prev_phi_info;
4746 tree vectype;
4747 machine_mode mode;
4748 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4749 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4750 basic_block exit_bb;
4751 tree scalar_dest;
4752 tree scalar_type;
4753 gimple *new_phi = NULL, *phi;
4754 gimple_stmt_iterator exit_gsi;
4755 tree vec_dest;
4756 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4757 gimple *epilog_stmt = NULL;
4758 enum tree_code code = gimple_assign_rhs_code (stmt);
4759 gimple *exit_phi;
4760 tree bitsize;
4761 tree adjustment_def = NULL;
4762 tree vec_initial_def = NULL;
4763 tree expr, def, initial_def = NULL;
4764 tree orig_name, scalar_result;
4765 imm_use_iterator imm_iter, phi_imm_iter;
4766 use_operand_p use_p, phi_use_p;
4767 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4768 bool nested_in_vect_loop = false;
4769 auto_vec<gimple *> new_phis;
4770 auto_vec<gimple *> inner_phis;
4771 enum vect_def_type dt = vect_unknown_def_type;
4772 int j, i;
4773 auto_vec<tree> scalar_results;
4774 unsigned int group_size = 1, k, ratio;
4775 auto_vec<tree> vec_initial_defs;
4776 auto_vec<gimple *> phis;
4777 bool slp_reduc = false;
4778 bool direct_slp_reduc;
4779 tree new_phi_result;
4780 gimple *inner_phi = NULL;
4781 tree induction_index = NULL_TREE;
4782
4783 if (slp_node)
4784 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4785
4786 if (nested_in_vect_loop_p (loop, stmt))
4787 {
4788 outer_loop = loop;
4789 loop = loop->inner;
4790 nested_in_vect_loop = true;
4791 gcc_assert (!slp_node);
4792 }
4793
4794 vectype = STMT_VINFO_VECTYPE (stmt_info);
4795 gcc_assert (vectype);
4796 mode = TYPE_MODE (vectype);
4797
4798 /* 1. Create the reduction def-use cycle:
4799 Set the arguments of REDUCTION_PHIS, i.e., transform
4800
4801 loop:
4802 vec_def = phi <null, null> # REDUCTION_PHI
4803 VECT_DEF = vector_stmt # vectorized form of STMT
4804 ...
4805
4806 into:
4807
4808 loop:
4809 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4810 VECT_DEF = vector_stmt # vectorized form of STMT
4811 ...
4812
4813 (in case of SLP, do it for all the phis). */
4814
4815 /* Get the loop-entry arguments. */
4816 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4817 if (slp_node)
4818 {
4819 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4820 vec_initial_defs.reserve (vec_num);
4821 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4822 &vec_initial_defs, vec_num,
4823 GROUP_FIRST_ELEMENT (stmt_info),
4824 neutral_op);
4825 }
4826 else
4827 {
4828 /* Get at the scalar def before the loop, that defines the initial value
4829 of the reduction variable. */
4830 gimple *def_stmt;
4831 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4832 loop_preheader_edge (loop));
4833 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4834 and we can't use zero for induc_val, use initial_def. Similarly
4835 for REDUC_MIN and initial_def larger than the base. */
4836 if (TREE_CODE (initial_def) == INTEGER_CST
4837 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4838 == INTEGER_INDUC_COND_REDUCTION)
4839 && !integer_zerop (induc_val)
4840 && ((induc_code == MAX_EXPR
4841 && tree_int_cst_lt (initial_def, induc_val))
4842 || (induc_code == MIN_EXPR
4843 && tree_int_cst_lt (induc_val, initial_def))))
4844 induc_val = initial_def;
4845 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4846 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4847 &adjustment_def);
4848 vec_initial_defs.create (1);
4849 vec_initial_defs.quick_push (vec_initial_def);
4850 }
4851
4852 /* Set phi nodes arguments. */
4853 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4854 {
4855 tree vec_init_def = vec_initial_defs[i];
4856 tree def = vect_defs[i];
4857 for (j = 0; j < ncopies; j++)
4858 {
4859 if (j != 0)
4860 {
4861 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4862 if (nested_in_vect_loop)
4863 vec_init_def
4864 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4865 vec_init_def);
4866 }
4867
4868 /* Set the loop-entry arg of the reduction-phi. */
4869
4870 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4871 == INTEGER_INDUC_COND_REDUCTION)
4872 {
4873 /* Initialise the reduction phi to zero. This prevents initial
4874 values of non-zero interferring with the reduction op. */
4875 gcc_assert (ncopies == 1);
4876 gcc_assert (i == 0);
4877
4878 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4879 tree induc_val_vec
4880 = build_vector_from_val (vec_init_def_type, induc_val);
4881
4882 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4883 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4884 }
4885 else
4886 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4887 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4888
4889 /* Set the loop-latch arg for the reduction-phi. */
4890 if (j > 0)
4891 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4892
4893 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4894 UNKNOWN_LOCATION);
4895
4896 if (dump_enabled_p ())
4897 {
4898 dump_printf_loc (MSG_NOTE, vect_location,
4899 "transform reduction: created def-use cycle: ");
4900 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4901 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4902 }
4903 }
4904 }
4905
4906 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4907 which is updated with the current index of the loop for every match of
4908 the original loop's cond_expr (VEC_STMT). This results in a vector
4909 containing the last time the condition passed for that vector lane.
4910 The first match will be a 1 to allow 0 to be used for non-matching
4911 indexes. If there are no matches at all then the vector will be all
4912 zeroes. */
4913 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4914 {
4915 tree indx_before_incr, indx_after_incr;
4916 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4917
4918 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4919 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4920
4921 int scalar_precision
4922 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4923 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4924 tree cr_index_vector_type = build_vector_type
4925 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4926
4927 /* First we create a simple vector induction variable which starts
4928 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4929 vector size (STEP). */
4930
4931 /* Create a {1,2,3,...} vector. */
4932 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4933
4934 /* Create a vector of the step value. */
4935 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4936 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4937
4938 /* Create an induction variable. */
4939 gimple_stmt_iterator incr_gsi;
4940 bool insert_after;
4941 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4942 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4943 insert_after, &indx_before_incr, &indx_after_incr);
4944
4945 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4946 filled with zeros (VEC_ZERO). */
4947
4948 /* Create a vector of 0s. */
4949 tree zero = build_zero_cst (cr_index_scalar_type);
4950 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4951
4952 /* Create a vector phi node. */
4953 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4954 new_phi = create_phi_node (new_phi_tree, loop->header);
4955 set_vinfo_for_stmt (new_phi,
4956 new_stmt_vec_info (new_phi, loop_vinfo));
4957 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4958 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4959
4960 /* Now take the condition from the loops original cond_expr
4961 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4962 every match uses values from the induction variable
4963 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4964 (NEW_PHI_TREE).
4965 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4966 the new cond_expr (INDEX_COND_EXPR). */
4967
4968 /* Duplicate the condition from vec_stmt. */
4969 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4970
4971 /* Create a conditional, where the condition is taken from vec_stmt
4972 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4973 else is the phi (NEW_PHI_TREE). */
4974 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4975 ccompare, indx_before_incr,
4976 new_phi_tree);
4977 induction_index = make_ssa_name (cr_index_vector_type);
4978 gimple *index_condition = gimple_build_assign (induction_index,
4979 index_cond_expr);
4980 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4981 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4982 loop_vinfo);
4983 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4984 set_vinfo_for_stmt (index_condition, index_vec_info);
4985
4986 /* Update the phi with the vec cond. */
4987 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4988 loop_latch_edge (loop), UNKNOWN_LOCATION);
4989 }
4990
4991 /* 2. Create epilog code.
4992 The reduction epilog code operates across the elements of the vector
4993 of partial results computed by the vectorized loop.
4994 The reduction epilog code consists of:
4995
4996 step 1: compute the scalar result in a vector (v_out2)
4997 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4998 step 3: adjust the scalar result (s_out3) if needed.
4999
5000 Step 1 can be accomplished using one the following three schemes:
5001 (scheme 1) using reduc_fn, if available.
5002 (scheme 2) using whole-vector shifts, if available.
5003 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5004 combined.
5005
5006 The overall epilog code looks like this:
5007
5008 s_out0 = phi <s_loop> # original EXIT_PHI
5009 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5010 v_out2 = reduce <v_out1> # step 1
5011 s_out3 = extract_field <v_out2, 0> # step 2
5012 s_out4 = adjust_result <s_out3> # step 3
5013
5014 (step 3 is optional, and steps 1 and 2 may be combined).
5015 Lastly, the uses of s_out0 are replaced by s_out4. */
5016
5017
5018 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5019 v_out1 = phi <VECT_DEF>
5020 Store them in NEW_PHIS. */
5021
5022 exit_bb = single_exit (loop)->dest;
5023 prev_phi_info = NULL;
5024 new_phis.create (vect_defs.length ());
5025 FOR_EACH_VEC_ELT (vect_defs, i, def)
5026 {
5027 for (j = 0; j < ncopies; j++)
5028 {
5029 tree new_def = copy_ssa_name (def);
5030 phi = create_phi_node (new_def, exit_bb);
5031 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5032 if (j == 0)
5033 new_phis.quick_push (phi);
5034 else
5035 {
5036 def = vect_get_vec_def_for_stmt_copy (dt, def);
5037 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5038 }
5039
5040 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5041 prev_phi_info = vinfo_for_stmt (phi);
5042 }
5043 }
5044
5045 /* The epilogue is created for the outer-loop, i.e., for the loop being
5046 vectorized. Create exit phis for the outer loop. */
5047 if (double_reduc)
5048 {
5049 loop = outer_loop;
5050 exit_bb = single_exit (loop)->dest;
5051 inner_phis.create (vect_defs.length ());
5052 FOR_EACH_VEC_ELT (new_phis, i, phi)
5053 {
5054 tree new_result = copy_ssa_name (PHI_RESULT (phi));
5055 gphi *outer_phi = create_phi_node (new_result, exit_bb);
5056 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5057 PHI_RESULT (phi));
5058 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5059 loop_vinfo));
5060 inner_phis.quick_push (phi);
5061 new_phis[i] = outer_phi;
5062 prev_phi_info = vinfo_for_stmt (outer_phi);
5063 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5064 {
5065 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5066 new_result = copy_ssa_name (PHI_RESULT (phi));
5067 outer_phi = create_phi_node (new_result, exit_bb);
5068 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5069 PHI_RESULT (phi));
5070 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5071 loop_vinfo));
5072 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5073 prev_phi_info = vinfo_for_stmt (outer_phi);
5074 }
5075 }
5076 }
5077
5078 exit_gsi = gsi_after_labels (exit_bb);
5079
5080 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5081 (i.e. when reduc_fn is not available) and in the final adjustment
5082 code (if needed). Also get the original scalar reduction variable as
5083 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5084 represents a reduction pattern), the tree-code and scalar-def are
5085 taken from the original stmt that the pattern-stmt (STMT) replaces.
5086 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5087 are taken from STMT. */
5088
5089 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5090 if (!orig_stmt)
5091 {
5092 /* Regular reduction */
5093 orig_stmt = stmt;
5094 }
5095 else
5096 {
5097 /* Reduction pattern */
5098 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5099 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5100 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5101 }
5102
5103 code = gimple_assign_rhs_code (orig_stmt);
5104 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5105 partial results are added and not subtracted. */
5106 if (code == MINUS_EXPR)
5107 code = PLUS_EXPR;
5108
5109 scalar_dest = gimple_assign_lhs (orig_stmt);
5110 scalar_type = TREE_TYPE (scalar_dest);
5111 scalar_results.create (group_size);
5112 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5113 bitsize = TYPE_SIZE (scalar_type);
5114
5115 /* In case this is a reduction in an inner-loop while vectorizing an outer
5116 loop - we don't need to extract a single scalar result at the end of the
5117 inner-loop (unless it is double reduction, i.e., the use of reduction is
5118 outside the outer-loop). The final vector of partial results will be used
5119 in the vectorized outer-loop, or reduced to a scalar result at the end of
5120 the outer-loop. */
5121 if (nested_in_vect_loop && !double_reduc)
5122 goto vect_finalize_reduction;
5123
5124 /* SLP reduction without reduction chain, e.g.,
5125 # a1 = phi <a2, a0>
5126 # b1 = phi <b2, b0>
5127 a2 = operation (a1)
5128 b2 = operation (b1) */
5129 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5130
5131 /* True if we should implement SLP_REDUC using native reduction operations
5132 instead of scalar operations. */
5133 direct_slp_reduc = (reduc_fn != IFN_LAST
5134 && slp_reduc
5135 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5136
5137 /* In case of reduction chain, e.g.,
5138 # a1 = phi <a3, a0>
5139 a2 = operation (a1)
5140 a3 = operation (a2),
5141
5142 we may end up with more than one vector result. Here we reduce them to
5143 one vector. */
5144 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5145 {
5146 tree first_vect = PHI_RESULT (new_phis[0]);
5147 gassign *new_vec_stmt = NULL;
5148 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5149 for (k = 1; k < new_phis.length (); k++)
5150 {
5151 gimple *next_phi = new_phis[k];
5152 tree second_vect = PHI_RESULT (next_phi);
5153 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5154 new_vec_stmt = gimple_build_assign (tem, code,
5155 first_vect, second_vect);
5156 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5157 first_vect = tem;
5158 }
5159
5160 new_phi_result = first_vect;
5161 if (new_vec_stmt)
5162 {
5163 new_phis.truncate (0);
5164 new_phis.safe_push (new_vec_stmt);
5165 }
5166 }
5167 /* Likewise if we couldn't use a single defuse cycle. */
5168 else if (ncopies > 1)
5169 {
5170 gcc_assert (new_phis.length () == 1);
5171 tree first_vect = PHI_RESULT (new_phis[0]);
5172 gassign *new_vec_stmt = NULL;
5173 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5174 gimple *next_phi = new_phis[0];
5175 for (int k = 1; k < ncopies; ++k)
5176 {
5177 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5178 tree second_vect = PHI_RESULT (next_phi);
5179 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5180 new_vec_stmt = gimple_build_assign (tem, code,
5181 first_vect, second_vect);
5182 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5183 first_vect = tem;
5184 }
5185 new_phi_result = first_vect;
5186 new_phis.truncate (0);
5187 new_phis.safe_push (new_vec_stmt);
5188 }
5189 else
5190 new_phi_result = PHI_RESULT (new_phis[0]);
5191
5192 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5193 && reduc_fn != IFN_LAST)
5194 {
5195 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5196 various data values where the condition matched and another vector
5197 (INDUCTION_INDEX) containing all the indexes of those matches. We
5198 need to extract the last matching index (which will be the index with
5199 highest value) and use this to index into the data vector.
5200 For the case where there were no matches, the data vector will contain
5201 all default values and the index vector will be all zeros. */
5202
5203 /* Get various versions of the type of the vector of indexes. */
5204 tree index_vec_type = TREE_TYPE (induction_index);
5205 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5206 tree index_scalar_type = TREE_TYPE (index_vec_type);
5207 tree index_vec_cmp_type = build_same_sized_truth_vector_type
5208 (index_vec_type);
5209
5210 /* Get an unsigned integer version of the type of the data vector. */
5211 int scalar_precision
5212 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5213 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5214 tree vectype_unsigned = build_vector_type
5215 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5216
5217 /* First we need to create a vector (ZERO_VEC) of zeros and another
5218 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5219 can create using a MAX reduction and then expanding.
5220 In the case where the loop never made any matches, the max index will
5221 be zero. */
5222
5223 /* Vector of {0, 0, 0,...}. */
5224 tree zero_vec = make_ssa_name (vectype);
5225 tree zero_vec_rhs = build_zero_cst (vectype);
5226 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5227 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5228
5229 /* Find maximum value from the vector of found indexes. */
5230 tree max_index = make_ssa_name (index_scalar_type);
5231 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5232 1, induction_index);
5233 gimple_call_set_lhs (max_index_stmt, max_index);
5234 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5235
5236 /* Vector of {max_index, max_index, max_index,...}. */
5237 tree max_index_vec = make_ssa_name (index_vec_type);
5238 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5239 max_index);
5240 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5241 max_index_vec_rhs);
5242 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5243
5244 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5245 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5246 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5247 otherwise. Only one value should match, resulting in a vector
5248 (VEC_COND) with one data value and the rest zeros.
5249 In the case where the loop never made any matches, every index will
5250 match, resulting in a vector with all data values (which will all be
5251 the default value). */
5252
5253 /* Compare the max index vector to the vector of found indexes to find
5254 the position of the max value. */
5255 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5256 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5257 induction_index,
5258 max_index_vec);
5259 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5260
5261 /* Use the compare to choose either values from the data vector or
5262 zero. */
5263 tree vec_cond = make_ssa_name (vectype);
5264 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5265 vec_compare, new_phi_result,
5266 zero_vec);
5267 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5268
5269 /* Finally we need to extract the data value from the vector (VEC_COND)
5270 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5271 reduction, but because this doesn't exist, we can use a MAX reduction
5272 instead. The data value might be signed or a float so we need to cast
5273 it first.
5274 In the case where the loop never made any matches, the data values are
5275 all identical, and so will reduce down correctly. */
5276
5277 /* Make the matched data values unsigned. */
5278 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5279 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5280 vec_cond);
5281 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5282 VIEW_CONVERT_EXPR,
5283 vec_cond_cast_rhs);
5284 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5285
5286 /* Reduce down to a scalar value. */
5287 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5288 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5289 1, vec_cond_cast);
5290 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5291 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5292
5293 /* Convert the reduced value back to the result type and set as the
5294 result. */
5295 gimple_seq stmts = NULL;
5296 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5297 data_reduc);
5298 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5299 scalar_results.safe_push (new_temp);
5300 }
5301 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5302 && reduc_fn == IFN_LAST)
5303 {
5304 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5305 idx = 0;
5306 idx_val = induction_index[0];
5307 val = data_reduc[0];
5308 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5309 if (induction_index[i] > idx_val)
5310 val = data_reduc[i], idx_val = induction_index[i];
5311 return val; */
5312
5313 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5314 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5315 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5316 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5317 /* Enforced by vectorizable_reduction, which ensures we have target
5318 support before allowing a conditional reduction on variable-length
5319 vectors. */
5320 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5321 tree idx_val = NULL_TREE, val = NULL_TREE;
5322 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5323 {
5324 tree old_idx_val = idx_val;
5325 tree old_val = val;
5326 idx_val = make_ssa_name (idx_eltype);
5327 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5328 build3 (BIT_FIELD_REF, idx_eltype,
5329 induction_index,
5330 bitsize_int (el_size),
5331 bitsize_int (off)));
5332 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5333 val = make_ssa_name (data_eltype);
5334 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5335 build3 (BIT_FIELD_REF,
5336 data_eltype,
5337 new_phi_result,
5338 bitsize_int (el_size),
5339 bitsize_int (off)));
5340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5341 if (off != 0)
5342 {
5343 tree new_idx_val = idx_val;
5344 tree new_val = val;
5345 if (off != v_size - el_size)
5346 {
5347 new_idx_val = make_ssa_name (idx_eltype);
5348 epilog_stmt = gimple_build_assign (new_idx_val,
5349 MAX_EXPR, idx_val,
5350 old_idx_val);
5351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352 }
5353 new_val = make_ssa_name (data_eltype);
5354 epilog_stmt = gimple_build_assign (new_val,
5355 COND_EXPR,
5356 build2 (GT_EXPR,
5357 boolean_type_node,
5358 idx_val,
5359 old_idx_val),
5360 val, old_val);
5361 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362 idx_val = new_idx_val;
5363 val = new_val;
5364 }
5365 }
5366 /* Convert the reduced value back to the result type and set as the
5367 result. */
5368 gimple_seq stmts = NULL;
5369 val = gimple_convert (&stmts, scalar_type, val);
5370 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5371 scalar_results.safe_push (val);
5372 }
5373
5374 /* 2.3 Create the reduction code, using one of the three schemes described
5375 above. In SLP we simply need to extract all the elements from the
5376 vector (without reducing them), so we use scalar shifts. */
5377 else if (reduc_fn != IFN_LAST && !slp_reduc)
5378 {
5379 tree tmp;
5380 tree vec_elem_type;
5381
5382 /* Case 1: Create:
5383 v_out2 = reduc_expr <v_out1> */
5384
5385 if (dump_enabled_p ())
5386 dump_printf_loc (MSG_NOTE, vect_location,
5387 "Reduce using direct vector reduction.\n");
5388
5389 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5390 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5391 {
5392 tree tmp_dest
5393 = vect_create_destination_var (scalar_dest, vec_elem_type);
5394 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5395 new_phi_result);
5396 gimple_set_lhs (epilog_stmt, tmp_dest);
5397 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5398 gimple_set_lhs (epilog_stmt, new_temp);
5399 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5400
5401 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5402 new_temp);
5403 }
5404 else
5405 {
5406 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5407 new_phi_result);
5408 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5409 }
5410
5411 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5412 gimple_set_lhs (epilog_stmt, new_temp);
5413 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5414
5415 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5416 == INTEGER_INDUC_COND_REDUCTION)
5417 && !operand_equal_p (initial_def, induc_val, 0))
5418 {
5419 /* Earlier we set the initial value to be a vector if induc_val
5420 values. Check the result and if it is induc_val then replace
5421 with the original initial value, unless induc_val is
5422 the same as initial_def already. */
5423 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5424 induc_val);
5425
5426 tmp = make_ssa_name (new_scalar_dest);
5427 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5428 initial_def, new_temp);
5429 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5430 new_temp = tmp;
5431 }
5432
5433 scalar_results.safe_push (new_temp);
5434 }
5435 else if (direct_slp_reduc)
5436 {
5437 /* Here we create one vector for each of the GROUP_SIZE results,
5438 with the elements for other SLP statements replaced with the
5439 neutral value. We can then do a normal reduction on each vector. */
5440
5441 /* Enforced by vectorizable_reduction. */
5442 gcc_assert (new_phis.length () == 1);
5443 gcc_assert (pow2p_hwi (group_size));
5444
5445 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5446 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5447 gimple_seq seq = NULL;
5448
5449 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5450 and the same element size as VECTYPE. */
5451 tree index = build_index_vector (vectype, 0, 1);
5452 tree index_type = TREE_TYPE (index);
5453 tree index_elt_type = TREE_TYPE (index_type);
5454 tree mask_type = build_same_sized_truth_vector_type (index_type);
5455
5456 /* Create a vector that, for each element, identifies which of
5457 the GROUP_SIZE results should use it. */
5458 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5459 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5460 build_vector_from_val (index_type, index_mask));
5461
5462 /* Get a neutral vector value. This is simply a splat of the neutral
5463 scalar value if we have one, otherwise the initial scalar value
5464 is itself a neutral value. */
5465 tree vector_identity = NULL_TREE;
5466 if (neutral_op)
5467 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5468 neutral_op);
5469 for (unsigned int i = 0; i < group_size; ++i)
5470 {
5471 /* If there's no univeral neutral value, we can use the
5472 initial scalar value from the original PHI. This is used
5473 for MIN and MAX reduction, for example. */
5474 if (!neutral_op)
5475 {
5476 tree scalar_value
5477 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5478 loop_preheader_edge (loop));
5479 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5480 scalar_value);
5481 }
5482
5483 /* Calculate the equivalent of:
5484
5485 sel[j] = (index[j] == i);
5486
5487 which selects the elements of NEW_PHI_RESULT that should
5488 be included in the result. */
5489 tree compare_val = build_int_cst (index_elt_type, i);
5490 compare_val = build_vector_from_val (index_type, compare_val);
5491 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5492 index, compare_val);
5493
5494 /* Calculate the equivalent of:
5495
5496 vec = seq ? new_phi_result : vector_identity;
5497
5498 VEC is now suitable for a full vector reduction. */
5499 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5500 sel, new_phi_result, vector_identity);
5501
5502 /* Do the reduction and convert it to the appropriate type. */
5503 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5504 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5505 gimple_call_set_lhs (call, scalar);
5506 gimple_seq_add_stmt (&seq, call);
5507 scalar = gimple_convert (&seq, scalar_type, scalar);
5508 scalar_results.safe_push (scalar);
5509 }
5510 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5511 }
5512 else
5513 {
5514 bool reduce_with_shift;
5515 tree vec_temp;
5516
5517 /* COND reductions all do the final reduction with MAX_EXPR
5518 or MIN_EXPR. */
5519 if (code == COND_EXPR)
5520 {
5521 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5522 == INTEGER_INDUC_COND_REDUCTION)
5523 code = induc_code;
5524 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5525 == CONST_COND_REDUCTION)
5526 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5527 else
5528 code = MAX_EXPR;
5529 }
5530
5531 /* See if the target wants to do the final (shift) reduction
5532 in a vector mode of smaller size and first reduce upper/lower
5533 halves against each other. */
5534 enum machine_mode mode1 = mode;
5535 tree vectype1 = vectype;
5536 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5537 unsigned sz1 = sz;
5538 if (!slp_reduc
5539 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5540 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5541
5542 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5543 reduce_with_shift = have_whole_vector_shift (mode1);
5544 if (!VECTOR_MODE_P (mode1))
5545 reduce_with_shift = false;
5546 else
5547 {
5548 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5549 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5550 reduce_with_shift = false;
5551 }
5552
5553 /* First reduce the vector to the desired vector size we should
5554 do shift reduction on by combining upper and lower halves. */
5555 new_temp = new_phi_result;
5556 while (sz > sz1)
5557 {
5558 gcc_assert (!slp_reduc);
5559 sz /= 2;
5560 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5561
5562 /* The target has to make sure we support lowpart/highpart
5563 extraction, either via direct vector extract or through
5564 an integer mode punning. */
5565 tree dst1, dst2;
5566 if (convert_optab_handler (vec_extract_optab,
5567 TYPE_MODE (TREE_TYPE (new_temp)),
5568 TYPE_MODE (vectype1))
5569 != CODE_FOR_nothing)
5570 {
5571 /* Extract sub-vectors directly once vec_extract becomes
5572 a conversion optab. */
5573 dst1 = make_ssa_name (vectype1);
5574 epilog_stmt
5575 = gimple_build_assign (dst1, BIT_FIELD_REF,
5576 build3 (BIT_FIELD_REF, vectype1,
5577 new_temp, TYPE_SIZE (vectype1),
5578 bitsize_int (0)));
5579 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5580 dst2 = make_ssa_name (vectype1);
5581 epilog_stmt
5582 = gimple_build_assign (dst2, BIT_FIELD_REF,
5583 build3 (BIT_FIELD_REF, vectype1,
5584 new_temp, TYPE_SIZE (vectype1),
5585 bitsize_int (sz * BITS_PER_UNIT)));
5586 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587 }
5588 else
5589 {
5590 /* Extract via punning to appropriately sized integer mode
5591 vector. */
5592 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5593 1);
5594 tree etype = build_vector_type (eltype, 2);
5595 gcc_assert (convert_optab_handler (vec_extract_optab,
5596 TYPE_MODE (etype),
5597 TYPE_MODE (eltype))
5598 != CODE_FOR_nothing);
5599 tree tem = make_ssa_name (etype);
5600 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5601 build1 (VIEW_CONVERT_EXPR,
5602 etype, new_temp));
5603 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5604 new_temp = tem;
5605 tem = make_ssa_name (eltype);
5606 epilog_stmt
5607 = gimple_build_assign (tem, BIT_FIELD_REF,
5608 build3 (BIT_FIELD_REF, eltype,
5609 new_temp, TYPE_SIZE (eltype),
5610 bitsize_int (0)));
5611 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612 dst1 = make_ssa_name (vectype1);
5613 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5614 build1 (VIEW_CONVERT_EXPR,
5615 vectype1, tem));
5616 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617 tem = make_ssa_name (eltype);
5618 epilog_stmt
5619 = gimple_build_assign (tem, BIT_FIELD_REF,
5620 build3 (BIT_FIELD_REF, eltype,
5621 new_temp, TYPE_SIZE (eltype),
5622 bitsize_int (sz * BITS_PER_UNIT)));
5623 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5624 dst2 = make_ssa_name (vectype1);
5625 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5626 build1 (VIEW_CONVERT_EXPR,
5627 vectype1, tem));
5628 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5629 }
5630
5631 new_temp = make_ssa_name (vectype1);
5632 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5633 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5634 }
5635
5636 if (reduce_with_shift && !slp_reduc)
5637 {
5638 int element_bitsize = tree_to_uhwi (bitsize);
5639 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5640 for variable-length vectors and also requires direct target support
5641 for loop reductions. */
5642 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5643 int nelements = vec_size_in_bits / element_bitsize;
5644 vec_perm_builder sel;
5645 vec_perm_indices indices;
5646
5647 int elt_offset;
5648
5649 tree zero_vec = build_zero_cst (vectype1);
5650 /* Case 2: Create:
5651 for (offset = nelements/2; offset >= 1; offset/=2)
5652 {
5653 Create: va' = vec_shift <va, offset>
5654 Create: va = vop <va, va'>
5655 } */
5656
5657 tree rhs;
5658
5659 if (dump_enabled_p ())
5660 dump_printf_loc (MSG_NOTE, vect_location,
5661 "Reduce using vector shifts\n");
5662
5663 mode1 = TYPE_MODE (vectype1);
5664 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5665 for (elt_offset = nelements / 2;
5666 elt_offset >= 1;
5667 elt_offset /= 2)
5668 {
5669 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5670 indices.new_vector (sel, 2, nelements);
5671 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5672 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5673 new_temp, zero_vec, mask);
5674 new_name = make_ssa_name (vec_dest, epilog_stmt);
5675 gimple_assign_set_lhs (epilog_stmt, new_name);
5676 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5677
5678 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5679 new_temp);
5680 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5681 gimple_assign_set_lhs (epilog_stmt, new_temp);
5682 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5683 }
5684
5685 /* 2.4 Extract the final scalar result. Create:
5686 s_out3 = extract_field <v_out2, bitpos> */
5687
5688 if (dump_enabled_p ())
5689 dump_printf_loc (MSG_NOTE, vect_location,
5690 "extract scalar result\n");
5691
5692 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5693 bitsize, bitsize_zero_node);
5694 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5695 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5696 gimple_assign_set_lhs (epilog_stmt, new_temp);
5697 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5698 scalar_results.safe_push (new_temp);
5699 }
5700 else
5701 {
5702 /* Case 3: Create:
5703 s = extract_field <v_out2, 0>
5704 for (offset = element_size;
5705 offset < vector_size;
5706 offset += element_size;)
5707 {
5708 Create: s' = extract_field <v_out2, offset>
5709 Create: s = op <s, s'> // For non SLP cases
5710 } */
5711
5712 if (dump_enabled_p ())
5713 dump_printf_loc (MSG_NOTE, vect_location,
5714 "Reduce using scalar code.\n");
5715
5716 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5717 int element_bitsize = tree_to_uhwi (bitsize);
5718 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5719 {
5720 int bit_offset;
5721 if (gimple_code (new_phi) == GIMPLE_PHI)
5722 vec_temp = PHI_RESULT (new_phi);
5723 else
5724 vec_temp = gimple_assign_lhs (new_phi);
5725 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5726 bitsize_zero_node);
5727 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5728 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5729 gimple_assign_set_lhs (epilog_stmt, new_temp);
5730 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5731
5732 /* In SLP we don't need to apply reduction operation, so we just
5733 collect s' values in SCALAR_RESULTS. */
5734 if (slp_reduc)
5735 scalar_results.safe_push (new_temp);
5736
5737 for (bit_offset = element_bitsize;
5738 bit_offset < vec_size_in_bits;
5739 bit_offset += element_bitsize)
5740 {
5741 tree bitpos = bitsize_int (bit_offset);
5742 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5743 bitsize, bitpos);
5744
5745 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5746 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5747 gimple_assign_set_lhs (epilog_stmt, new_name);
5748 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5749
5750 if (slp_reduc)
5751 {
5752 /* In SLP we don't need to apply reduction operation, so
5753 we just collect s' values in SCALAR_RESULTS. */
5754 new_temp = new_name;
5755 scalar_results.safe_push (new_name);
5756 }
5757 else
5758 {
5759 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5760 new_name, new_temp);
5761 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5762 gimple_assign_set_lhs (epilog_stmt, new_temp);
5763 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5764 }
5765 }
5766 }
5767
5768 /* The only case where we need to reduce scalar results in SLP, is
5769 unrolling. If the size of SCALAR_RESULTS is greater than
5770 GROUP_SIZE, we reduce them combining elements modulo
5771 GROUP_SIZE. */
5772 if (slp_reduc)
5773 {
5774 tree res, first_res, new_res;
5775 gimple *new_stmt;
5776
5777 /* Reduce multiple scalar results in case of SLP unrolling. */
5778 for (j = group_size; scalar_results.iterate (j, &res);
5779 j++)
5780 {
5781 first_res = scalar_results[j % group_size];
5782 new_stmt = gimple_build_assign (new_scalar_dest, code,
5783 first_res, res);
5784 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5785 gimple_assign_set_lhs (new_stmt, new_res);
5786 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5787 scalar_results[j % group_size] = new_res;
5788 }
5789 }
5790 else
5791 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5792 scalar_results.safe_push (new_temp);
5793 }
5794
5795 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5796 == INTEGER_INDUC_COND_REDUCTION)
5797 && !operand_equal_p (initial_def, induc_val, 0))
5798 {
5799 /* Earlier we set the initial value to be a vector if induc_val
5800 values. Check the result and if it is induc_val then replace
5801 with the original initial value, unless induc_val is
5802 the same as initial_def already. */
5803 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5804 induc_val);
5805
5806 tree tmp = make_ssa_name (new_scalar_dest);
5807 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5808 initial_def, new_temp);
5809 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5810 scalar_results[0] = tmp;
5811 }
5812 }
5813
5814 vect_finalize_reduction:
5815
5816 if (double_reduc)
5817 loop = loop->inner;
5818
5819 /* 2.5 Adjust the final result by the initial value of the reduction
5820 variable. (When such adjustment is not needed, then
5821 'adjustment_def' is zero). For example, if code is PLUS we create:
5822 new_temp = loop_exit_def + adjustment_def */
5823
5824 if (adjustment_def)
5825 {
5826 gcc_assert (!slp_reduc);
5827 if (nested_in_vect_loop)
5828 {
5829 new_phi = new_phis[0];
5830 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5831 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5832 new_dest = vect_create_destination_var (scalar_dest, vectype);
5833 }
5834 else
5835 {
5836 new_temp = scalar_results[0];
5837 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5838 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5839 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5840 }
5841
5842 epilog_stmt = gimple_build_assign (new_dest, expr);
5843 new_temp = make_ssa_name (new_dest, epilog_stmt);
5844 gimple_assign_set_lhs (epilog_stmt, new_temp);
5845 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5846 if (nested_in_vect_loop)
5847 {
5848 set_vinfo_for_stmt (epilog_stmt,
5849 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5850 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5851 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5852
5853 if (!double_reduc)
5854 scalar_results.quick_push (new_temp);
5855 else
5856 scalar_results[0] = new_temp;
5857 }
5858 else
5859 scalar_results[0] = new_temp;
5860
5861 new_phis[0] = epilog_stmt;
5862 }
5863
5864 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5865 phis with new adjusted scalar results, i.e., replace use <s_out0>
5866 with use <s_out4>.
5867
5868 Transform:
5869 loop_exit:
5870 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5871 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5872 v_out2 = reduce <v_out1>
5873 s_out3 = extract_field <v_out2, 0>
5874 s_out4 = adjust_result <s_out3>
5875 use <s_out0>
5876 use <s_out0>
5877
5878 into:
5879
5880 loop_exit:
5881 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5882 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5883 v_out2 = reduce <v_out1>
5884 s_out3 = extract_field <v_out2, 0>
5885 s_out4 = adjust_result <s_out3>
5886 use <s_out4>
5887 use <s_out4> */
5888
5889
5890 /* In SLP reduction chain we reduce vector results into one vector if
5891 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5892 the last stmt in the reduction chain, since we are looking for the loop
5893 exit phi node. */
5894 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5895 {
5896 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5897 /* Handle reduction patterns. */
5898 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5899 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5900
5901 scalar_dest = gimple_assign_lhs (dest_stmt);
5902 group_size = 1;
5903 }
5904
5905 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5906 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5907 need to match SCALAR_RESULTS with corresponding statements. The first
5908 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5909 the first vector stmt, etc.
5910 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5911 if (group_size > new_phis.length ())
5912 {
5913 ratio = group_size / new_phis.length ();
5914 gcc_assert (!(group_size % new_phis.length ()));
5915 }
5916 else
5917 ratio = 1;
5918
5919 for (k = 0; k < group_size; k++)
5920 {
5921 if (k % ratio == 0)
5922 {
5923 epilog_stmt = new_phis[k / ratio];
5924 reduction_phi = reduction_phis[k / ratio];
5925 if (double_reduc)
5926 inner_phi = inner_phis[k / ratio];
5927 }
5928
5929 if (slp_reduc)
5930 {
5931 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5932
5933 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5934 /* SLP statements can't participate in patterns. */
5935 gcc_assert (!orig_stmt);
5936 scalar_dest = gimple_assign_lhs (current_stmt);
5937 }
5938
5939 phis.create (3);
5940 /* Find the loop-closed-use at the loop exit of the original scalar
5941 result. (The reduction result is expected to have two immediate uses -
5942 one at the latch block, and one at the loop exit). */
5943 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5944 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5945 && !is_gimple_debug (USE_STMT (use_p)))
5946 phis.safe_push (USE_STMT (use_p));
5947
5948 /* While we expect to have found an exit_phi because of loop-closed-ssa
5949 form we can end up without one if the scalar cycle is dead. */
5950
5951 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5952 {
5953 if (outer_loop)
5954 {
5955 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5956 gphi *vect_phi;
5957
5958 /* FORNOW. Currently not supporting the case that an inner-loop
5959 reduction is not used in the outer-loop (but only outside the
5960 outer-loop), unless it is double reduction. */
5961 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5962 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5963 || double_reduc);
5964
5965 if (double_reduc)
5966 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5967 else
5968 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5969 if (!double_reduc
5970 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5971 != vect_double_reduction_def)
5972 continue;
5973
5974 /* Handle double reduction:
5975
5976 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5977 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5978 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5979 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5980
5981 At that point the regular reduction (stmt2 and stmt3) is
5982 already vectorized, as well as the exit phi node, stmt4.
5983 Here we vectorize the phi node of double reduction, stmt1, and
5984 update all relevant statements. */
5985
5986 /* Go through all the uses of s2 to find double reduction phi
5987 node, i.e., stmt1 above. */
5988 orig_name = PHI_RESULT (exit_phi);
5989 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5990 {
5991 stmt_vec_info use_stmt_vinfo;
5992 stmt_vec_info new_phi_vinfo;
5993 tree vect_phi_init, preheader_arg, vect_phi_res;
5994 basic_block bb = gimple_bb (use_stmt);
5995 gimple *use;
5996
5997 /* Check that USE_STMT is really double reduction phi
5998 node. */
5999 if (gimple_code (use_stmt) != GIMPLE_PHI
6000 || gimple_phi_num_args (use_stmt) != 2
6001 || bb->loop_father != outer_loop)
6002 continue;
6003 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6004 if (!use_stmt_vinfo
6005 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6006 != vect_double_reduction_def)
6007 continue;
6008
6009 /* Create vector phi node for double reduction:
6010 vs1 = phi <vs0, vs2>
6011 vs1 was created previously in this function by a call to
6012 vect_get_vec_def_for_operand and is stored in
6013 vec_initial_def;
6014 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6015 vs0 is created here. */
6016
6017 /* Create vector phi node. */
6018 vect_phi = create_phi_node (vec_initial_def, bb);
6019 new_phi_vinfo = new_stmt_vec_info (vect_phi,
6020 loop_vec_info_for_loop (outer_loop));
6021 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6022
6023 /* Create vs0 - initial def of the double reduction phi. */
6024 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6025 loop_preheader_edge (outer_loop));
6026 vect_phi_init = get_initial_def_for_reduction
6027 (stmt, preheader_arg, NULL);
6028
6029 /* Update phi node arguments with vs0 and vs2. */
6030 add_phi_arg (vect_phi, vect_phi_init,
6031 loop_preheader_edge (outer_loop),
6032 UNKNOWN_LOCATION);
6033 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6034 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6035 if (dump_enabled_p ())
6036 {
6037 dump_printf_loc (MSG_NOTE, vect_location,
6038 "created double reduction phi node: ");
6039 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6040 }
6041
6042 vect_phi_res = PHI_RESULT (vect_phi);
6043
6044 /* Replace the use, i.e., set the correct vs1 in the regular
6045 reduction phi node. FORNOW, NCOPIES is always 1, so the
6046 loop is redundant. */
6047 use = reduction_phi;
6048 for (j = 0; j < ncopies; j++)
6049 {
6050 edge pr_edge = loop_preheader_edge (loop);
6051 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6052 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6053 }
6054 }
6055 }
6056 }
6057
6058 phis.release ();
6059 if (nested_in_vect_loop)
6060 {
6061 if (double_reduc)
6062 loop = outer_loop;
6063 else
6064 continue;
6065 }
6066
6067 phis.create (3);
6068 /* Find the loop-closed-use at the loop exit of the original scalar
6069 result. (The reduction result is expected to have two immediate uses,
6070 one at the latch block, and one at the loop exit). For double
6071 reductions we are looking for exit phis of the outer loop. */
6072 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6073 {
6074 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6075 {
6076 if (!is_gimple_debug (USE_STMT (use_p)))
6077 phis.safe_push (USE_STMT (use_p));
6078 }
6079 else
6080 {
6081 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6082 {
6083 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6084
6085 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6086 {
6087 if (!flow_bb_inside_loop_p (loop,
6088 gimple_bb (USE_STMT (phi_use_p)))
6089 && !is_gimple_debug (USE_STMT (phi_use_p)))
6090 phis.safe_push (USE_STMT (phi_use_p));
6091 }
6092 }
6093 }
6094 }
6095
6096 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6097 {
6098 /* Replace the uses: */
6099 orig_name = PHI_RESULT (exit_phi);
6100 scalar_result = scalar_results[k];
6101 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6102 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6103 SET_USE (use_p, scalar_result);
6104 }
6105
6106 phis.release ();
6107 }
6108 }
6109
6110 /* Return a vector of type VECTYPE that is equal to the vector select
6111 operation "MASK ? VEC : IDENTITY". Insert the select statements
6112 before GSI. */
6113
6114 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6115 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6116 tree vec, tree identity)
6117 {
6118 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6119 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6120 mask, vec, identity);
6121 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6122 return cond;
6123 }
6124
6125 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6126 order, starting with LHS. Insert the extraction statements before GSI and
6127 associate the new scalar SSA names with variable SCALAR_DEST.
6128 Return the SSA name for the result. */
6129
6130 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6131 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6132 tree_code code, tree lhs, tree vector_rhs)
6133 {
6134 tree vectype = TREE_TYPE (vector_rhs);
6135 tree scalar_type = TREE_TYPE (vectype);
6136 tree bitsize = TYPE_SIZE (scalar_type);
6137 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6138 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6139
6140 for (unsigned HOST_WIDE_INT bit_offset = 0;
6141 bit_offset < vec_size_in_bits;
6142 bit_offset += element_bitsize)
6143 {
6144 tree bitpos = bitsize_int (bit_offset);
6145 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6146 bitsize, bitpos);
6147
6148 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6149 rhs = make_ssa_name (scalar_dest, stmt);
6150 gimple_assign_set_lhs (stmt, rhs);
6151 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6152
6153 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6154 tree new_name = make_ssa_name (scalar_dest, stmt);
6155 gimple_assign_set_lhs (stmt, new_name);
6156 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6157 lhs = new_name;
6158 }
6159 return lhs;
6160 }
6161
6162 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
6163 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6164 statement. CODE is the operation performed by STMT and OPS are
6165 its scalar operands. REDUC_INDEX is the index of the operand in
6166 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6167 implements in-order reduction, or IFN_LAST if we should open-code it.
6168 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6169 that should be used to control the operation in a fully-masked loop. */
6170
6171 static bool
vectorize_fold_left_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6172 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6173 gimple **vec_stmt, slp_tree slp_node,
6174 gimple *reduc_def_stmt,
6175 tree_code code, internal_fn reduc_fn,
6176 tree ops[3], tree vectype_in,
6177 int reduc_index, vec_loop_masks *masks)
6178 {
6179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6180 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6181 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6182 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6183 gimple *new_stmt = NULL;
6184
6185 int ncopies;
6186 if (slp_node)
6187 ncopies = 1;
6188 else
6189 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6190
6191 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6192 gcc_assert (ncopies == 1);
6193 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6194 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6195 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6196 == FOLD_LEFT_REDUCTION);
6197
6198 if (slp_node)
6199 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6200 TYPE_VECTOR_SUBPARTS (vectype_in)));
6201
6202 tree op0 = ops[1 - reduc_index];
6203
6204 int group_size = 1;
6205 gimple *scalar_dest_def;
6206 auto_vec<tree> vec_oprnds0;
6207 if (slp_node)
6208 {
6209 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6210 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6211 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6212 }
6213 else
6214 {
6215 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6216 vec_oprnds0.create (1);
6217 vec_oprnds0.quick_push (loop_vec_def0);
6218 scalar_dest_def = stmt;
6219 }
6220
6221 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6222 tree scalar_type = TREE_TYPE (scalar_dest);
6223 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6224
6225 int vec_num = vec_oprnds0.length ();
6226 gcc_assert (vec_num == 1 || slp_node);
6227 tree vec_elem_type = TREE_TYPE (vectype_out);
6228 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6229
6230 tree vector_identity = NULL_TREE;
6231 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6232 vector_identity = build_zero_cst (vectype_out);
6233
6234 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6235 int i;
6236 tree def0;
6237 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6238 {
6239 tree mask = NULL_TREE;
6240 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6241 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6242
6243 /* Handle MINUS by adding the negative. */
6244 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6245 {
6246 tree negated = make_ssa_name (vectype_out);
6247 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6248 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6249 def0 = negated;
6250 }
6251
6252 if (mask)
6253 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6254 vector_identity);
6255
6256 /* On the first iteration the input is simply the scalar phi
6257 result, and for subsequent iterations it is the output of
6258 the preceding operation. */
6259 if (reduc_fn != IFN_LAST)
6260 {
6261 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6262 /* For chained SLP reductions the output of the previous reduction
6263 operation serves as the input of the next. For the final statement
6264 the output cannot be a temporary - we reuse the original
6265 scalar destination of the last statement. */
6266 if (i != vec_num - 1)
6267 {
6268 gimple_set_lhs (new_stmt, scalar_dest_var);
6269 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6270 gimple_set_lhs (new_stmt, reduc_var);
6271 }
6272 }
6273 else
6274 {
6275 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6276 reduc_var, def0);
6277 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6278 /* Remove the statement, so that we can use the same code paths
6279 as for statements that we've just created. */
6280 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6281 gsi_remove (&tmp_gsi, true);
6282 }
6283
6284 if (i == vec_num - 1)
6285 {
6286 gimple_set_lhs (new_stmt, scalar_dest);
6287 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6288 }
6289 else
6290 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6291
6292 if (slp_node)
6293 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6294 }
6295
6296 if (!slp_node)
6297 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6298
6299 return true;
6300 }
6301
6302 /* Function is_nonwrapping_integer_induction.
6303
6304 Check if STMT (which is part of loop LOOP) both increments and
6305 does not cause overflow. */
6306
6307 static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)6308 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6309 {
6310 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6311 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6312 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6313 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6314 widest_int ni, max_loop_value, lhs_max;
6315 bool overflow = false;
6316
6317 /* Make sure the loop is integer based. */
6318 if (TREE_CODE (base) != INTEGER_CST
6319 || TREE_CODE (step) != INTEGER_CST)
6320 return false;
6321
6322 /* Check that the max size of the loop will not wrap. */
6323
6324 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6325 return true;
6326
6327 if (! max_stmt_executions (loop, &ni))
6328 return false;
6329
6330 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6331 &overflow);
6332 if (overflow)
6333 return false;
6334
6335 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6336 TYPE_SIGN (lhs_type), &overflow);
6337 if (overflow)
6338 return false;
6339
6340 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6341 <= TYPE_PRECISION (lhs_type));
6342 }
6343
6344 /* Function vectorizable_reduction.
6345
6346 Check if STMT performs a reduction operation that can be vectorized.
6347 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6348 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6349 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6350
6351 This function also handles reduction idioms (patterns) that have been
6352 recognized in advance during vect_pattern_recog. In this case, STMT may be
6353 of this form:
6354 X = pattern_expr (arg0, arg1, ..., X)
6355 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6356 sequence that had been detected and replaced by the pattern-stmt (STMT).
6357
6358 This function also handles reduction of condition expressions, for example:
6359 for (int i = 0; i < N; i++)
6360 if (a[i] < value)
6361 last = a[i];
6362 This is handled by vectorising the loop and creating an additional vector
6363 containing the loop indexes for which "a[i] < value" was true. In the
6364 function epilogue this is reduced to a single max value and then used to
6365 index into the vector of results.
6366
6367 In some cases of reduction patterns, the type of the reduction variable X is
6368 different than the type of the other arguments of STMT.
6369 In such cases, the vectype that is used when transforming STMT into a vector
6370 stmt is different than the vectype that is used to determine the
6371 vectorization factor, because it consists of a different number of elements
6372 than the actual number of elements that are being operated upon in parallel.
6373
6374 For example, consider an accumulation of shorts into an int accumulator.
6375 On some targets it's possible to vectorize this pattern operating on 8
6376 shorts at a time (hence, the vectype for purposes of determining the
6377 vectorization factor should be V8HI); on the other hand, the vectype that
6378 is used to create the vector form is actually V4SI (the type of the result).
6379
6380 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6381 indicates what is the actual level of parallelism (V8HI in the example), so
6382 that the right vectorization factor would be derived. This vectype
6383 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6384 be used to create the vectorized stmt. The right vectype for the vectorized
6385 stmt is obtained from the type of the result X:
6386 get_vectype_for_scalar_type (TREE_TYPE (X))
6387
6388 This means that, contrary to "regular" reductions (or "regular" stmts in
6389 general), the following equation:
6390 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6391 does *NOT* necessarily hold for reduction patterns. */
6392
6393 bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)6394 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6395 gimple **vec_stmt, slp_tree slp_node,
6396 slp_instance slp_node_instance)
6397 {
6398 tree vec_dest;
6399 tree scalar_dest;
6400 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6401 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6402 tree vectype_in = NULL_TREE;
6403 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6404 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6405 enum tree_code code, orig_code;
6406 internal_fn reduc_fn;
6407 machine_mode vec_mode;
6408 int op_type;
6409 optab optab;
6410 tree new_temp = NULL_TREE;
6411 gimple *def_stmt;
6412 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6413 gimple *cond_reduc_def_stmt = NULL;
6414 enum tree_code cond_reduc_op_code = ERROR_MARK;
6415 tree scalar_type;
6416 bool is_simple_use;
6417 gimple *orig_stmt;
6418 stmt_vec_info orig_stmt_info = NULL;
6419 int i;
6420 int ncopies;
6421 int epilog_copies;
6422 stmt_vec_info prev_stmt_info, prev_phi_info;
6423 bool single_defuse_cycle = false;
6424 gimple *new_stmt = NULL;
6425 int j;
6426 tree ops[3];
6427 enum vect_def_type dts[3];
6428 bool nested_cycle = false, found_nested_cycle_def = false;
6429 bool double_reduc = false;
6430 basic_block def_bb;
6431 struct loop * def_stmt_loop, *outer_loop = NULL;
6432 tree def_arg;
6433 gimple *def_arg_stmt;
6434 auto_vec<tree> vec_oprnds0;
6435 auto_vec<tree> vec_oprnds1;
6436 auto_vec<tree> vec_oprnds2;
6437 auto_vec<tree> vect_defs;
6438 auto_vec<gimple *> phis;
6439 int vec_num;
6440 tree def0, tem;
6441 bool first_p = true;
6442 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6443 tree cond_reduc_val = NULL_TREE;
6444
6445 /* Make sure it was already recognized as a reduction computation. */
6446 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6447 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6448 return false;
6449
6450 if (nested_in_vect_loop_p (loop, stmt))
6451 {
6452 outer_loop = loop;
6453 loop = loop->inner;
6454 nested_cycle = true;
6455 }
6456
6457 /* In case of reduction chain we switch to the first stmt in the chain, but
6458 we don't update STMT_INFO, since only the last stmt is marked as reduction
6459 and has reduction properties. */
6460 if (GROUP_FIRST_ELEMENT (stmt_info)
6461 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6462 {
6463 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6464 first_p = false;
6465 }
6466
6467 if (gimple_code (stmt) == GIMPLE_PHI)
6468 {
6469 /* Analysis is fully done on the reduction stmt invocation. */
6470 if (! vec_stmt)
6471 {
6472 if (slp_node)
6473 slp_node_instance->reduc_phis = slp_node;
6474
6475 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6476 return true;
6477 }
6478
6479 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6480 /* Leave the scalar phi in place. Note that checking
6481 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6482 for reductions involving a single statement. */
6483 return true;
6484
6485 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6486 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6487 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6488
6489 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6490 == EXTRACT_LAST_REDUCTION)
6491 /* Leave the scalar phi in place. */
6492 return true;
6493
6494 gcc_assert (is_gimple_assign (reduc_stmt));
6495 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6496 {
6497 tree op = gimple_op (reduc_stmt, k);
6498 if (op == gimple_phi_result (stmt))
6499 continue;
6500 if (k == 1
6501 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6502 continue;
6503 if (!vectype_in
6504 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6505 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6506 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6507 break;
6508 }
6509 gcc_assert (vectype_in);
6510
6511 if (slp_node)
6512 ncopies = 1;
6513 else
6514 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6515
6516 use_operand_p use_p;
6517 gimple *use_stmt;
6518 if (ncopies > 1
6519 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6520 <= vect_used_only_live)
6521 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6522 && (use_stmt == reduc_stmt
6523 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6524 == reduc_stmt)))
6525 single_defuse_cycle = true;
6526
6527 /* Create the destination vector */
6528 scalar_dest = gimple_assign_lhs (reduc_stmt);
6529 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6530
6531 if (slp_node)
6532 /* The size vect_schedule_slp_instance computes is off for us. */
6533 vec_num = vect_get_num_vectors
6534 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6535 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6536 vectype_in);
6537 else
6538 vec_num = 1;
6539
6540 /* Generate the reduction PHIs upfront. */
6541 prev_phi_info = NULL;
6542 for (j = 0; j < ncopies; j++)
6543 {
6544 if (j == 0 || !single_defuse_cycle)
6545 {
6546 for (i = 0; i < vec_num; i++)
6547 {
6548 /* Create the reduction-phi that defines the reduction
6549 operand. */
6550 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6551 set_vinfo_for_stmt (new_phi,
6552 new_stmt_vec_info (new_phi, loop_vinfo));
6553
6554 if (slp_node)
6555 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6556 else
6557 {
6558 if (j == 0)
6559 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6560 else
6561 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6562 prev_phi_info = vinfo_for_stmt (new_phi);
6563 }
6564 }
6565 }
6566 }
6567
6568 return true;
6569 }
6570
6571 /* 1. Is vectorizable reduction? */
6572 /* Not supportable if the reduction variable is used in the loop, unless
6573 it's a reduction chain. */
6574 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6575 && !GROUP_FIRST_ELEMENT (stmt_info))
6576 return false;
6577
6578 /* Reductions that are not used even in an enclosing outer-loop,
6579 are expected to be "live" (used out of the loop). */
6580 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6581 && !STMT_VINFO_LIVE_P (stmt_info))
6582 return false;
6583
6584 /* 2. Has this been recognized as a reduction pattern?
6585
6586 Check if STMT represents a pattern that has been recognized
6587 in earlier analysis stages. For stmts that represent a pattern,
6588 the STMT_VINFO_RELATED_STMT field records the last stmt in
6589 the original sequence that constitutes the pattern. */
6590
6591 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6592 if (orig_stmt)
6593 {
6594 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6595 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6596 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6597 }
6598
6599 /* 3. Check the operands of the operation. The first operands are defined
6600 inside the loop body. The last operand is the reduction variable,
6601 which is defined by the loop-header-phi. */
6602
6603 gcc_assert (is_gimple_assign (stmt));
6604
6605 /* Flatten RHS. */
6606 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6607 {
6608 case GIMPLE_BINARY_RHS:
6609 code = gimple_assign_rhs_code (stmt);
6610 op_type = TREE_CODE_LENGTH (code);
6611 gcc_assert (op_type == binary_op);
6612 ops[0] = gimple_assign_rhs1 (stmt);
6613 ops[1] = gimple_assign_rhs2 (stmt);
6614 break;
6615
6616 case GIMPLE_TERNARY_RHS:
6617 code = gimple_assign_rhs_code (stmt);
6618 op_type = TREE_CODE_LENGTH (code);
6619 gcc_assert (op_type == ternary_op);
6620 ops[0] = gimple_assign_rhs1 (stmt);
6621 ops[1] = gimple_assign_rhs2 (stmt);
6622 ops[2] = gimple_assign_rhs3 (stmt);
6623 break;
6624
6625 case GIMPLE_UNARY_RHS:
6626 return false;
6627
6628 default:
6629 gcc_unreachable ();
6630 }
6631
6632 if (code == COND_EXPR && slp_node)
6633 return false;
6634
6635 scalar_dest = gimple_assign_lhs (stmt);
6636 scalar_type = TREE_TYPE (scalar_dest);
6637 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6638 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6639 return false;
6640
6641 /* Do not try to vectorize bit-precision reductions. */
6642 if (!type_has_mode_precision_p (scalar_type))
6643 return false;
6644
6645 /* All uses but the last are expected to be defined in the loop.
6646 The last use is the reduction variable. In case of nested cycle this
6647 assumption is not true: we use reduc_index to record the index of the
6648 reduction variable. */
6649 gimple *reduc_def_stmt = NULL;
6650 int reduc_index = -1;
6651 for (i = 0; i < op_type; i++)
6652 {
6653 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6654 if (i == 0 && code == COND_EXPR)
6655 continue;
6656
6657 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6658 &def_stmt, &dts[i], &tem);
6659 dt = dts[i];
6660 gcc_assert (is_simple_use);
6661 if (dt == vect_reduction_def)
6662 {
6663 reduc_def_stmt = def_stmt;
6664 reduc_index = i;
6665 continue;
6666 }
6667 else if (tem)
6668 {
6669 /* To properly compute ncopies we are interested in the widest
6670 input type in case we're looking at a widening accumulation. */
6671 if (!vectype_in
6672 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6673 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6674 vectype_in = tem;
6675 }
6676
6677 if (dt != vect_internal_def
6678 && dt != vect_external_def
6679 && dt != vect_constant_def
6680 && dt != vect_induction_def
6681 && !(dt == vect_nested_cycle && nested_cycle))
6682 return false;
6683
6684 if (dt == vect_nested_cycle)
6685 {
6686 found_nested_cycle_def = true;
6687 reduc_def_stmt = def_stmt;
6688 reduc_index = i;
6689 }
6690
6691 if (i == 1 && code == COND_EXPR)
6692 {
6693 /* Record how value of COND_EXPR is defined. */
6694 if (dt == vect_constant_def)
6695 {
6696 cond_reduc_dt = dt;
6697 cond_reduc_val = ops[i];
6698 }
6699 if (dt == vect_induction_def
6700 && def_stmt != NULL
6701 && is_nonwrapping_integer_induction (def_stmt, loop))
6702 {
6703 cond_reduc_dt = dt;
6704 cond_reduc_def_stmt = def_stmt;
6705 }
6706 }
6707 }
6708
6709 if (!vectype_in)
6710 vectype_in = vectype_out;
6711
6712 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6713 directy used in stmt. */
6714 if (reduc_index == -1)
6715 {
6716 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6717 {
6718 if (dump_enabled_p ())
6719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720 "in-order reduction chain without SLP.\n");
6721 return false;
6722 }
6723
6724 if (orig_stmt)
6725 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6726 else
6727 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6728 }
6729
6730 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6731 return false;
6732
6733 if (!(reduc_index == -1
6734 || dts[reduc_index] == vect_reduction_def
6735 || dts[reduc_index] == vect_nested_cycle
6736 || ((dts[reduc_index] == vect_internal_def
6737 || dts[reduc_index] == vect_external_def
6738 || dts[reduc_index] == vect_constant_def
6739 || dts[reduc_index] == vect_induction_def)
6740 && nested_cycle && found_nested_cycle_def)))
6741 {
6742 /* For pattern recognized stmts, orig_stmt might be a reduction,
6743 but some helper statements for the pattern might not, or
6744 might be COND_EXPRs with reduction uses in the condition. */
6745 gcc_assert (orig_stmt);
6746 return false;
6747 }
6748
6749 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6750 enum vect_reduction_type v_reduc_type
6751 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6752 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6753
6754 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6755 /* If we have a condition reduction, see if we can simplify it further. */
6756 if (v_reduc_type == COND_REDUCTION)
6757 {
6758 /* TODO: We can't yet handle reduction chains, since we need to treat
6759 each COND_EXPR in the chain specially, not just the last one.
6760 E.g. for:
6761
6762 x_1 = PHI <x_3, ...>
6763 x_2 = a_2 ? ... : x_1;
6764 x_3 = a_3 ? ... : x_2;
6765
6766 we're interested in the last element in x_3 for which a_2 || a_3
6767 is true, whereas the current reduction chain handling would
6768 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6769 as a reduction operation. */
6770 if (reduc_index == -1)
6771 {
6772 if (dump_enabled_p ())
6773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6774 "conditional reduction chains not supported\n");
6775 return false;
6776 }
6777
6778 /* vect_is_simple_reduction ensured that operand 2 is the
6779 loop-carried operand. */
6780 gcc_assert (reduc_index == 2);
6781
6782 /* Loop peeling modifies initial value of reduction PHI, which
6783 makes the reduction stmt to be transformed different to the
6784 original stmt analyzed. We need to record reduction code for
6785 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6786 it can be used directly at transform stage. */
6787 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6788 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6789 {
6790 /* Also set the reduction type to CONST_COND_REDUCTION. */
6791 gcc_assert (cond_reduc_dt == vect_constant_def);
6792 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6793 }
6794 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6795 vectype_in, OPTIMIZE_FOR_SPEED))
6796 {
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "optimizing condition reduction with"
6800 " FOLD_EXTRACT_LAST.\n");
6801 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6802 }
6803 else if (cond_reduc_dt == vect_induction_def)
6804 {
6805 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6806 tree base
6807 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6808 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6809
6810 gcc_assert (TREE_CODE (base) == INTEGER_CST
6811 && TREE_CODE (step) == INTEGER_CST);
6812 cond_reduc_val = NULL_TREE;
6813 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6814 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6815 ;
6816 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6817 above base; punt if base is the minimum value of the type for
6818 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6819 else if (tree_int_cst_sgn (step) == -1)
6820 {
6821 cond_reduc_op_code = MIN_EXPR;
6822 if (tree_int_cst_sgn (base) == -1)
6823 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6824 else if (tree_int_cst_lt (base,
6825 TYPE_MAX_VALUE (TREE_TYPE (base))))
6826 cond_reduc_val
6827 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6828 }
6829 else
6830 {
6831 cond_reduc_op_code = MAX_EXPR;
6832 if (tree_int_cst_sgn (base) == 1)
6833 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6834 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6835 base))
6836 cond_reduc_val
6837 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6838 }
6839 if (cond_reduc_val)
6840 {
6841 if (dump_enabled_p ())
6842 dump_printf_loc (MSG_NOTE, vect_location,
6843 "condition expression based on "
6844 "integer induction.\n");
6845 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6846 = INTEGER_INDUC_COND_REDUCTION;
6847 }
6848 }
6849 else if (cond_reduc_dt == vect_constant_def)
6850 {
6851 enum vect_def_type cond_initial_dt;
6852 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6853 tree cond_initial_val
6854 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6855
6856 gcc_assert (cond_reduc_val != NULL_TREE);
6857 vect_is_simple_use (cond_initial_val, loop_vinfo,
6858 &def_stmt, &cond_initial_dt);
6859 if (cond_initial_dt == vect_constant_def
6860 && types_compatible_p (TREE_TYPE (cond_initial_val),
6861 TREE_TYPE (cond_reduc_val)))
6862 {
6863 tree e = fold_binary (LE_EXPR, boolean_type_node,
6864 cond_initial_val, cond_reduc_val);
6865 if (e && (integer_onep (e) || integer_zerop (e)))
6866 {
6867 if (dump_enabled_p ())
6868 dump_printf_loc (MSG_NOTE, vect_location,
6869 "condition expression based on "
6870 "compile time constant.\n");
6871 /* Record reduction code at analysis stage. */
6872 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6873 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6874 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6875 = CONST_COND_REDUCTION;
6876 }
6877 }
6878 }
6879 }
6880
6881 if (orig_stmt)
6882 gcc_assert (tmp == orig_stmt
6883 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6884 else
6885 /* We changed STMT to be the first stmt in reduction chain, hence we
6886 check that in this case the first element in the chain is STMT. */
6887 gcc_assert (stmt == tmp
6888 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6889
6890 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6891 return false;
6892
6893 if (slp_node)
6894 ncopies = 1;
6895 else
6896 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6897
6898 gcc_assert (ncopies >= 1);
6899
6900 vec_mode = TYPE_MODE (vectype_in);
6901 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6902
6903 if (code == COND_EXPR)
6904 {
6905 /* Only call during the analysis stage, otherwise we'll lose
6906 STMT_VINFO_TYPE. */
6907 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6908 ops[reduc_index], 0, NULL))
6909 {
6910 if (dump_enabled_p ())
6911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6912 "unsupported condition in reduction\n");
6913 return false;
6914 }
6915 }
6916 else
6917 {
6918 /* 4. Supportable by target? */
6919
6920 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6921 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6922 {
6923 /* Shifts and rotates are only supported by vectorizable_shifts,
6924 not vectorizable_reduction. */
6925 if (dump_enabled_p ())
6926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6927 "unsupported shift or rotation.\n");
6928 return false;
6929 }
6930
6931 /* 4.1. check support for the operation in the loop */
6932 optab = optab_for_tree_code (code, vectype_in, optab_default);
6933 if (!optab)
6934 {
6935 if (dump_enabled_p ())
6936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6937 "no optab.\n");
6938
6939 return false;
6940 }
6941
6942 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6943 {
6944 if (dump_enabled_p ())
6945 dump_printf (MSG_NOTE, "op not supported by target.\n");
6946
6947 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6948 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6949 return false;
6950
6951 if (dump_enabled_p ())
6952 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6953 }
6954
6955 /* Worthwhile without SIMD support? */
6956 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6957 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6958 {
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961 "not worthwhile without SIMD support.\n");
6962
6963 return false;
6964 }
6965 }
6966
6967 /* 4.2. Check support for the epilog operation.
6968
6969 If STMT represents a reduction pattern, then the type of the
6970 reduction variable may be different than the type of the rest
6971 of the arguments. For example, consider the case of accumulation
6972 of shorts into an int accumulator; The original code:
6973 S1: int_a = (int) short_a;
6974 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6975
6976 was replaced with:
6977 STMT: int_acc = widen_sum <short_a, int_acc>
6978
6979 This means that:
6980 1. The tree-code that is used to create the vector operation in the
6981 epilog code (that reduces the partial results) is not the
6982 tree-code of STMT, but is rather the tree-code of the original
6983 stmt from the pattern that STMT is replacing. I.e, in the example
6984 above we want to use 'widen_sum' in the loop, but 'plus' in the
6985 epilog.
6986 2. The type (mode) we use to check available target support
6987 for the vector operation to be created in the *epilog*, is
6988 determined by the type of the reduction variable (in the example
6989 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6990 However the type (mode) we use to check available target support
6991 for the vector operation to be created *inside the loop*, is
6992 determined by the type of the other arguments to STMT (in the
6993 example we'd check this: optab_handler (widen_sum_optab,
6994 vect_short_mode)).
6995
6996 This is contrary to "regular" reductions, in which the types of all
6997 the arguments are the same as the type of the reduction variable.
6998 For "regular" reductions we can therefore use the same vector type
6999 (and also the same tree-code) when generating the epilog code and
7000 when generating the code inside the loop. */
7001
7002 vect_reduction_type reduction_type
7003 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7004 if (orig_stmt
7005 && (reduction_type == TREE_CODE_REDUCTION
7006 || reduction_type == FOLD_LEFT_REDUCTION))
7007 {
7008 /* This is a reduction pattern: get the vectype from the type of the
7009 reduction variable, and get the tree-code from orig_stmt. */
7010 orig_code = gimple_assign_rhs_code (orig_stmt);
7011 gcc_assert (vectype_out);
7012 vec_mode = TYPE_MODE (vectype_out);
7013 }
7014 else
7015 {
7016 /* Regular reduction: use the same vectype and tree-code as used for
7017 the vector code inside the loop can be used for the epilog code. */
7018 orig_code = code;
7019
7020 if (code == MINUS_EXPR)
7021 orig_code = PLUS_EXPR;
7022
7023 /* For simple condition reductions, replace with the actual expression
7024 we want to base our reduction around. */
7025 if (reduction_type == CONST_COND_REDUCTION)
7026 {
7027 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7028 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7029 }
7030 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7031 orig_code = cond_reduc_op_code;
7032 }
7033
7034 if (nested_cycle)
7035 {
7036 def_bb = gimple_bb (reduc_def_stmt);
7037 def_stmt_loop = def_bb->loop_father;
7038 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7039 loop_preheader_edge (def_stmt_loop));
7040 if (TREE_CODE (def_arg) == SSA_NAME
7041 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7042 && gimple_code (def_arg_stmt) == GIMPLE_PHI
7043 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7044 && vinfo_for_stmt (def_arg_stmt)
7045 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7046 == vect_double_reduction_def)
7047 double_reduc = true;
7048 }
7049
7050 reduc_fn = IFN_LAST;
7051
7052 if (reduction_type == TREE_CODE_REDUCTION
7053 || reduction_type == FOLD_LEFT_REDUCTION
7054 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7055 || reduction_type == CONST_COND_REDUCTION)
7056 {
7057 if (reduction_type == FOLD_LEFT_REDUCTION
7058 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7059 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7060 {
7061 if (reduc_fn != IFN_LAST
7062 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7063 OPTIMIZE_FOR_SPEED))
7064 {
7065 if (dump_enabled_p ())
7066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7067 "reduc op not supported by target.\n");
7068
7069 reduc_fn = IFN_LAST;
7070 }
7071 }
7072 else
7073 {
7074 if (!nested_cycle || double_reduc)
7075 {
7076 if (dump_enabled_p ())
7077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7078 "no reduc code for scalar code.\n");
7079
7080 return false;
7081 }
7082 }
7083 }
7084 else if (reduction_type == COND_REDUCTION)
7085 {
7086 int scalar_precision
7087 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7088 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7089 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7090 nunits_out);
7091
7092 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7093 OPTIMIZE_FOR_SPEED))
7094 reduc_fn = IFN_REDUC_MAX;
7095 }
7096
7097 if (reduction_type != EXTRACT_LAST_REDUCTION
7098 && reduc_fn == IFN_LAST
7099 && !nunits_out.is_constant ())
7100 {
7101 if (dump_enabled_p ())
7102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 "missing target support for reduction on"
7104 " variable-length vectors.\n");
7105 return false;
7106 }
7107
7108 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7109 && ncopies > 1)
7110 {
7111 if (dump_enabled_p ())
7112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113 "multiple types in double reduction or condition "
7114 "reduction.\n");
7115 return false;
7116 }
7117
7118 /* For SLP reductions, see if there is a neutral value we can use. */
7119 tree neutral_op = NULL_TREE;
7120 if (slp_node)
7121 neutral_op
7122 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7123 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7124
7125 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7126 {
7127 /* We can't support in-order reductions of code such as this:
7128
7129 for (int i = 0; i < n1; ++i)
7130 for (int j = 0; j < n2; ++j)
7131 l += a[j];
7132
7133 since GCC effectively transforms the loop when vectorizing:
7134
7135 for (int i = 0; i < n1 / VF; ++i)
7136 for (int j = 0; j < n2; ++j)
7137 for (int k = 0; k < VF; ++k)
7138 l += a[j];
7139
7140 which is a reassociation of the original operation. */
7141 if (dump_enabled_p ())
7142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 "in-order double reduction not supported.\n");
7144
7145 return false;
7146 }
7147
7148 if (reduction_type == FOLD_LEFT_REDUCTION
7149 && slp_node
7150 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7151 {
7152 /* We cannot use in-order reductions in this case because there is
7153 an implicit reassociation of the operations involved. */
7154 if (dump_enabled_p ())
7155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7156 "in-order unchained SLP reductions not supported.\n");
7157 return false;
7158 }
7159
7160 /* For double reductions, and for SLP reductions with a neutral value,
7161 we construct a variable-length initial vector by loading a vector
7162 full of the neutral value and then shift-and-inserting the start
7163 values into the low-numbered elements. */
7164 if ((double_reduc || neutral_op)
7165 && !nunits_out.is_constant ()
7166 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7167 vectype_out, OPTIMIZE_FOR_SPEED))
7168 {
7169 if (dump_enabled_p ())
7170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7171 "reduction on variable-length vectors requires"
7172 " target support for a vector-shift-and-insert"
7173 " operation.\n");
7174 return false;
7175 }
7176
7177 /* Check extra constraints for variable-length unchained SLP reductions. */
7178 if (STMT_SLP_TYPE (stmt_info)
7179 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7180 && !nunits_out.is_constant ())
7181 {
7182 /* We checked above that we could build the initial vector when
7183 there's a neutral element value. Check here for the case in
7184 which each SLP statement has its own initial value and in which
7185 that value needs to be repeated for every instance of the
7186 statement within the initial vector. */
7187 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7188 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7189 if (!neutral_op
7190 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7191 {
7192 if (dump_enabled_p ())
7193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7194 "unsupported form of SLP reduction for"
7195 " variable-length vectors: cannot build"
7196 " initial vector.\n");
7197 return false;
7198 }
7199 /* The epilogue code relies on the number of elements being a multiple
7200 of the group size. The duplicate-and-interleave approach to setting
7201 up the the initial vector does too. */
7202 if (!multiple_p (nunits_out, group_size))
7203 {
7204 if (dump_enabled_p ())
7205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206 "unsupported form of SLP reduction for"
7207 " variable-length vectors: the vector size"
7208 " is not a multiple of the number of results.\n");
7209 return false;
7210 }
7211 }
7212
7213 /* In case of widenning multiplication by a constant, we update the type
7214 of the constant to be the type of the other operand. We check that the
7215 constant fits the type in the pattern recognition pass. */
7216 if (code == DOT_PROD_EXPR
7217 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7218 {
7219 if (TREE_CODE (ops[0]) == INTEGER_CST)
7220 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7221 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7222 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7223 else
7224 {
7225 if (dump_enabled_p ())
7226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7227 "invalid types in dot-prod\n");
7228
7229 return false;
7230 }
7231 }
7232
7233 if (reduction_type == COND_REDUCTION)
7234 {
7235 widest_int ni;
7236
7237 if (! max_loop_iterations (loop, &ni))
7238 {
7239 if (dump_enabled_p ())
7240 dump_printf_loc (MSG_NOTE, vect_location,
7241 "loop count not known, cannot create cond "
7242 "reduction.\n");
7243 return false;
7244 }
7245 /* Convert backedges to iterations. */
7246 ni += 1;
7247
7248 /* The additional index will be the same type as the condition. Check
7249 that the loop can fit into this less one (because we'll use up the
7250 zero slot for when there are no matches). */
7251 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7252 if (wi::geu_p (ni, wi::to_widest (max_index)))
7253 {
7254 if (dump_enabled_p ())
7255 dump_printf_loc (MSG_NOTE, vect_location,
7256 "loop size is greater than data size.\n");
7257 return false;
7258 }
7259 }
7260
7261 /* In case the vectorization factor (VF) is bigger than the number
7262 of elements that we can fit in a vectype (nunits), we have to generate
7263 more than one vector stmt - i.e - we need to "unroll" the
7264 vector stmt by a factor VF/nunits. For more details see documentation
7265 in vectorizable_operation. */
7266
7267 /* If the reduction is used in an outer loop we need to generate
7268 VF intermediate results, like so (e.g. for ncopies=2):
7269 r0 = phi (init, r0)
7270 r1 = phi (init, r1)
7271 r0 = x0 + r0;
7272 r1 = x1 + r1;
7273 (i.e. we generate VF results in 2 registers).
7274 In this case we have a separate def-use cycle for each copy, and therefore
7275 for each copy we get the vector def for the reduction variable from the
7276 respective phi node created for this copy.
7277
7278 Otherwise (the reduction is unused in the loop nest), we can combine
7279 together intermediate results, like so (e.g. for ncopies=2):
7280 r = phi (init, r)
7281 r = x0 + r;
7282 r = x1 + r;
7283 (i.e. we generate VF/2 results in a single register).
7284 In this case for each copy we get the vector def for the reduction variable
7285 from the vectorized reduction operation generated in the previous iteration.
7286
7287 This only works when we see both the reduction PHI and its only consumer
7288 in vectorizable_reduction and there are no intermediate stmts
7289 participating. */
7290 use_operand_p use_p;
7291 gimple *use_stmt;
7292 if (ncopies > 1
7293 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7294 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7295 && (use_stmt == stmt
7296 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7297 {
7298 single_defuse_cycle = true;
7299 epilog_copies = 1;
7300 }
7301 else
7302 epilog_copies = ncopies;
7303
7304 /* If the reduction stmt is one of the patterns that have lane
7305 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7306 if ((ncopies > 1
7307 && ! single_defuse_cycle)
7308 && (code == DOT_PROD_EXPR
7309 || code == WIDEN_SUM_EXPR
7310 || code == SAD_EXPR))
7311 {
7312 if (dump_enabled_p ())
7313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314 "multi def-use cycle not possible for lane-reducing "
7315 "reduction operation\n");
7316 return false;
7317 }
7318
7319 if (slp_node)
7320 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7321 else
7322 vec_num = 1;
7323
7324 internal_fn cond_fn = get_conditional_internal_fn (code);
7325 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7326
7327 if (!vec_stmt) /* transformation not required. */
7328 {
7329 if (first_p)
7330 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7331 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7332 {
7333 if (reduction_type != FOLD_LEFT_REDUCTION
7334 && (cond_fn == IFN_LAST
7335 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7336 OPTIMIZE_FOR_SPEED)))
7337 {
7338 if (dump_enabled_p ())
7339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7340 "can't use a fully-masked loop because no"
7341 " conditional operation is available.\n");
7342 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7343 }
7344 else if (reduc_index == -1)
7345 {
7346 if (dump_enabled_p ())
7347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7348 "can't use a fully-masked loop for chained"
7349 " reductions.\n");
7350 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7351 }
7352 else
7353 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7354 vectype_in);
7355 }
7356 if (dump_enabled_p ()
7357 && reduction_type == FOLD_LEFT_REDUCTION)
7358 dump_printf_loc (MSG_NOTE, vect_location,
7359 "using an in-order (fold-left) reduction.\n");
7360 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7361 return true;
7362 }
7363
7364 /* Transform. */
7365
7366 if (dump_enabled_p ())
7367 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7368
7369 /* FORNOW: Multiple types are not supported for condition. */
7370 if (code == COND_EXPR)
7371 gcc_assert (ncopies == 1);
7372
7373 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7374
7375 if (reduction_type == FOLD_LEFT_REDUCTION)
7376 return vectorize_fold_left_reduction
7377 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7378 reduc_fn, ops, vectype_in, reduc_index, masks);
7379
7380 if (reduction_type == EXTRACT_LAST_REDUCTION)
7381 {
7382 gcc_assert (!slp_node);
7383 return vectorizable_condition (stmt, gsi, vec_stmt,
7384 NULL, reduc_index, NULL);
7385 }
7386
7387 /* Create the destination vector */
7388 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7389
7390 prev_stmt_info = NULL;
7391 prev_phi_info = NULL;
7392 if (!slp_node)
7393 {
7394 vec_oprnds0.create (1);
7395 vec_oprnds1.create (1);
7396 if (op_type == ternary_op)
7397 vec_oprnds2.create (1);
7398 }
7399
7400 phis.create (vec_num);
7401 vect_defs.create (vec_num);
7402 if (!slp_node)
7403 vect_defs.quick_push (NULL_TREE);
7404
7405 if (slp_node)
7406 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7407 else
7408 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7409
7410 for (j = 0; j < ncopies; j++)
7411 {
7412 if (code == COND_EXPR)
7413 {
7414 gcc_assert (!slp_node);
7415 vectorizable_condition (stmt, gsi, vec_stmt,
7416 PHI_RESULT (phis[0]),
7417 reduc_index, NULL);
7418 /* Multiple types are not supported for condition. */
7419 break;
7420 }
7421
7422 /* Handle uses. */
7423 if (j == 0)
7424 {
7425 if (slp_node)
7426 {
7427 /* Get vec defs for all the operands except the reduction index,
7428 ensuring the ordering of the ops in the vector is kept. */
7429 auto_vec<tree, 3> slp_ops;
7430 auto_vec<vec<tree>, 3> vec_defs;
7431
7432 slp_ops.quick_push (ops[0]);
7433 slp_ops.quick_push (ops[1]);
7434 if (op_type == ternary_op)
7435 slp_ops.quick_push (ops[2]);
7436
7437 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7438
7439 vec_oprnds0.safe_splice (vec_defs[0]);
7440 vec_defs[0].release ();
7441 vec_oprnds1.safe_splice (vec_defs[1]);
7442 vec_defs[1].release ();
7443 if (op_type == ternary_op)
7444 {
7445 vec_oprnds2.safe_splice (vec_defs[2]);
7446 vec_defs[2].release ();
7447 }
7448 }
7449 else
7450 {
7451 vec_oprnds0.quick_push
7452 (vect_get_vec_def_for_operand (ops[0], stmt));
7453 vec_oprnds1.quick_push
7454 (vect_get_vec_def_for_operand (ops[1], stmt));
7455 if (op_type == ternary_op)
7456 vec_oprnds2.quick_push
7457 (vect_get_vec_def_for_operand (ops[2], stmt));
7458 }
7459 }
7460 else
7461 {
7462 if (!slp_node)
7463 {
7464 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7465
7466 if (single_defuse_cycle && reduc_index == 0)
7467 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7468 else
7469 vec_oprnds0[0]
7470 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7471 if (single_defuse_cycle && reduc_index == 1)
7472 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7473 else
7474 vec_oprnds1[0]
7475 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7476 if (op_type == ternary_op)
7477 {
7478 if (single_defuse_cycle && reduc_index == 2)
7479 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7480 else
7481 vec_oprnds2[0]
7482 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7483 }
7484 }
7485 }
7486
7487 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7488 {
7489 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7490 if (masked_loop_p)
7491 {
7492 /* Make sure that the reduction accumulator is vop[0]. */
7493 if (reduc_index == 1)
7494 {
7495 gcc_assert (commutative_tree_code (code));
7496 std::swap (vop[0], vop[1]);
7497 }
7498 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7499 vectype_in, i * ncopies + j);
7500 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7501 vop[0], vop[1]);
7502 new_temp = make_ssa_name (vec_dest, call);
7503 gimple_call_set_lhs (call, new_temp);
7504 gimple_call_set_nothrow (call, true);
7505 new_stmt = call;
7506 }
7507 else
7508 {
7509 if (op_type == ternary_op)
7510 vop[2] = vec_oprnds2[i];
7511
7512 new_temp = make_ssa_name (vec_dest, new_stmt);
7513 new_stmt = gimple_build_assign (new_temp, code,
7514 vop[0], vop[1], vop[2]);
7515 }
7516 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7517
7518 if (slp_node)
7519 {
7520 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7521 vect_defs.quick_push (new_temp);
7522 }
7523 else
7524 vect_defs[0] = new_temp;
7525 }
7526
7527 if (slp_node)
7528 continue;
7529
7530 if (j == 0)
7531 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7532 else
7533 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7534
7535 prev_stmt_info = vinfo_for_stmt (new_stmt);
7536 }
7537
7538 /* Finalize the reduction-phi (set its arguments) and create the
7539 epilog reduction code. */
7540 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7541 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7542
7543 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7544 epilog_copies, reduc_fn, phis,
7545 double_reduc, slp_node, slp_node_instance,
7546 cond_reduc_val, cond_reduc_op_code,
7547 neutral_op);
7548
7549 return true;
7550 }
7551
7552 /* Function vect_min_worthwhile_factor.
7553
7554 For a loop where we could vectorize the operation indicated by CODE,
7555 return the minimum vectorization factor that makes it worthwhile
7556 to use generic vectors. */
7557 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7558 vect_min_worthwhile_factor (enum tree_code code)
7559 {
7560 switch (code)
7561 {
7562 case PLUS_EXPR:
7563 case MINUS_EXPR:
7564 case NEGATE_EXPR:
7565 return 4;
7566
7567 case BIT_AND_EXPR:
7568 case BIT_IOR_EXPR:
7569 case BIT_XOR_EXPR:
7570 case BIT_NOT_EXPR:
7571 return 2;
7572
7573 default:
7574 return INT_MAX;
7575 }
7576 }
7577
7578 /* Return true if VINFO indicates we are doing loop vectorization and if
7579 it is worth decomposing CODE operations into scalar operations for
7580 that loop's vectorization factor. */
7581
7582 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7583 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7584 {
7585 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7586 unsigned HOST_WIDE_INT value;
7587 return (loop_vinfo
7588 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7589 && value >= vect_min_worthwhile_factor (code));
7590 }
7591
7592 /* Function vectorizable_induction
7593
7594 Check if PHI performs an induction computation that can be vectorized.
7595 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7596 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7597 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7598
7599 bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt,slp_tree slp_node)7600 vectorizable_induction (gimple *phi,
7601 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7602 gimple **vec_stmt, slp_tree slp_node)
7603 {
7604 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7605 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7606 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7607 unsigned ncopies;
7608 bool nested_in_vect_loop = false;
7609 struct loop *iv_loop;
7610 tree vec_def;
7611 edge pe = loop_preheader_edge (loop);
7612 basic_block new_bb;
7613 tree new_vec, vec_init, vec_step, t;
7614 tree new_name;
7615 gimple *new_stmt;
7616 gphi *induction_phi;
7617 tree induc_def, vec_dest;
7618 tree init_expr, step_expr;
7619 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7620 unsigned i;
7621 tree expr;
7622 gimple_seq stmts;
7623 imm_use_iterator imm_iter;
7624 use_operand_p use_p;
7625 gimple *exit_phi;
7626 edge latch_e;
7627 tree loop_arg;
7628 gimple_stmt_iterator si;
7629 basic_block bb = gimple_bb (phi);
7630
7631 if (gimple_code (phi) != GIMPLE_PHI)
7632 return false;
7633
7634 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7635 return false;
7636
7637 /* Make sure it was recognized as induction computation. */
7638 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7639 return false;
7640
7641 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7642 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7643
7644 if (slp_node)
7645 ncopies = 1;
7646 else
7647 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7648 gcc_assert (ncopies >= 1);
7649
7650 /* FORNOW. These restrictions should be relaxed. */
7651 if (nested_in_vect_loop_p (loop, phi))
7652 {
7653 imm_use_iterator imm_iter;
7654 use_operand_p use_p;
7655 gimple *exit_phi;
7656 edge latch_e;
7657 tree loop_arg;
7658
7659 if (ncopies > 1)
7660 {
7661 if (dump_enabled_p ())
7662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7663 "multiple types in nested loop.\n");
7664 return false;
7665 }
7666
7667 /* FORNOW: outer loop induction with SLP not supported. */
7668 if (STMT_SLP_TYPE (stmt_info))
7669 return false;
7670
7671 exit_phi = NULL;
7672 latch_e = loop_latch_edge (loop->inner);
7673 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7674 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7675 {
7676 gimple *use_stmt = USE_STMT (use_p);
7677 if (is_gimple_debug (use_stmt))
7678 continue;
7679
7680 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7681 {
7682 exit_phi = use_stmt;
7683 break;
7684 }
7685 }
7686 if (exit_phi)
7687 {
7688 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7689 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7690 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7691 {
7692 if (dump_enabled_p ())
7693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694 "inner-loop induction only used outside "
7695 "of the outer vectorized loop.\n");
7696 return false;
7697 }
7698 }
7699
7700 nested_in_vect_loop = true;
7701 iv_loop = loop->inner;
7702 }
7703 else
7704 iv_loop = loop;
7705 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7706
7707 if (slp_node && !nunits.is_constant ())
7708 {
7709 /* The current SLP code creates the initial value element-by-element. */
7710 if (dump_enabled_p ())
7711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7712 "SLP induction not supported for variable-length"
7713 " vectors.\n");
7714 return false;
7715 }
7716
7717 if (!vec_stmt) /* transformation not required. */
7718 {
7719 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7720 if (dump_enabled_p ())
7721 dump_printf_loc (MSG_NOTE, vect_location,
7722 "=== vectorizable_induction ===\n");
7723 vect_model_induction_cost (stmt_info, ncopies);
7724 return true;
7725 }
7726
7727 /* Transform. */
7728
7729 /* Compute a vector variable, initialized with the first VF values of
7730 the induction variable. E.g., for an iv with IV_PHI='X' and
7731 evolution S, for a vector of 4 units, we want to compute:
7732 [X, X + S, X + 2*S, X + 3*S]. */
7733
7734 if (dump_enabled_p ())
7735 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7736
7737 latch_e = loop_latch_edge (iv_loop);
7738 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7739
7740 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7741 gcc_assert (step_expr != NULL_TREE);
7742
7743 pe = loop_preheader_edge (iv_loop);
7744 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7745 loop_preheader_edge (iv_loop));
7746
7747 stmts = NULL;
7748 if (!nested_in_vect_loop)
7749 {
7750 /* Convert the initial value to the desired type. */
7751 tree new_type = TREE_TYPE (vectype);
7752 init_expr = gimple_convert (&stmts, new_type, init_expr);
7753
7754 /* If we are using the loop mask to "peel" for alignment then we need
7755 to adjust the start value here. */
7756 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7757 if (skip_niters != NULL_TREE)
7758 {
7759 if (FLOAT_TYPE_P (vectype))
7760 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7761 skip_niters);
7762 else
7763 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7764 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7765 skip_niters, step_expr);
7766 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7767 init_expr, skip_step);
7768 }
7769 }
7770
7771 /* Convert the step to the desired type. */
7772 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7773
7774 if (stmts)
7775 {
7776 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7777 gcc_assert (!new_bb);
7778 }
7779
7780 /* Find the first insertion point in the BB. */
7781 si = gsi_after_labels (bb);
7782
7783 /* For SLP induction we have to generate several IVs as for example
7784 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7785 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7786 [VF*S, VF*S, VF*S, VF*S] for all. */
7787 if (slp_node)
7788 {
7789 /* Enforced above. */
7790 unsigned int const_nunits = nunits.to_constant ();
7791
7792 /* Generate [VF*S, VF*S, ... ]. */
7793 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7794 {
7795 expr = build_int_cst (integer_type_node, vf);
7796 expr = fold_convert (TREE_TYPE (step_expr), expr);
7797 }
7798 else
7799 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7800 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7801 expr, step_expr);
7802 if (! CONSTANT_CLASS_P (new_name))
7803 new_name = vect_init_vector (phi, new_name,
7804 TREE_TYPE (step_expr), NULL);
7805 new_vec = build_vector_from_val (vectype, new_name);
7806 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7807
7808 /* Now generate the IVs. */
7809 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7810 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7811 unsigned elts = const_nunits * nvects;
7812 unsigned nivs = least_common_multiple (group_size,
7813 const_nunits) / const_nunits;
7814 gcc_assert (elts % group_size == 0);
7815 tree elt = init_expr;
7816 unsigned ivn;
7817 for (ivn = 0; ivn < nivs; ++ivn)
7818 {
7819 tree_vector_builder elts (vectype, const_nunits, 1);
7820 stmts = NULL;
7821 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7822 {
7823 if (ivn*const_nunits + eltn >= group_size
7824 && (ivn * const_nunits + eltn) % group_size == 0)
7825 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7826 elt, step_expr);
7827 elts.quick_push (elt);
7828 }
7829 vec_init = gimple_build_vector (&stmts, &elts);
7830 if (stmts)
7831 {
7832 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7833 gcc_assert (!new_bb);
7834 }
7835
7836 /* Create the induction-phi that defines the induction-operand. */
7837 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7838 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7839 set_vinfo_for_stmt (induction_phi,
7840 new_stmt_vec_info (induction_phi, loop_vinfo));
7841 induc_def = PHI_RESULT (induction_phi);
7842
7843 /* Create the iv update inside the loop */
7844 vec_def = make_ssa_name (vec_dest);
7845 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7846 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7847 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7848
7849 /* Set the arguments of the phi node: */
7850 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7851 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7852 UNKNOWN_LOCATION);
7853
7854 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7855 }
7856
7857 /* Re-use IVs when we can. */
7858 if (ivn < nvects)
7859 {
7860 unsigned vfp
7861 = least_common_multiple (group_size, const_nunits) / group_size;
7862 /* Generate [VF'*S, VF'*S, ... ]. */
7863 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7864 {
7865 expr = build_int_cst (integer_type_node, vfp);
7866 expr = fold_convert (TREE_TYPE (step_expr), expr);
7867 }
7868 else
7869 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7870 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7871 expr, step_expr);
7872 if (! CONSTANT_CLASS_P (new_name))
7873 new_name = vect_init_vector (phi, new_name,
7874 TREE_TYPE (step_expr), NULL);
7875 new_vec = build_vector_from_val (vectype, new_name);
7876 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7877 for (; ivn < nvects; ++ivn)
7878 {
7879 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7880 tree def;
7881 if (gimple_code (iv) == GIMPLE_PHI)
7882 def = gimple_phi_result (iv);
7883 else
7884 def = gimple_assign_lhs (iv);
7885 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7886 PLUS_EXPR,
7887 def, vec_step);
7888 if (gimple_code (iv) == GIMPLE_PHI)
7889 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7890 else
7891 {
7892 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7893 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7894 }
7895 set_vinfo_for_stmt (new_stmt,
7896 new_stmt_vec_info (new_stmt, loop_vinfo));
7897 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7898 }
7899 }
7900
7901 return true;
7902 }
7903
7904 /* Create the vector that holds the initial_value of the induction. */
7905 if (nested_in_vect_loop)
7906 {
7907 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7908 been created during vectorization of previous stmts. We obtain it
7909 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7910 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7911 /* If the initial value is not of proper type, convert it. */
7912 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7913 {
7914 new_stmt
7915 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7916 vect_simple_var,
7917 "vec_iv_"),
7918 VIEW_CONVERT_EXPR,
7919 build1 (VIEW_CONVERT_EXPR, vectype,
7920 vec_init));
7921 vec_init = gimple_assign_lhs (new_stmt);
7922 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7923 new_stmt);
7924 gcc_assert (!new_bb);
7925 set_vinfo_for_stmt (new_stmt,
7926 new_stmt_vec_info (new_stmt, loop_vinfo));
7927 }
7928 }
7929 else
7930 {
7931 /* iv_loop is the loop to be vectorized. Create:
7932 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7933 stmts = NULL;
7934 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7935
7936 unsigned HOST_WIDE_INT const_nunits;
7937 if (nunits.is_constant (&const_nunits))
7938 {
7939 tree_vector_builder elts (vectype, const_nunits, 1);
7940 elts.quick_push (new_name);
7941 for (i = 1; i < const_nunits; i++)
7942 {
7943 /* Create: new_name_i = new_name + step_expr */
7944 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7945 new_name, step_expr);
7946 elts.quick_push (new_name);
7947 }
7948 /* Create a vector from [new_name_0, new_name_1, ...,
7949 new_name_nunits-1] */
7950 vec_init = gimple_build_vector (&stmts, &elts);
7951 }
7952 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7953 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7954 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7955 new_name, step_expr);
7956 else
7957 {
7958 /* Build:
7959 [base, base, base, ...]
7960 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7961 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7962 gcc_assert (flag_associative_math);
7963 tree index = build_index_vector (vectype, 0, 1);
7964 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7965 new_name);
7966 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7967 step_expr);
7968 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7969 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7970 vec_init, step_vec);
7971 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7972 vec_init, base_vec);
7973 }
7974
7975 if (stmts)
7976 {
7977 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7978 gcc_assert (!new_bb);
7979 }
7980 }
7981
7982
7983 /* Create the vector that holds the step of the induction. */
7984 if (nested_in_vect_loop)
7985 /* iv_loop is nested in the loop to be vectorized. Generate:
7986 vec_step = [S, S, S, S] */
7987 new_name = step_expr;
7988 else
7989 {
7990 /* iv_loop is the loop to be vectorized. Generate:
7991 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7992 gimple_seq seq = NULL;
7993 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7994 {
7995 expr = build_int_cst (integer_type_node, vf);
7996 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7997 }
7998 else
7999 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8000 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8001 expr, step_expr);
8002 if (seq)
8003 {
8004 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8005 gcc_assert (!new_bb);
8006 }
8007 }
8008
8009 t = unshare_expr (new_name);
8010 gcc_assert (CONSTANT_CLASS_P (new_name)
8011 || TREE_CODE (new_name) == SSA_NAME);
8012 new_vec = build_vector_from_val (vectype, t);
8013 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8014
8015
8016 /* Create the following def-use cycle:
8017 loop prolog:
8018 vec_init = ...
8019 vec_step = ...
8020 loop:
8021 vec_iv = PHI <vec_init, vec_loop>
8022 ...
8023 STMT
8024 ...
8025 vec_loop = vec_iv + vec_step; */
8026
8027 /* Create the induction-phi that defines the induction-operand. */
8028 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8029 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8030 set_vinfo_for_stmt (induction_phi,
8031 new_stmt_vec_info (induction_phi, loop_vinfo));
8032 induc_def = PHI_RESULT (induction_phi);
8033
8034 /* Create the iv update inside the loop */
8035 vec_def = make_ssa_name (vec_dest);
8036 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8037 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8038 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8039
8040 /* Set the arguments of the phi node: */
8041 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8042 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8043 UNKNOWN_LOCATION);
8044
8045 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8046
8047 /* In case that vectorization factor (VF) is bigger than the number
8048 of elements that we can fit in a vectype (nunits), we have to generate
8049 more than one vector stmt - i.e - we need to "unroll" the
8050 vector stmt by a factor VF/nunits. For more details see documentation
8051 in vectorizable_operation. */
8052
8053 if (ncopies > 1)
8054 {
8055 gimple_seq seq = NULL;
8056 stmt_vec_info prev_stmt_vinfo;
8057 /* FORNOW. This restriction should be relaxed. */
8058 gcc_assert (!nested_in_vect_loop);
8059
8060 /* Create the vector that holds the step of the induction. */
8061 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8062 {
8063 expr = build_int_cst (integer_type_node, nunits);
8064 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8065 }
8066 else
8067 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8068 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8069 expr, step_expr);
8070 if (seq)
8071 {
8072 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8073 gcc_assert (!new_bb);
8074 }
8075
8076 t = unshare_expr (new_name);
8077 gcc_assert (CONSTANT_CLASS_P (new_name)
8078 || TREE_CODE (new_name) == SSA_NAME);
8079 new_vec = build_vector_from_val (vectype, t);
8080 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8081
8082 vec_def = induc_def;
8083 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8084 for (i = 1; i < ncopies; i++)
8085 {
8086 /* vec_i = vec_prev + vec_step */
8087 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8088 vec_def, vec_step);
8089 vec_def = make_ssa_name (vec_dest, new_stmt);
8090 gimple_assign_set_lhs (new_stmt, vec_def);
8091
8092 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8093 set_vinfo_for_stmt (new_stmt,
8094 new_stmt_vec_info (new_stmt, loop_vinfo));
8095 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8096 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8097 }
8098 }
8099
8100 if (nested_in_vect_loop)
8101 {
8102 /* Find the loop-closed exit-phi of the induction, and record
8103 the final vector of induction results: */
8104 exit_phi = NULL;
8105 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8106 {
8107 gimple *use_stmt = USE_STMT (use_p);
8108 if (is_gimple_debug (use_stmt))
8109 continue;
8110
8111 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8112 {
8113 exit_phi = use_stmt;
8114 break;
8115 }
8116 }
8117 if (exit_phi)
8118 {
8119 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8120 /* FORNOW. Currently not supporting the case that an inner-loop induction
8121 is not used in the outer-loop (i.e. only outside the outer-loop). */
8122 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8123 && !STMT_VINFO_LIVE_P (stmt_vinfo));
8124
8125 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8126 if (dump_enabled_p ())
8127 {
8128 dump_printf_loc (MSG_NOTE, vect_location,
8129 "vector of inductions after inner-loop:");
8130 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8131 }
8132 }
8133 }
8134
8135
8136 if (dump_enabled_p ())
8137 {
8138 dump_printf_loc (MSG_NOTE, vect_location,
8139 "transform induction: created def-use cycle: ");
8140 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8141 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8142 SSA_NAME_DEF_STMT (vec_def), 0);
8143 }
8144
8145 return true;
8146 }
8147
8148 /* Function vectorizable_live_operation.
8149
8150 STMT computes a value that is used outside the loop. Check if
8151 it can be supported. */
8152
8153 bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,gimple ** vec_stmt)8154 vectorizable_live_operation (gimple *stmt,
8155 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8156 slp_tree slp_node, int slp_index,
8157 gimple **vec_stmt)
8158 {
8159 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8160 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8162 imm_use_iterator imm_iter;
8163 tree lhs, lhs_type, bitsize, vec_bitsize;
8164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8165 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8166 int ncopies;
8167 gimple *use_stmt;
8168 auto_vec<tree> vec_oprnds;
8169 int vec_entry = 0;
8170 poly_uint64 vec_index = 0;
8171
8172 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8173
8174 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8175 return false;
8176
8177 /* FORNOW. CHECKME. */
8178 if (nested_in_vect_loop_p (loop, stmt))
8179 return false;
8180
8181 /* If STMT is not relevant and it is a simple assignment and its inputs are
8182 invariant then it can remain in place, unvectorized. The original last
8183 scalar value that it computes will be used. */
8184 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8185 {
8186 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8187 if (dump_enabled_p ())
8188 dump_printf_loc (MSG_NOTE, vect_location,
8189 "statement is simple and uses invariant. Leaving in "
8190 "place.\n");
8191 return true;
8192 }
8193
8194 if (slp_node)
8195 ncopies = 1;
8196 else
8197 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8198
8199 if (slp_node)
8200 {
8201 gcc_assert (slp_index >= 0);
8202
8203 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8204 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8205
8206 /* Get the last occurrence of the scalar index from the concatenation of
8207 all the slp vectors. Calculate which slp vector it is and the index
8208 within. */
8209 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8210
8211 /* Calculate which vector contains the result, and which lane of
8212 that vector we need. */
8213 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8214 {
8215 if (dump_enabled_p ())
8216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8217 "Cannot determine which vector holds the"
8218 " final result.\n");
8219 return false;
8220 }
8221 }
8222
8223 if (!vec_stmt)
8224 {
8225 /* No transformation required. */
8226 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8227 {
8228 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8229 OPTIMIZE_FOR_SPEED))
8230 {
8231 if (dump_enabled_p ())
8232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8233 "can't use a fully-masked loop because "
8234 "the target doesn't support extract last "
8235 "reduction.\n");
8236 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8237 }
8238 else if (slp_node)
8239 {
8240 if (dump_enabled_p ())
8241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8242 "can't use a fully-masked loop because an "
8243 "SLP statement is live after the loop.\n");
8244 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8245 }
8246 else if (ncopies > 1)
8247 {
8248 if (dump_enabled_p ())
8249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8250 "can't use a fully-masked loop because"
8251 " ncopies is greater than 1.\n");
8252 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8253 }
8254 else
8255 {
8256 gcc_assert (ncopies == 1 && !slp_node);
8257 vect_record_loop_mask (loop_vinfo,
8258 &LOOP_VINFO_MASKS (loop_vinfo),
8259 1, vectype);
8260 }
8261 }
8262 return true;
8263 }
8264
8265 /* If stmt has a related stmt, then use that for getting the lhs. */
8266 if (is_pattern_stmt_p (stmt_info))
8267 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8268
8269 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8270 : gimple_get_lhs (stmt);
8271 lhs_type = TREE_TYPE (lhs);
8272
8273 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8274 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8275 : TYPE_SIZE (TREE_TYPE (vectype)));
8276 vec_bitsize = TYPE_SIZE (vectype);
8277
8278 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8279 tree vec_lhs, bitstart;
8280 if (slp_node)
8281 {
8282 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8283
8284 /* Get the correct slp vectorized stmt. */
8285 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8286 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8287 vec_lhs = gimple_phi_result (phi);
8288 else
8289 vec_lhs = gimple_get_lhs (vec_stmt);
8290
8291 /* Get entry to use. */
8292 bitstart = bitsize_int (vec_index);
8293 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8294 }
8295 else
8296 {
8297 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8298 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8299 gcc_checking_assert (ncopies == 1
8300 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8301
8302 /* For multiple copies, get the last copy. */
8303 for (int i = 1; i < ncopies; ++i)
8304 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8305 vec_lhs);
8306
8307 /* Get the last lane in the vector. */
8308 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8309 }
8310
8311 gimple_seq stmts = NULL;
8312 tree new_tree;
8313 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8314 {
8315 /* Emit:
8316
8317 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8318
8319 where VEC_LHS is the vectorized live-out result and MASK is
8320 the loop mask for the final iteration. */
8321 gcc_assert (ncopies == 1 && !slp_node);
8322 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8323 tree scalar_res = make_ssa_name (scalar_type);
8324 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8325 1, vectype, 0);
8326 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8327 2, mask, vec_lhs);
8328 gimple_call_set_lhs (new_stmt, scalar_res);
8329 gimple_seq_add_stmt (&stmts, new_stmt);
8330
8331 /* Convert the extracted vector element to the required scalar type. */
8332 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8333 }
8334 else
8335 {
8336 tree bftype = TREE_TYPE (vectype);
8337 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8338 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8339 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8340 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8341 &stmts, true, NULL_TREE);
8342 }
8343
8344 if (stmts)
8345 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8346
8347 /* Replace use of lhs with newly computed result. If the use stmt is a
8348 single arg PHI, just replace all uses of PHI result. It's necessary
8349 because lcssa PHI defining lhs may be before newly inserted stmt. */
8350 use_operand_p use_p;
8351 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8352 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8353 && !is_gimple_debug (use_stmt))
8354 {
8355 if (gimple_code (use_stmt) == GIMPLE_PHI
8356 && gimple_phi_num_args (use_stmt) == 1)
8357 {
8358 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8359 }
8360 else
8361 {
8362 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8363 SET_USE (use_p, new_tree);
8364 }
8365 update_stmt (use_stmt);
8366 }
8367
8368 return true;
8369 }
8370
8371 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8372
8373 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)8374 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8375 {
8376 ssa_op_iter op_iter;
8377 imm_use_iterator imm_iter;
8378 def_operand_p def_p;
8379 gimple *ustmt;
8380
8381 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8382 {
8383 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8384 {
8385 basic_block bb;
8386
8387 if (!is_gimple_debug (ustmt))
8388 continue;
8389
8390 bb = gimple_bb (ustmt);
8391
8392 if (!flow_bb_inside_loop_p (loop, bb))
8393 {
8394 if (gimple_debug_bind_p (ustmt))
8395 {
8396 if (dump_enabled_p ())
8397 dump_printf_loc (MSG_NOTE, vect_location,
8398 "killing debug use\n");
8399
8400 gimple_debug_bind_reset_value (ustmt);
8401 update_stmt (ustmt);
8402 }
8403 else
8404 gcc_unreachable ();
8405 }
8406 }
8407 }
8408 }
8409
8410 /* Given loop represented by LOOP_VINFO, return true if computation of
8411 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8412 otherwise. */
8413
8414 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8415 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8416 {
8417 /* Constant case. */
8418 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8419 {
8420 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8421 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8422
8423 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8424 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8425 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8426 return true;
8427 }
8428
8429 widest_int max;
8430 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8431 /* Check the upper bound of loop niters. */
8432 if (get_max_loop_iterations (loop, &max))
8433 {
8434 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8435 signop sgn = TYPE_SIGN (type);
8436 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8437 if (max < type_max)
8438 return true;
8439 }
8440 return false;
8441 }
8442
8443 /* Return a mask type with half the number of elements as TYPE. */
8444
8445 tree
vect_halve_mask_nunits(tree type)8446 vect_halve_mask_nunits (tree type)
8447 {
8448 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8449 return build_truth_vector_type (nunits, current_vector_size);
8450 }
8451
8452 /* Return a mask type with twice as many elements as TYPE. */
8453
8454 tree
vect_double_mask_nunits(tree type)8455 vect_double_mask_nunits (tree type)
8456 {
8457 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8458 return build_truth_vector_type (nunits, current_vector_size);
8459 }
8460
8461 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8462 contain a sequence of NVECTORS masks that each control a vector of type
8463 VECTYPE. */
8464
8465 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)8466 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8467 unsigned int nvectors, tree vectype)
8468 {
8469 gcc_assert (nvectors != 0);
8470 if (masks->length () < nvectors)
8471 masks->safe_grow_cleared (nvectors);
8472 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8473 /* The number of scalars per iteration and the number of vectors are
8474 both compile-time constants. */
8475 unsigned int nscalars_per_iter
8476 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8477 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8478 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8479 {
8480 rgm->max_nscalars_per_iter = nscalars_per_iter;
8481 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8482 }
8483 }
8484
8485 /* Given a complete set of masks MASKS, extract mask number INDEX
8486 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8487 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8488
8489 See the comment above vec_loop_masks for more details about the mask
8490 arrangement. */
8491
8492 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8493 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8494 unsigned int nvectors, tree vectype, unsigned int index)
8495 {
8496 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8497 tree mask_type = rgm->mask_type;
8498
8499 /* Populate the rgroup's mask array, if this is the first time we've
8500 used it. */
8501 if (rgm->masks.is_empty ())
8502 {
8503 rgm->masks.safe_grow_cleared (nvectors);
8504 for (unsigned int i = 0; i < nvectors; ++i)
8505 {
8506 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8507 /* Provide a dummy definition until the real one is available. */
8508 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8509 rgm->masks[i] = mask;
8510 }
8511 }
8512
8513 tree mask = rgm->masks[index];
8514 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8515 TYPE_VECTOR_SUBPARTS (vectype)))
8516 {
8517 /* A loop mask for data type X can be reused for data type Y
8518 if X has N times more elements than Y and if Y's elements
8519 are N times bigger than X's. In this case each sequence
8520 of N elements in the loop mask will be all-zero or all-one.
8521 We can then view-convert the mask so that each sequence of
8522 N elements is replaced by a single element. */
8523 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8524 TYPE_VECTOR_SUBPARTS (vectype)));
8525 gimple_seq seq = NULL;
8526 mask_type = build_same_sized_truth_vector_type (vectype);
8527 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8528 if (seq)
8529 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8530 }
8531 return mask;
8532 }
8533
8534 /* Scale profiling counters by estimation for LOOP which is vectorized
8535 by factor VF. */
8536
8537 static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)8538 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8539 {
8540 edge preheader = loop_preheader_edge (loop);
8541 /* Reduce loop iterations by the vectorization factor. */
8542 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8543 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8544
8545 if (freq_h.nonzero_p ())
8546 {
8547 profile_probability p;
8548
8549 /* Avoid dropping loop body profile counter to 0 because of zero count
8550 in loop's preheader. */
8551 if (!(freq_e == profile_count::zero ()))
8552 freq_e = freq_e.force_nonzero ();
8553 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8554 scale_loop_frequencies (loop, p);
8555 }
8556
8557 edge exit_e = single_exit (loop);
8558 exit_e->probability = profile_probability::always ()
8559 .apply_scale (1, new_est_niter + 1);
8560
8561 edge exit_l = single_pred_edge (loop->latch);
8562 profile_probability prob = exit_l->probability;
8563 exit_l->probability = exit_e->probability.invert ();
8564 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8565 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8566 }
8567
8568 /* Function vect_transform_loop.
8569
8570 The analysis phase has determined that the loop is vectorizable.
8571 Vectorize the loop - created vectorized stmts to replace the scalar
8572 stmts in the loop, and update the loop exit condition.
8573 Returns scalar epilogue loop if any. */
8574
8575 struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)8576 vect_transform_loop (loop_vec_info loop_vinfo)
8577 {
8578 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8579 struct loop *epilogue = NULL;
8580 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8581 int nbbs = loop->num_nodes;
8582 int i;
8583 tree niters_vector = NULL_TREE;
8584 tree step_vector = NULL_TREE;
8585 tree niters_vector_mult_vf = NULL_TREE;
8586 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8587 unsigned int lowest_vf = constant_lower_bound (vf);
8588 bool grouped_store;
8589 bool slp_scheduled = false;
8590 gimple *stmt, *pattern_stmt;
8591 gimple_seq pattern_def_seq = NULL;
8592 gimple_stmt_iterator pattern_def_si = gsi_none ();
8593 bool transform_pattern_stmt = false;
8594 bool check_profitability = false;
8595 unsigned int th;
8596
8597 if (dump_enabled_p ())
8598 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8599
8600 /* Use the more conservative vectorization threshold. If the number
8601 of iterations is constant assume the cost check has been performed
8602 by our caller. If the threshold makes all loops profitable that
8603 run at least the (estimated) vectorization factor number of times
8604 checking is pointless, too. */
8605 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8606 if (th >= vect_vf_for_cost (loop_vinfo)
8607 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8608 {
8609 if (dump_enabled_p ())
8610 dump_printf_loc (MSG_NOTE, vect_location,
8611 "Profitability threshold is %d loop iterations.\n",
8612 th);
8613 check_profitability = true;
8614 }
8615
8616 /* Make sure there exists a single-predecessor exit bb. Do this before
8617 versioning. */
8618 edge e = single_exit (loop);
8619 if (! single_pred_p (e->dest))
8620 {
8621 split_loop_exit_edge (e);
8622 if (dump_enabled_p ())
8623 dump_printf (MSG_NOTE, "split exit edge\n");
8624 }
8625
8626 /* Version the loop first, if required, so the profitability check
8627 comes first. */
8628
8629 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8630 {
8631 poly_uint64 versioning_threshold
8632 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8633 if (check_profitability
8634 && ordered_p (poly_uint64 (th), versioning_threshold))
8635 {
8636 versioning_threshold = ordered_max (poly_uint64 (th),
8637 versioning_threshold);
8638 check_profitability = false;
8639 }
8640 vect_loop_versioning (loop_vinfo, th, check_profitability,
8641 versioning_threshold);
8642 check_profitability = false;
8643 }
8644
8645 /* Make sure there exists a single-predecessor exit bb also on the
8646 scalar loop copy. Do this after versioning but before peeling
8647 so CFG structure is fine for both scalar and if-converted loop
8648 to make slpeel_duplicate_current_defs_from_edges face matched
8649 loop closed PHI nodes on the exit. */
8650 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8651 {
8652 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8653 if (! single_pred_p (e->dest))
8654 {
8655 split_loop_exit_edge (e);
8656 if (dump_enabled_p ())
8657 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8658 }
8659 }
8660
8661 tree niters = vect_build_loop_niters (loop_vinfo);
8662 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8663 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8664 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8665 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8666 &step_vector, &niters_vector_mult_vf, th,
8667 check_profitability, niters_no_overflow);
8668
8669 if (niters_vector == NULL_TREE)
8670 {
8671 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8672 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8673 && known_eq (lowest_vf, vf))
8674 {
8675 niters_vector
8676 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8677 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8678 step_vector = build_one_cst (TREE_TYPE (niters));
8679 }
8680 else
8681 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8682 &step_vector, niters_no_overflow);
8683 }
8684
8685 /* 1) Make sure the loop header has exactly two entries
8686 2) Make sure we have a preheader basic block. */
8687
8688 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8689
8690 split_edge (loop_preheader_edge (loop));
8691
8692 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8693 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8694 /* This will deal with any possible peeling. */
8695 vect_prepare_for_masked_peels (loop_vinfo);
8696
8697 /* FORNOW: the vectorizer supports only loops which body consist
8698 of one basic block (header + empty latch). When the vectorizer will
8699 support more involved loop forms, the order by which the BBs are
8700 traversed need to be reconsidered. */
8701
8702 for (i = 0; i < nbbs; i++)
8703 {
8704 basic_block bb = bbs[i];
8705 stmt_vec_info stmt_info;
8706
8707 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8708 gsi_next (&si))
8709 {
8710 gphi *phi = si.phi ();
8711 if (dump_enabled_p ())
8712 {
8713 dump_printf_loc (MSG_NOTE, vect_location,
8714 "------>vectorizing phi: ");
8715 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8716 }
8717 stmt_info = vinfo_for_stmt (phi);
8718 if (!stmt_info)
8719 continue;
8720
8721 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8722 vect_loop_kill_debug_uses (loop, phi);
8723
8724 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8725 && !STMT_VINFO_LIVE_P (stmt_info))
8726 continue;
8727
8728 if (STMT_VINFO_VECTYPE (stmt_info)
8729 && (maybe_ne
8730 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8731 && dump_enabled_p ())
8732 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8733
8734 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8735 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8736 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8737 && ! PURE_SLP_STMT (stmt_info))
8738 {
8739 if (dump_enabled_p ())
8740 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8741 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8742 }
8743 }
8744
8745 pattern_stmt = NULL;
8746 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8747 !gsi_end_p (si) || transform_pattern_stmt;)
8748 {
8749 bool is_store;
8750
8751 if (transform_pattern_stmt)
8752 stmt = pattern_stmt;
8753 else
8754 {
8755 stmt = gsi_stmt (si);
8756 /* During vectorization remove existing clobber stmts. */
8757 if (gimple_clobber_p (stmt))
8758 {
8759 unlink_stmt_vdef (stmt);
8760 gsi_remove (&si, true);
8761 release_defs (stmt);
8762 continue;
8763 }
8764 }
8765
8766 if (dump_enabled_p ())
8767 {
8768 dump_printf_loc (MSG_NOTE, vect_location,
8769 "------>vectorizing statement: ");
8770 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8771 }
8772
8773 stmt_info = vinfo_for_stmt (stmt);
8774
8775 /* vector stmts created in the outer-loop during vectorization of
8776 stmts in an inner-loop may not have a stmt_info, and do not
8777 need to be vectorized. */
8778 if (!stmt_info)
8779 {
8780 gsi_next (&si);
8781 continue;
8782 }
8783
8784 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8785 vect_loop_kill_debug_uses (loop, stmt);
8786
8787 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8788 && !STMT_VINFO_LIVE_P (stmt_info))
8789 {
8790 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8791 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8792 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8793 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8794 {
8795 stmt = pattern_stmt;
8796 stmt_info = vinfo_for_stmt (stmt);
8797 }
8798 else
8799 {
8800 gsi_next (&si);
8801 continue;
8802 }
8803 }
8804 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8805 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8806 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8807 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8808 transform_pattern_stmt = true;
8809
8810 /* If pattern statement has def stmts, vectorize them too. */
8811 if (is_pattern_stmt_p (stmt_info))
8812 {
8813 if (pattern_def_seq == NULL)
8814 {
8815 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8816 pattern_def_si = gsi_start (pattern_def_seq);
8817 }
8818 else if (!gsi_end_p (pattern_def_si))
8819 gsi_next (&pattern_def_si);
8820 if (pattern_def_seq != NULL)
8821 {
8822 gimple *pattern_def_stmt = NULL;
8823 stmt_vec_info pattern_def_stmt_info = NULL;
8824
8825 while (!gsi_end_p (pattern_def_si))
8826 {
8827 pattern_def_stmt = gsi_stmt (pattern_def_si);
8828 pattern_def_stmt_info
8829 = vinfo_for_stmt (pattern_def_stmt);
8830 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8831 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8832 break;
8833 gsi_next (&pattern_def_si);
8834 }
8835
8836 if (!gsi_end_p (pattern_def_si))
8837 {
8838 if (dump_enabled_p ())
8839 {
8840 dump_printf_loc (MSG_NOTE, vect_location,
8841 "==> vectorizing pattern def "
8842 "stmt: ");
8843 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8844 pattern_def_stmt, 0);
8845 }
8846
8847 stmt = pattern_def_stmt;
8848 stmt_info = pattern_def_stmt_info;
8849 }
8850 else
8851 {
8852 pattern_def_si = gsi_none ();
8853 transform_pattern_stmt = false;
8854 }
8855 }
8856 else
8857 transform_pattern_stmt = false;
8858 }
8859
8860 if (STMT_VINFO_VECTYPE (stmt_info))
8861 {
8862 poly_uint64 nunits
8863 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8864 if (!STMT_SLP_TYPE (stmt_info)
8865 && maybe_ne (nunits, vf)
8866 && dump_enabled_p ())
8867 /* For SLP VF is set according to unrolling factor, and not
8868 to vector size, hence for SLP this print is not valid. */
8869 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8870 }
8871
8872 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8873 reached. */
8874 if (STMT_SLP_TYPE (stmt_info))
8875 {
8876 if (!slp_scheduled)
8877 {
8878 slp_scheduled = true;
8879
8880 if (dump_enabled_p ())
8881 dump_printf_loc (MSG_NOTE, vect_location,
8882 "=== scheduling SLP instances ===\n");
8883
8884 vect_schedule_slp (loop_vinfo);
8885 }
8886
8887 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8888 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8889 {
8890 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8891 {
8892 pattern_def_seq = NULL;
8893 gsi_next (&si);
8894 }
8895 continue;
8896 }
8897 }
8898
8899 /* -------- vectorize statement ------------ */
8900 if (dump_enabled_p ())
8901 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8902
8903 grouped_store = false;
8904 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8905 if (is_store)
8906 {
8907 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8908 {
8909 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8910 interleaving chain was completed - free all the stores in
8911 the chain. */
8912 gsi_next (&si);
8913 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8914 }
8915 else
8916 {
8917 /* Free the attached stmt_vec_info and remove the stmt. */
8918 gimple *store = gsi_stmt (si);
8919 free_stmt_vec_info (store);
8920 unlink_stmt_vdef (store);
8921 gsi_remove (&si, true);
8922 release_defs (store);
8923 }
8924
8925 /* Stores can only appear at the end of pattern statements. */
8926 gcc_assert (!transform_pattern_stmt);
8927 pattern_def_seq = NULL;
8928 }
8929 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8930 {
8931 pattern_def_seq = NULL;
8932 gsi_next (&si);
8933 }
8934 } /* stmts in BB */
8935
8936 /* Stub out scalar statements that must not survive vectorization.
8937 Doing this here helps with grouped statements, or statements that
8938 are involved in patterns. */
8939 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8940 !gsi_end_p (gsi); gsi_next (&gsi))
8941 {
8942 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8943 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8944 {
8945 tree lhs = gimple_get_lhs (call);
8946 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8947 {
8948 tree zero = build_zero_cst (TREE_TYPE (lhs));
8949 gimple *new_stmt = gimple_build_assign (lhs, zero);
8950 gsi_replace (&gsi, new_stmt, true);
8951 }
8952 }
8953 }
8954 } /* BBs in loop */
8955
8956 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8957 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8958 if (integer_onep (step_vector))
8959 niters_no_overflow = true;
8960 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8961 niters_vector_mult_vf, !niters_no_overflow);
8962
8963 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8964 scale_profile_for_vect_loop (loop, assumed_vf);
8965
8966 /* True if the final iteration might not handle a full vector's
8967 worth of scalar iterations. */
8968 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8969 /* The minimum number of iterations performed by the epilogue. This
8970 is 1 when peeling for gaps because we always need a final scalar
8971 iteration. */
8972 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8973 /* +1 to convert latch counts to loop iteration counts,
8974 -min_epilogue_iters to remove iterations that cannot be performed
8975 by the vector code. */
8976 int bias_for_lowest = 1 - min_epilogue_iters;
8977 int bias_for_assumed = bias_for_lowest;
8978 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8979 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8980 {
8981 /* When the amount of peeling is known at compile time, the first
8982 iteration will have exactly alignment_npeels active elements.
8983 In the worst case it will have at least one. */
8984 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8985 bias_for_lowest += lowest_vf - min_first_active;
8986 bias_for_assumed += assumed_vf - min_first_active;
8987 }
8988 /* In these calculations the "- 1" converts loop iteration counts
8989 back to latch counts. */
8990 if (loop->any_upper_bound)
8991 loop->nb_iterations_upper_bound
8992 = (final_iter_may_be_partial
8993 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8994 lowest_vf) - 1
8995 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8996 lowest_vf) - 1);
8997 if (loop->any_likely_upper_bound)
8998 loop->nb_iterations_likely_upper_bound
8999 = (final_iter_may_be_partial
9000 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9001 + bias_for_lowest, lowest_vf) - 1
9002 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9003 + bias_for_lowest, lowest_vf) - 1);
9004 if (loop->any_estimate)
9005 loop->nb_iterations_estimate
9006 = (final_iter_may_be_partial
9007 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9008 assumed_vf) - 1
9009 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9010 assumed_vf) - 1);
9011
9012 if (dump_enabled_p ())
9013 {
9014 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9015 {
9016 dump_printf_loc (MSG_NOTE, vect_location,
9017 "LOOP VECTORIZED\n");
9018 if (loop->inner)
9019 dump_printf_loc (MSG_NOTE, vect_location,
9020 "OUTER LOOP VECTORIZED\n");
9021 dump_printf (MSG_NOTE, "\n");
9022 }
9023 else
9024 {
9025 dump_printf_loc (MSG_NOTE, vect_location,
9026 "LOOP EPILOGUE VECTORIZED (VS=");
9027 dump_dec (MSG_NOTE, current_vector_size);
9028 dump_printf (MSG_NOTE, ")\n");
9029 }
9030 }
9031
9032 /* Free SLP instances here because otherwise stmt reference counting
9033 won't work. */
9034 slp_instance instance;
9035 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9036 vect_free_slp_instance (instance);
9037 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9038 /* Clear-up safelen field since its value is invalid after vectorization
9039 since vectorized loop can have loop-carried dependencies. */
9040 loop->safelen = 0;
9041
9042 /* Don't vectorize epilogue for epilogue. */
9043 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9044 epilogue = NULL;
9045
9046 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9047 epilogue = NULL;
9048
9049 if (epilogue)
9050 {
9051 auto_vector_sizes vector_sizes;
9052 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9053 unsigned int next_size = 0;
9054
9055 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9056 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9057 && known_eq (vf, lowest_vf))
9058 {
9059 unsigned int eiters
9060 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9061 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9062 eiters = eiters % lowest_vf;
9063 epilogue->nb_iterations_upper_bound = eiters - 1;
9064
9065 unsigned int ratio;
9066 while (next_size < vector_sizes.length ()
9067 && !(constant_multiple_p (current_vector_size,
9068 vector_sizes[next_size], &ratio)
9069 && eiters >= lowest_vf / ratio))
9070 next_size += 1;
9071 }
9072 else
9073 while (next_size < vector_sizes.length ()
9074 && maybe_lt (current_vector_size, vector_sizes[next_size]))
9075 next_size += 1;
9076
9077 if (next_size == vector_sizes.length ())
9078 epilogue = NULL;
9079 }
9080
9081 if (epilogue)
9082 {
9083 epilogue->force_vectorize = loop->force_vectorize;
9084 epilogue->safelen = loop->safelen;
9085 epilogue->dont_vectorize = false;
9086
9087 /* We may need to if-convert epilogue to vectorize it. */
9088 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9089 tree_if_conversion (epilogue);
9090 }
9091
9092 return epilogue;
9093 }
9094
9095 /* The code below is trying to perform simple optimization - revert
9096 if-conversion for masked stores, i.e. if the mask of a store is zero
9097 do not perform it and all stored value producers also if possible.
9098 For example,
9099 for (i=0; i<n; i++)
9100 if (c[i])
9101 {
9102 p1[i] += 1;
9103 p2[i] = p3[i] +2;
9104 }
9105 this transformation will produce the following semi-hammock:
9106
9107 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9108 {
9109 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9110 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9111 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9112 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9113 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9114 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9115 }
9116 */
9117
9118 void
optimize_mask_stores(struct loop * loop)9119 optimize_mask_stores (struct loop *loop)
9120 {
9121 basic_block *bbs = get_loop_body (loop);
9122 unsigned nbbs = loop->num_nodes;
9123 unsigned i;
9124 basic_block bb;
9125 struct loop *bb_loop;
9126 gimple_stmt_iterator gsi;
9127 gimple *stmt;
9128 auto_vec<gimple *> worklist;
9129
9130 vect_location = find_loop_location (loop);
9131 /* Pick up all masked stores in loop if any. */
9132 for (i = 0; i < nbbs; i++)
9133 {
9134 bb = bbs[i];
9135 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9136 gsi_next (&gsi))
9137 {
9138 stmt = gsi_stmt (gsi);
9139 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9140 worklist.safe_push (stmt);
9141 }
9142 }
9143
9144 free (bbs);
9145 if (worklist.is_empty ())
9146 return;
9147
9148 /* Loop has masked stores. */
9149 while (!worklist.is_empty ())
9150 {
9151 gimple *last, *last_store;
9152 edge e, efalse;
9153 tree mask;
9154 basic_block store_bb, join_bb;
9155 gimple_stmt_iterator gsi_to;
9156 tree vdef, new_vdef;
9157 gphi *phi;
9158 tree vectype;
9159 tree zero;
9160
9161 last = worklist.pop ();
9162 mask = gimple_call_arg (last, 2);
9163 bb = gimple_bb (last);
9164 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9165 the same loop as if_bb. It could be different to LOOP when two
9166 level loop-nest is vectorized and mask_store belongs to the inner
9167 one. */
9168 e = split_block (bb, last);
9169 bb_loop = bb->loop_father;
9170 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9171 join_bb = e->dest;
9172 store_bb = create_empty_bb (bb);
9173 add_bb_to_loop (store_bb, bb_loop);
9174 e->flags = EDGE_TRUE_VALUE;
9175 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9176 /* Put STORE_BB to likely part. */
9177 efalse->probability = profile_probability::unlikely ();
9178 store_bb->count = efalse->count ();
9179 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9180 if (dom_info_available_p (CDI_DOMINATORS))
9181 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9182 if (dump_enabled_p ())
9183 dump_printf_loc (MSG_NOTE, vect_location,
9184 "Create new block %d to sink mask stores.",
9185 store_bb->index);
9186 /* Create vector comparison with boolean result. */
9187 vectype = TREE_TYPE (mask);
9188 zero = build_zero_cst (vectype);
9189 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9190 gsi = gsi_last_bb (bb);
9191 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9192 /* Create new PHI node for vdef of the last masked store:
9193 .MEM_2 = VDEF <.MEM_1>
9194 will be converted to
9195 .MEM.3 = VDEF <.MEM_1>
9196 and new PHI node will be created in join bb
9197 .MEM_2 = PHI <.MEM_1, .MEM_3>
9198 */
9199 vdef = gimple_vdef (last);
9200 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9201 gimple_set_vdef (last, new_vdef);
9202 phi = create_phi_node (vdef, join_bb);
9203 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9204
9205 /* Put all masked stores with the same mask to STORE_BB if possible. */
9206 while (true)
9207 {
9208 gimple_stmt_iterator gsi_from;
9209 gimple *stmt1 = NULL;
9210
9211 /* Move masked store to STORE_BB. */
9212 last_store = last;
9213 gsi = gsi_for_stmt (last);
9214 gsi_from = gsi;
9215 /* Shift GSI to the previous stmt for further traversal. */
9216 gsi_prev (&gsi);
9217 gsi_to = gsi_start_bb (store_bb);
9218 gsi_move_before (&gsi_from, &gsi_to);
9219 /* Setup GSI_TO to the non-empty block start. */
9220 gsi_to = gsi_start_bb (store_bb);
9221 if (dump_enabled_p ())
9222 {
9223 dump_printf_loc (MSG_NOTE, vect_location,
9224 "Move stmt to created bb\n");
9225 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9226 }
9227 /* Move all stored value producers if possible. */
9228 while (!gsi_end_p (gsi))
9229 {
9230 tree lhs;
9231 imm_use_iterator imm_iter;
9232 use_operand_p use_p;
9233 bool res;
9234
9235 /* Skip debug statements. */
9236 if (is_gimple_debug (gsi_stmt (gsi)))
9237 {
9238 gsi_prev (&gsi);
9239 continue;
9240 }
9241 stmt1 = gsi_stmt (gsi);
9242 /* Do not consider statements writing to memory or having
9243 volatile operand. */
9244 if (gimple_vdef (stmt1)
9245 || gimple_has_volatile_ops (stmt1))
9246 break;
9247 gsi_from = gsi;
9248 gsi_prev (&gsi);
9249 lhs = gimple_get_lhs (stmt1);
9250 if (!lhs)
9251 break;
9252
9253 /* LHS of vectorized stmt must be SSA_NAME. */
9254 if (TREE_CODE (lhs) != SSA_NAME)
9255 break;
9256
9257 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9258 {
9259 /* Remove dead scalar statement. */
9260 if (has_zero_uses (lhs))
9261 {
9262 gsi_remove (&gsi_from, true);
9263 continue;
9264 }
9265 }
9266
9267 /* Check that LHS does not have uses outside of STORE_BB. */
9268 res = true;
9269 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9270 {
9271 gimple *use_stmt;
9272 use_stmt = USE_STMT (use_p);
9273 if (is_gimple_debug (use_stmt))
9274 continue;
9275 if (gimple_bb (use_stmt) != store_bb)
9276 {
9277 res = false;
9278 break;
9279 }
9280 }
9281 if (!res)
9282 break;
9283
9284 if (gimple_vuse (stmt1)
9285 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9286 break;
9287
9288 /* Can move STMT1 to STORE_BB. */
9289 if (dump_enabled_p ())
9290 {
9291 dump_printf_loc (MSG_NOTE, vect_location,
9292 "Move stmt to created bb\n");
9293 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9294 }
9295 gsi_move_before (&gsi_from, &gsi_to);
9296 /* Shift GSI_TO for further insertion. */
9297 gsi_prev (&gsi_to);
9298 }
9299 /* Put other masked stores with the same mask to STORE_BB. */
9300 if (worklist.is_empty ()
9301 || gimple_call_arg (worklist.last (), 2) != mask
9302 || worklist.last () != stmt1)
9303 break;
9304 last = worklist.pop ();
9305 }
9306 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9307 }
9308 }
9309